"Fossies" - the Fresh Open Source Software Archive

Member "openssl-1.0.2q/crypto/aes/asm/aes-x86_64.pl" (20 Nov 2018, 74952 Bytes) of package /linux/misc/openssl-1.0.2q.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Perl source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "aes-x86_64.pl" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 1.1.0g_vs_1.1.1-pre2.

    1 #!/usr/bin/env perl
    2 #
    3 # ====================================================================
    4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
    5 # project. The module is, however, dual licensed under OpenSSL and
    6 # CRYPTOGAMS licenses depending on where you obtain it. For further
    7 # details see http://www.openssl.org/~appro/cryptogams/.
    8 # ====================================================================
    9 #
   10 # Version 2.1.
   11 #
   12 # aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on
   13 # Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version
   14 # [you'll notice a lot of resemblance], such as compressed S-boxes
   15 # in little-endian byte order, prefetch of these tables in CBC mode,
   16 # as well as avoiding L1 cache aliasing between stack frame and key
   17 # schedule and already mentioned tables, compressed Td4...
   18 #
   19 # Performance in number of cycles per processed byte for 128-bit key:
   20 #
   21 #       ECB encrypt ECB decrypt CBC large chunk
   22 # AMD64     33      43      13.0
   23 # EM64T     38      56      18.6(*)
   24 # Core 2    30      42      14.5(*)
   25 # Atom      65      86      32.1(*)
   26 #
   27 # (*) with hyper-threading off
   28 
   29 $flavour = shift;
   30 $output  = shift;
   31 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
   32 
   33 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
   34 
   35 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
   36 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
   37 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
   38 die "can't locate x86_64-xlate.pl";
   39 
   40 open OUT,"| \"$^X\" $xlate $flavour $output";
   41 *STDOUT=*OUT;
   42 
   43 $verticalspin=1;    # unlike 32-bit version $verticalspin performs
   44             # ~15% better on both AMD and Intel cores
   45 $speed_limit=512;   # see aes-586.pl for details
   46 
   47 $code=".text\n";
   48 
   49 $s0="%eax";
   50 $s1="%ebx";
   51 $s2="%ecx";
   52 $s3="%edx";
   53 $acc0="%esi";   $mask80="%rsi";
   54 $acc1="%edi";   $maskfe="%rdi";
   55 $acc2="%ebp";   $mask1b="%rbp";
   56 $inp="%r8";
   57 $out="%r9";
   58 $t0="%r10d";
   59 $t1="%r11d";
   60 $t2="%r12d";
   61 $rnds="%r13d";
   62 $sbox="%r14";
   63 $key="%r15";
   64 
   65 sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/;    $r; }
   66 sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
   67             $r =~ s/%[er]([sd]i)/%\1l/;
   68             $r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }
   69 sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/;
   70             $r =~ s/%r([0-9]+)/%r\1d/;  $r; }
   71 sub _data_word()
   72 { my $i;
   73     while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
   74 }
   75 sub data_word()
   76 { my $i;
   77   my $last=pop(@_);
   78     $code.=".long\t";
   79     while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; }
   80     $code.=sprintf"0x%08x\n",$last;
   81 }
   82 
   83 sub data_byte()
   84 { my $i;
   85   my $last=pop(@_);
   86     $code.=".byte\t";
   87     while(defined($i=shift)) { $code.=sprintf"0x%02x,",$i&0xff; }
   88     $code.=sprintf"0x%02x\n",$last&0xff;
   89 }
   90 
   91 sub encvert()
   92 { my $t3="%r8d";    # zaps $inp!
   93 
   94 $code.=<<___;
   95     # favor 3-way issue Opteron pipeline...
   96     movzb   `&lo("$s0")`,$acc0
   97     movzb   `&lo("$s1")`,$acc1
   98     movzb   `&lo("$s2")`,$acc2
   99     mov 0($sbox,$acc0,8),$t0
  100     mov 0($sbox,$acc1,8),$t1
  101     mov 0($sbox,$acc2,8),$t2
  102 
  103     movzb   `&hi("$s1")`,$acc0
  104     movzb   `&hi("$s2")`,$acc1
  105     movzb   `&lo("$s3")`,$acc2
  106     xor 3($sbox,$acc0,8),$t0
  107     xor 3($sbox,$acc1,8),$t1
  108     mov 0($sbox,$acc2,8),$t3
  109 
  110     movzb   `&hi("$s3")`,$acc0
  111     shr \$16,$s2
  112     movzb   `&hi("$s0")`,$acc2
  113     xor 3($sbox,$acc0,8),$t2
  114     shr \$16,$s3
  115     xor 3($sbox,$acc2,8),$t3
  116 
  117     shr \$16,$s1
  118     lea 16($key),$key
  119     shr \$16,$s0
  120 
  121     movzb   `&lo("$s2")`,$acc0
  122     movzb   `&lo("$s3")`,$acc1
  123     movzb   `&lo("$s0")`,$acc2
  124     xor 2($sbox,$acc0,8),$t0
  125     xor 2($sbox,$acc1,8),$t1
  126     xor 2($sbox,$acc2,8),$t2
  127 
  128     movzb   `&hi("$s3")`,$acc0
  129     movzb   `&hi("$s0")`,$acc1
  130     movzb   `&lo("$s1")`,$acc2
  131     xor 1($sbox,$acc0,8),$t0
  132     xor 1($sbox,$acc1,8),$t1
  133     xor 2($sbox,$acc2,8),$t3
  134 
  135     mov 12($key),$s3
  136     movzb   `&hi("$s1")`,$acc1
  137     movzb   `&hi("$s2")`,$acc2
  138     mov 0($key),$s0
  139     xor 1($sbox,$acc1,8),$t2
  140     xor 1($sbox,$acc2,8),$t3
  141 
  142     mov 4($key),$s1
  143     mov 8($key),$s2
  144     xor $t0,$s0
  145     xor $t1,$s1
  146     xor $t2,$s2
  147     xor $t3,$s3
  148 ___
  149 }
  150 
  151 sub enclastvert()
  152 { my $t3="%r8d";    # zaps $inp!
  153 
  154 $code.=<<___;
  155     movzb   `&lo("$s0")`,$acc0
  156     movzb   `&lo("$s1")`,$acc1
  157     movzb   `&lo("$s2")`,$acc2
  158     movzb   2($sbox,$acc0,8),$t0
  159     movzb   2($sbox,$acc1,8),$t1
  160     movzb   2($sbox,$acc2,8),$t2
  161 
  162     movzb   `&lo("$s3")`,$acc0
  163     movzb   `&hi("$s1")`,$acc1
  164     movzb   `&hi("$s2")`,$acc2
  165     movzb   2($sbox,$acc0,8),$t3
  166     mov 0($sbox,$acc1,8),$acc1  #$t0
  167     mov 0($sbox,$acc2,8),$acc2  #$t1
  168 
  169     and \$0x0000ff00,$acc1
  170     and \$0x0000ff00,$acc2
  171 
  172     xor $acc1,$t0
  173     xor $acc2,$t1
  174     shr \$16,$s2
  175 
  176     movzb   `&hi("$s3")`,$acc0
  177     movzb   `&hi("$s0")`,$acc1
  178     shr \$16,$s3
  179     mov 0($sbox,$acc0,8),$acc0  #$t2
  180     mov 0($sbox,$acc1,8),$acc1  #$t3
  181 
  182     and \$0x0000ff00,$acc0
  183     and \$0x0000ff00,$acc1
  184     shr \$16,$s1
  185     xor $acc0,$t2
  186     xor $acc1,$t3
  187     shr \$16,$s0
  188 
  189     movzb   `&lo("$s2")`,$acc0
  190     movzb   `&lo("$s3")`,$acc1
  191     movzb   `&lo("$s0")`,$acc2
  192     mov 0($sbox,$acc0,8),$acc0  #$t0
  193     mov 0($sbox,$acc1,8),$acc1  #$t1
  194     mov 0($sbox,$acc2,8),$acc2  #$t2
  195 
  196     and \$0x00ff0000,$acc0
  197     and \$0x00ff0000,$acc1
  198     and \$0x00ff0000,$acc2
  199 
  200     xor $acc0,$t0
  201     xor $acc1,$t1
  202     xor $acc2,$t2
  203 
  204     movzb   `&lo("$s1")`,$acc0
  205     movzb   `&hi("$s3")`,$acc1
  206     movzb   `&hi("$s0")`,$acc2
  207     mov 0($sbox,$acc0,8),$acc0  #$t3
  208     mov 2($sbox,$acc1,8),$acc1  #$t0
  209     mov 2($sbox,$acc2,8),$acc2  #$t1
  210 
  211     and \$0x00ff0000,$acc0
  212     and \$0xff000000,$acc1
  213     and \$0xff000000,$acc2
  214 
  215     xor $acc0,$t3
  216     xor $acc1,$t0
  217     xor $acc2,$t1
  218 
  219     movzb   `&hi("$s1")`,$acc0
  220     movzb   `&hi("$s2")`,$acc1
  221     mov 16+12($key),$s3
  222     mov 2($sbox,$acc0,8),$acc0  #$t2
  223     mov 2($sbox,$acc1,8),$acc1  #$t3
  224     mov 16+0($key),$s0
  225 
  226     and \$0xff000000,$acc0
  227     and \$0xff000000,$acc1
  228 
  229     xor $acc0,$t2
  230     xor $acc1,$t3
  231 
  232     mov 16+4($key),$s1
  233     mov 16+8($key),$s2
  234     xor $t0,$s0
  235     xor $t1,$s1
  236     xor $t2,$s2
  237     xor $t3,$s3
  238 ___
  239 }
  240 
  241 sub encstep()
  242 { my ($i,@s) = @_;
  243   my $tmp0=$acc0;
  244   my $tmp1=$acc1;
  245   my $tmp2=$acc2;
  246   my $out=($t0,$t1,$t2,$s[0])[$i];
  247 
  248     if ($i==3) {
  249         $tmp0=$s[1];
  250         $tmp1=$s[2];
  251         $tmp2=$s[3];
  252     }
  253     $code.="    movzb   ".&lo($s[0]).",$out\n";
  254     $code.="    mov $s[2],$tmp1\n"      if ($i!=3);
  255     $code.="    lea 16($key),$key\n"    if ($i==0);
  256 
  257     $code.="    movzb   ".&hi($s[1]).",$tmp0\n";
  258     $code.="    mov 0($sbox,$out,8),$out\n";
  259 
  260     $code.="    shr \$16,$tmp1\n";
  261     $code.="    mov $s[3],$tmp2\n"      if ($i!=3);
  262     $code.="    xor 3($sbox,$tmp0,8),$out\n";
  263 
  264     $code.="    movzb   ".&lo($tmp1).",$tmp1\n";
  265     $code.="    shr \$24,$tmp2\n";
  266     $code.="    xor 4*$i($key),$out\n";
  267 
  268     $code.="    xor 2($sbox,$tmp1,8),$out\n";
  269     $code.="    xor 1($sbox,$tmp2,8),$out\n";
  270 
  271     $code.="    mov $t0,$s[1]\n"        if ($i==3);
  272     $code.="    mov $t1,$s[2]\n"        if ($i==3);
  273     $code.="    mov $t2,$s[3]\n"        if ($i==3);
  274     $code.="\n";
  275 }
  276 
  277 sub enclast()
  278 { my ($i,@s)=@_;
  279   my $tmp0=$acc0;
  280   my $tmp1=$acc1;
  281   my $tmp2=$acc2;
  282   my $out=($t0,$t1,$t2,$s[0])[$i];
  283 
  284     if ($i==3) {
  285         $tmp0=$s[1];
  286         $tmp1=$s[2];
  287         $tmp2=$s[3];
  288     }
  289     $code.="    movzb   ".&lo($s[0]).",$out\n";
  290     $code.="    mov $s[2],$tmp1\n"      if ($i!=3);
  291 
  292     $code.="    mov 2($sbox,$out,8),$out\n";
  293     $code.="    shr \$16,$tmp1\n";
  294     $code.="    mov $s[3],$tmp2\n"      if ($i!=3);
  295 
  296     $code.="    and \$0x000000ff,$out\n";
  297     $code.="    movzb   ".&hi($s[1]).",$tmp0\n";
  298     $code.="    movzb   ".&lo($tmp1).",$tmp1\n";
  299     $code.="    shr \$24,$tmp2\n";
  300 
  301     $code.="    mov 0($sbox,$tmp0,8),$tmp0\n";
  302     $code.="    mov 0($sbox,$tmp1,8),$tmp1\n";
  303     $code.="    mov 2($sbox,$tmp2,8),$tmp2\n";
  304 
  305     $code.="    and \$0x0000ff00,$tmp0\n";
  306     $code.="    and \$0x00ff0000,$tmp1\n";
  307     $code.="    and \$0xff000000,$tmp2\n";
  308 
  309     $code.="    xor $tmp0,$out\n";
  310     $code.="    mov $t0,$s[1]\n"        if ($i==3);
  311     $code.="    xor $tmp1,$out\n";
  312     $code.="    mov $t1,$s[2]\n"        if ($i==3);
  313     $code.="    xor $tmp2,$out\n";
  314     $code.="    mov $t2,$s[3]\n"        if ($i==3);
  315     $code.="\n";
  316 }
  317 
  318 $code.=<<___;
  319 .type   _x86_64_AES_encrypt,\@abi-omnipotent
  320 .align  16
  321 _x86_64_AES_encrypt:
  322     xor 0($key),$s0         # xor with key
  323     xor 4($key),$s1
  324     xor 8($key),$s2
  325     xor 12($key),$s3
  326 
  327     mov 240($key),$rnds         # load key->rounds
  328     sub \$1,$rnds
  329     jmp .Lenc_loop
  330 .align  16
  331 .Lenc_loop:
  332 ___
  333     if ($verticalspin) { &encvert(); }
  334     else {  &encstep(0,$s0,$s1,$s2,$s3);
  335         &encstep(1,$s1,$s2,$s3,$s0);
  336         &encstep(2,$s2,$s3,$s0,$s1);
  337         &encstep(3,$s3,$s0,$s1,$s2);
  338     }
  339 $code.=<<___;
  340     sub \$1,$rnds
  341     jnz .Lenc_loop
  342 ___
  343     if ($verticalspin) { &enclastvert(); }
  344     else {  &enclast(0,$s0,$s1,$s2,$s3);
  345         &enclast(1,$s1,$s2,$s3,$s0);
  346         &enclast(2,$s2,$s3,$s0,$s1);
  347         &enclast(3,$s3,$s0,$s1,$s2);
  348         $code.=<<___;
  349         xor 16+0($key),$s0      # xor with key
  350         xor 16+4($key),$s1
  351         xor 16+8($key),$s2
  352         xor 16+12($key),$s3
  353 ___
  354     }
  355 $code.=<<___;
  356     .byte   0xf3,0xc3           # rep ret
  357 .size   _x86_64_AES_encrypt,.-_x86_64_AES_encrypt
  358 ___
  359 
  360 # it's possible to implement this by shifting tN by 8, filling least
  361 # significant byte with byte load and finally bswap-ing at the end,
  362 # but such partial register load kills Core 2...
  363 sub enccompactvert()
  364 { my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
  365 
  366 $code.=<<___;
  367     movzb   `&lo("$s0")`,$t0
  368     movzb   `&lo("$s1")`,$t1
  369     movzb   `&lo("$s2")`,$t2
  370     movzb   `&lo("$s3")`,$t3
  371     movzb   `&hi("$s1")`,$acc0
  372     movzb   `&hi("$s2")`,$acc1
  373     shr \$16,$s2
  374     movzb   `&hi("$s3")`,$acc2
  375     movzb   ($sbox,$t0,1),$t0
  376     movzb   ($sbox,$t1,1),$t1
  377     movzb   ($sbox,$t2,1),$t2
  378     movzb   ($sbox,$t3,1),$t3
  379 
  380     movzb   ($sbox,$acc0,1),$t4 #$t0
  381     movzb   `&hi("$s0")`,$acc0
  382     movzb   ($sbox,$acc1,1),$t5 #$t1
  383     movzb   `&lo("$s2")`,$acc1
  384     movzb   ($sbox,$acc2,1),$acc2   #$t2
  385     movzb   ($sbox,$acc0,1),$acc0   #$t3
  386 
  387     shl \$8,$t4
  388     shr \$16,$s3
  389     shl \$8,$t5
  390     xor $t4,$t0
  391     shr \$16,$s0
  392     movzb   `&lo("$s3")`,$t4
  393     shr \$16,$s1
  394     xor $t5,$t1
  395     shl \$8,$acc2
  396     movzb   `&lo("$s0")`,$t5
  397     movzb   ($sbox,$acc1,1),$acc1   #$t0
  398     xor $acc2,$t2
  399 
  400     shl \$8,$acc0
  401     movzb   `&lo("$s1")`,$acc2
  402     shl \$16,$acc1
  403     xor $acc0,$t3
  404     movzb   ($sbox,$t4,1),$t4   #$t1
  405     movzb   `&hi("$s3")`,$acc0
  406     movzb   ($sbox,$t5,1),$t5   #$t2
  407     xor $acc1,$t0
  408 
  409     shr \$8,$s2
  410     movzb   `&hi("$s0")`,$acc1
  411     shl \$16,$t4
  412     shr \$8,$s1
  413     shl \$16,$t5
  414     xor $t4,$t1
  415     movzb   ($sbox,$acc2,1),$acc2   #$t3
  416     movzb   ($sbox,$acc0,1),$acc0   #$t0
  417     movzb   ($sbox,$acc1,1),$acc1   #$t1
  418     movzb   ($sbox,$s2,1),$s3   #$t3
  419     movzb   ($sbox,$s1,1),$s2   #$t2
  420 
  421     shl \$16,$acc2
  422     xor $t5,$t2
  423     shl \$24,$acc0
  424     xor $acc2,$t3
  425     shl \$24,$acc1
  426     xor $acc0,$t0
  427     shl \$24,$s3
  428     xor $acc1,$t1
  429     shl \$24,$s2
  430     mov $t0,$s0
  431     mov $t1,$s1
  432     xor $t2,$s2
  433     xor $t3,$s3
  434 ___
  435 }
  436 
  437 sub enctransform_ref()
  438 { my $sn = shift;
  439   my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d");
  440 
  441 $code.=<<___;
  442     mov $sn,$acc
  443     and \$0x80808080,$acc
  444     mov $acc,$tmp
  445     shr \$7,$tmp
  446     lea ($sn,$sn),$r2
  447     sub $tmp,$acc
  448     and \$0xfefefefe,$r2
  449     and \$0x1b1b1b1b,$acc
  450     mov $sn,$tmp
  451     xor $acc,$r2
  452 
  453     xor $r2,$sn
  454     rol \$24,$sn
  455     xor $r2,$sn
  456     ror \$16,$tmp
  457     xor $tmp,$sn
  458     ror \$8,$tmp
  459     xor $tmp,$sn
  460 ___
  461 }
  462 
  463 # unlike decrypt case it does not pay off to parallelize enctransform
  464 sub enctransform()
  465 { my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");
  466 
  467 $code.=<<___;
  468     mov \$0x80808080,$t0
  469     mov \$0x80808080,$t1
  470     and $s0,$t0
  471     and $s1,$t1
  472     mov $t0,$acc0
  473     mov $t1,$acc1
  474     shr \$7,$t0
  475     lea ($s0,$s0),$r20
  476     shr \$7,$t1
  477     lea ($s1,$s1),$r21
  478     sub $t0,$acc0
  479     sub $t1,$acc1
  480     and \$0xfefefefe,$r20
  481     and \$0xfefefefe,$r21
  482     and \$0x1b1b1b1b,$acc0
  483     and \$0x1b1b1b1b,$acc1
  484     mov $s0,$t0
  485     mov $s1,$t1
  486     xor $acc0,$r20
  487     xor $acc1,$r21
  488 
  489     xor $r20,$s0
  490     xor $r21,$s1
  491      mov    \$0x80808080,$t2
  492     rol \$24,$s0
  493      mov    \$0x80808080,$t3
  494     rol \$24,$s1
  495      and    $s2,$t2
  496      and    $s3,$t3
  497     xor $r20,$s0
  498     xor $r21,$s1
  499      mov    $t2,$acc0
  500     ror \$16,$t0
  501      mov    $t3,$acc1
  502     ror \$16,$t1
  503      lea    ($s2,$s2),$r20
  504      shr    \$7,$t2
  505     xor $t0,$s0
  506      shr    \$7,$t3
  507     xor $t1,$s1
  508     ror \$8,$t0
  509      lea    ($s3,$s3),$r21
  510     ror \$8,$t1
  511      sub    $t2,$acc0
  512      sub    $t3,$acc1
  513     xor $t0,$s0
  514     xor $t1,$s1
  515 
  516     and \$0xfefefefe,$r20
  517     and \$0xfefefefe,$r21
  518     and \$0x1b1b1b1b,$acc0
  519     and \$0x1b1b1b1b,$acc1
  520     mov $s2,$t2
  521     mov $s3,$t3
  522     xor $acc0,$r20
  523     xor $acc1,$r21
  524 
  525     ror \$16,$t2
  526     xor $r20,$s2
  527     ror \$16,$t3
  528     xor $r21,$s3
  529     rol \$24,$s2
  530     mov 0($sbox),$acc0          # prefetch Te4
  531     rol \$24,$s3
  532     xor $r20,$s2
  533     mov 64($sbox),$acc1
  534     xor $r21,$s3
  535     mov 128($sbox),$r20
  536     xor $t2,$s2
  537     ror \$8,$t2
  538     xor $t3,$s3
  539     ror \$8,$t3
  540     xor $t2,$s2
  541     mov 192($sbox),$r21
  542     xor $t3,$s3
  543 ___
  544 }
  545 
  546 $code.=<<___;
  547 .type   _x86_64_AES_encrypt_compact,\@abi-omnipotent
  548 .align  16
  549 _x86_64_AES_encrypt_compact:
  550     lea 128($sbox),$inp         # size optimization
  551     mov 0-128($inp),$acc1       # prefetch Te4
  552     mov 32-128($inp),$acc2
  553     mov 64-128($inp),$t0
  554     mov 96-128($inp),$t1
  555     mov 128-128($inp),$acc1
  556     mov 160-128($inp),$acc2
  557     mov 192-128($inp),$t0
  558     mov 224-128($inp),$t1
  559     jmp .Lenc_loop_compact
  560 .align  16
  561 .Lenc_loop_compact:
  562         xor 0($key),$s0     # xor with key
  563         xor 4($key),$s1
  564         xor 8($key),$s2
  565         xor 12($key),$s3
  566         lea 16($key),$key
  567 ___
  568         &enccompactvert();
  569 $code.=<<___;
  570         cmp 16(%rsp),$key
  571         je  .Lenc_compact_done
  572 ___
  573         &enctransform();
  574 $code.=<<___;
  575     jmp .Lenc_loop_compact
  576 .align  16
  577 .Lenc_compact_done:
  578     xor 0($key),$s0
  579     xor 4($key),$s1
  580     xor 8($key),$s2
  581     xor 12($key),$s3
  582     .byte   0xf3,0xc3           # rep ret
  583 .size   _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
  584 ___
  585 
  586 # void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
  587 $code.=<<___;
  588 .globl  AES_encrypt
  589 .type   AES_encrypt,\@function,3
  590 .align  16
  591 .globl  asm_AES_encrypt
  592 .hidden asm_AES_encrypt
  593 asm_AES_encrypt:
  594 AES_encrypt:
  595     push    %rbx
  596     push    %rbp
  597     push    %r12
  598     push    %r13
  599     push    %r14
  600     push    %r15
  601 
  602     # allocate frame "above" key schedule
  603     mov %rsp,%r10
  604     lea -63(%rdx),%rcx  # %rdx is key argument
  605     and \$-64,%rsp
  606     sub %rsp,%rcx
  607     neg %rcx
  608     and \$0x3c0,%rcx
  609     sub %rcx,%rsp
  610     sub \$32,%rsp
  611 
  612     mov %rsi,16(%rsp)   # save out
  613     mov %r10,24(%rsp)   # save real stack pointer
  614 .Lenc_prologue:
  615 
  616     mov %rdx,$key
  617     mov 240($key),$rnds # load rounds
  618 
  619     mov 0(%rdi),$s0 # load input vector
  620     mov 4(%rdi),$s1
  621     mov 8(%rdi),$s2
  622     mov 12(%rdi),$s3
  623 
  624     shl \$4,$rnds
  625     lea ($key,$rnds),%rbp
  626     mov $key,(%rsp) # key schedule
  627     mov %rbp,8(%rsp)    # end of key schedule
  628 
  629     # pick Te4 copy which can't "overlap" with stack frame or key schedule
  630     lea .LAES_Te+2048(%rip),$sbox
  631     lea 768(%rsp),%rbp
  632     sub $sbox,%rbp
  633     and \$0x300,%rbp
  634     lea ($sbox,%rbp),$sbox
  635 
  636     call    _x86_64_AES_encrypt_compact
  637 
  638     mov 16(%rsp),$out   # restore out
  639     mov 24(%rsp),%rsi   # restore saved stack pointer
  640     mov $s0,0($out) # write output vector
  641     mov $s1,4($out)
  642     mov $s2,8($out)
  643     mov $s3,12($out)
  644 
  645     mov (%rsi),%r15
  646     mov 8(%rsi),%r14
  647     mov 16(%rsi),%r13
  648     mov 24(%rsi),%r12
  649     mov 32(%rsi),%rbp
  650     mov 40(%rsi),%rbx
  651     lea 48(%rsi),%rsp
  652 .Lenc_epilogue:
  653     ret
  654 .size   AES_encrypt,.-AES_encrypt
  655 ___
  656 
  657 #------------------------------------------------------------------#
  658 
  659 sub decvert()
  660 { my $t3="%r8d";    # zaps $inp!
  661 
  662 $code.=<<___;
  663     # favor 3-way issue Opteron pipeline...
  664     movzb   `&lo("$s0")`,$acc0
  665     movzb   `&lo("$s1")`,$acc1
  666     movzb   `&lo("$s2")`,$acc2
  667     mov 0($sbox,$acc0,8),$t0
  668     mov 0($sbox,$acc1,8),$t1
  669     mov 0($sbox,$acc2,8),$t2
  670 
  671     movzb   `&hi("$s3")`,$acc0
  672     movzb   `&hi("$s0")`,$acc1
  673     movzb   `&lo("$s3")`,$acc2
  674     xor 3($sbox,$acc0,8),$t0
  675     xor 3($sbox,$acc1,8),$t1
  676     mov 0($sbox,$acc2,8),$t3
  677 
  678     movzb   `&hi("$s1")`,$acc0
  679     shr \$16,$s0
  680     movzb   `&hi("$s2")`,$acc2
  681     xor 3($sbox,$acc0,8),$t2
  682     shr \$16,$s3
  683     xor 3($sbox,$acc2,8),$t3
  684 
  685     shr \$16,$s1
  686     lea 16($key),$key
  687     shr \$16,$s2
  688 
  689     movzb   `&lo("$s2")`,$acc0
  690     movzb   `&lo("$s3")`,$acc1
  691     movzb   `&lo("$s0")`,$acc2
  692     xor 2($sbox,$acc0,8),$t0
  693     xor 2($sbox,$acc1,8),$t1
  694     xor 2($sbox,$acc2,8),$t2
  695 
  696     movzb   `&hi("$s1")`,$acc0
  697     movzb   `&hi("$s2")`,$acc1
  698     movzb   `&lo("$s1")`,$acc2
  699     xor 1($sbox,$acc0,8),$t0
  700     xor 1($sbox,$acc1,8),$t1
  701     xor 2($sbox,$acc2,8),$t3
  702 
  703     movzb   `&hi("$s3")`,$acc0
  704     mov 12($key),$s3
  705     movzb   `&hi("$s0")`,$acc2
  706     xor 1($sbox,$acc0,8),$t2
  707     mov 0($key),$s0
  708     xor 1($sbox,$acc2,8),$t3
  709 
  710     xor $t0,$s0
  711     mov 4($key),$s1
  712     mov 8($key),$s2
  713     xor $t2,$s2
  714     xor $t1,$s1
  715     xor $t3,$s3
  716 ___
  717 }
  718 
  719 sub declastvert()
  720 { my $t3="%r8d";    # zaps $inp!
  721 
  722 $code.=<<___;
  723     lea 2048($sbox),$sbox   # size optimization
  724     movzb   `&lo("$s0")`,$acc0
  725     movzb   `&lo("$s1")`,$acc1
  726     movzb   `&lo("$s2")`,$acc2
  727     movzb   ($sbox,$acc0,1),$t0
  728     movzb   ($sbox,$acc1,1),$t1
  729     movzb   ($sbox,$acc2,1),$t2
  730 
  731     movzb   `&lo("$s3")`,$acc0
  732     movzb   `&hi("$s3")`,$acc1
  733     movzb   `&hi("$s0")`,$acc2
  734     movzb   ($sbox,$acc0,1),$t3
  735     movzb   ($sbox,$acc1,1),$acc1   #$t0
  736     movzb   ($sbox,$acc2,1),$acc2   #$t1
  737 
  738     shl \$8,$acc1
  739     shl \$8,$acc2
  740 
  741     xor $acc1,$t0
  742     xor $acc2,$t1
  743     shr \$16,$s3
  744 
  745     movzb   `&hi("$s1")`,$acc0
  746     movzb   `&hi("$s2")`,$acc1
  747     shr \$16,$s0
  748     movzb   ($sbox,$acc0,1),$acc0   #$t2
  749     movzb   ($sbox,$acc1,1),$acc1   #$t3
  750 
  751     shl \$8,$acc0
  752     shl \$8,$acc1
  753     shr \$16,$s1
  754     xor $acc0,$t2
  755     xor $acc1,$t3
  756     shr \$16,$s2
  757 
  758     movzb   `&lo("$s2")`,$acc0
  759     movzb   `&lo("$s3")`,$acc1
  760     movzb   `&lo("$s0")`,$acc2
  761     movzb   ($sbox,$acc0,1),$acc0   #$t0
  762     movzb   ($sbox,$acc1,1),$acc1   #$t1
  763     movzb   ($sbox,$acc2,1),$acc2   #$t2
  764 
  765     shl \$16,$acc0
  766     shl \$16,$acc1
  767     shl \$16,$acc2
  768 
  769     xor $acc0,$t0
  770     xor $acc1,$t1
  771     xor $acc2,$t2
  772 
  773     movzb   `&lo("$s1")`,$acc0
  774     movzb   `&hi("$s1")`,$acc1
  775     movzb   `&hi("$s2")`,$acc2
  776     movzb   ($sbox,$acc0,1),$acc0   #$t3
  777     movzb   ($sbox,$acc1,1),$acc1   #$t0
  778     movzb   ($sbox,$acc2,1),$acc2   #$t1
  779 
  780     shl \$16,$acc0
  781     shl \$24,$acc1
  782     shl \$24,$acc2
  783 
  784     xor $acc0,$t3
  785     xor $acc1,$t0
  786     xor $acc2,$t1
  787 
  788     movzb   `&hi("$s3")`,$acc0
  789     movzb   `&hi("$s0")`,$acc1
  790     mov 16+12($key),$s3
  791     movzb   ($sbox,$acc0,1),$acc0   #$t2
  792     movzb   ($sbox,$acc1,1),$acc1   #$t3
  793     mov 16+0($key),$s0
  794 
  795     shl \$24,$acc0
  796     shl \$24,$acc1
  797 
  798     xor $acc0,$t2
  799     xor $acc1,$t3
  800 
  801     mov 16+4($key),$s1
  802     mov 16+8($key),$s2
  803     lea -2048($sbox),$sbox
  804     xor $t0,$s0
  805     xor $t1,$s1
  806     xor $t2,$s2
  807     xor $t3,$s3
  808 ___
  809 }
  810 
  811 sub decstep()
  812 { my ($i,@s) = @_;
  813   my $tmp0=$acc0;
  814   my $tmp1=$acc1;
  815   my $tmp2=$acc2;
  816   my $out=($t0,$t1,$t2,$s[0])[$i];
  817 
  818     $code.="    mov $s[0],$out\n"       if ($i!=3);
  819             $tmp1=$s[2]         if ($i==3);
  820     $code.="    mov $s[2],$tmp1\n"      if ($i!=3);
  821     $code.="    and \$0xFF,$out\n";
  822 
  823     $code.="    mov 0($sbox,$out,8),$out\n";
  824     $code.="    shr \$16,$tmp1\n";
  825             $tmp2=$s[3]         if ($i==3);
  826     $code.="    mov $s[3],$tmp2\n"      if ($i!=3);
  827 
  828             $tmp0=$s[1]         if ($i==3);
  829     $code.="    movzb   ".&hi($s[1]).",$tmp0\n";
  830     $code.="    and \$0xFF,$tmp1\n";
  831     $code.="    shr \$24,$tmp2\n";
  832 
  833     $code.="    xor 3($sbox,$tmp0,8),$out\n";
  834     $code.="    xor 2($sbox,$tmp1,8),$out\n";
  835     $code.="    xor 1($sbox,$tmp2,8),$out\n";
  836 
  837     $code.="    mov $t2,$s[1]\n"        if ($i==3);
  838     $code.="    mov $t1,$s[2]\n"        if ($i==3);
  839     $code.="    mov $t0,$s[3]\n"        if ($i==3);
  840     $code.="\n";
  841 }
  842 
  843 sub declast()
  844 { my ($i,@s)=@_;
  845   my $tmp0=$acc0;
  846   my $tmp1=$acc1;
  847   my $tmp2=$acc2;
  848   my $out=($t0,$t1,$t2,$s[0])[$i];
  849 
  850     $code.="    mov $s[0],$out\n"       if ($i!=3);
  851             $tmp1=$s[2]         if ($i==3);
  852     $code.="    mov $s[2],$tmp1\n"      if ($i!=3);
  853     $code.="    and \$0xFF,$out\n";
  854 
  855     $code.="    movzb   2048($sbox,$out,1),$out\n";
  856     $code.="    shr \$16,$tmp1\n";
  857             $tmp2=$s[3]         if ($i==3);
  858     $code.="    mov $s[3],$tmp2\n"      if ($i!=3);
  859 
  860             $tmp0=$s[1]         if ($i==3);
  861     $code.="    movzb   ".&hi($s[1]).",$tmp0\n";
  862     $code.="    and \$0xFF,$tmp1\n";
  863     $code.="    shr \$24,$tmp2\n";
  864 
  865     $code.="    movzb   2048($sbox,$tmp0,1),$tmp0\n";
  866     $code.="    movzb   2048($sbox,$tmp1,1),$tmp1\n";
  867     $code.="    movzb   2048($sbox,$tmp2,1),$tmp2\n";
  868 
  869     $code.="    shl \$8,$tmp0\n";
  870     $code.="    shl \$16,$tmp1\n";
  871     $code.="    shl \$24,$tmp2\n";
  872 
  873     $code.="    xor $tmp0,$out\n";
  874     $code.="    mov $t2,$s[1]\n"        if ($i==3);
  875     $code.="    xor $tmp1,$out\n";
  876     $code.="    mov $t1,$s[2]\n"        if ($i==3);
  877     $code.="    xor $tmp2,$out\n";
  878     $code.="    mov $t0,$s[3]\n"        if ($i==3);
  879     $code.="\n";
  880 }
  881 
  882 $code.=<<___;
  883 .type   _x86_64_AES_decrypt,\@abi-omnipotent
  884 .align  16
  885 _x86_64_AES_decrypt:
  886     xor 0($key),$s0         # xor with key
  887     xor 4($key),$s1
  888     xor 8($key),$s2
  889     xor 12($key),$s3
  890 
  891     mov 240($key),$rnds         # load key->rounds
  892     sub \$1,$rnds
  893     jmp .Ldec_loop
  894 .align  16
  895 .Ldec_loop:
  896 ___
  897     if ($verticalspin) { &decvert(); }
  898     else {  &decstep(0,$s0,$s3,$s2,$s1);
  899         &decstep(1,$s1,$s0,$s3,$s2);
  900         &decstep(2,$s2,$s1,$s0,$s3);
  901         &decstep(3,$s3,$s2,$s1,$s0);
  902         $code.=<<___;
  903         lea 16($key),$key
  904         xor 0($key),$s0         # xor with key
  905         xor 4($key),$s1
  906         xor 8($key),$s2
  907         xor 12($key),$s3
  908 ___
  909     }
  910 $code.=<<___;
  911     sub \$1,$rnds
  912     jnz .Ldec_loop
  913 ___
  914     if ($verticalspin) { &declastvert(); }
  915     else {  &declast(0,$s0,$s3,$s2,$s1);
  916         &declast(1,$s1,$s0,$s3,$s2);
  917         &declast(2,$s2,$s1,$s0,$s3);
  918         &declast(3,$s3,$s2,$s1,$s0);
  919         $code.=<<___;
  920         xor 16+0($key),$s0          # xor with key
  921         xor 16+4($key),$s1
  922         xor 16+8($key),$s2
  923         xor 16+12($key),$s3
  924 ___
  925     }
  926 $code.=<<___;
  927     .byte   0xf3,0xc3           # rep ret
  928 .size   _x86_64_AES_decrypt,.-_x86_64_AES_decrypt
  929 ___
  930 
  931 sub deccompactvert()
  932 { my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
  933 
  934 $code.=<<___;
  935     movzb   `&lo("$s0")`,$t0
  936     movzb   `&lo("$s1")`,$t1
  937     movzb   `&lo("$s2")`,$t2
  938     movzb   `&lo("$s3")`,$t3
  939     movzb   `&hi("$s3")`,$acc0
  940     movzb   `&hi("$s0")`,$acc1
  941     shr \$16,$s3
  942     movzb   `&hi("$s1")`,$acc2
  943     movzb   ($sbox,$t0,1),$t0
  944     movzb   ($sbox,$t1,1),$t1
  945     movzb   ($sbox,$t2,1),$t2
  946     movzb   ($sbox,$t3,1),$t3
  947 
  948     movzb   ($sbox,$acc0,1),$t4 #$t0
  949     movzb   `&hi("$s2")`,$acc0
  950     movzb   ($sbox,$acc1,1),$t5 #$t1
  951     movzb   ($sbox,$acc2,1),$acc2   #$t2
  952     movzb   ($sbox,$acc0,1),$acc0   #$t3
  953 
  954     shr \$16,$s2
  955     shl \$8,$t5
  956     shl \$8,$t4
  957     movzb   `&lo("$s2")`,$acc1
  958     shr \$16,$s0
  959     xor $t4,$t0
  960     shr \$16,$s1
  961     movzb   `&lo("$s3")`,$t4
  962 
  963     shl \$8,$acc2
  964     xor $t5,$t1
  965     shl \$8,$acc0
  966     movzb   `&lo("$s0")`,$t5
  967     movzb   ($sbox,$acc1,1),$acc1   #$t0
  968     xor $acc2,$t2
  969     movzb   `&lo("$s1")`,$acc2
  970 
  971     shl \$16,$acc1
  972     xor $acc0,$t3
  973     movzb   ($sbox,$t4,1),$t4   #$t1
  974     movzb   `&hi("$s1")`,$acc0
  975     movzb   ($sbox,$acc2,1),$acc2   #$t3
  976     xor $acc1,$t0
  977     movzb   ($sbox,$t5,1),$t5   #$t2
  978     movzb   `&hi("$s2")`,$acc1
  979 
  980     shl \$16,$acc2
  981     shl \$16,$t4
  982     shl \$16,$t5
  983     xor $acc2,$t3
  984     movzb   `&hi("$s3")`,$acc2
  985     xor $t4,$t1
  986     shr \$8,$s0
  987     xor $t5,$t2
  988 
  989     movzb   ($sbox,$acc0,1),$acc0   #$t0
  990     movzb   ($sbox,$acc1,1),$s1 #$t1
  991     movzb   ($sbox,$acc2,1),$s2 #$t2
  992     movzb   ($sbox,$s0,1),$s3   #$t3
  993 
  994     mov $t0,$s0
  995     shl \$24,$acc0
  996     shl \$24,$s1
  997     shl \$24,$s2
  998     xor $acc0,$s0
  999     shl \$24,$s3
 1000     xor $t1,$s1
 1001     xor $t2,$s2
 1002     xor $t3,$s3
 1003 ___
 1004 }
 1005 
 1006 # parallelized version! input is pair of 64-bit values: %rax=s1.s0
 1007 # and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1,
 1008 # %ecx=s2 and %edx=s3.
 1009 sub dectransform()
 1010 { my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx");
 1011   my ($tp18,$tp28,$tp48,$tp88,$acc8)=("%rcx","%r11","%r12","%r13","%rdx");
 1012   my $prefetch = shift;
 1013 
 1014 $code.=<<___;
 1015     mov $mask80,$tp40
 1016     mov $mask80,$tp48
 1017     and $tp10,$tp40
 1018     and $tp18,$tp48
 1019     mov $tp40,$acc0
 1020     mov $tp48,$acc8
 1021     shr \$7,$tp40
 1022     lea ($tp10,$tp10),$tp20
 1023     shr \$7,$tp48
 1024     lea ($tp18,$tp18),$tp28
 1025     sub $tp40,$acc0
 1026     sub $tp48,$acc8
 1027     and $maskfe,$tp20
 1028     and $maskfe,$tp28
 1029     and $mask1b,$acc0
 1030     and $mask1b,$acc8
 1031     xor $acc0,$tp20
 1032     xor $acc8,$tp28
 1033     mov $mask80,$tp80
 1034     mov $mask80,$tp88
 1035 
 1036     and $tp20,$tp80
 1037     and $tp28,$tp88
 1038     mov $tp80,$acc0
 1039     mov $tp88,$acc8
 1040     shr \$7,$tp80
 1041     lea ($tp20,$tp20),$tp40
 1042     shr \$7,$tp88
 1043     lea ($tp28,$tp28),$tp48
 1044     sub $tp80,$acc0
 1045     sub $tp88,$acc8
 1046     and $maskfe,$tp40
 1047     and $maskfe,$tp48
 1048     and $mask1b,$acc0
 1049     and $mask1b,$acc8
 1050     xor $acc0,$tp40
 1051     xor $acc8,$tp48
 1052     mov $mask80,$tp80
 1053     mov $mask80,$tp88
 1054 
 1055     and $tp40,$tp80
 1056     and $tp48,$tp88
 1057     mov $tp80,$acc0
 1058     mov $tp88,$acc8
 1059     shr \$7,$tp80
 1060      xor    $tp10,$tp20     # tp2^=tp1
 1061     shr \$7,$tp88
 1062      xor    $tp18,$tp28     # tp2^=tp1
 1063     sub $tp80,$acc0
 1064     sub $tp88,$acc8
 1065     lea ($tp40,$tp40),$tp80
 1066     lea ($tp48,$tp48),$tp88
 1067      xor    $tp10,$tp40     # tp4^=tp1
 1068      xor    $tp18,$tp48     # tp4^=tp1
 1069     and $maskfe,$tp80
 1070     and $maskfe,$tp88
 1071     and $mask1b,$acc0
 1072     and $mask1b,$acc8
 1073     xor $acc0,$tp80
 1074     xor $acc8,$tp88
 1075 
 1076     xor $tp80,$tp10     # tp1^=tp8
 1077     xor $tp88,$tp18     # tp1^=tp8
 1078     xor $tp80,$tp20     # tp2^tp1^=tp8
 1079     xor $tp88,$tp28     # tp2^tp1^=tp8
 1080     mov $tp10,$acc0
 1081     mov $tp18,$acc8
 1082     xor $tp80,$tp40     # tp4^tp1^=tp8
 1083     shr \$32,$acc0
 1084     xor $tp88,$tp48     # tp4^tp1^=tp8
 1085     shr \$32,$acc8
 1086     xor $tp20,$tp80     # tp8^=tp8^tp2^tp1=tp2^tp1
 1087     rol \$8,`&LO("$tp10")`  # ROTATE(tp1^tp8,8)
 1088     xor $tp28,$tp88     # tp8^=tp8^tp2^tp1=tp2^tp1
 1089     rol \$8,`&LO("$tp18")`  # ROTATE(tp1^tp8,8)
 1090     xor $tp40,$tp80     # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
 1091     rol \$8,`&LO("$acc0")`  # ROTATE(tp1^tp8,8)
 1092     xor $tp48,$tp88     # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
 1093 
 1094     rol \$8,`&LO("$acc8")`  # ROTATE(tp1^tp8,8)
 1095     xor `&LO("$tp80")`,`&LO("$tp10")`
 1096     shr \$32,$tp80
 1097     xor `&LO("$tp88")`,`&LO("$tp18")`
 1098     shr \$32,$tp88
 1099     xor `&LO("$tp80")`,`&LO("$acc0")`
 1100     xor `&LO("$tp88")`,`&LO("$acc8")`
 1101 
 1102     mov $tp20,$tp80
 1103     rol \$24,`&LO("$tp20")` # ROTATE(tp2^tp1^tp8,24)
 1104     mov $tp28,$tp88
 1105     rol \$24,`&LO("$tp28")` # ROTATE(tp2^tp1^tp8,24)
 1106     shr \$32,$tp80
 1107     xor `&LO("$tp20")`,`&LO("$tp10")`
 1108     shr \$32,$tp88
 1109     xor `&LO("$tp28")`,`&LO("$tp18")`
 1110     rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24)
 1111     mov $tp40,$tp20
 1112     rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24)
 1113     mov $tp48,$tp28
 1114     shr \$32,$tp20
 1115     xor `&LO("$tp80")`,`&LO("$acc0")`
 1116     shr \$32,$tp28
 1117     xor `&LO("$tp88")`,`&LO("$acc8")`
 1118 
 1119     `"mov   0($sbox),$mask80"   if ($prefetch)`
 1120     rol \$16,`&LO("$tp40")` # ROTATE(tp4^tp1^tp8,16)
 1121     `"mov   64($sbox),$maskfe"  if ($prefetch)`
 1122     rol \$16,`&LO("$tp48")` # ROTATE(tp4^tp1^tp8,16)
 1123     `"mov   128($sbox),$mask1b" if ($prefetch)`
 1124     rol \$16,`&LO("$tp20")` # ROTATE(tp4^tp1^tp8,16)
 1125     `"mov   192($sbox),$tp80"   if ($prefetch)`
 1126     xor `&LO("$tp40")`,`&LO("$tp10")`
 1127     rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16)
 1128     xor `&LO("$tp48")`,`&LO("$tp18")`
 1129     `"mov   256($sbox),$tp88"   if ($prefetch)`
 1130     xor `&LO("$tp20")`,`&LO("$acc0")`
 1131     xor `&LO("$tp28")`,`&LO("$acc8")`
 1132 ___
 1133 }
 1134 
 1135 $code.=<<___;
 1136 .type   _x86_64_AES_decrypt_compact,\@abi-omnipotent
 1137 .align  16
 1138 _x86_64_AES_decrypt_compact:
 1139     lea 128($sbox),$inp         # size optimization
 1140     mov 0-128($inp),$acc1       # prefetch Td4
 1141     mov 32-128($inp),$acc2
 1142     mov 64-128($inp),$t0
 1143     mov 96-128($inp),$t1
 1144     mov 128-128($inp),$acc1
 1145     mov 160-128($inp),$acc2
 1146     mov 192-128($inp),$t0
 1147     mov 224-128($inp),$t1
 1148     jmp .Ldec_loop_compact
 1149 
 1150 .align  16
 1151 .Ldec_loop_compact:
 1152         xor 0($key),$s0     # xor with key
 1153         xor 4($key),$s1
 1154         xor 8($key),$s2
 1155         xor 12($key),$s3
 1156         lea 16($key),$key
 1157 ___
 1158         &deccompactvert();
 1159 $code.=<<___;
 1160         cmp 16(%rsp),$key
 1161         je  .Ldec_compact_done
 1162 
 1163         mov 256+0($sbox),$mask80
 1164         shl \$32,%rbx
 1165         shl \$32,%rdx
 1166         mov 256+8($sbox),$maskfe
 1167         or  %rbx,%rax
 1168         or  %rdx,%rcx
 1169         mov 256+16($sbox),$mask1b
 1170 ___
 1171         &dectransform(1);
 1172 $code.=<<___;
 1173     jmp .Ldec_loop_compact
 1174 .align  16
 1175 .Ldec_compact_done:
 1176     xor 0($key),$s0
 1177     xor 4($key),$s1
 1178     xor 8($key),$s2
 1179     xor 12($key),$s3
 1180     .byte   0xf3,0xc3           # rep ret
 1181 .size   _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact
 1182 ___
 1183 
 1184 # void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
 1185 $code.=<<___;
 1186 .globl  AES_decrypt
 1187 .type   AES_decrypt,\@function,3
 1188 .align  16
 1189 .globl  asm_AES_decrypt
 1190 .hidden asm_AES_decrypt
 1191 asm_AES_decrypt:
 1192 AES_decrypt:
 1193     push    %rbx
 1194     push    %rbp
 1195     push    %r12
 1196     push    %r13
 1197     push    %r14
 1198     push    %r15
 1199 
 1200     # allocate frame "above" key schedule
 1201     mov %rsp,%r10
 1202     lea -63(%rdx),%rcx  # %rdx is key argument
 1203     and \$-64,%rsp
 1204     sub %rsp,%rcx
 1205     neg %rcx
 1206     and \$0x3c0,%rcx
 1207     sub %rcx,%rsp
 1208     sub \$32,%rsp
 1209 
 1210     mov %rsi,16(%rsp)   # save out
 1211     mov %r10,24(%rsp)   # save real stack pointer
 1212 .Ldec_prologue:
 1213 
 1214     mov %rdx,$key
 1215     mov 240($key),$rnds # load rounds
 1216 
 1217     mov 0(%rdi),$s0 # load input vector
 1218     mov 4(%rdi),$s1
 1219     mov 8(%rdi),$s2
 1220     mov 12(%rdi),$s3
 1221 
 1222     shl \$4,$rnds
 1223     lea ($key,$rnds),%rbp
 1224     mov $key,(%rsp) # key schedule
 1225     mov %rbp,8(%rsp)    # end of key schedule
 1226 
 1227     # pick Td4 copy which can't "overlap" with stack frame or key schedule
 1228     lea .LAES_Td+2048(%rip),$sbox
 1229     lea 768(%rsp),%rbp
 1230     sub $sbox,%rbp
 1231     and \$0x300,%rbp
 1232     lea ($sbox,%rbp),$sbox
 1233     shr \$3,%rbp    # recall "magic" constants!
 1234     add %rbp,$sbox
 1235 
 1236     call    _x86_64_AES_decrypt_compact
 1237 
 1238     mov 16(%rsp),$out   # restore out
 1239     mov 24(%rsp),%rsi   # restore saved stack pointer
 1240     mov $s0,0($out) # write output vector
 1241     mov $s1,4($out)
 1242     mov $s2,8($out)
 1243     mov $s3,12($out)
 1244 
 1245     mov (%rsi),%r15
 1246     mov 8(%rsi),%r14
 1247     mov 16(%rsi),%r13
 1248     mov 24(%rsi),%r12
 1249     mov 32(%rsi),%rbp
 1250     mov 40(%rsi),%rbx
 1251     lea 48(%rsi),%rsp
 1252 .Ldec_epilogue:
 1253     ret
 1254 .size   AES_decrypt,.-AES_decrypt
 1255 ___
 1256 #------------------------------------------------------------------#
 1257 
 1258 sub enckey()
 1259 {
 1260 $code.=<<___;
 1261     movz    %dl,%esi        # rk[i]>>0
 1262     movzb   -128(%rbp,%rsi),%ebx
 1263     movz    %dh,%esi        # rk[i]>>8
 1264     shl \$24,%ebx
 1265     xor %ebx,%eax
 1266 
 1267     movzb   -128(%rbp,%rsi),%ebx
 1268     shr \$16,%edx
 1269     movz    %dl,%esi        # rk[i]>>16
 1270     xor %ebx,%eax
 1271 
 1272     movzb   -128(%rbp,%rsi),%ebx
 1273     movz    %dh,%esi        # rk[i]>>24
 1274     shl \$8,%ebx
 1275     xor %ebx,%eax
 1276 
 1277     movzb   -128(%rbp,%rsi),%ebx
 1278     shl \$16,%ebx
 1279     xor %ebx,%eax
 1280 
 1281     xor 1024-128(%rbp,%rcx,4),%eax      # rcon
 1282 ___
 1283 }
 1284 
 1285 # int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 1286 #                        AES_KEY *key)
 1287 $code.=<<___;
 1288 .globl  private_AES_set_encrypt_key
 1289 .type   private_AES_set_encrypt_key,\@function,3
 1290 .align  16
 1291 private_AES_set_encrypt_key:
 1292     push    %rbx
 1293     push    %rbp
 1294     push    %r12            # redundant, but allows to share 
 1295     push    %r13            # exception handler...
 1296     push    %r14
 1297     push    %r15
 1298     sub \$8,%rsp
 1299 .Lenc_key_prologue:
 1300 
 1301     call    _x86_64_AES_set_encrypt_key
 1302 
 1303     mov 40(%rsp),%rbp
 1304     mov 48(%rsp),%rbx
 1305     add \$56,%rsp
 1306 .Lenc_key_epilogue:
 1307     ret
 1308 .size   private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
 1309 
 1310 .type   _x86_64_AES_set_encrypt_key,\@abi-omnipotent
 1311 .align  16
 1312 _x86_64_AES_set_encrypt_key:
 1313     mov %esi,%ecx           # %ecx=bits
 1314     mov %rdi,%rsi           # %rsi=userKey
 1315     mov %rdx,%rdi           # %rdi=key
 1316 
 1317     test    \$-1,%rsi
 1318     jz  .Lbadpointer
 1319     test    \$-1,%rdi
 1320     jz  .Lbadpointer
 1321 
 1322     lea .LAES_Te(%rip),%rbp
 1323     lea 2048+128(%rbp),%rbp
 1324 
 1325     # prefetch Te4
 1326     mov 0-128(%rbp),%eax
 1327     mov 32-128(%rbp),%ebx
 1328     mov 64-128(%rbp),%r8d
 1329     mov 96-128(%rbp),%edx
 1330     mov 128-128(%rbp),%eax
 1331     mov 160-128(%rbp),%ebx
 1332     mov 192-128(%rbp),%r8d
 1333     mov 224-128(%rbp),%edx
 1334 
 1335     cmp \$128,%ecx
 1336     je  .L10rounds
 1337     cmp \$192,%ecx
 1338     je  .L12rounds
 1339     cmp \$256,%ecx
 1340     je  .L14rounds
 1341     mov \$-2,%rax           # invalid number of bits
 1342     jmp .Lexit
 1343 
 1344 .L10rounds:
 1345     mov 0(%rsi),%rax            # copy first 4 dwords
 1346     mov 8(%rsi),%rdx
 1347     mov %rax,0(%rdi)
 1348     mov %rdx,8(%rdi)
 1349 
 1350     shr \$32,%rdx
 1351     xor %ecx,%ecx
 1352     jmp .L10shortcut
 1353 .align  4
 1354 .L10loop:
 1355         mov 0(%rdi),%eax            # rk[0]
 1356         mov 12(%rdi),%edx           # rk[3]
 1357 .L10shortcut:
 1358 ___
 1359         &enckey ();
 1360 $code.=<<___;
 1361         mov %eax,16(%rdi)           # rk[4]
 1362         xor 4(%rdi),%eax
 1363         mov %eax,20(%rdi)           # rk[5]
 1364         xor 8(%rdi),%eax
 1365         mov %eax,24(%rdi)           # rk[6]
 1366         xor 12(%rdi),%eax
 1367         mov %eax,28(%rdi)           # rk[7]
 1368         add \$1,%ecx
 1369         lea 16(%rdi),%rdi
 1370         cmp \$10,%ecx
 1371     jl  .L10loop
 1372 
 1373     movl    \$10,80(%rdi)           # setup number of rounds
 1374     xor %rax,%rax
 1375     jmp .Lexit
 1376 
 1377 .L12rounds:
 1378     mov 0(%rsi),%rax            # copy first 6 dwords
 1379     mov 8(%rsi),%rbx
 1380     mov 16(%rsi),%rdx
 1381     mov %rax,0(%rdi)
 1382     mov %rbx,8(%rdi)
 1383     mov %rdx,16(%rdi)
 1384 
 1385     shr \$32,%rdx
 1386     xor %ecx,%ecx
 1387     jmp .L12shortcut
 1388 .align  4
 1389 .L12loop:
 1390         mov 0(%rdi),%eax            # rk[0]
 1391         mov 20(%rdi),%edx           # rk[5]
 1392 .L12shortcut:
 1393 ___
 1394         &enckey ();
 1395 $code.=<<___;
 1396         mov %eax,24(%rdi)           # rk[6]
 1397         xor 4(%rdi),%eax
 1398         mov %eax,28(%rdi)           # rk[7]
 1399         xor 8(%rdi),%eax
 1400         mov %eax,32(%rdi)           # rk[8]
 1401         xor 12(%rdi),%eax
 1402         mov %eax,36(%rdi)           # rk[9]
 1403 
 1404         cmp \$7,%ecx
 1405         je  .L12break
 1406         add \$1,%ecx
 1407 
 1408         xor 16(%rdi),%eax
 1409         mov %eax,40(%rdi)           # rk[10]
 1410         xor 20(%rdi),%eax
 1411         mov %eax,44(%rdi)           # rk[11]
 1412 
 1413         lea 24(%rdi),%rdi
 1414     jmp .L12loop
 1415 .L12break:
 1416     movl    \$12,72(%rdi)       # setup number of rounds
 1417     xor %rax,%rax
 1418     jmp .Lexit
 1419 
 1420 .L14rounds:     
 1421     mov 0(%rsi),%rax            # copy first 8 dwords
 1422     mov 8(%rsi),%rbx
 1423     mov 16(%rsi),%rcx
 1424     mov 24(%rsi),%rdx
 1425     mov %rax,0(%rdi)
 1426     mov %rbx,8(%rdi)
 1427     mov %rcx,16(%rdi)
 1428     mov %rdx,24(%rdi)
 1429 
 1430     shr \$32,%rdx
 1431     xor %ecx,%ecx
 1432     jmp .L14shortcut
 1433 .align  4
 1434 .L14loop:
 1435         mov 0(%rdi),%eax            # rk[0]
 1436         mov 28(%rdi),%edx           # rk[4]
 1437 .L14shortcut:
 1438 ___
 1439         &enckey ();
 1440 $code.=<<___;
 1441         mov %eax,32(%rdi)           # rk[8]
 1442         xor 4(%rdi),%eax
 1443         mov %eax,36(%rdi)           # rk[9]
 1444         xor 8(%rdi),%eax
 1445         mov %eax,40(%rdi)           # rk[10]
 1446         xor 12(%rdi),%eax
 1447         mov %eax,44(%rdi)           # rk[11]
 1448 
 1449         cmp \$6,%ecx
 1450         je  .L14break
 1451         add \$1,%ecx
 1452 
 1453         mov %eax,%edx
 1454         mov 16(%rdi),%eax           # rk[4]
 1455         movz    %dl,%esi            # rk[11]>>0
 1456         movzb   -128(%rbp,%rsi),%ebx
 1457         movz    %dh,%esi            # rk[11]>>8
 1458         xor %ebx,%eax
 1459 
 1460         movzb   -128(%rbp,%rsi),%ebx
 1461         shr \$16,%edx
 1462         shl \$8,%ebx
 1463         movz    %dl,%esi            # rk[11]>>16
 1464         xor %ebx,%eax
 1465 
 1466         movzb   -128(%rbp,%rsi),%ebx
 1467         movz    %dh,%esi            # rk[11]>>24
 1468         shl \$16,%ebx
 1469         xor %ebx,%eax
 1470 
 1471         movzb   -128(%rbp,%rsi),%ebx
 1472         shl \$24,%ebx
 1473         xor %ebx,%eax
 1474 
 1475         mov %eax,48(%rdi)           # rk[12]
 1476         xor 20(%rdi),%eax
 1477         mov %eax,52(%rdi)           # rk[13]
 1478         xor 24(%rdi),%eax
 1479         mov %eax,56(%rdi)           # rk[14]
 1480         xor 28(%rdi),%eax
 1481         mov %eax,60(%rdi)           # rk[15]
 1482 
 1483         lea 32(%rdi),%rdi
 1484     jmp .L14loop
 1485 .L14break:
 1486     movl    \$14,48(%rdi)       # setup number of rounds
 1487     xor %rax,%rax
 1488     jmp .Lexit
 1489 
 1490 .Lbadpointer:
 1491     mov \$-1,%rax
 1492 .Lexit:
 1493     .byte   0xf3,0xc3           # rep ret
 1494 .size   _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key
 1495 ___
 1496 
 1497 sub deckey_ref()
 1498 { my ($i,$ptr,$te,$td) = @_;
 1499   my ($tp1,$tp2,$tp4,$tp8,$acc)=("%eax","%ebx","%edi","%edx","%r8d");
 1500 $code.=<<___;
 1501     mov $i($ptr),$tp1
 1502     mov $tp1,$acc
 1503     and \$0x80808080,$acc
 1504     mov $acc,$tp4
 1505     shr \$7,$tp4
 1506     lea 0($tp1,$tp1),$tp2
 1507     sub $tp4,$acc
 1508     and \$0xfefefefe,$tp2
 1509     and \$0x1b1b1b1b,$acc
 1510     xor $tp2,$acc
 1511     mov $acc,$tp2
 1512 
 1513     and \$0x80808080,$acc
 1514     mov $acc,$tp8
 1515     shr \$7,$tp8
 1516     lea 0($tp2,$tp2),$tp4
 1517     sub $tp8,$acc
 1518     and \$0xfefefefe,$tp4
 1519     and \$0x1b1b1b1b,$acc
 1520      xor    $tp1,$tp2       # tp2^tp1
 1521     xor $tp4,$acc
 1522     mov $acc,$tp4
 1523 
 1524     and \$0x80808080,$acc
 1525     mov $acc,$tp8
 1526     shr \$7,$tp8
 1527     sub $tp8,$acc
 1528     lea 0($tp4,$tp4),$tp8
 1529      xor    $tp1,$tp4       # tp4^tp1
 1530     and \$0xfefefefe,$tp8
 1531     and \$0x1b1b1b1b,$acc
 1532     xor $acc,$tp8
 1533 
 1534     xor $tp8,$tp1       # tp1^tp8
 1535     rol \$8,$tp1        # ROTATE(tp1^tp8,8)
 1536     xor $tp8,$tp2       # tp2^tp1^tp8
 1537     xor $tp8,$tp4       # tp4^tp1^tp8
 1538     xor $tp2,$tp8
 1539     xor $tp4,$tp8       # tp8^(tp8^tp4^tp1)^(tp8^tp2^tp1)=tp8^tp4^tp2
 1540 
 1541     xor $tp8,$tp1
 1542     rol \$24,$tp2       # ROTATE(tp2^tp1^tp8,24)
 1543     xor $tp2,$tp1
 1544     rol \$16,$tp4       # ROTATE(tp4^tp1^tp8,16)
 1545     xor $tp4,$tp1
 1546 
 1547     mov $tp1,$i($ptr)
 1548 ___
 1549 }
 1550 
 1551 # int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 1552 #                        AES_KEY *key)
 1553 $code.=<<___;
 1554 .globl  private_AES_set_decrypt_key
 1555 .type   private_AES_set_decrypt_key,\@function,3
 1556 .align  16
 1557 private_AES_set_decrypt_key:
 1558     push    %rbx
 1559     push    %rbp
 1560     push    %r12
 1561     push    %r13
 1562     push    %r14
 1563     push    %r15
 1564     push    %rdx            # save key schedule
 1565 .Ldec_key_prologue:
 1566 
 1567     call    _x86_64_AES_set_encrypt_key
 1568     mov (%rsp),%r8      # restore key schedule
 1569     cmp \$0,%eax
 1570     jne .Labort
 1571 
 1572     mov 240(%r8),%r14d      # pull number of rounds
 1573     xor %rdi,%rdi
 1574     lea (%rdi,%r14d,4),%rcx
 1575     mov %r8,%rsi
 1576     lea (%r8,%rcx,4),%rdi   # pointer to last chunk
 1577 .align  4
 1578 .Linvert:
 1579         mov 0(%rsi),%rax
 1580         mov 8(%rsi),%rbx
 1581         mov 0(%rdi),%rcx
 1582         mov 8(%rdi),%rdx
 1583         mov %rax,0(%rdi)
 1584         mov %rbx,8(%rdi)
 1585         mov %rcx,0(%rsi)
 1586         mov %rdx,8(%rsi)
 1587         lea 16(%rsi),%rsi
 1588         lea -16(%rdi),%rdi
 1589         cmp %rsi,%rdi
 1590     jne .Linvert
 1591 
 1592     lea .LAES_Te+2048+1024(%rip),%rax   # rcon
 1593 
 1594     mov 40(%rax),$mask80
 1595     mov 48(%rax),$maskfe
 1596     mov 56(%rax),$mask1b
 1597 
 1598     mov %r8,$key
 1599     sub \$1,%r14d
 1600 .align  4
 1601 .Lpermute:
 1602         lea 16($key),$key
 1603         mov 0($key),%rax
 1604         mov 8($key),%rcx
 1605 ___
 1606         &dectransform ();
 1607 $code.=<<___;
 1608         mov %eax,0($key)
 1609         mov %ebx,4($key)
 1610         mov %ecx,8($key)
 1611         mov %edx,12($key)
 1612         sub \$1,%r14d
 1613     jnz .Lpermute
 1614 
 1615     xor %rax,%rax
 1616 .Labort:
 1617     mov 8(%rsp),%r15
 1618     mov 16(%rsp),%r14
 1619     mov 24(%rsp),%r13
 1620     mov 32(%rsp),%r12
 1621     mov 40(%rsp),%rbp
 1622     mov 48(%rsp),%rbx
 1623     add \$56,%rsp
 1624 .Ldec_key_epilogue:
 1625     ret
 1626 .size   private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
 1627 ___
 1628 
 1629 # void AES_cbc_encrypt (const void char *inp, unsigned char *out,
 1630 #           size_t length, const AES_KEY *key,
 1631 #           unsigned char *ivp,const int enc);
 1632 {
 1633 # stack frame layout
 1634 # -8(%rsp)      return address
 1635 my $keyp="0(%rsp)";     # one to pass as $key
 1636 my $keyend="8(%rsp)";       # &(keyp->rd_key[4*keyp->rounds])
 1637 my $_rsp="16(%rsp)";        # saved %rsp
 1638 my $_inp="24(%rsp)";        # copy of 1st parameter, inp
 1639 my $_out="32(%rsp)";        # copy of 2nd parameter, out
 1640 my $_len="40(%rsp)";        # copy of 3rd parameter, length
 1641 my $_key="48(%rsp)";        # copy of 4th parameter, key
 1642 my $_ivp="56(%rsp)";        # copy of 5th parameter, ivp
 1643 my $ivec="64(%rsp)";        # ivec[16]
 1644 my $aes_key="80(%rsp)";     # copy of aes_key
 1645 my $mark="80+240(%rsp)";    # copy of aes_key->rounds
 1646 
 1647 $code.=<<___;
 1648 .globl  AES_cbc_encrypt
 1649 .type   AES_cbc_encrypt,\@function,6
 1650 .align  16
 1651 .extern OPENSSL_ia32cap_P
 1652 .globl  asm_AES_cbc_encrypt
 1653 .hidden asm_AES_cbc_encrypt
 1654 asm_AES_cbc_encrypt:
 1655 AES_cbc_encrypt:
 1656     cmp \$0,%rdx    # check length
 1657     je  .Lcbc_epilogue
 1658     pushfq
 1659     push    %rbx
 1660     push    %rbp
 1661     push    %r12
 1662     push    %r13
 1663     push    %r14
 1664     push    %r15
 1665 .Lcbc_prologue:
 1666 
 1667     cld
 1668     mov %r9d,%r9d   # clear upper half of enc
 1669 
 1670     lea .LAES_Te(%rip),$sbox
 1671     cmp \$0,%r9
 1672     jne .Lcbc_picked_te
 1673     lea .LAES_Td(%rip),$sbox
 1674 .Lcbc_picked_te:
 1675 
 1676     mov OPENSSL_ia32cap_P(%rip),%r10d
 1677     cmp \$$speed_limit,%rdx
 1678     jb  .Lcbc_slow_prologue
 1679     test    \$15,%rdx
 1680     jnz .Lcbc_slow_prologue
 1681     bt  \$28,%r10d
 1682     jc  .Lcbc_slow_prologue
 1683 
 1684     # allocate aligned stack frame...
 1685     lea -88-248(%rsp),$key
 1686     and \$-64,$key
 1687 
 1688     # ... and make sure it doesn't alias with AES_T[ed] modulo 4096
 1689     mov $sbox,%r10
 1690     lea 2304($sbox),%r11
 1691     mov $key,%r12
 1692     and \$0xFFF,%r10    # s = $sbox&0xfff
 1693     and \$0xFFF,%r11    # e = ($sbox+2048)&0xfff
 1694     and \$0xFFF,%r12    # p = %rsp&0xfff
 1695 
 1696     cmp %r11,%r12   # if (p=>e) %rsp =- (p-e);
 1697     jb  .Lcbc_te_break_out
 1698     sub %r11,%r12
 1699     sub %r12,$key
 1700     jmp .Lcbc_te_ok
 1701 .Lcbc_te_break_out:     # else %rsp -= (p-s)&0xfff + framesz
 1702     sub %r10,%r12
 1703     and \$0xFFF,%r12
 1704     add \$320,%r12
 1705     sub %r12,$key
 1706 .align  4
 1707 .Lcbc_te_ok:
 1708 
 1709     xchg    %rsp,$key
 1710     #add    \$8,%rsp    # reserve for return address!
 1711     mov $key,$_rsp  # save %rsp
 1712 .Lcbc_fast_body:
 1713     mov %rdi,$_inp  # save copy of inp
 1714     mov %rsi,$_out  # save copy of out
 1715     mov %rdx,$_len  # save copy of len
 1716     mov %rcx,$_key  # save copy of key
 1717     mov %r8,$_ivp   # save copy of ivp
 1718     movl    \$0,$mark   # copy of aes_key->rounds = 0;
 1719     mov %r8,%rbp    # rearrange input arguments
 1720     mov %r9,%rbx
 1721     mov %rsi,$out
 1722     mov %rdi,$inp
 1723     mov %rcx,$key
 1724 
 1725     mov 240($key),%eax      # key->rounds
 1726     # do we copy key schedule to stack?
 1727     mov $key,%r10
 1728     sub $sbox,%r10
 1729     and \$0xfff,%r10
 1730     cmp \$2304,%r10
 1731     jb  .Lcbc_do_ecopy
 1732     cmp \$4096-248,%r10
 1733     jb  .Lcbc_skip_ecopy
 1734 .align  4
 1735 .Lcbc_do_ecopy:
 1736         mov $key,%rsi
 1737         lea $aes_key,%rdi
 1738         lea $aes_key,$key
 1739         mov \$240/8,%ecx
 1740         .long   0x90A548F3  # rep movsq
 1741         mov %eax,(%rdi) # copy aes_key->rounds
 1742 .Lcbc_skip_ecopy:
 1743     mov $key,$keyp  # save key pointer
 1744 
 1745     mov \$18,%ecx
 1746 .align  4
 1747 .Lcbc_prefetch_te:
 1748         mov 0($sbox),%r10
 1749         mov 32($sbox),%r11
 1750         mov 64($sbox),%r12
 1751         mov 96($sbox),%r13
 1752         lea 128($sbox),$sbox
 1753         sub \$1,%ecx
 1754     jnz .Lcbc_prefetch_te
 1755     lea -2304($sbox),$sbox
 1756 
 1757     cmp \$0,%rbx
 1758     je  .LFAST_DECRYPT
 1759 
 1760 #----------------------------- ENCRYPT -----------------------------#
 1761     mov 0(%rbp),$s0     # load iv
 1762     mov 4(%rbp),$s1
 1763     mov 8(%rbp),$s2
 1764     mov 12(%rbp),$s3
 1765 
 1766 .align  4
 1767 .Lcbc_fast_enc_loop:
 1768         xor 0($inp),$s0
 1769         xor 4($inp),$s1
 1770         xor 8($inp),$s2
 1771         xor 12($inp),$s3
 1772         mov $keyp,$key  # restore key
 1773         mov $inp,$_inp  # if ($verticalspin) save inp
 1774 
 1775         call    _x86_64_AES_encrypt
 1776 
 1777         mov $_inp,$inp  # if ($verticalspin) restore inp
 1778         mov $_len,%r10
 1779         mov $s0,0($out)
 1780         mov $s1,4($out)
 1781         mov $s2,8($out)
 1782         mov $s3,12($out)
 1783 
 1784         lea 16($inp),$inp
 1785         lea 16($out),$out
 1786         sub \$16,%r10
 1787         test    \$-16,%r10
 1788         mov %r10,$_len
 1789     jnz .Lcbc_fast_enc_loop
 1790     mov $_ivp,%rbp  # restore ivp
 1791     mov $s0,0(%rbp) # save ivec
 1792     mov $s1,4(%rbp)
 1793     mov $s2,8(%rbp)
 1794     mov $s3,12(%rbp)
 1795 
 1796     jmp .Lcbc_fast_cleanup
 1797 
 1798 #----------------------------- DECRYPT -----------------------------#
 1799 .align  16
 1800 .LFAST_DECRYPT:
 1801     cmp $inp,$out
 1802     je  .Lcbc_fast_dec_in_place
 1803 
 1804     mov %rbp,$ivec
 1805 .align  4
 1806 .Lcbc_fast_dec_loop:
 1807         mov 0($inp),$s0 # read input
 1808         mov 4($inp),$s1
 1809         mov 8($inp),$s2
 1810         mov 12($inp),$s3
 1811         mov $keyp,$key  # restore key
 1812         mov $inp,$_inp  # if ($verticalspin) save inp
 1813 
 1814         call    _x86_64_AES_decrypt
 1815 
 1816         mov $ivec,%rbp  # load ivp
 1817         mov $_inp,$inp  # if ($verticalspin) restore inp
 1818         mov $_len,%r10  # load len
 1819         xor 0(%rbp),$s0 # xor iv
 1820         xor 4(%rbp),$s1
 1821         xor 8(%rbp),$s2
 1822         xor 12(%rbp),$s3
 1823         mov $inp,%rbp   # current input, next iv
 1824 
 1825         sub \$16,%r10
 1826         mov %r10,$_len  # update len
 1827         mov %rbp,$ivec  # update ivp
 1828 
 1829         mov $s0,0($out) # write output
 1830         mov $s1,4($out)
 1831         mov $s2,8($out)
 1832         mov $s3,12($out)
 1833 
 1834         lea 16($inp),$inp
 1835         lea 16($out),$out
 1836     jnz .Lcbc_fast_dec_loop
 1837     mov $_ivp,%r12      # load user ivp
 1838     mov 0(%rbp),%r10        # load iv
 1839     mov 8(%rbp),%r11
 1840     mov %r10,0(%r12)        # copy back to user
 1841     mov %r11,8(%r12)
 1842     jmp .Lcbc_fast_cleanup
 1843 
 1844 .align  16
 1845 .Lcbc_fast_dec_in_place:
 1846     mov 0(%rbp),%r10        # copy iv to stack
 1847     mov 8(%rbp),%r11
 1848     mov %r10,0+$ivec
 1849     mov %r11,8+$ivec
 1850 .align  4
 1851 .Lcbc_fast_dec_in_place_loop:
 1852         mov 0($inp),$s0 # load input
 1853         mov 4($inp),$s1
 1854         mov 8($inp),$s2
 1855         mov 12($inp),$s3
 1856         mov $keyp,$key  # restore key
 1857         mov $inp,$_inp  # if ($verticalspin) save inp
 1858 
 1859         call    _x86_64_AES_decrypt
 1860 
 1861         mov $_inp,$inp  # if ($verticalspin) restore inp
 1862         mov $_len,%r10
 1863         xor 0+$ivec,$s0
 1864         xor 4+$ivec,$s1
 1865         xor 8+$ivec,$s2
 1866         xor 12+$ivec,$s3
 1867 
 1868         mov 0($inp),%r11    # load input
 1869         mov 8($inp),%r12
 1870         sub \$16,%r10
 1871         jz  .Lcbc_fast_dec_in_place_done
 1872 
 1873         mov %r11,0+$ivec    # copy input to iv
 1874         mov %r12,8+$ivec
 1875 
 1876         mov $s0,0($out) # save output [zaps input]
 1877         mov $s1,4($out)
 1878         mov $s2,8($out)
 1879         mov $s3,12($out)
 1880 
 1881         lea 16($inp),$inp
 1882         lea 16($out),$out
 1883         mov %r10,$_len
 1884     jmp .Lcbc_fast_dec_in_place_loop
 1885 .Lcbc_fast_dec_in_place_done:
 1886     mov $_ivp,%rdi
 1887     mov %r11,0(%rdi)    # copy iv back to user
 1888     mov %r12,8(%rdi)
 1889 
 1890     mov $s0,0($out) # save output [zaps input]
 1891     mov $s1,4($out)
 1892     mov $s2,8($out)
 1893     mov $s3,12($out)
 1894 
 1895 .align  4
 1896 .Lcbc_fast_cleanup:
 1897     cmpl    \$0,$mark   # was the key schedule copied?
 1898     lea $aes_key,%rdi
 1899     je  .Lcbc_exit
 1900         mov \$240/8,%ecx
 1901         xor %rax,%rax
 1902         .long   0x90AB48F3  # rep stosq
 1903 
 1904     jmp .Lcbc_exit
 1905 
 1906 #--------------------------- SLOW ROUTINE ---------------------------#
 1907 .align  16
 1908 .Lcbc_slow_prologue:
 1909     # allocate aligned stack frame...
 1910     lea -88(%rsp),%rbp
 1911     and \$-64,%rbp
 1912     # ... just "above" key schedule
 1913     lea -88-63(%rcx),%r10
 1914     sub %rbp,%r10
 1915     neg %r10
 1916     and \$0x3c0,%r10
 1917     sub %r10,%rbp
 1918 
 1919     xchg    %rsp,%rbp
 1920     #add    \$8,%rsp    # reserve for return address!
 1921     mov %rbp,$_rsp  # save %rsp
 1922 .Lcbc_slow_body:
 1923     #mov    %rdi,$_inp  # save copy of inp
 1924     #mov    %rsi,$_out  # save copy of out
 1925     #mov    %rdx,$_len  # save copy of len
 1926     #mov    %rcx,$_key  # save copy of key
 1927     mov %r8,$_ivp   # save copy of ivp
 1928     mov %r8,%rbp    # rearrange input arguments
 1929     mov %r9,%rbx
 1930     mov %rsi,$out
 1931     mov %rdi,$inp
 1932     mov %rcx,$key
 1933     mov %rdx,%r10
 1934 
 1935     mov 240($key),%eax
 1936     mov $key,$keyp  # save key pointer
 1937     shl \$4,%eax
 1938     lea ($key,%rax),%rax
 1939     mov %rax,$keyend
 1940 
 1941     # pick Te4 copy which can't "overlap" with stack frame or key scdedule
 1942     lea 2048($sbox),$sbox
 1943     lea 768-8(%rsp),%rax
 1944     sub $sbox,%rax
 1945     and \$0x300,%rax
 1946     lea ($sbox,%rax),$sbox
 1947 
 1948     cmp \$0,%rbx
 1949     je  .LSLOW_DECRYPT
 1950 
 1951 #--------------------------- SLOW ENCRYPT ---------------------------#
 1952     test    \$-16,%r10      # check upon length
 1953     mov 0(%rbp),$s0     # load iv
 1954     mov 4(%rbp),$s1
 1955     mov 8(%rbp),$s2
 1956     mov 12(%rbp),$s3
 1957     jz  .Lcbc_slow_enc_tail # short input...
 1958 
 1959 .align  4
 1960 .Lcbc_slow_enc_loop:
 1961         xor 0($inp),$s0
 1962         xor 4($inp),$s1
 1963         xor 8($inp),$s2
 1964         xor 12($inp),$s3
 1965         mov $keyp,$key  # restore key
 1966         mov $inp,$_inp  # save inp
 1967         mov $out,$_out  # save out
 1968         mov %r10,$_len  # save len
 1969 
 1970         call    _x86_64_AES_encrypt_compact
 1971 
 1972         mov $_inp,$inp  # restore inp
 1973         mov $_out,$out  # restore out
 1974         mov $_len,%r10  # restore len
 1975         mov $s0,0($out)
 1976         mov $s1,4($out)
 1977         mov $s2,8($out)
 1978         mov $s3,12($out)
 1979 
 1980         lea 16($inp),$inp
 1981         lea 16($out),$out
 1982         sub \$16,%r10
 1983         test    \$-16,%r10
 1984     jnz .Lcbc_slow_enc_loop
 1985     test    \$15,%r10
 1986     jnz .Lcbc_slow_enc_tail
 1987     mov $_ivp,%rbp  # restore ivp
 1988     mov $s0,0(%rbp) # save ivec
 1989     mov $s1,4(%rbp)
 1990     mov $s2,8(%rbp)
 1991     mov $s3,12(%rbp)
 1992 
 1993     jmp .Lcbc_exit
 1994 
 1995 .align  4
 1996 .Lcbc_slow_enc_tail:
 1997     mov %rax,%r11
 1998     mov %rcx,%r12
 1999     mov %r10,%rcx
 2000     mov $inp,%rsi
 2001     mov $out,%rdi
 2002     .long   0x9066A4F3      # rep movsb
 2003     mov \$16,%rcx       # zero tail
 2004     sub %r10,%rcx
 2005     xor %rax,%rax
 2006     .long   0x9066AAF3      # rep stosb
 2007     mov $out,$inp       # this is not a mistake!
 2008     mov \$16,%r10       # len=16
 2009     mov %r11,%rax
 2010     mov %r12,%rcx
 2011     jmp .Lcbc_slow_enc_loop # one more spin...
 2012 #--------------------------- SLOW DECRYPT ---------------------------#
 2013 .align  16
 2014 .LSLOW_DECRYPT:
 2015     shr \$3,%rax
 2016     add %rax,$sbox      # recall "magic" constants!
 2017 
 2018     mov 0(%rbp),%r11        # copy iv to stack
 2019     mov 8(%rbp),%r12
 2020     mov %r11,0+$ivec
 2021     mov %r12,8+$ivec
 2022 
 2023 .align  4
 2024 .Lcbc_slow_dec_loop:
 2025         mov 0($inp),$s0 # load input
 2026         mov 4($inp),$s1
 2027         mov 8($inp),$s2
 2028         mov 12($inp),$s3
 2029         mov $keyp,$key  # restore key
 2030         mov $inp,$_inp  # save inp
 2031         mov $out,$_out  # save out
 2032         mov %r10,$_len  # save len
 2033 
 2034         call    _x86_64_AES_decrypt_compact
 2035 
 2036         mov $_inp,$inp  # restore inp
 2037         mov $_out,$out  # restore out
 2038         mov $_len,%r10
 2039         xor 0+$ivec,$s0
 2040         xor 4+$ivec,$s1
 2041         xor 8+$ivec,$s2
 2042         xor 12+$ivec,$s3
 2043 
 2044         mov 0($inp),%r11    # load input
 2045         mov 8($inp),%r12
 2046         sub \$16,%r10
 2047         jc  .Lcbc_slow_dec_partial
 2048         jz  .Lcbc_slow_dec_done
 2049 
 2050         mov %r11,0+$ivec    # copy input to iv
 2051         mov %r12,8+$ivec
 2052 
 2053         mov $s0,0($out) # save output [can zap input]
 2054         mov $s1,4($out)
 2055         mov $s2,8($out)
 2056         mov $s3,12($out)
 2057 
 2058         lea 16($inp),$inp
 2059         lea 16($out),$out
 2060     jmp .Lcbc_slow_dec_loop
 2061 .Lcbc_slow_dec_done:
 2062     mov $_ivp,%rdi
 2063     mov %r11,0(%rdi)        # copy iv back to user
 2064     mov %r12,8(%rdi)
 2065 
 2066     mov $s0,0($out)     # save output [can zap input]
 2067     mov $s1,4($out)
 2068     mov $s2,8($out)
 2069     mov $s3,12($out)
 2070 
 2071     jmp .Lcbc_exit
 2072 
 2073 .align  4
 2074 .Lcbc_slow_dec_partial:
 2075     mov $_ivp,%rdi
 2076     mov %r11,0(%rdi)        # copy iv back to user
 2077     mov %r12,8(%rdi)
 2078 
 2079     mov $s0,0+$ivec     # save output to stack
 2080     mov $s1,4+$ivec
 2081     mov $s2,8+$ivec
 2082     mov $s3,12+$ivec
 2083 
 2084     mov $out,%rdi
 2085     lea $ivec,%rsi
 2086     lea 16(%r10),%rcx
 2087     .long   0x9066A4F3  # rep movsb
 2088     jmp .Lcbc_exit
 2089 
 2090 .align  16
 2091 .Lcbc_exit:
 2092     mov $_rsp,%rsi
 2093     mov (%rsi),%r15
 2094     mov 8(%rsi),%r14
 2095     mov 16(%rsi),%r13
 2096     mov 24(%rsi),%r12
 2097     mov 32(%rsi),%rbp
 2098     mov 40(%rsi),%rbx
 2099     lea 48(%rsi),%rsp
 2100 .Lcbc_popfq:
 2101     popfq
 2102 .Lcbc_epilogue:
 2103     ret
 2104 .size   AES_cbc_encrypt,.-AES_cbc_encrypt
 2105 ___
 2106 }
 2107 
 2108 $code.=<<___;
 2109 .align  64
 2110 .LAES_Te:
 2111 ___
 2112     &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
 2113     &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
 2114     &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
 2115     &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
 2116     &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
 2117     &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
 2118     &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
 2119     &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
 2120     &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
 2121     &_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
 2122     &_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
 2123     &_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
 2124     &_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
 2125     &_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
 2126     &_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
 2127     &_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
 2128     &_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
 2129     &_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
 2130     &_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
 2131     &_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
 2132     &_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
 2133     &_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
 2134     &_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
 2135     &_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
 2136     &_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
 2137     &_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
 2138     &_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
 2139     &_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
 2140     &_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
 2141     &_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
 2142     &_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
 2143     &_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
 2144     &_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
 2145     &_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
 2146     &_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
 2147     &_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
 2148     &_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
 2149     &_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
 2150     &_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
 2151     &_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
 2152     &_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
 2153     &_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
 2154     &_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
 2155     &_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
 2156     &_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
 2157     &_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
 2158     &_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
 2159     &_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
 2160     &_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
 2161     &_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
 2162     &_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
 2163     &_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
 2164     &_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
 2165     &_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
 2166     &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
 2167     &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
 2168     &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
 2169     &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
 2170     &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
 2171     &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
 2172     &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
 2173     &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
 2174     &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
 2175     &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
 2176 
 2177 #Te4    # four copies of Te4 to choose from to avoid L1 aliasing
 2178     &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
 2179     &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
 2180     &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
 2181     &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
 2182     &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
 2183     &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
 2184     &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
 2185     &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
 2186     &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
 2187     &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
 2188     &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
 2189     &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
 2190     &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
 2191     &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
 2192     &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
 2193     &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
 2194     &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
 2195     &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
 2196     &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
 2197     &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
 2198     &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
 2199     &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
 2200     &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
 2201     &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
 2202     &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
 2203     &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
 2204     &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
 2205     &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
 2206     &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
 2207     &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
 2208     &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
 2209     &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
 2210 
 2211     &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
 2212     &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
 2213     &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
 2214     &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
 2215     &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
 2216     &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
 2217     &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
 2218     &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
 2219     &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
 2220     &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
 2221     &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
 2222     &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
 2223     &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
 2224     &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
 2225     &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
 2226     &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
 2227     &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
 2228     &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
 2229     &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
 2230     &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
 2231     &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
 2232     &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
 2233     &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
 2234     &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
 2235     &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
 2236     &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
 2237     &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
 2238     &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
 2239     &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
 2240     &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
 2241     &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
 2242     &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
 2243 
 2244     &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
 2245     &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
 2246     &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
 2247     &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
 2248     &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
 2249     &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
 2250     &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
 2251     &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
 2252     &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
 2253     &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
 2254     &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
 2255     &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
 2256     &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
 2257     &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
 2258     &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
 2259     &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
 2260     &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
 2261     &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
 2262     &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
 2263     &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
 2264     &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
 2265     &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
 2266     &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
 2267     &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
 2268     &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
 2269     &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
 2270     &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
 2271     &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
 2272     &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
 2273     &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
 2274     &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
 2275     &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
 2276 
 2277     &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
 2278     &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
 2279     &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
 2280     &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
 2281     &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
 2282     &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
 2283     &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
 2284     &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
 2285     &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
 2286     &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
 2287     &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
 2288     &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
 2289     &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
 2290     &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
 2291     &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
 2292     &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
 2293     &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
 2294     &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
 2295     &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
 2296     &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
 2297     &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
 2298     &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
 2299     &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
 2300     &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
 2301     &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
 2302     &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
 2303     &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
 2304     &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
 2305     &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
 2306     &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
 2307     &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
 2308     &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
 2309 #rcon:
 2310 $code.=<<___;
 2311     .long   0x00000001, 0x00000002, 0x00000004, 0x00000008
 2312     .long   0x00000010, 0x00000020, 0x00000040, 0x00000080
 2313     .long   0x0000001b, 0x00000036, 0x80808080, 0x80808080
 2314     .long   0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b
 2315 ___
 2316 $code.=<<___;
 2317 .align  64
 2318 .LAES_Td:
 2319 ___
 2320     &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
 2321     &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
 2322     &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
 2323     &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
 2324     &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
 2325     &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
 2326     &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
 2327     &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
 2328     &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
 2329     &_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
 2330     &_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
 2331     &_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
 2332     &_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
 2333     &_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
 2334     &_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
 2335     &_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
 2336     &_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
 2337     &_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
 2338     &_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
 2339     &_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
 2340     &_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
 2341     &_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
 2342     &_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
 2343     &_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
 2344     &_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
 2345     &_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
 2346     &_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
 2347     &_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
 2348     &_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
 2349     &_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
 2350     &_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
 2351     &_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
 2352     &_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
 2353     &_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
 2354     &_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
 2355     &_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
 2356     &_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
 2357     &_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
 2358     &_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
 2359     &_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
 2360     &_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
 2361     &_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
 2362     &_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
 2363     &_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
 2364     &_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
 2365     &_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
 2366     &_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
 2367     &_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
 2368     &_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
 2369     &_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
 2370     &_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
 2371     &_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
 2372     &_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
 2373     &_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
 2374     &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
 2375     &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
 2376     &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
 2377     &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
 2378     &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
 2379     &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
 2380     &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
 2381     &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
 2382     &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
 2383     &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
 2384 
 2385 #Td4:   # four copies of Td4 to choose from to avoid L1 aliasing
 2386     &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
 2387     &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
 2388     &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
 2389     &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
 2390     &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
 2391     &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
 2392     &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
 2393     &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
 2394     &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
 2395     &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
 2396     &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
 2397     &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
 2398     &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
 2399     &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
 2400     &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
 2401     &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
 2402     &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
 2403     &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
 2404     &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
 2405     &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
 2406     &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
 2407     &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
 2408     &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
 2409     &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
 2410     &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
 2411     &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
 2412     &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
 2413     &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
 2414     &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
 2415     &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
 2416     &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
 2417     &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
 2418 $code.=<<___;
 2419     .long   0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
 2420     .long   0x1b1b1b1b, 0x1b1b1b1b, 0, 0
 2421 ___
 2422     &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
 2423     &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
 2424     &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
 2425     &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
 2426     &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
 2427     &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
 2428     &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
 2429     &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
 2430     &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
 2431     &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
 2432     &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
 2433     &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
 2434     &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
 2435     &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
 2436     &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
 2437     &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
 2438     &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
 2439     &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
 2440     &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
 2441     &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
 2442     &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
 2443     &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
 2444     &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
 2445     &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
 2446     &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
 2447     &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
 2448     &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
 2449     &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
 2450     &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
 2451     &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
 2452     &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
 2453     &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
 2454 $code.=<<___;
 2455     .long   0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
 2456     .long   0x1b1b1b1b, 0x1b1b1b1b, 0, 0
 2457 ___
 2458     &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
 2459     &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
 2460     &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
 2461     &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
 2462     &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
 2463     &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
 2464     &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
 2465     &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
 2466     &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
 2467     &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
 2468     &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
 2469     &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
 2470     &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
 2471     &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
 2472     &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
 2473     &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
 2474     &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
 2475     &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
 2476     &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
 2477     &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
 2478     &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
 2479     &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
 2480     &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
 2481     &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
 2482     &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
 2483     &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
 2484     &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
 2485     &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
 2486     &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
 2487     &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
 2488     &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
 2489     &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
 2490 $code.=<<___;
 2491     .long   0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
 2492     .long   0x1b1b1b1b, 0x1b1b1b1b, 0, 0
 2493 ___
 2494     &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
 2495     &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
 2496     &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
 2497     &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
 2498     &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
 2499     &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
 2500     &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
 2501     &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
 2502     &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
 2503     &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
 2504     &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
 2505     &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
 2506     &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
 2507     &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
 2508     &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
 2509     &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
 2510     &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
 2511     &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
 2512     &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
 2513     &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
 2514     &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
 2515     &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
 2516     &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
 2517     &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
 2518     &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
 2519     &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
 2520     &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
 2521     &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
 2522     &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
 2523     &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
 2524     &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
 2525     &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
 2526 $code.=<<___;
 2527     .long   0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
 2528     .long   0x1b1b1b1b, 0x1b1b1b1b, 0, 0
 2529 .asciz  "AES for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 2530 .align  64
 2531 ___
 2532 
 2533 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
 2534 #       CONTEXT *context,DISPATCHER_CONTEXT *disp)
 2535 if ($win64) {
 2536 $rec="%rcx";
 2537 $frame="%rdx";
 2538 $context="%r8";
 2539 $disp="%r9";
 2540 
 2541 $code.=<<___;
 2542 .extern __imp_RtlVirtualUnwind
 2543 .type   block_se_handler,\@abi-omnipotent
 2544 .align  16
 2545 block_se_handler:
 2546     push    %rsi
 2547     push    %rdi
 2548     push    %rbx
 2549     push    %rbp
 2550     push    %r12
 2551     push    %r13
 2552     push    %r14
 2553     push    %r15
 2554     pushfq
 2555     sub \$64,%rsp
 2556 
 2557     mov 120($context),%rax  # pull context->Rax
 2558     mov 248($context),%rbx  # pull context->Rip
 2559 
 2560     mov 8($disp),%rsi       # disp->ImageBase
 2561     mov 56($disp),%r11      # disp->HandlerData
 2562 
 2563     mov 0(%r11),%r10d       # HandlerData[0]
 2564     lea (%rsi,%r10),%r10    # prologue label
 2565     cmp %r10,%rbx       # context->Rip<prologue label
 2566     jb  .Lin_block_prologue
 2567 
 2568     mov 152($context),%rax  # pull context->Rsp
 2569 
 2570     mov 4(%r11),%r10d       # HandlerData[1]
 2571     lea (%rsi,%r10),%r10    # epilogue label
 2572     cmp %r10,%rbx       # context->Rip>=epilogue label
 2573     jae .Lin_block_prologue
 2574 
 2575     mov 24(%rax),%rax       # pull saved real stack pointer
 2576     lea 48(%rax),%rax       # adjust...
 2577 
 2578     mov -8(%rax),%rbx
 2579     mov -16(%rax),%rbp
 2580     mov -24(%rax),%r12
 2581     mov -32(%rax),%r13
 2582     mov -40(%rax),%r14
 2583     mov -48(%rax),%r15
 2584     mov %rbx,144($context)  # restore context->Rbx
 2585     mov %rbp,160($context)  # restore context->Rbp
 2586     mov %r12,216($context)  # restore context->R12
 2587     mov %r13,224($context)  # restore context->R13
 2588     mov %r14,232($context)  # restore context->R14
 2589     mov %r15,240($context)  # restore context->R15
 2590 
 2591 .Lin_block_prologue:
 2592     mov 8(%rax),%rdi
 2593     mov 16(%rax),%rsi
 2594     mov %rax,152($context)  # restore context->Rsp
 2595     mov %rsi,168($context)  # restore context->Rsi
 2596     mov %rdi,176($context)  # restore context->Rdi
 2597 
 2598     jmp .Lcommon_seh_exit
 2599 .size   block_se_handler,.-block_se_handler
 2600 
 2601 .type   key_se_handler,\@abi-omnipotent
 2602 .align  16
 2603 key_se_handler:
 2604     push    %rsi
 2605     push    %rdi
 2606     push    %rbx
 2607     push    %rbp
 2608     push    %r12
 2609     push    %r13
 2610     push    %r14
 2611     push    %r15
 2612     pushfq
 2613     sub \$64,%rsp
 2614 
 2615     mov 120($context),%rax  # pull context->Rax
 2616     mov 248($context),%rbx  # pull context->Rip
 2617 
 2618     mov 8($disp),%rsi       # disp->ImageBase
 2619     mov 56($disp),%r11      # disp->HandlerData
 2620 
 2621     mov 0(%r11),%r10d       # HandlerData[0]
 2622     lea (%rsi,%r10),%r10    # prologue label
 2623     cmp %r10,%rbx       # context->Rip<prologue label
 2624     jb  .Lin_key_prologue
 2625 
 2626     mov 152($context),%rax  # pull context->Rsp
 2627 
 2628     mov 4(%r11),%r10d       # HandlerData[1]
 2629     lea (%rsi,%r10),%r10    # epilogue label
 2630     cmp %r10,%rbx       # context->Rip>=epilogue label
 2631     jae .Lin_key_prologue
 2632 
 2633     lea 56(%rax),%rax
 2634 
 2635     mov -8(%rax),%rbx
 2636     mov -16(%rax),%rbp
 2637     mov -24(%rax),%r12
 2638     mov -32(%rax),%r13
 2639     mov -40(%rax),%r14
 2640     mov -48(%rax),%r15
 2641     mov %rbx,144($context)  # restore context->Rbx
 2642     mov %rbp,160($context)  # restore context->Rbp
 2643     mov %r12,216($context)  # restore context->R12
 2644     mov %r13,224($context)  # restore context->R13
 2645     mov %r14,232($context)  # restore context->R14
 2646     mov %r15,240($context)  # restore context->R15
 2647 
 2648 .Lin_key_prologue:
 2649     mov 8(%rax),%rdi
 2650     mov 16(%rax),%rsi
 2651     mov %rax,152($context)  # restore context->Rsp
 2652     mov %rsi,168($context)  # restore context->Rsi
 2653     mov %rdi,176($context)  # restore context->Rdi
 2654 
 2655     jmp .Lcommon_seh_exit
 2656 .size   key_se_handler,.-key_se_handler
 2657 
 2658 .type   cbc_se_handler,\@abi-omnipotent
 2659 .align  16
 2660 cbc_se_handler:
 2661     push    %rsi
 2662     push    %rdi
 2663     push    %rbx
 2664     push    %rbp
 2665     push    %r12
 2666     push    %r13
 2667     push    %r14
 2668     push    %r15
 2669     pushfq
 2670     sub \$64,%rsp
 2671 
 2672     mov 120($context),%rax  # pull context->Rax
 2673     mov 248($context),%rbx  # pull context->Rip
 2674 
 2675     lea .Lcbc_prologue(%rip),%r10
 2676     cmp %r10,%rbx       # context->Rip<.Lcbc_prologue
 2677     jb  .Lin_cbc_prologue
 2678 
 2679     lea .Lcbc_fast_body(%rip),%r10
 2680     cmp %r10,%rbx       # context->Rip<.Lcbc_fast_body
 2681     jb  .Lin_cbc_frame_setup
 2682 
 2683     lea .Lcbc_slow_prologue(%rip),%r10
 2684     cmp %r10,%rbx       # context->Rip<.Lcbc_slow_prologue
 2685     jb  .Lin_cbc_body
 2686 
 2687     lea .Lcbc_slow_body(%rip),%r10
 2688     cmp %r10,%rbx       # context->Rip<.Lcbc_slow_body
 2689     jb  .Lin_cbc_frame_setup
 2690 
 2691 .Lin_cbc_body:
 2692     mov 152($context),%rax  # pull context->Rsp
 2693 
 2694     lea .Lcbc_epilogue(%rip),%r10
 2695     cmp %r10,%rbx       # context->Rip>=.Lcbc_epilogue
 2696     jae .Lin_cbc_prologue
 2697 
 2698     lea 8(%rax),%rax
 2699 
 2700     lea .Lcbc_popfq(%rip),%r10
 2701     cmp %r10,%rbx       # context->Rip>=.Lcbc_popfq
 2702     jae .Lin_cbc_prologue
 2703 
 2704     mov `16-8`(%rax),%rax   # biased $_rsp
 2705     lea 56(%rax),%rax
 2706 
 2707 .Lin_cbc_frame_setup:
 2708     mov -16(%rax),%rbx
 2709     mov -24(%rax),%rbp
 2710     mov -32(%rax),%r12
 2711     mov -40(%rax),%r13
 2712     mov -48(%rax),%r14
 2713     mov -56(%rax),%r15
 2714     mov %rbx,144($context)  # restore context->Rbx
 2715     mov %rbp,160($context)  # restore context->Rbp
 2716     mov %r12,216($context)  # restore context->R12
 2717     mov %r13,224($context)  # restore context->R13
 2718     mov %r14,232($context)  # restore context->R14
 2719     mov %r15,240($context)  # restore context->R15
 2720 
 2721 .Lin_cbc_prologue:
 2722     mov 8(%rax),%rdi
 2723     mov 16(%rax),%rsi
 2724     mov %rax,152($context)  # restore context->Rsp
 2725     mov %rsi,168($context)  # restore context->Rsi
 2726     mov %rdi,176($context)  # restore context->Rdi
 2727 
 2728 .Lcommon_seh_exit:
 2729 
 2730     mov 40($disp),%rdi      # disp->ContextRecord
 2731     mov $context,%rsi       # context
 2732     mov \$`1232/8`,%ecx     # sizeof(CONTEXT)
 2733     .long   0xa548f3fc      # cld; rep movsq
 2734 
 2735     mov $disp,%rsi
 2736     xor %rcx,%rcx       # arg1, UNW_FLAG_NHANDLER
 2737     mov 8(%rsi),%rdx        # arg2, disp->ImageBase
 2738     mov 0(%rsi),%r8     # arg3, disp->ControlPc
 2739     mov 16(%rsi),%r9        # arg4, disp->FunctionEntry
 2740     mov 40(%rsi),%r10       # disp->ContextRecord
 2741     lea 56(%rsi),%r11       # &disp->HandlerData
 2742     lea 24(%rsi),%r12       # &disp->EstablisherFrame
 2743     mov %r10,32(%rsp)       # arg5
 2744     mov %r11,40(%rsp)       # arg6
 2745     mov %r12,48(%rsp)       # arg7
 2746     mov %rcx,56(%rsp)       # arg8, (NULL)
 2747     call    *__imp_RtlVirtualUnwind(%rip)
 2748 
 2749     mov \$1,%eax        # ExceptionContinueSearch
 2750     add \$64,%rsp
 2751     popfq
 2752     pop %r15
 2753     pop %r14
 2754     pop %r13
 2755     pop %r12
 2756     pop %rbp
 2757     pop %rbx
 2758     pop %rdi
 2759     pop %rsi
 2760     ret
 2761 .size   cbc_se_handler,.-cbc_se_handler
 2762 
 2763 .section    .pdata
 2764 .align  4
 2765     .rva    .LSEH_begin_AES_encrypt
 2766     .rva    .LSEH_end_AES_encrypt
 2767     .rva    .LSEH_info_AES_encrypt
 2768 
 2769     .rva    .LSEH_begin_AES_decrypt
 2770     .rva    .LSEH_end_AES_decrypt
 2771     .rva    .LSEH_info_AES_decrypt
 2772 
 2773     .rva    .LSEH_begin_private_AES_set_encrypt_key
 2774     .rva    .LSEH_end_private_AES_set_encrypt_key
 2775     .rva    .LSEH_info_private_AES_set_encrypt_key
 2776 
 2777     .rva    .LSEH_begin_private_AES_set_decrypt_key
 2778     .rva    .LSEH_end_private_AES_set_decrypt_key
 2779     .rva    .LSEH_info_private_AES_set_decrypt_key
 2780 
 2781     .rva    .LSEH_begin_AES_cbc_encrypt
 2782     .rva    .LSEH_end_AES_cbc_encrypt
 2783     .rva    .LSEH_info_AES_cbc_encrypt
 2784 
 2785 .section    .xdata
 2786 .align  8
 2787 .LSEH_info_AES_encrypt:
 2788     .byte   9,0,0,0
 2789     .rva    block_se_handler
 2790     .rva    .Lenc_prologue,.Lenc_epilogue   # HandlerData[]
 2791 .LSEH_info_AES_decrypt:
 2792     .byte   9,0,0,0
 2793     .rva    block_se_handler
 2794     .rva    .Ldec_prologue,.Ldec_epilogue   # HandlerData[]
 2795 .LSEH_info_private_AES_set_encrypt_key:
 2796     .byte   9,0,0,0
 2797     .rva    key_se_handler
 2798     .rva    .Lenc_key_prologue,.Lenc_key_epilogue   # HandlerData[]
 2799 .LSEH_info_private_AES_set_decrypt_key:
 2800     .byte   9,0,0,0
 2801     .rva    key_se_handler
 2802     .rva    .Ldec_key_prologue,.Ldec_key_epilogue   # HandlerData[]
 2803 .LSEH_info_AES_cbc_encrypt:
 2804     .byte   9,0,0,0
 2805     .rva    cbc_se_handler
 2806 ___
 2807 }
 2808 
 2809 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
 2810 
 2811 print $code;
 2812 
 2813 close STDOUT;