"Fossies" - the Fresh Open Source Software Archive

Member "openssl-1.0.2q/crypto/aes/asm/aest4-sparcv9.pl" (20 Nov 2018, 22891 Bytes) of package /linux/misc/openssl-1.0.2q.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Perl source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "aest4-sparcv9.pl" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 1.1.0g_vs_1.1.1-pre2.

    1 #!/usr/bin/env perl
    2 
    3 # ====================================================================
    4 # Written by David S. Miller <davem@devemloft.net> and Andy Polyakov
    5 # <appro@openssl.org>. The module is licensed under 2-clause BSD
    6 # license. October 2012. All rights reserved.
    7 # ====================================================================
    8 
    9 ######################################################################
   10 # AES for SPARC T4.
   11 #
   12 # AES round instructions complete in 3 cycles and can be issued every
   13 # cycle. It means that round calculations should take 4*rounds cycles,
   14 # because any given round instruction depends on result of *both*
   15 # previous instructions:
   16 #
   17 #   |0 |1 |2 |3 |4
   18 #   |01|01|01|
   19 #      |23|23|23|
   20 #               |01|01|...
   21 #                  |23|...
   22 #
   23 # Provided that fxor [with IV] takes 3 cycles to complete, critical
   24 # path length for CBC encrypt would be 3+4*rounds, or in other words
   25 # it should process one byte in at least (3+4*rounds)/16 cycles. This
   26 # estimate doesn't account for "collateral" instructions, such as
   27 # fetching input from memory, xor-ing it with zero-round key and
   28 # storing the result. Yet, *measured* performance [for data aligned
   29 # at 64-bit boundary!] deviates from this equation by less than 0.5%:
   30 #
   31 #       128-bit key 192-        256-
   32 # CBC encrypt   2.70/2.90(*)    3.20/3.40   3.70/3.90
   33 #            (*) numbers after slash are for
   34 #                misaligned data.
   35 #
   36 # Out-of-order execution logic managed to fully overlap "collateral"
   37 # instructions with those on critical path. Amazing!
   38 #
   39 # As with Intel AES-NI, question is if it's possible to improve
   40 # performance of parallelizeable modes by interleaving round
   41 # instructions. Provided round instruction latency and throughput
   42 # optimal interleave factor is 2. But can we expect 2x performance
   43 # improvement? Well, as round instructions can be issued one per
   44 # cycle, they don't saturate the 2-way issue pipeline and therefore
   45 # there is room for "collateral" calculations... Yet, 2x speed-up
   46 # over CBC encrypt remains unattaintable:
   47 #
   48 #       128-bit key 192-        256-
   49 # CBC decrypt   1.64/2.11   1.89/2.37   2.23/2.61
   50 # CTR       1.64/2.08(*)    1.89/2.33   2.23/2.61
   51 #            (*) numbers after slash are for
   52 #                misaligned data.
   53 #
   54 # Estimates based on amount of instructions under assumption that
   55 # round instructions are not pairable with any other instruction
   56 # suggest that latter is the actual case and pipeline runs
   57 # underutilized. It should be noted that T4 out-of-order execution
   58 # logic is so capable that performance gain from 2x interleave is
   59 # not even impressive, ~7-13% over non-interleaved code, largest
   60 # for 256-bit keys.
   61 
   62 # To anchor to something else, software implementation processes
   63 # one byte in 29 cycles with 128-bit key on same processor. Intel
   64 # Sandy Bridge encrypts byte in 5.07 cycles in CBC mode and decrypts
   65 # in 0.93, naturally with AES-NI.
   66 
   67 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
   68 push(@INC,"${dir}","${dir}../../perlasm");
   69 require "sparcv9_modes.pl";
   70 
   71 &asm_init(@ARGV);
   72 
   73 $::evp=1;   # if $evp is set to 0, script generates module with
   74 # AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry
   75 # points. These however are not fully compatible with openssl/aes.h,
   76 # because they expect AES_KEY to be aligned at 64-bit boundary. When
   77 # used through EVP, alignment is arranged at EVP layer. Second thing
   78 # that is arranged by EVP is at least 32-bit alignment of IV.
   79 
   80 ######################################################################
   81 # single-round subroutines
   82 #
   83 {
   84 my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
   85 
   86 $code.=<<___ if ($::abibits==64);
   87 .register   %g2,#scratch
   88 .register   %g3,#scratch
   89 
   90 ___
   91 $code.=<<___;
   92 .text
   93 
   94 .globl  aes_t4_encrypt
   95 .align  32
   96 aes_t4_encrypt:
   97     andcc       $inp, 7, %g1        ! is input aligned?
   98     andn        $inp, 7, $inp
   99 
  100     ldx     [$key + 0], %g4
  101     ldx     [$key + 8], %g5
  102 
  103     ldx     [$inp + 0], %o4
  104     bz,pt       %icc, 1f
  105     ldx     [$inp + 8], %o5
  106     ldx     [$inp + 16], $inp
  107     sll     %g1, 3, %g1
  108     sub     %g0, %g1, %o3
  109     sllx        %o4, %g1, %o4
  110     sllx        %o5, %g1, %g1
  111     srlx        %o5, %o3, %o5
  112     srlx        $inp, %o3, %o3
  113     or      %o5, %o4, %o4
  114     or      %o3, %g1, %o5
  115 1:
  116     ld      [$key + 240], $rounds
  117     ldd     [$key + 16], %f12
  118     ldd     [$key + 24], %f14
  119     xor     %g4, %o4, %o4
  120     xor     %g5, %o5, %o5
  121     movxtod     %o4, %f0
  122     movxtod     %o5, %f2
  123     srl     $rounds, 1, $rounds
  124     ldd     [$key + 32], %f16
  125     sub     $rounds, 1, $rounds
  126     ldd     [$key + 40], %f18
  127     add     $key, 48, $key
  128 
  129 .Lenc:
  130     aes_eround01    %f12, %f0, %f2, %f4
  131     aes_eround23    %f14, %f0, %f2, %f2
  132     ldd     [$key + 0], %f12
  133     ldd     [$key + 8], %f14
  134     sub     $rounds,1,$rounds
  135     aes_eround01    %f16, %f4, %f2, %f0
  136     aes_eround23    %f18, %f4, %f2, %f2
  137     ldd     [$key + 16], %f16
  138     ldd     [$key + 24], %f18
  139     brnz,pt     $rounds, .Lenc
  140     add     $key, 32, $key
  141 
  142     andcc       $out, 7, $tmp       ! is output aligned?
  143     aes_eround01    %f12, %f0, %f2, %f4
  144     aes_eround23    %f14, %f0, %f2, %f2
  145     aes_eround01_l  %f16, %f4, %f2, %f0
  146     aes_eround23_l  %f18, %f4, %f2, %f2
  147 
  148     bnz,pn      %icc, 2f
  149     nop
  150 
  151     std     %f0, [$out + 0]
  152     retl
  153     std     %f2, [$out + 8]
  154 
  155 2:  alignaddrl  $out, %g0, $out
  156     mov     0xff, $mask
  157     srl     $mask, $tmp, $mask
  158 
  159     faligndata  %f0, %f0, %f4
  160     faligndata  %f0, %f2, %f6
  161     faligndata  %f2, %f2, %f8
  162 
  163     stda        %f4, [$out + $mask]0xc0 ! partial store
  164     std     %f6, [$out + 8]
  165     add     $out, 16, $out
  166     orn     %g0, $mask, $mask
  167     retl
  168     stda        %f8, [$out + $mask]0xc0 ! partial store
  169 .type   aes_t4_encrypt,#function
  170 .size   aes_t4_encrypt,.-aes_t4_encrypt
  171 
  172 .globl  aes_t4_decrypt
  173 .align  32
  174 aes_t4_decrypt:
  175     andcc       $inp, 7, %g1        ! is input aligned?
  176     andn        $inp, 7, $inp
  177 
  178     ldx     [$key + 0], %g4
  179     ldx     [$key + 8], %g5
  180 
  181     ldx     [$inp + 0], %o4
  182     bz,pt       %icc, 1f
  183     ldx     [$inp + 8], %o5
  184     ldx     [$inp + 16], $inp
  185     sll     %g1, 3, %g1
  186     sub     %g0, %g1, %o3
  187     sllx        %o4, %g1, %o4
  188     sllx        %o5, %g1, %g1
  189     srlx        %o5, %o3, %o5
  190     srlx        $inp, %o3, %o3
  191     or      %o5, %o4, %o4
  192     or      %o3, %g1, %o5
  193 1:
  194     ld      [$key + 240], $rounds
  195     ldd     [$key + 16], %f12
  196     ldd     [$key + 24], %f14
  197     xor     %g4, %o4, %o4
  198     xor     %g5, %o5, %o5
  199     movxtod     %o4, %f0
  200     movxtod     %o5, %f2
  201     srl     $rounds, 1, $rounds
  202     ldd     [$key + 32], %f16
  203     sub     $rounds, 1, $rounds
  204     ldd     [$key + 40], %f18
  205     add     $key, 48, $key
  206 
  207 .Ldec:
  208     aes_dround01    %f12, %f0, %f2, %f4
  209     aes_dround23    %f14, %f0, %f2, %f2
  210     ldd     [$key + 0], %f12
  211     ldd     [$key + 8], %f14
  212     sub     $rounds,1,$rounds
  213     aes_dround01    %f16, %f4, %f2, %f0
  214     aes_dround23    %f18, %f4, %f2, %f2
  215     ldd     [$key + 16], %f16
  216     ldd     [$key + 24], %f18
  217     brnz,pt     $rounds, .Ldec
  218     add     $key, 32, $key
  219 
  220     andcc       $out, 7, $tmp       ! is output aligned?
  221     aes_dround01    %f12, %f0, %f2, %f4
  222     aes_dround23    %f14, %f0, %f2, %f2
  223     aes_dround01_l  %f16, %f4, %f2, %f0
  224     aes_dround23_l  %f18, %f4, %f2, %f2
  225 
  226     bnz,pn      %icc, 2f
  227     nop
  228 
  229     std     %f0, [$out + 0]
  230     retl
  231     std     %f2, [$out + 8]
  232 
  233 2:  alignaddrl  $out, %g0, $out
  234     mov     0xff, $mask
  235     srl     $mask, $tmp, $mask
  236 
  237     faligndata  %f0, %f0, %f4
  238     faligndata  %f0, %f2, %f6
  239     faligndata  %f2, %f2, %f8
  240 
  241     stda        %f4, [$out + $mask]0xc0 ! partial store
  242     std     %f6, [$out + 8]
  243     add     $out, 16, $out
  244     orn     %g0, $mask, $mask
  245     retl
  246     stda        %f8, [$out + $mask]0xc0 ! partial store
  247 .type   aes_t4_decrypt,#function
  248 .size   aes_t4_decrypt,.-aes_t4_decrypt
  249 ___
  250 }
  251 
  252 ######################################################################
  253 # key setup subroutines
  254 #
  255 {
  256 my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5));
  257 $code.=<<___;
  258 .globl  aes_t4_set_encrypt_key
  259 .align  32
  260 aes_t4_set_encrypt_key:
  261 .Lset_encrypt_key:
  262     and     $inp, 7, $tmp
  263     alignaddr   $inp, %g0, $inp
  264     cmp     $bits, 192
  265     ldd     [$inp + 0], %f0
  266     bl,pt       %icc,.L128
  267     ldd     [$inp + 8], %f2
  268 
  269     be,pt       %icc,.L192
  270     ldd     [$inp + 16], %f4
  271     brz,pt      $tmp, .L256aligned
  272     ldd     [$inp + 24], %f6
  273 
  274     ldd     [$inp + 32], %f8
  275     faligndata  %f0, %f2, %f0
  276     faligndata  %f2, %f4, %f2
  277     faligndata  %f4, %f6, %f4
  278     faligndata  %f6, %f8, %f6
  279 .L256aligned:
  280 ___
  281 for ($i=0; $i<6; $i++) {
  282     $code.=<<___;
  283     std     %f0, [$out + `32*$i+0`]
  284     aes_kexpand1    %f0, %f6, $i, %f0
  285     std     %f2, [$out + `32*$i+8`]
  286     aes_kexpand2    %f2, %f0, %f2
  287     std     %f4, [$out + `32*$i+16`]
  288     aes_kexpand0    %f4, %f2, %f4
  289     std     %f6, [$out + `32*$i+24`]
  290     aes_kexpand2    %f6, %f4, %f6
  291 ___
  292 }
  293 $code.=<<___;
  294     std     %f0, [$out + `32*$i+0`]
  295     aes_kexpand1    %f0, %f6, $i, %f0
  296     std     %f2, [$out + `32*$i+8`]
  297     aes_kexpand2    %f2, %f0, %f2
  298     std     %f4, [$out + `32*$i+16`]
  299     std     %f6, [$out + `32*$i+24`]
  300     std     %f0, [$out + `32*$i+32`]
  301     std     %f2, [$out + `32*$i+40`]
  302 
  303     mov     14, $tmp
  304     st      $tmp, [$out + 240]
  305     retl
  306     xor     %o0, %o0, %o0
  307 
  308 .align  16
  309 .L192:
  310     brz,pt      $tmp, .L192aligned
  311     nop
  312 
  313     ldd     [$inp + 24], %f6
  314     faligndata  %f0, %f2, %f0
  315     faligndata  %f2, %f4, %f2
  316     faligndata  %f4, %f6, %f4
  317 .L192aligned:
  318 ___
  319 for ($i=0; $i<7; $i++) {
  320     $code.=<<___;
  321     std     %f0, [$out + `24*$i+0`]
  322     aes_kexpand1    %f0, %f4, $i, %f0
  323     std     %f2, [$out + `24*$i+8`]
  324     aes_kexpand2    %f2, %f0, %f2
  325     std     %f4, [$out + `24*$i+16`]
  326     aes_kexpand2    %f4, %f2, %f4
  327 ___
  328 }
  329 $code.=<<___;
  330     std     %f0, [$out + `24*$i+0`]
  331     aes_kexpand1    %f0, %f4, $i, %f0
  332     std     %f2, [$out + `24*$i+8`]
  333     aes_kexpand2    %f2, %f0, %f2
  334     std     %f4, [$out + `24*$i+16`]
  335     std     %f0, [$out + `24*$i+24`]
  336     std     %f2, [$out + `24*$i+32`]
  337 
  338     mov     12, $tmp
  339     st      $tmp, [$out + 240]
  340     retl
  341     xor     %o0, %o0, %o0
  342 
  343 .align  16
  344 .L128:
  345     brz,pt      $tmp, .L128aligned
  346     nop
  347 
  348     ldd     [$inp + 16], %f4
  349     faligndata  %f0, %f2, %f0
  350     faligndata  %f2, %f4, %f2
  351 .L128aligned:
  352 ___
  353 for ($i=0; $i<10; $i++) {
  354     $code.=<<___;
  355     std     %f0, [$out + `16*$i+0`]
  356     aes_kexpand1    %f0, %f2, $i, %f0
  357     std     %f2, [$out + `16*$i+8`]
  358     aes_kexpand2    %f2, %f0, %f2
  359 ___
  360 }
  361 $code.=<<___;
  362     std     %f0, [$out + `16*$i+0`]
  363     std     %f2, [$out + `16*$i+8`]
  364 
  365     mov     10, $tmp
  366     st      $tmp, [$out + 240]
  367     retl
  368     xor     %o0, %o0, %o0
  369 .type   aes_t4_set_encrypt_key,#function
  370 .size   aes_t4_set_encrypt_key,.-aes_t4_set_encrypt_key
  371 
  372 .globl  aes_t4_set_decrypt_key
  373 .align  32
  374 aes_t4_set_decrypt_key:
  375     mov     %o7, %o5
  376     call        .Lset_encrypt_key
  377     nop
  378 
  379     mov     %o5, %o7
  380     sll     $tmp, 4, $inp       ! $tmp is number of rounds
  381     add     $tmp, 2, $tmp
  382     add     $out, $inp, $inp    ! $inp=$out+16*rounds
  383     srl     $tmp, 2, $tmp       ! $tmp=(rounds+2)/4
  384 
  385 .Lkey_flip:
  386     ldd     [$out + 0],  %f0
  387     ldd     [$out + 8],  %f2
  388     ldd     [$out + 16], %f4
  389     ldd     [$out + 24], %f6
  390     ldd     [$inp + 0],  %f8
  391     ldd     [$inp + 8],  %f10
  392     ldd     [$inp - 16], %f12
  393     ldd     [$inp - 8],  %f14
  394     sub     $tmp, 1, $tmp
  395     std     %f0, [$inp + 0]
  396     std     %f2, [$inp + 8]
  397     std     %f4, [$inp - 16]
  398     std     %f6, [$inp - 8]
  399     std     %f8, [$out + 0]
  400     std     %f10, [$out + 8]
  401     std     %f12, [$out + 16]
  402     std     %f14, [$out + 24]
  403     add     $out, 32, $out
  404     brnz        $tmp, .Lkey_flip
  405     sub     $inp, 32, $inp
  406 
  407     retl
  408     xor     %o0, %o0, %o0
  409 .type   aes_t4_set_decrypt_key,#function
  410 .size   aes_t4_set_decrypt_key,.-aes_t4_set_decrypt_key
  411 ___
  412 }
  413 
  414 {{{
  415 my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
  416 my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7));
  417 
  418 $code.=<<___;
  419 .align  32
  420 _aes128_encrypt_1x:
  421 ___
  422 for ($i=0; $i<4; $i++) {
  423     $code.=<<___;
  424     aes_eround01    %f`16+8*$i+0`, %f0, %f2, %f4
  425     aes_eround23    %f`16+8*$i+2`, %f0, %f2, %f2
  426     aes_eround01    %f`16+8*$i+4`, %f4, %f2, %f0
  427     aes_eround23    %f`16+8*$i+6`, %f4, %f2, %f2
  428 ___
  429 }
  430 $code.=<<___;
  431     aes_eround01    %f48, %f0, %f2, %f4
  432     aes_eround23    %f50, %f0, %f2, %f2
  433     aes_eround01_l  %f52, %f4, %f2, %f0
  434     retl
  435     aes_eround23_l  %f54, %f4, %f2, %f2
  436 .type   _aes128_encrypt_1x,#function
  437 .size   _aes128_encrypt_1x,.-_aes128_encrypt_1x
  438 
  439 .align  32
  440 _aes128_encrypt_2x:
  441 ___
  442 for ($i=0; $i<4; $i++) {
  443     $code.=<<___;
  444     aes_eround01    %f`16+8*$i+0`, %f0, %f2, %f8
  445     aes_eround23    %f`16+8*$i+2`, %f0, %f2, %f2
  446     aes_eround01    %f`16+8*$i+0`, %f4, %f6, %f10
  447     aes_eround23    %f`16+8*$i+2`, %f4, %f6, %f6
  448     aes_eround01    %f`16+8*$i+4`, %f8, %f2, %f0
  449     aes_eround23    %f`16+8*$i+6`, %f8, %f2, %f2
  450     aes_eround01    %f`16+8*$i+4`, %f10, %f6, %f4
  451     aes_eround23    %f`16+8*$i+6`, %f10, %f6, %f6
  452 ___
  453 }
  454 $code.=<<___;
  455     aes_eround01    %f48, %f0, %f2, %f8
  456     aes_eround23    %f50, %f0, %f2, %f2
  457     aes_eround01    %f48, %f4, %f6, %f10
  458     aes_eround23    %f50, %f4, %f6, %f6
  459     aes_eround01_l  %f52, %f8, %f2, %f0
  460     aes_eround23_l  %f54, %f8, %f2, %f2
  461     aes_eround01_l  %f52, %f10, %f6, %f4
  462     retl
  463     aes_eround23_l  %f54, %f10, %f6, %f6
  464 .type   _aes128_encrypt_2x,#function
  465 .size   _aes128_encrypt_2x,.-_aes128_encrypt_2x
  466 
  467 .align  32
  468 _aes128_loadkey:
  469     ldx     [$key + 0], %g4
  470     ldx     [$key + 8], %g5
  471 ___
  472 for ($i=2; $i<22;$i++) {            # load key schedule
  473     $code.=<<___;
  474     ldd     [$key + `8*$i`], %f`12+2*$i`
  475 ___
  476 }
  477 $code.=<<___;
  478     retl
  479     nop
  480 .type   _aes128_loadkey,#function
  481 .size   _aes128_loadkey,.-_aes128_loadkey
  482 _aes128_load_enckey=_aes128_loadkey
  483 _aes128_load_deckey=_aes128_loadkey
  484 
  485 ___
  486 
  487 &alg_cbc_encrypt_implement("aes",128);
  488 if ($::evp) {
  489     &alg_ctr32_implement("aes",128);
  490     &alg_xts_implement("aes",128,"en");
  491     &alg_xts_implement("aes",128,"de");
  492 }
  493 &alg_cbc_decrypt_implement("aes",128);
  494 
  495 $code.=<<___;
  496 .align  32
  497 _aes128_decrypt_1x:
  498 ___
  499 for ($i=0; $i<4; $i++) {
  500     $code.=<<___;
  501     aes_dround01    %f`16+8*$i+0`, %f0, %f2, %f4
  502     aes_dround23    %f`16+8*$i+2`, %f0, %f2, %f2
  503     aes_dround01    %f`16+8*$i+4`, %f4, %f2, %f0
  504     aes_dround23    %f`16+8*$i+6`, %f4, %f2, %f2
  505 ___
  506 }
  507 $code.=<<___;
  508     aes_dround01    %f48, %f0, %f2, %f4
  509     aes_dround23    %f50, %f0, %f2, %f2
  510     aes_dround01_l  %f52, %f4, %f2, %f0
  511     retl
  512     aes_dround23_l  %f54, %f4, %f2, %f2
  513 .type   _aes128_decrypt_1x,#function
  514 .size   _aes128_decrypt_1x,.-_aes128_decrypt_1x
  515 
  516 .align  32
  517 _aes128_decrypt_2x:
  518 ___
  519 for ($i=0; $i<4; $i++) {
  520     $code.=<<___;
  521     aes_dround01    %f`16+8*$i+0`, %f0, %f2, %f8
  522     aes_dround23    %f`16+8*$i+2`, %f0, %f2, %f2
  523     aes_dround01    %f`16+8*$i+0`, %f4, %f6, %f10
  524     aes_dround23    %f`16+8*$i+2`, %f4, %f6, %f6
  525     aes_dround01    %f`16+8*$i+4`, %f8, %f2, %f0
  526     aes_dround23    %f`16+8*$i+6`, %f8, %f2, %f2
  527     aes_dround01    %f`16+8*$i+4`, %f10, %f6, %f4
  528     aes_dround23    %f`16+8*$i+6`, %f10, %f6, %f6
  529 ___
  530 }
  531 $code.=<<___;
  532     aes_dround01    %f48, %f0, %f2, %f8
  533     aes_dround23    %f50, %f0, %f2, %f2
  534     aes_dround01    %f48, %f4, %f6, %f10
  535     aes_dround23    %f50, %f4, %f6, %f6
  536     aes_dround01_l  %f52, %f8, %f2, %f0
  537     aes_dround23_l  %f54, %f8, %f2, %f2
  538     aes_dround01_l  %f52, %f10, %f6, %f4
  539     retl
  540     aes_dround23_l  %f54, %f10, %f6, %f6
  541 .type   _aes128_decrypt_2x,#function
  542 .size   _aes128_decrypt_2x,.-_aes128_decrypt_2x
  543 ___
  544 
  545 $code.=<<___;
  546 .align  32
  547 _aes192_encrypt_1x:
  548 ___
  549 for ($i=0; $i<5; $i++) {
  550     $code.=<<___;
  551     aes_eround01    %f`16+8*$i+0`, %f0, %f2, %f4
  552     aes_eround23    %f`16+8*$i+2`, %f0, %f2, %f2
  553     aes_eround01    %f`16+8*$i+4`, %f4, %f2, %f0
  554     aes_eround23    %f`16+8*$i+6`, %f4, %f2, %f2
  555 ___
  556 }
  557 $code.=<<___;
  558     aes_eround01    %f56, %f0, %f2, %f4
  559     aes_eround23    %f58, %f0, %f2, %f2
  560     aes_eround01_l  %f60, %f4, %f2, %f0
  561     retl
  562     aes_eround23_l  %f62, %f4, %f2, %f2
  563 .type   _aes192_encrypt_1x,#function
  564 .size   _aes192_encrypt_1x,.-_aes192_encrypt_1x
  565 
  566 .align  32
  567 _aes192_encrypt_2x:
  568 ___
  569 for ($i=0; $i<5; $i++) {
  570     $code.=<<___;
  571     aes_eround01    %f`16+8*$i+0`, %f0, %f2, %f8
  572     aes_eround23    %f`16+8*$i+2`, %f0, %f2, %f2
  573     aes_eround01    %f`16+8*$i+0`, %f4, %f6, %f10
  574     aes_eround23    %f`16+8*$i+2`, %f4, %f6, %f6
  575     aes_eround01    %f`16+8*$i+4`, %f8, %f2, %f0
  576     aes_eround23    %f`16+8*$i+6`, %f8, %f2, %f2
  577     aes_eround01    %f`16+8*$i+4`, %f10, %f6, %f4
  578     aes_eround23    %f`16+8*$i+6`, %f10, %f6, %f6
  579 ___
  580 }
  581 $code.=<<___;
  582     aes_eround01    %f56, %f0, %f2, %f8
  583     aes_eround23    %f58, %f0, %f2, %f2
  584     aes_eround01    %f56, %f4, %f6, %f10
  585     aes_eround23    %f58, %f4, %f6, %f6
  586     aes_eround01_l  %f60, %f8, %f2, %f0
  587     aes_eround23_l  %f62, %f8, %f2, %f2
  588     aes_eround01_l  %f60, %f10, %f6, %f4
  589     retl
  590     aes_eround23_l  %f62, %f10, %f6, %f6
  591 .type   _aes192_encrypt_2x,#function
  592 .size   _aes192_encrypt_2x,.-_aes192_encrypt_2x
  593 
  594 .align  32
  595 _aes256_encrypt_1x:
  596     aes_eround01    %f16, %f0, %f2, %f4
  597     aes_eround23    %f18, %f0, %f2, %f2
  598     ldd     [$key + 208], %f16
  599     ldd     [$key + 216], %f18
  600     aes_eround01    %f20, %f4, %f2, %f0
  601     aes_eround23    %f22, %f4, %f2, %f2
  602     ldd     [$key + 224], %f20
  603     ldd     [$key + 232], %f22
  604 ___
  605 for ($i=1; $i<6; $i++) {
  606     $code.=<<___;
  607     aes_eround01    %f`16+8*$i+0`, %f0, %f2, %f4
  608     aes_eround23    %f`16+8*$i+2`, %f0, %f2, %f2
  609     aes_eround01    %f`16+8*$i+4`, %f4, %f2, %f0
  610     aes_eround23    %f`16+8*$i+6`, %f4, %f2, %f2
  611 ___
  612 }
  613 $code.=<<___;
  614     aes_eround01    %f16, %f0, %f2, %f4
  615     aes_eround23    %f18, %f0, %f2, %f2
  616     ldd     [$key + 16], %f16
  617     ldd     [$key + 24], %f18
  618     aes_eround01_l  %f20, %f4, %f2, %f0
  619     aes_eround23_l  %f22, %f4, %f2, %f2
  620     ldd     [$key + 32], %f20
  621     retl
  622     ldd     [$key + 40], %f22
  623 .type   _aes256_encrypt_1x,#function
  624 .size   _aes256_encrypt_1x,.-_aes256_encrypt_1x
  625 
  626 .align  32
  627 _aes256_encrypt_2x:
  628     aes_eround01    %f16, %f0, %f2, %f8
  629     aes_eround23    %f18, %f0, %f2, %f2
  630     aes_eround01    %f16, %f4, %f6, %f10
  631     aes_eround23    %f18, %f4, %f6, %f6
  632     ldd     [$key + 208], %f16
  633     ldd     [$key + 216], %f18
  634     aes_eround01    %f20, %f8, %f2, %f0
  635     aes_eround23    %f22, %f8, %f2, %f2
  636     aes_eround01    %f20, %f10, %f6, %f4
  637     aes_eround23    %f22, %f10, %f6, %f6
  638     ldd     [$key + 224], %f20
  639     ldd     [$key + 232], %f22
  640 ___
  641 for ($i=1; $i<6; $i++) {
  642     $code.=<<___;
  643     aes_eround01    %f`16+8*$i+0`, %f0, %f2, %f8
  644     aes_eround23    %f`16+8*$i+2`, %f0, %f2, %f2
  645     aes_eround01    %f`16+8*$i+0`, %f4, %f6, %f10
  646     aes_eround23    %f`16+8*$i+2`, %f4, %f6, %f6
  647     aes_eround01    %f`16+8*$i+4`, %f8, %f2, %f0
  648     aes_eround23    %f`16+8*$i+6`, %f8, %f2, %f2
  649     aes_eround01    %f`16+8*$i+4`, %f10, %f6, %f4
  650     aes_eround23    %f`16+8*$i+6`, %f10, %f6, %f6
  651 ___
  652 }
  653 $code.=<<___;
  654     aes_eround01    %f16, %f0, %f2, %f8
  655     aes_eround23    %f18, %f0, %f2, %f2
  656     aes_eround01    %f16, %f4, %f6, %f10
  657     aes_eround23    %f18, %f4, %f6, %f6
  658     ldd     [$key + 16], %f16
  659     ldd     [$key + 24], %f18
  660     aes_eround01_l  %f20, %f8, %f2, %f0
  661     aes_eround23_l  %f22, %f8, %f2, %f2
  662     aes_eround01_l  %f20, %f10, %f6, %f4
  663     aes_eround23_l  %f22, %f10, %f6, %f6
  664     ldd     [$key + 32], %f20
  665     retl
  666     ldd     [$key + 40], %f22
  667 .type   _aes256_encrypt_2x,#function
  668 .size   _aes256_encrypt_2x,.-_aes256_encrypt_2x
  669 
  670 .align  32
  671 _aes192_loadkey:
  672     ldx     [$key + 0], %g4
  673     ldx     [$key + 8], %g5
  674 ___
  675 for ($i=2; $i<26;$i++) {            # load key schedule
  676     $code.=<<___;
  677     ldd     [$key + `8*$i`], %f`12+2*$i`
  678 ___
  679 }
  680 $code.=<<___;
  681     retl
  682     nop
  683 .type   _aes192_loadkey,#function
  684 .size   _aes192_loadkey,.-_aes192_loadkey
  685 _aes256_loadkey=_aes192_loadkey
  686 _aes192_load_enckey=_aes192_loadkey
  687 _aes192_load_deckey=_aes192_loadkey
  688 _aes256_load_enckey=_aes192_loadkey
  689 _aes256_load_deckey=_aes192_loadkey
  690 ___
  691 
  692 &alg_cbc_encrypt_implement("aes",256);
  693 &alg_cbc_encrypt_implement("aes",192);
  694 if ($::evp) {
  695     &alg_ctr32_implement("aes",256);
  696     &alg_xts_implement("aes",256,"en");
  697     &alg_xts_implement("aes",256,"de");
  698     &alg_ctr32_implement("aes",192);
  699 }
  700 &alg_cbc_decrypt_implement("aes",192);
  701 &alg_cbc_decrypt_implement("aes",256);
  702 
  703 $code.=<<___;
  704 .align  32
  705 _aes256_decrypt_1x:
  706     aes_dround01    %f16, %f0, %f2, %f4
  707     aes_dround23    %f18, %f0, %f2, %f2
  708     ldd     [$key + 208], %f16
  709     ldd     [$key + 216], %f18
  710     aes_dround01    %f20, %f4, %f2, %f0
  711     aes_dround23    %f22, %f4, %f2, %f2
  712     ldd     [$key + 224], %f20
  713     ldd     [$key + 232], %f22
  714 ___
  715 for ($i=1; $i<6; $i++) {
  716     $code.=<<___;
  717     aes_dround01    %f`16+8*$i+0`, %f0, %f2, %f4
  718     aes_dround23    %f`16+8*$i+2`, %f0, %f2, %f2
  719     aes_dround01    %f`16+8*$i+4`, %f4, %f2, %f0
  720     aes_dround23    %f`16+8*$i+6`, %f4, %f2, %f2
  721 ___
  722 }
  723 $code.=<<___;
  724     aes_dround01    %f16, %f0, %f2, %f4
  725     aes_dround23    %f18, %f0, %f2, %f2
  726     ldd     [$key + 16], %f16
  727     ldd     [$key + 24], %f18
  728     aes_dround01_l  %f20, %f4, %f2, %f0
  729     aes_dround23_l  %f22, %f4, %f2, %f2
  730     ldd     [$key + 32], %f20
  731     retl
  732     ldd     [$key + 40], %f22
  733 .type   _aes256_decrypt_1x,#function
  734 .size   _aes256_decrypt_1x,.-_aes256_decrypt_1x
  735 
  736 .align  32
  737 _aes256_decrypt_2x:
  738     aes_dround01    %f16, %f0, %f2, %f8
  739     aes_dround23    %f18, %f0, %f2, %f2
  740     aes_dround01    %f16, %f4, %f6, %f10
  741     aes_dround23    %f18, %f4, %f6, %f6
  742     ldd     [$key + 208], %f16
  743     ldd     [$key + 216], %f18
  744     aes_dround01    %f20, %f8, %f2, %f0
  745     aes_dround23    %f22, %f8, %f2, %f2
  746     aes_dround01    %f20, %f10, %f6, %f4
  747     aes_dround23    %f22, %f10, %f6, %f6
  748     ldd     [$key + 224], %f20
  749     ldd     [$key + 232], %f22
  750 ___
  751 for ($i=1; $i<6; $i++) {
  752     $code.=<<___;
  753     aes_dround01    %f`16+8*$i+0`, %f0, %f2, %f8
  754     aes_dround23    %f`16+8*$i+2`, %f0, %f2, %f2
  755     aes_dround01    %f`16+8*$i+0`, %f4, %f6, %f10
  756     aes_dround23    %f`16+8*$i+2`, %f4, %f6, %f6
  757     aes_dround01    %f`16+8*$i+4`, %f8, %f2, %f0
  758     aes_dround23    %f`16+8*$i+6`, %f8, %f2, %f2
  759     aes_dround01    %f`16+8*$i+4`, %f10, %f6, %f4
  760     aes_dround23    %f`16+8*$i+6`, %f10, %f6, %f6
  761 ___
  762 }
  763 $code.=<<___;
  764     aes_dround01    %f16, %f0, %f2, %f8
  765     aes_dround23    %f18, %f0, %f2, %f2
  766     aes_dround01    %f16, %f4, %f6, %f10
  767     aes_dround23    %f18, %f4, %f6, %f6
  768     ldd     [$key + 16], %f16
  769     ldd     [$key + 24], %f18
  770     aes_dround01_l  %f20, %f8, %f2, %f0
  771     aes_dround23_l  %f22, %f8, %f2, %f2
  772     aes_dround01_l  %f20, %f10, %f6, %f4
  773     aes_dround23_l  %f22, %f10, %f6, %f6
  774     ldd     [$key + 32], %f20
  775     retl
  776     ldd     [$key + 40], %f22
  777 .type   _aes256_decrypt_2x,#function
  778 .size   _aes256_decrypt_2x,.-_aes256_decrypt_2x
  779 
  780 .align  32
  781 _aes192_decrypt_1x:
  782 ___
  783 for ($i=0; $i<5; $i++) {
  784     $code.=<<___;
  785     aes_dround01    %f`16+8*$i+0`, %f0, %f2, %f4
  786     aes_dround23    %f`16+8*$i+2`, %f0, %f2, %f2
  787     aes_dround01    %f`16+8*$i+4`, %f4, %f2, %f0
  788     aes_dround23    %f`16+8*$i+6`, %f4, %f2, %f2
  789 ___
  790 }
  791 $code.=<<___;
  792     aes_dround01    %f56, %f0, %f2, %f4
  793     aes_dround23    %f58, %f0, %f2, %f2
  794     aes_dround01_l  %f60, %f4, %f2, %f0
  795     retl
  796     aes_dround23_l  %f62, %f4, %f2, %f2
  797 .type   _aes192_decrypt_1x,#function
  798 .size   _aes192_decrypt_1x,.-_aes192_decrypt_1x
  799 
  800 .align  32
  801 _aes192_decrypt_2x:
  802 ___
  803 for ($i=0; $i<5; $i++) {
  804     $code.=<<___;
  805     aes_dround01    %f`16+8*$i+0`, %f0, %f2, %f8
  806     aes_dround23    %f`16+8*$i+2`, %f0, %f2, %f2
  807     aes_dround01    %f`16+8*$i+0`, %f4, %f6, %f10
  808     aes_dround23    %f`16+8*$i+2`, %f4, %f6, %f6
  809     aes_dround01    %f`16+8*$i+4`, %f8, %f2, %f0
  810     aes_dround23    %f`16+8*$i+6`, %f8, %f2, %f2
  811     aes_dround01    %f`16+8*$i+4`, %f10, %f6, %f4
  812     aes_dround23    %f`16+8*$i+6`, %f10, %f6, %f6
  813 ___
  814 }
  815 $code.=<<___;
  816     aes_dround01    %f56, %f0, %f2, %f8
  817     aes_dround23    %f58, %f0, %f2, %f2
  818     aes_dround01    %f56, %f4, %f6, %f10
  819     aes_dround23    %f58, %f4, %f6, %f6
  820     aes_dround01_l  %f60, %f8, %f2, %f0
  821     aes_dround23_l  %f62, %f8, %f2, %f2
  822     aes_dround01_l  %f60, %f10, %f6, %f4
  823     retl
  824     aes_dround23_l  %f62, %f10, %f6, %f6
  825 .type   _aes192_decrypt_2x,#function
  826 .size   _aes192_decrypt_2x,.-_aes192_decrypt_2x
  827 ___
  828 }}}
  829 
  830 if (!$::evp) {
  831 $code.=<<___;
  832 .global AES_encrypt
  833 AES_encrypt=aes_t4_encrypt
  834 .global AES_decrypt
  835 AES_decrypt=aes_t4_decrypt
  836 .global AES_set_encrypt_key
  837 .align  32
  838 AES_set_encrypt_key:
  839     andcc       %o2, 7, %g0     ! check alignment
  840     bnz,a,pn    %icc, 1f
  841     mov     -1, %o0
  842     brz,a,pn    %o0, 1f
  843     mov     -1, %o0
  844     brz,a,pn    %o2, 1f
  845     mov     -1, %o0
  846     andncc      %o1, 0x1c0, %g0
  847     bnz,a,pn    %icc, 1f
  848     mov     -2, %o0
  849     cmp     %o1, 128
  850     bl,a,pn     %icc, 1f
  851     mov     -2, %o0
  852     b       aes_t4_set_encrypt_key
  853     nop
  854 1:  retl
  855     nop
  856 .type   AES_set_encrypt_key,#function
  857 .size   AES_set_encrypt_key,.-AES_set_encrypt_key
  858 
  859 .global AES_set_decrypt_key
  860 .align  32
  861 AES_set_decrypt_key:
  862     andcc       %o2, 7, %g0     ! check alignment
  863     bnz,a,pn    %icc, 1f
  864     mov     -1, %o0
  865     brz,a,pn    %o0, 1f
  866     mov     -1, %o0
  867     brz,a,pn    %o2, 1f
  868     mov     -1, %o0
  869     andncc      %o1, 0x1c0, %g0
  870     bnz,a,pn    %icc, 1f
  871     mov     -2, %o0
  872     cmp     %o1, 128
  873     bl,a,pn     %icc, 1f
  874     mov     -2, %o0
  875     b       aes_t4_set_decrypt_key
  876     nop
  877 1:  retl
  878     nop
  879 .type   AES_set_decrypt_key,#function
  880 .size   AES_set_decrypt_key,.-AES_set_decrypt_key
  881 ___
  882 
  883 my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5));
  884 
  885 $code.=<<___;
  886 .globl  AES_cbc_encrypt
  887 .align  32
  888 AES_cbc_encrypt:
  889     ld      [$key + 240], %g1
  890     nop
  891     brz     $enc, .Lcbc_decrypt
  892     cmp     %g1, 12
  893 
  894     bl,pt       %icc, aes128_t4_cbc_encrypt
  895     nop
  896     be,pn       %icc, aes192_t4_cbc_encrypt
  897     nop
  898     ba      aes256_t4_cbc_encrypt
  899     nop
  900 
  901 .Lcbc_decrypt:
  902     bl,pt       %icc, aes128_t4_cbc_decrypt
  903     nop
  904     be,pn       %icc, aes192_t4_cbc_decrypt
  905     nop
  906     ba      aes256_t4_cbc_decrypt
  907     nop
  908 .type   AES_cbc_encrypt,#function
  909 .size   AES_cbc_encrypt,.-AES_cbc_encrypt
  910 ___
  911 }
  912 $code.=<<___;
  913 .asciz  "AES for SPARC T4, David S. Miller, Andy Polyakov"
  914 .align  4
  915 ___
  916 
  917 &emit_assembler();
  918 
  919 close STDOUT;