"Fossies" - the Fresh Open Source Software Archive

Member "openssl-1.0.2q/crypto/aes/asm/aesv8-armx.pl" (20 Nov 2018, 21533 Bytes) of package /linux/misc/openssl-1.0.2q.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Perl source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "aesv8-armx.pl" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 1.1.0g_vs_1.1.1-pre2.

    1 #!/usr/bin/env perl
    2 #
    3 # ====================================================================
    4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
    5 # project. The module is, however, dual licensed under OpenSSL and
    6 # CRYPTOGAMS licenses depending on where you obtain it. For further
    7 # details see http://www.openssl.org/~appro/cryptogams/.
    8 # ====================================================================
    9 #
   10 # This module implements support for ARMv8 AES instructions. The
   11 # module is endian-agnostic in sense that it supports both big- and
   12 # little-endian cases. As does it support both 32- and 64-bit modes
   13 # of operation. Latter is achieved by limiting amount of utilized
   14 # registers to 16, which implies additional NEON load and integer
   15 # instructions. This has no effect on mighty Apple A7, where results
   16 # are literally equal to the theoretical estimates based on AES
   17 # instruction latencies and issue rates. On Cortex-A53, an in-order
   18 # execution core, this costs up to 10-15%, which is partially
   19 # compensated by implementing dedicated code path for 128-bit
   20 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
   21 # seems to be limited by sheer amount of NEON instructions...
   22 #
   23 # Performance in cycles per byte processed with 128-bit key:
   24 #
   25 #       CBC enc     CBC dec     CTR
   26 # Apple A7  2.39        1.20        1.20
   27 # Cortex-A53    1.32        1.29        1.46
   28 # Cortex-A57(*) 1.95        0.85        0.93
   29 # Denver    1.96        0.86        0.80
   30 #
   31 # (*)   original 3.64/1.34/1.32 results were for r0p0 revision
   32 #   and are still same even for updated module;
   33 
   34 $flavour = shift;
   35 open STDOUT,">".shift;
   36 
   37 $prefix="aes_v8";
   38 
   39 $code=<<___;
   40 #include "arm_arch.h"
   41 
   42 #if __ARM_MAX_ARCH__>=7
   43 .text
   44 ___
   45 $code.=".arch   armv8-a+crypto\n"           if ($flavour =~ /64/);
   46 $code.=".arch   armv7-a\n.fpu   neon\n.code 32\n"   if ($flavour !~ /64/);
   47         #^^^^^^ this is done to simplify adoption by not depending
   48         #   on latest binutils.
   49 
   50 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
   51 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
   52 # maintain both 32- and 64-bit codes within single module and
   53 # transliterate common code to either flavour with regex vodoo.
   54 #
   55 {{{
   56 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
   57 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
   58     $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
   59 
   60 
   61 $code.=<<___;
   62 .align  5
   63 rcon:
   64 .long   0x01,0x01,0x01,0x01
   65 .long   0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
   66 .long   0x1b,0x1b,0x1b,0x1b
   67 
   68 .globl  ${prefix}_set_encrypt_key
   69 .type   ${prefix}_set_encrypt_key,%function
   70 .align  5
   71 ${prefix}_set_encrypt_key:
   72 .Lenc_key:
   73 ___
   74 $code.=<<___    if ($flavour =~ /64/);
   75     stp x29,x30,[sp,#-16]!
   76     add x29,sp,#0
   77 ___
   78 $code.=<<___;
   79     mov $ptr,#-1
   80     cmp $inp,#0
   81     b.eq    .Lenc_key_abort
   82     cmp $out,#0
   83     b.eq    .Lenc_key_abort
   84     mov $ptr,#-2
   85     cmp $bits,#128
   86     b.lt    .Lenc_key_abort
   87     cmp $bits,#256
   88     b.gt    .Lenc_key_abort
   89     tst $bits,#0x3f
   90     b.ne    .Lenc_key_abort
   91 
   92     adr $ptr,rcon
   93     cmp $bits,#192
   94 
   95     veor    $zero,$zero,$zero
   96     vld1.8  {$in0},[$inp],#16
   97     mov $bits,#8        // reuse $bits
   98     vld1.32 {$rcon,$mask},[$ptr],#32
   99 
  100     b.lt    .Loop128
  101     b.eq    .L192
  102     b   .L256
  103 
  104 .align  4
  105 .Loop128:
  106     vtbl.8  $key,{$in0},$mask
  107     vext.8  $tmp,$zero,$in0,#12
  108     vst1.32 {$in0},[$out],#16
  109     aese    $key,$zero
  110     subs    $bits,$bits,#1
  111 
  112     veor    $in0,$in0,$tmp
  113     vext.8  $tmp,$zero,$tmp,#12
  114     veor    $in0,$in0,$tmp
  115     vext.8  $tmp,$zero,$tmp,#12
  116      veor   $key,$key,$rcon
  117     veor    $in0,$in0,$tmp
  118     vshl.u8 $rcon,$rcon,#1
  119     veor    $in0,$in0,$key
  120     b.ne    .Loop128
  121 
  122     vld1.32 {$rcon},[$ptr]
  123 
  124     vtbl.8  $key,{$in0},$mask
  125     vext.8  $tmp,$zero,$in0,#12
  126     vst1.32 {$in0},[$out],#16
  127     aese    $key,$zero
  128 
  129     veor    $in0,$in0,$tmp
  130     vext.8  $tmp,$zero,$tmp,#12
  131     veor    $in0,$in0,$tmp
  132     vext.8  $tmp,$zero,$tmp,#12
  133      veor   $key,$key,$rcon
  134     veor    $in0,$in0,$tmp
  135     vshl.u8 $rcon,$rcon,#1
  136     veor    $in0,$in0,$key
  137 
  138     vtbl.8  $key,{$in0},$mask
  139     vext.8  $tmp,$zero,$in0,#12
  140     vst1.32 {$in0},[$out],#16
  141     aese    $key,$zero
  142 
  143     veor    $in0,$in0,$tmp
  144     vext.8  $tmp,$zero,$tmp,#12
  145     veor    $in0,$in0,$tmp
  146     vext.8  $tmp,$zero,$tmp,#12
  147      veor   $key,$key,$rcon
  148     veor    $in0,$in0,$tmp
  149     veor    $in0,$in0,$key
  150     vst1.32 {$in0},[$out]
  151     add $out,$out,#0x50
  152 
  153     mov $rounds,#10
  154     b   .Ldone
  155 
  156 .align  4
  157 .L192:
  158     vld1.8  {$in1},[$inp],#8
  159     vmov.i8 $key,#8         // borrow $key
  160     vst1.32 {$in0},[$out],#16
  161     vsub.i8 $mask,$mask,$key    // adjust the mask
  162 
  163 .Loop192:
  164     vtbl.8  $key,{$in1},$mask
  165     vext.8  $tmp,$zero,$in0,#12
  166     vst1.32 {$in1},[$out],#8
  167     aese    $key,$zero
  168     subs    $bits,$bits,#1
  169 
  170     veor    $in0,$in0,$tmp
  171     vext.8  $tmp,$zero,$tmp,#12
  172     veor    $in0,$in0,$tmp
  173     vext.8  $tmp,$zero,$tmp,#12
  174     veor    $in0,$in0,$tmp
  175 
  176     vdup.32 $tmp,${in0}[3]
  177     veor    $tmp,$tmp,$in1
  178      veor   $key,$key,$rcon
  179     vext.8  $in1,$zero,$in1,#12
  180     vshl.u8 $rcon,$rcon,#1
  181     veor    $in1,$in1,$tmp
  182     veor    $in0,$in0,$key
  183     veor    $in1,$in1,$key
  184     vst1.32 {$in0},[$out],#16
  185     b.ne    .Loop192
  186 
  187     mov $rounds,#12
  188     add $out,$out,#0x20
  189     b   .Ldone
  190 
  191 .align  4
  192 .L256:
  193     vld1.8  {$in1},[$inp]
  194     mov $bits,#7
  195     mov $rounds,#14
  196     vst1.32 {$in0},[$out],#16
  197 
  198 .Loop256:
  199     vtbl.8  $key,{$in1},$mask
  200     vext.8  $tmp,$zero,$in0,#12
  201     vst1.32 {$in1},[$out],#16
  202     aese    $key,$zero
  203     subs    $bits,$bits,#1
  204 
  205     veor    $in0,$in0,$tmp
  206     vext.8  $tmp,$zero,$tmp,#12
  207     veor    $in0,$in0,$tmp
  208     vext.8  $tmp,$zero,$tmp,#12
  209      veor   $key,$key,$rcon
  210     veor    $in0,$in0,$tmp
  211     vshl.u8 $rcon,$rcon,#1
  212     veor    $in0,$in0,$key
  213     vst1.32 {$in0},[$out],#16
  214     b.eq    .Ldone
  215 
  216     vdup.32 $key,${in0}[3]      // just splat
  217     vext.8  $tmp,$zero,$in1,#12
  218     aese    $key,$zero
  219 
  220     veor    $in1,$in1,$tmp
  221     vext.8  $tmp,$zero,$tmp,#12
  222     veor    $in1,$in1,$tmp
  223     vext.8  $tmp,$zero,$tmp,#12
  224     veor    $in1,$in1,$tmp
  225 
  226     veor    $in1,$in1,$key
  227     b   .Loop256
  228 
  229 .Ldone:
  230     str $rounds,[$out]
  231     mov $ptr,#0
  232 
  233 .Lenc_key_abort:
  234     mov x0,$ptr         // return value
  235     `"ldr   x29,[sp],#16"       if ($flavour =~ /64/)`
  236     ret
  237 .size   ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
  238 
  239 .globl  ${prefix}_set_decrypt_key
  240 .type   ${prefix}_set_decrypt_key,%function
  241 .align  5
  242 ${prefix}_set_decrypt_key:
  243 ___
  244 $code.=<<___    if ($flavour =~ /64/);
  245     stp x29,x30,[sp,#-16]!
  246     add x29,sp,#0
  247 ___
  248 $code.=<<___    if ($flavour !~ /64/);
  249     stmdb   sp!,{r4,lr}
  250 ___
  251 $code.=<<___;
  252     bl  .Lenc_key
  253 
  254     cmp x0,#0
  255     b.ne    .Ldec_key_abort
  256 
  257     sub $out,$out,#240      // restore original $out
  258     mov x4,#-16
  259     add $inp,$out,x12,lsl#4 // end of key schedule
  260 
  261     vld1.32 {v0.16b},[$out]
  262     vld1.32 {v1.16b},[$inp]
  263     vst1.32 {v0.16b},[$inp],x4
  264     vst1.32 {v1.16b},[$out],#16
  265 
  266 .Loop_imc:
  267     vld1.32 {v0.16b},[$out]
  268     vld1.32 {v1.16b},[$inp]
  269     aesimc  v0.16b,v0.16b
  270     aesimc  v1.16b,v1.16b
  271     vst1.32 {v0.16b},[$inp],x4
  272     vst1.32 {v1.16b},[$out],#16
  273     cmp $inp,$out
  274     b.hi    .Loop_imc
  275 
  276     vld1.32 {v0.16b},[$out]
  277     aesimc  v0.16b,v0.16b
  278     vst1.32 {v0.16b},[$inp]
  279 
  280     eor x0,x0,x0        // return value
  281 .Ldec_key_abort:
  282 ___
  283 $code.=<<___    if ($flavour !~ /64/);
  284     ldmia   sp!,{r4,pc}
  285 ___
  286 $code.=<<___    if ($flavour =~ /64/);
  287     ldp x29,x30,[sp],#16
  288     ret
  289 ___
  290 $code.=<<___;
  291 .size   ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
  292 ___
  293 }}}
  294 {{{
  295 sub gen_block () {
  296 my $dir = shift;
  297 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
  298 my ($inp,$out,$key)=map("x$_",(0..2));
  299 my $rounds="w3";
  300 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
  301 
  302 $code.=<<___;
  303 .globl  ${prefix}_${dir}crypt
  304 .type   ${prefix}_${dir}crypt,%function
  305 .align  5
  306 ${prefix}_${dir}crypt:
  307     ldr $rounds,[$key,#240]
  308     vld1.32 {$rndkey0},[$key],#16
  309     vld1.8  {$inout},[$inp]
  310     sub $rounds,$rounds,#2
  311     vld1.32 {$rndkey1},[$key],#16
  312 
  313 .Loop_${dir}c:
  314     aes$e   $inout,$rndkey0
  315     aes$mc  $inout,$inout
  316     vld1.32 {$rndkey0},[$key],#16
  317     subs    $rounds,$rounds,#2
  318     aes$e   $inout,$rndkey1
  319     aes$mc  $inout,$inout
  320     vld1.32 {$rndkey1},[$key],#16
  321     b.gt    .Loop_${dir}c
  322 
  323     aes$e   $inout,$rndkey0
  324     aes$mc  $inout,$inout
  325     vld1.32 {$rndkey0},[$key]
  326     aes$e   $inout,$rndkey1
  327     veor    $inout,$inout,$rndkey0
  328 
  329     vst1.8  {$inout},[$out]
  330     ret
  331 .size   ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
  332 ___
  333 }
  334 &gen_block("en");
  335 &gen_block("de");
  336 }}}
  337 {{{
  338 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
  339 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
  340 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
  341 
  342 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
  343 my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
  344 
  345 ### q8-q15  preloaded key schedule
  346 
  347 $code.=<<___;
  348 .globl  ${prefix}_cbc_encrypt
  349 .type   ${prefix}_cbc_encrypt,%function
  350 .align  5
  351 ${prefix}_cbc_encrypt:
  352 ___
  353 $code.=<<___    if ($flavour =~ /64/);
  354     stp x29,x30,[sp,#-16]!
  355     add x29,sp,#0
  356 ___
  357 $code.=<<___    if ($flavour !~ /64/);
  358     mov ip,sp
  359     stmdb   sp!,{r4-r8,lr}
  360     vstmdb  sp!,{d8-d15}            @ ABI specification says so
  361     ldmia   ip,{r4-r5}      @ load remaining args
  362 ___
  363 $code.=<<___;
  364     subs    $len,$len,#16
  365     mov $step,#16
  366     b.lo    .Lcbc_abort
  367     cclr    $step,eq
  368 
  369     cmp $enc,#0         // en- or decrypting?
  370     ldr $rounds,[$key,#240]
  371     and $len,$len,#-16
  372     vld1.8  {$ivec},[$ivp]
  373     vld1.8  {$dat},[$inp],$step
  374 
  375     vld1.32 {q8-q9},[$key]      // load key schedule...
  376     sub $rounds,$rounds,#6
  377     add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
  378     sub $rounds,$rounds,#2
  379     vld1.32 {q10-q11},[$key_],#32
  380     vld1.32 {q12-q13},[$key_],#32
  381     vld1.32 {q14-q15},[$key_],#32
  382     vld1.32 {$rndlast},[$key_]
  383 
  384     add $key_,$key,#32
  385     mov $cnt,$rounds
  386     b.eq    .Lcbc_dec
  387 
  388     cmp $rounds,#2
  389     veor    $dat,$dat,$ivec
  390     veor    $rndzero_n_last,q8,$rndlast
  391     b.eq    .Lcbc_enc128
  392 
  393     vld1.32 {$in0-$in1},[$key_]
  394     add $key_,$key,#16
  395     add $key4,$key,#16*4
  396     add $key5,$key,#16*5
  397     aese    $dat,q8
  398     aesmc   $dat,$dat
  399     add $key6,$key,#16*6
  400     add $key7,$key,#16*7
  401     b   .Lenter_cbc_enc
  402 
  403 .align  4
  404 .Loop_cbc_enc:
  405     aese    $dat,q8
  406     aesmc   $dat,$dat
  407      vst1.8 {$ivec},[$out],#16
  408 .Lenter_cbc_enc:
  409     aese    $dat,q9
  410     aesmc   $dat,$dat
  411     aese    $dat,$in0
  412     aesmc   $dat,$dat
  413     vld1.32 {q8},[$key4]
  414     cmp $rounds,#4
  415     aese    $dat,$in1
  416     aesmc   $dat,$dat
  417     vld1.32 {q9},[$key5]
  418     b.eq    .Lcbc_enc192
  419 
  420     aese    $dat,q8
  421     aesmc   $dat,$dat
  422     vld1.32 {q8},[$key6]
  423     aese    $dat,q9
  424     aesmc   $dat,$dat
  425     vld1.32 {q9},[$key7]
  426     nop
  427 
  428 .Lcbc_enc192:
  429     aese    $dat,q8
  430     aesmc   $dat,$dat
  431      subs   $len,$len,#16
  432     aese    $dat,q9
  433     aesmc   $dat,$dat
  434      cclr   $step,eq
  435     aese    $dat,q10
  436     aesmc   $dat,$dat
  437     aese    $dat,q11
  438     aesmc   $dat,$dat
  439      vld1.8 {q8},[$inp],$step
  440     aese    $dat,q12
  441     aesmc   $dat,$dat
  442      veor   q8,q8,$rndzero_n_last
  443     aese    $dat,q13
  444     aesmc   $dat,$dat
  445      vld1.32 {q9},[$key_]       // re-pre-load rndkey[1]
  446     aese    $dat,q14
  447     aesmc   $dat,$dat
  448     aese    $dat,q15
  449     veor    $ivec,$dat,$rndlast
  450     b.hs    .Loop_cbc_enc
  451 
  452     vst1.8  {$ivec},[$out],#16
  453     b   .Lcbc_done
  454 
  455 .align  5
  456 .Lcbc_enc128:
  457     vld1.32 {$in0-$in1},[$key_]
  458     aese    $dat,q8
  459     aesmc   $dat,$dat
  460     b   .Lenter_cbc_enc128
  461 .Loop_cbc_enc128:
  462     aese    $dat,q8
  463     aesmc   $dat,$dat
  464      vst1.8 {$ivec},[$out],#16
  465 .Lenter_cbc_enc128:
  466     aese    $dat,q9
  467     aesmc   $dat,$dat
  468      subs   $len,$len,#16
  469     aese    $dat,$in0
  470     aesmc   $dat,$dat
  471      cclr   $step,eq
  472     aese    $dat,$in1
  473     aesmc   $dat,$dat
  474     aese    $dat,q10
  475     aesmc   $dat,$dat
  476     aese    $dat,q11
  477     aesmc   $dat,$dat
  478      vld1.8 {q8},[$inp],$step
  479     aese    $dat,q12
  480     aesmc   $dat,$dat
  481     aese    $dat,q13
  482     aesmc   $dat,$dat
  483     aese    $dat,q14
  484     aesmc   $dat,$dat
  485      veor   q8,q8,$rndzero_n_last
  486     aese    $dat,q15
  487     veor    $ivec,$dat,$rndlast
  488     b.hs    .Loop_cbc_enc128
  489 
  490     vst1.8  {$ivec},[$out],#16
  491     b   .Lcbc_done
  492 ___
  493 {
  494 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
  495 $code.=<<___;
  496 .align  5
  497 .Lcbc_dec:
  498     vld1.8  {$dat2},[$inp],#16
  499     subs    $len,$len,#32       // bias
  500     add $cnt,$rounds,#2
  501     vorr    $in1,$dat,$dat
  502     vorr    $dat1,$dat,$dat
  503     vorr    $in2,$dat2,$dat2
  504     b.lo    .Lcbc_dec_tail
  505 
  506     vorr    $dat1,$dat2,$dat2
  507     vld1.8  {$dat2},[$inp],#16
  508     vorr    $in0,$dat,$dat
  509     vorr    $in1,$dat1,$dat1
  510     vorr    $in2,$dat2,$dat2
  511 
  512 .Loop3x_cbc_dec:
  513     aesd    $dat0,q8
  514     aesimc  $dat0,$dat0
  515     aesd    $dat1,q8
  516     aesimc  $dat1,$dat1
  517     aesd    $dat2,q8
  518     aesimc  $dat2,$dat2
  519     vld1.32 {q8},[$key_],#16
  520     subs    $cnt,$cnt,#2
  521     aesd    $dat0,q9
  522     aesimc  $dat0,$dat0
  523     aesd    $dat1,q9
  524     aesimc  $dat1,$dat1
  525     aesd    $dat2,q9
  526     aesimc  $dat2,$dat2
  527     vld1.32 {q9},[$key_],#16
  528     b.gt    .Loop3x_cbc_dec
  529 
  530     aesd    $dat0,q8
  531     aesimc  $dat0,$dat0
  532     aesd    $dat1,q8
  533     aesimc  $dat1,$dat1
  534     aesd    $dat2,q8
  535     aesimc  $dat2,$dat2
  536      veor   $tmp0,$ivec,$rndlast
  537      subs   $len,$len,#0x30
  538      veor   $tmp1,$in0,$rndlast
  539      mov.lo x6,$len         // x6, $cnt, is zero at this point
  540     aesd    $dat0,q9
  541     aesimc  $dat0,$dat0
  542     aesd    $dat1,q9
  543     aesimc  $dat1,$dat1
  544     aesd    $dat2,q9
  545     aesimc  $dat2,$dat2
  546      veor   $tmp2,$in1,$rndlast
  547      add    $inp,$inp,x6        // $inp is adjusted in such way that
  548                     // at exit from the loop $dat1-$dat2
  549                     // are loaded with last "words"
  550      vorr   $ivec,$in2,$in2
  551      mov    $key_,$key
  552     aesd    $dat0,q12
  553     aesimc  $dat0,$dat0
  554     aesd    $dat1,q12
  555     aesimc  $dat1,$dat1
  556     aesd    $dat2,q12
  557     aesimc  $dat2,$dat2
  558      vld1.8 {$in0},[$inp],#16
  559     aesd    $dat0,q13
  560     aesimc  $dat0,$dat0
  561     aesd    $dat1,q13
  562     aesimc  $dat1,$dat1
  563     aesd    $dat2,q13
  564     aesimc  $dat2,$dat2
  565      vld1.8 {$in1},[$inp],#16
  566     aesd    $dat0,q14
  567     aesimc  $dat0,$dat0
  568     aesd    $dat1,q14
  569     aesimc  $dat1,$dat1
  570     aesd    $dat2,q14
  571     aesimc  $dat2,$dat2
  572      vld1.8 {$in2},[$inp],#16
  573     aesd    $dat0,q15
  574     aesd    $dat1,q15
  575     aesd    $dat2,q15
  576      vld1.32 {q8},[$key_],#16   // re-pre-load rndkey[0]
  577      add    $cnt,$rounds,#2
  578     veor    $tmp0,$tmp0,$dat0
  579     veor    $tmp1,$tmp1,$dat1
  580     veor    $dat2,$dat2,$tmp2
  581      vld1.32 {q9},[$key_],#16   // re-pre-load rndkey[1]
  582     vst1.8  {$tmp0},[$out],#16
  583      vorr   $dat0,$in0,$in0
  584     vst1.8  {$tmp1},[$out],#16
  585      vorr   $dat1,$in1,$in1
  586     vst1.8  {$dat2},[$out],#16
  587      vorr   $dat2,$in2,$in2
  588     b.hs    .Loop3x_cbc_dec
  589 
  590     cmn $len,#0x30
  591     b.eq    .Lcbc_done
  592     nop
  593 
  594 .Lcbc_dec_tail:
  595     aesd    $dat1,q8
  596     aesimc  $dat1,$dat1
  597     aesd    $dat2,q8
  598     aesimc  $dat2,$dat2
  599     vld1.32 {q8},[$key_],#16
  600     subs    $cnt,$cnt,#2
  601     aesd    $dat1,q9
  602     aesimc  $dat1,$dat1
  603     aesd    $dat2,q9
  604     aesimc  $dat2,$dat2
  605     vld1.32 {q9},[$key_],#16
  606     b.gt    .Lcbc_dec_tail
  607 
  608     aesd    $dat1,q8
  609     aesimc  $dat1,$dat1
  610     aesd    $dat2,q8
  611     aesimc  $dat2,$dat2
  612     aesd    $dat1,q9
  613     aesimc  $dat1,$dat1
  614     aesd    $dat2,q9
  615     aesimc  $dat2,$dat2
  616     aesd    $dat1,q12
  617     aesimc  $dat1,$dat1
  618     aesd    $dat2,q12
  619     aesimc  $dat2,$dat2
  620      cmn    $len,#0x20
  621     aesd    $dat1,q13
  622     aesimc  $dat1,$dat1
  623     aesd    $dat2,q13
  624     aesimc  $dat2,$dat2
  625      veor   $tmp1,$ivec,$rndlast
  626     aesd    $dat1,q14
  627     aesimc  $dat1,$dat1
  628     aesd    $dat2,q14
  629     aesimc  $dat2,$dat2
  630      veor   $tmp2,$in1,$rndlast
  631     aesd    $dat1,q15
  632     aesd    $dat2,q15
  633     b.eq    .Lcbc_dec_one
  634     veor    $tmp1,$tmp1,$dat1
  635     veor    $tmp2,$tmp2,$dat2
  636      vorr   $ivec,$in2,$in2
  637     vst1.8  {$tmp1},[$out],#16
  638     vst1.8  {$tmp2},[$out],#16
  639     b   .Lcbc_done
  640 
  641 .Lcbc_dec_one:
  642     veor    $tmp1,$tmp1,$dat2
  643      vorr   $ivec,$in2,$in2
  644     vst1.8  {$tmp1},[$out],#16
  645 
  646 .Lcbc_done:
  647     vst1.8  {$ivec},[$ivp]
  648 .Lcbc_abort:
  649 ___
  650 }
  651 $code.=<<___    if ($flavour !~ /64/);
  652     vldmia  sp!,{d8-d15}
  653     ldmia   sp!,{r4-r8,pc}
  654 ___
  655 $code.=<<___    if ($flavour =~ /64/);
  656     ldr x29,[sp],#16
  657     ret
  658 ___
  659 $code.=<<___;
  660 .size   ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
  661 ___
  662 }}}
  663 {{{
  664 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
  665 my ($rounds,$cnt,$key_)=("w5","w6","x7");
  666 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
  667 my $step="x12";     # aliases with $tctr2
  668 
  669 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
  670 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
  671 
  672 my ($dat,$tmp)=($dat0,$tmp0);
  673 
  674 ### q8-q15  preloaded key schedule
  675 
  676 $code.=<<___;
  677 .globl  ${prefix}_ctr32_encrypt_blocks
  678 .type   ${prefix}_ctr32_encrypt_blocks,%function
  679 .align  5
  680 ${prefix}_ctr32_encrypt_blocks:
  681 ___
  682 $code.=<<___    if ($flavour =~ /64/);
  683     stp     x29,x30,[sp,#-16]!
  684     add     x29,sp,#0
  685 ___
  686 $code.=<<___    if ($flavour !~ /64/);
  687     mov     ip,sp
  688     stmdb       sp!,{r4-r10,lr}
  689     vstmdb      sp!,{d8-d15}            @ ABI specification says so
  690     ldr     r4, [ip]        @ load remaining arg
  691 ___
  692 $code.=<<___;
  693     ldr     $rounds,[$key,#240]
  694 
  695     ldr     $ctr, [$ivp, #12]
  696     vld1.32     {$dat0},[$ivp]
  697 
  698     vld1.32     {q8-q9},[$key]      // load key schedule...
  699     sub     $rounds,$rounds,#4
  700     mov     $step,#16
  701     cmp     $len,#2
  702     add     $key_,$key,x5,lsl#4 // pointer to last 5 round keys
  703     sub     $rounds,$rounds,#2
  704     vld1.32     {q12-q13},[$key_],#32
  705     vld1.32     {q14-q15},[$key_],#32
  706     vld1.32     {$rndlast},[$key_]
  707     add     $key_,$key,#32
  708     mov     $cnt,$rounds
  709     cclr        $step,lo
  710 #ifndef __ARMEB__
  711     rev     $ctr, $ctr
  712 #endif
  713     vorr        $dat1,$dat0,$dat0
  714     add     $tctr1, $ctr, #1
  715     vorr        $dat2,$dat0,$dat0
  716     add     $ctr, $ctr, #2
  717     vorr        $ivec,$dat0,$dat0
  718     rev     $tctr1, $tctr1
  719     vmov.32     ${dat1}[3],$tctr1
  720     b.ls        .Lctr32_tail
  721     rev     $tctr2, $ctr
  722     sub     $len,$len,#3        // bias
  723     vmov.32     ${dat2}[3],$tctr2
  724     b       .Loop3x_ctr32
  725 
  726 .align  4
  727 .Loop3x_ctr32:
  728     aese        $dat0,q8
  729     aesmc       $dat0,$dat0
  730     aese        $dat1,q8
  731     aesmc       $dat1,$dat1
  732     aese        $dat2,q8
  733     aesmc       $dat2,$dat2
  734     vld1.32     {q8},[$key_],#16
  735     subs        $cnt,$cnt,#2
  736     aese        $dat0,q9
  737     aesmc       $dat0,$dat0
  738     aese        $dat1,q9
  739     aesmc       $dat1,$dat1
  740     aese        $dat2,q9
  741     aesmc       $dat2,$dat2
  742     vld1.32     {q9},[$key_],#16
  743     b.gt        .Loop3x_ctr32
  744 
  745     aese        $dat0,q8
  746     aesmc       $tmp0,$dat0
  747     aese        $dat1,q8
  748     aesmc       $tmp1,$dat1
  749      vld1.8     {$in0},[$inp],#16
  750      vorr       $dat0,$ivec,$ivec
  751     aese        $dat2,q8
  752     aesmc       $dat2,$dat2
  753      vld1.8     {$in1},[$inp],#16
  754      vorr       $dat1,$ivec,$ivec
  755     aese        $tmp0,q9
  756     aesmc       $tmp0,$tmp0
  757     aese        $tmp1,q9
  758     aesmc       $tmp1,$tmp1
  759      vld1.8     {$in2},[$inp],#16
  760      mov        $key_,$key
  761     aese        $dat2,q9
  762     aesmc       $tmp2,$dat2
  763      vorr       $dat2,$ivec,$ivec
  764      add        $tctr0,$ctr,#1
  765     aese        $tmp0,q12
  766     aesmc       $tmp0,$tmp0
  767     aese        $tmp1,q12
  768     aesmc       $tmp1,$tmp1
  769      veor       $in0,$in0,$rndlast
  770      add        $tctr1,$ctr,#2
  771     aese        $tmp2,q12
  772     aesmc       $tmp2,$tmp2
  773      veor       $in1,$in1,$rndlast
  774      add        $ctr,$ctr,#3
  775     aese        $tmp0,q13
  776     aesmc       $tmp0,$tmp0
  777     aese        $tmp1,q13
  778     aesmc       $tmp1,$tmp1
  779      veor       $in2,$in2,$rndlast
  780      rev        $tctr0,$tctr0
  781     aese        $tmp2,q13
  782     aesmc       $tmp2,$tmp2
  783      vmov.32    ${dat0}[3], $tctr0
  784      rev        $tctr1,$tctr1
  785     aese        $tmp0,q14
  786     aesmc       $tmp0,$tmp0
  787     aese        $tmp1,q14
  788     aesmc       $tmp1,$tmp1
  789      vmov.32    ${dat1}[3], $tctr1
  790      rev        $tctr2,$ctr
  791     aese        $tmp2,q14
  792     aesmc       $tmp2,$tmp2
  793      vmov.32    ${dat2}[3], $tctr2
  794      subs       $len,$len,#3
  795     aese        $tmp0,q15
  796     aese        $tmp1,q15
  797     aese        $tmp2,q15
  798 
  799     veor        $in0,$in0,$tmp0
  800      vld1.32     {q8},[$key_],#16   // re-pre-load rndkey[0]
  801     vst1.8      {$in0},[$out],#16
  802     veor        $in1,$in1,$tmp1
  803      mov        $cnt,$rounds
  804     vst1.8      {$in1},[$out],#16
  805     veor        $in2,$in2,$tmp2
  806      vld1.32     {q9},[$key_],#16   // re-pre-load rndkey[1]
  807     vst1.8      {$in2},[$out],#16
  808     b.hs        .Loop3x_ctr32
  809 
  810     adds        $len,$len,#3
  811     b.eq        .Lctr32_done
  812     cmp     $len,#1
  813     mov     $step,#16
  814     cclr        $step,eq
  815 
  816 .Lctr32_tail:
  817     aese        $dat0,q8
  818     aesmc       $dat0,$dat0
  819     aese        $dat1,q8
  820     aesmc       $dat1,$dat1
  821     vld1.32     {q8},[$key_],#16
  822     subs        $cnt,$cnt,#2
  823     aese        $dat0,q9
  824     aesmc       $dat0,$dat0
  825     aese        $dat1,q9
  826     aesmc       $dat1,$dat1
  827     vld1.32     {q9},[$key_],#16
  828     b.gt        .Lctr32_tail
  829 
  830     aese        $dat0,q8
  831     aesmc       $dat0,$dat0
  832     aese        $dat1,q8
  833     aesmc       $dat1,$dat1
  834     aese        $dat0,q9
  835     aesmc       $dat0,$dat0
  836     aese        $dat1,q9
  837     aesmc       $dat1,$dat1
  838      vld1.8     {$in0},[$inp],$step
  839     aese        $dat0,q12
  840     aesmc       $dat0,$dat0
  841     aese        $dat1,q12
  842     aesmc       $dat1,$dat1
  843      vld1.8     {$in1},[$inp]
  844     aese        $dat0,q13
  845     aesmc       $dat0,$dat0
  846     aese        $dat1,q13
  847     aesmc       $dat1,$dat1
  848      veor       $in0,$in0,$rndlast
  849     aese        $dat0,q14
  850     aesmc       $dat0,$dat0
  851     aese        $dat1,q14
  852     aesmc       $dat1,$dat1
  853      veor       $in1,$in1,$rndlast
  854     aese        $dat0,q15
  855     aese        $dat1,q15
  856 
  857     cmp     $len,#1
  858     veor        $in0,$in0,$dat0
  859     veor        $in1,$in1,$dat1
  860     vst1.8      {$in0},[$out],#16
  861     b.eq        .Lctr32_done
  862     vst1.8      {$in1},[$out]
  863 
  864 .Lctr32_done:
  865 ___
  866 $code.=<<___    if ($flavour !~ /64/);
  867     vldmia      sp!,{d8-d15}
  868     ldmia       sp!,{r4-r10,pc}
  869 ___
  870 $code.=<<___    if ($flavour =~ /64/);
  871     ldr     x29,[sp],#16
  872     ret
  873 ___
  874 $code.=<<___;
  875 .size   ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
  876 ___
  877 }}}
  878 $code.=<<___;
  879 #endif
  880 ___
  881 ########################################
  882 if ($flavour =~ /64/) {         ######## 64-bit code
  883     my %opcode = (
  884     "aesd"  =>  0x4e285800, "aese"  =>  0x4e284800,
  885     "aesimc"=>  0x4e287800, "aesmc" =>  0x4e286800  );
  886 
  887     local *unaes = sub {
  888     my ($mnemonic,$arg)=@_;
  889 
  890     $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o   &&
  891     sprintf ".inst\t0x%08x\t//%s %s",
  892             $opcode{$mnemonic}|$1|($2<<5),
  893             $mnemonic,$arg;
  894     };
  895 
  896     foreach(split("\n",$code)) {
  897     s/\`([^\`]*)\`/eval($1)/geo;
  898 
  899     s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
  900     s/@\s/\/\//o;           # old->new style commentary
  901 
  902     #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
  903     s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
  904     s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o   or
  905     s/vmov\.i8/movi/o   or  # fix up legacy mnemonics
  906     s/vext\.8/ext/o     or
  907     s/vrev32\.8/rev32/o or
  908     s/vtst\.8/cmtst/o   or
  909     s/vshr/ushr/o       or
  910     s/^(\s+)v/$1/o      or  # strip off v prefix
  911     s/\bbx\s+lr\b/ret/o;
  912 
  913     # fix up remainig legacy suffixes
  914     s/\.[ui]?8//o;
  915     m/\],#8/o and s/\.16b/\.8b/go;
  916     s/\.[ui]?32//o and s/\.16b/\.4s/go;
  917     s/\.[ui]?64//o and s/\.16b/\.2d/go;
  918     s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
  919 
  920     print $_,"\n";
  921     }
  922 } else {                ######## 32-bit code
  923     my %opcode = (
  924     "aesd"  =>  0xf3b00340, "aese"  =>  0xf3b00300,
  925     "aesimc"=>  0xf3b003c0, "aesmc" =>  0xf3b00380  );
  926 
  927     local *unaes = sub {
  928     my ($mnemonic,$arg)=@_;
  929 
  930     if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
  931         my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
  932                      |(($2&7)<<1) |(($2&8)<<2);
  933         # since ARMv7 instructions are always encoded little-endian.
  934         # correct solution is to use .inst directive, but older
  935         # assemblers don't implement it:-(
  936         sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
  937             $word&0xff,($word>>8)&0xff,
  938             ($word>>16)&0xff,($word>>24)&0xff,
  939             $mnemonic,$arg;
  940     }
  941     };
  942 
  943     sub unvtbl {
  944     my $arg=shift;
  945 
  946     $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
  947     sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
  948         "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; 
  949     }
  950 
  951     sub unvdup32 {
  952     my $arg=shift;
  953 
  954     $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
  955     sprintf "vdup.32    q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;  
  956     }
  957 
  958     sub unvmov32 {
  959     my $arg=shift;
  960 
  961     $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
  962     sprintf "vmov.32    d%d[%d],%s",2*$1+($2>>1),$2&1,$3;   
  963     }
  964 
  965     foreach(split("\n",$code)) {
  966     s/\`([^\`]*)\`/eval($1)/geo;
  967 
  968     s/\b[wx]([0-9]+)\b/r$1/go;      # new->old registers
  969     s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
  970     s/\/\/\s?/@ /o;             # new->old style commentary
  971 
  972     # fix up remainig new-style suffixes
  973     s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo    or
  974     s/\],#[0-9]+/]!/o;
  975 
  976     s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo  or
  977     s/cclr\s+([^,]+),\s*([a-z]+)/mov$2  $1,#0/o or
  978     s/vtbl\.8\s+(.*)/unvtbl($1)/geo         or
  979     s/vdup\.32\s+(.*)/unvdup32($1)/geo      or
  980     s/vmov\.32\s+(.*)/unvmov32($1)/geo      or
  981     s/^(\s+)b\./$1b/o               or
  982     s/^(\s+)mov\./$1mov/o               or
  983     s/^(\s+)ret/$1bx\tlr/o;
  984 
  985     print $_,"\n";
  986     }
  987 }
  988 
  989 close STDOUT;