"Fossies" - the Fresh Open Source Software Archive

Member "openssl-1.0.2q/crypto/modes/asm/aesni-gcm-x86_64.pl" (20 Nov 2018, 28457 Bytes) of package /linux/misc/openssl-1.0.2q.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Perl source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "aesni-gcm-x86_64.pl" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 1.1.0g_vs_1.1.1-pre2.

    1 #!/usr/bin/env perl
    2 #
    3 # ====================================================================
    4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
    5 # project. The module is, however, dual licensed under OpenSSL and
    6 # CRYPTOGAMS licenses depending on where you obtain it. For further
    7 # details see http://www.openssl.org/~appro/cryptogams/.
    8 # ====================================================================
    9 #
   10 #
   11 # AES-NI-CTR+GHASH stitch.
   12 #
   13 # February 2013
   14 #
   15 # OpenSSL GCM implementation is organized in such way that its
   16 # performance is rather close to the sum of its streamed components,
   17 # in the context parallelized AES-NI CTR and modulo-scheduled
   18 # PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
   19 # was observed to perform significantly better than the sum of the
   20 # components on contemporary CPUs, the effort was deemed impossible to
   21 # justify. This module is based on combination of Intel submissions,
   22 # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
   23 # Locktyukhin of Intel Corp. who verified that it reduces shuffles
   24 # pressure with notable relative improvement, achieving 1.0 cycle per
   25 # byte processed with 128-bit key on Haswell processor, and 0.74 -
   26 # on Broadwell. [Mentioned results are raw profiled measurements for
   27 # favourable packet size, one divisible by 96. Applications using the
   28 # EVP interface will observe a few percent worse performance.]
   29 #
   30 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
   31 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
   32 
   33 $flavour = shift;
   34 $output  = shift;
   35 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
   36 
   37 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
   38 
   39 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
   40 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
   41 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
   42 die "can't locate x86_64-xlate.pl";
   43 
   44 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
   45         =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
   46     $avx = ($1>=2.20) + ($1>=2.22);
   47 }
   48 
   49 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
   50         `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
   51     $avx = ($1>=2.09) + ($1>=2.10);
   52 }
   53 
   54 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
   55         `ml64 2>&1` =~ /Version ([0-9]+)\./) {
   56     $avx = ($1>=10) + ($1>=11);
   57 }
   58 
   59 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
   60     $avx = ($2>=3.0) + ($2>3.0);
   61 }
   62 
   63 open OUT,"| \"$^X\" $xlate $flavour $output";
   64 *STDOUT=*OUT;
   65 
   66 if ($avx>1) {{{
   67 
   68 ($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
   69 
   70 ($Ii,$T1,$T2,$Hkey,
   71  $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
   72 
   73 ($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
   74 
   75 ($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
   76 
   77 $code=<<___;
   78 .text
   79 
   80 .type   _aesni_ctr32_ghash_6x,\@abi-omnipotent
   81 .align  32
   82 _aesni_ctr32_ghash_6x:
   83     vmovdqu     0x20($const),$T2    # borrow $T2, .Lone_msb
   84     sub     \$6,$len
   85     vpxor       $Z0,$Z0,$Z0     # $Z0   = 0
   86     vmovdqu     0x00-0x80($key),$rndkey
   87     vpaddb      $T2,$T1,$inout1
   88     vpaddb      $T2,$inout1,$inout2
   89     vpaddb      $T2,$inout2,$inout3
   90     vpaddb      $T2,$inout3,$inout4
   91     vpaddb      $T2,$inout4,$inout5
   92     vpxor       $rndkey,$T1,$inout0
   93     vmovdqu     $Z0,16+8(%rsp)      # "$Z3" = 0
   94     jmp     .Loop6x
   95 
   96 .align  32
   97 .Loop6x:
   98     add     \$`6<<24`,$counter
   99     jc      .Lhandle_ctr32      # discard $inout[1-5]?
  100     vmovdqu     0x00-0x20($Xip),$Hkey   # $Hkey^1
  101       vpaddb    $T2,$inout5,$T1     # next counter value
  102       vpxor     $rndkey,$inout1,$inout1
  103       vpxor     $rndkey,$inout2,$inout2
  104 
  105 .Lresume_ctr32:
  106     vmovdqu     $T1,($ivp)      # save next counter value
  107     vpclmulqdq  \$0x10,$Hkey,$Z3,$Z1
  108       vpxor     $rndkey,$inout3,$inout3
  109       vmovups   0x10-0x80($key),$T2 # borrow $T2 for $rndkey
  110     vpclmulqdq  \$0x01,$Hkey,$Z3,$Z2
  111     xor     %r12,%r12
  112     cmp     $in0,$end0
  113 
  114       vaesenc   $T2,$inout0,$inout0
  115     vmovdqu     0x30+8(%rsp),$Ii    # I[4]
  116       vpxor     $rndkey,$inout4,$inout4
  117     vpclmulqdq  \$0x00,$Hkey,$Z3,$T1
  118       vaesenc   $T2,$inout1,$inout1
  119       vpxor     $rndkey,$inout5,$inout5
  120     setnc       %r12b
  121     vpclmulqdq  \$0x11,$Hkey,$Z3,$Z3
  122       vaesenc   $T2,$inout2,$inout2
  123     vmovdqu     0x10-0x20($Xip),$Hkey   # $Hkey^2
  124     neg     %r12
  125       vaesenc   $T2,$inout3,$inout3
  126      vpxor      $Z1,$Z2,$Z2
  127     vpclmulqdq  \$0x00,$Hkey,$Ii,$Z1
  128      vpxor      $Z0,$Xi,$Xi     # modulo-scheduled
  129       vaesenc   $T2,$inout4,$inout4
  130      vpxor      $Z1,$T1,$Z0
  131     and     \$0x60,%r12
  132       vmovups   0x20-0x80($key),$rndkey
  133     vpclmulqdq  \$0x10,$Hkey,$Ii,$T1
  134       vaesenc   $T2,$inout5,$inout5
  135 
  136     vpclmulqdq  \$0x01,$Hkey,$Ii,$T2
  137     lea     ($in0,%r12),$in0
  138       vaesenc   $rndkey,$inout0,$inout0
  139      vpxor      16+8(%rsp),$Xi,$Xi  # modulo-scheduled [vpxor $Z3,$Xi,$Xi]
  140     vpclmulqdq  \$0x11,$Hkey,$Ii,$Hkey
  141      vmovdqu    0x40+8(%rsp),$Ii    # I[3]
  142       vaesenc   $rndkey,$inout1,$inout1
  143     movbe       0x58($in0),%r13
  144       vaesenc   $rndkey,$inout2,$inout2
  145     movbe       0x50($in0),%r12
  146       vaesenc   $rndkey,$inout3,$inout3
  147     mov     %r13,0x20+8(%rsp)
  148       vaesenc   $rndkey,$inout4,$inout4
  149     mov     %r12,0x28+8(%rsp)
  150     vmovdqu     0x30-0x20($Xip),$Z1 # borrow $Z1 for $Hkey^3
  151       vaesenc   $rndkey,$inout5,$inout5
  152 
  153       vmovups   0x30-0x80($key),$rndkey
  154      vpxor      $T1,$Z2,$Z2
  155     vpclmulqdq  \$0x00,$Z1,$Ii,$T1
  156       vaesenc   $rndkey,$inout0,$inout0
  157      vpxor      $T2,$Z2,$Z2
  158     vpclmulqdq  \$0x10,$Z1,$Ii,$T2
  159       vaesenc   $rndkey,$inout1,$inout1
  160      vpxor      $Hkey,$Z3,$Z3
  161     vpclmulqdq  \$0x01,$Z1,$Ii,$Hkey
  162       vaesenc   $rndkey,$inout2,$inout2
  163     vpclmulqdq  \$0x11,$Z1,$Ii,$Z1
  164      vmovdqu    0x50+8(%rsp),$Ii    # I[2]
  165       vaesenc   $rndkey,$inout3,$inout3
  166       vaesenc   $rndkey,$inout4,$inout4
  167      vpxor      $T1,$Z0,$Z0
  168     vmovdqu     0x40-0x20($Xip),$T1 # borrow $T1 for $Hkey^4
  169       vaesenc   $rndkey,$inout5,$inout5
  170 
  171       vmovups   0x40-0x80($key),$rndkey
  172      vpxor      $T2,$Z2,$Z2
  173     vpclmulqdq  \$0x00,$T1,$Ii,$T2
  174       vaesenc   $rndkey,$inout0,$inout0
  175      vpxor      $Hkey,$Z2,$Z2
  176     vpclmulqdq  \$0x10,$T1,$Ii,$Hkey
  177       vaesenc   $rndkey,$inout1,$inout1
  178     movbe       0x48($in0),%r13
  179      vpxor      $Z1,$Z3,$Z3
  180     vpclmulqdq  \$0x01,$T1,$Ii,$Z1
  181       vaesenc   $rndkey,$inout2,$inout2
  182     movbe       0x40($in0),%r12
  183     vpclmulqdq  \$0x11,$T1,$Ii,$T1
  184      vmovdqu    0x60+8(%rsp),$Ii    # I[1]
  185       vaesenc   $rndkey,$inout3,$inout3
  186     mov     %r13,0x30+8(%rsp)
  187       vaesenc   $rndkey,$inout4,$inout4
  188     mov     %r12,0x38+8(%rsp)
  189      vpxor      $T2,$Z0,$Z0
  190     vmovdqu     0x60-0x20($Xip),$T2 # borrow $T2 for $Hkey^5
  191       vaesenc   $rndkey,$inout5,$inout5
  192 
  193       vmovups   0x50-0x80($key),$rndkey
  194      vpxor      $Hkey,$Z2,$Z2
  195     vpclmulqdq  \$0x00,$T2,$Ii,$Hkey
  196       vaesenc   $rndkey,$inout0,$inout0
  197      vpxor      $Z1,$Z2,$Z2
  198     vpclmulqdq  \$0x10,$T2,$Ii,$Z1
  199       vaesenc   $rndkey,$inout1,$inout1
  200     movbe       0x38($in0),%r13
  201      vpxor      $T1,$Z3,$Z3
  202     vpclmulqdq  \$0x01,$T2,$Ii,$T1
  203      vpxor      0x70+8(%rsp),$Xi,$Xi    # accumulate I[0]
  204       vaesenc   $rndkey,$inout2,$inout2
  205     movbe       0x30($in0),%r12
  206     vpclmulqdq  \$0x11,$T2,$Ii,$T2
  207       vaesenc   $rndkey,$inout3,$inout3
  208     mov     %r13,0x40+8(%rsp)
  209       vaesenc   $rndkey,$inout4,$inout4
  210     mov     %r12,0x48+8(%rsp)
  211      vpxor      $Hkey,$Z0,$Z0
  212      vmovdqu    0x70-0x20($Xip),$Hkey   # $Hkey^6
  213       vaesenc   $rndkey,$inout5,$inout5
  214 
  215       vmovups   0x60-0x80($key),$rndkey
  216      vpxor      $Z1,$Z2,$Z2
  217     vpclmulqdq  \$0x10,$Hkey,$Xi,$Z1
  218       vaesenc   $rndkey,$inout0,$inout0
  219      vpxor      $T1,$Z2,$Z2
  220     vpclmulqdq  \$0x01,$Hkey,$Xi,$T1
  221       vaesenc   $rndkey,$inout1,$inout1
  222     movbe       0x28($in0),%r13
  223      vpxor      $T2,$Z3,$Z3
  224     vpclmulqdq  \$0x00,$Hkey,$Xi,$T2
  225       vaesenc   $rndkey,$inout2,$inout2
  226     movbe       0x20($in0),%r12
  227     vpclmulqdq  \$0x11,$Hkey,$Xi,$Xi
  228       vaesenc   $rndkey,$inout3,$inout3
  229     mov     %r13,0x50+8(%rsp)
  230       vaesenc   $rndkey,$inout4,$inout4
  231     mov     %r12,0x58+8(%rsp)
  232     vpxor       $Z1,$Z2,$Z2
  233       vaesenc   $rndkey,$inout5,$inout5
  234     vpxor       $T1,$Z2,$Z2
  235 
  236       vmovups   0x70-0x80($key),$rndkey
  237     vpslldq     \$8,$Z2,$Z1
  238     vpxor       $T2,$Z0,$Z0
  239     vmovdqu     0x10($const),$Hkey  # .Lpoly
  240 
  241       vaesenc   $rndkey,$inout0,$inout0
  242     vpxor       $Xi,$Z3,$Z3
  243       vaesenc   $rndkey,$inout1,$inout1
  244     vpxor       $Z1,$Z0,$Z0
  245     movbe       0x18($in0),%r13
  246       vaesenc   $rndkey,$inout2,$inout2
  247     movbe       0x10($in0),%r12
  248     vpalignr    \$8,$Z0,$Z0,$Ii     # 1st phase
  249     vpclmulqdq  \$0x10,$Hkey,$Z0,$Z0
  250     mov     %r13,0x60+8(%rsp)
  251       vaesenc   $rndkey,$inout3,$inout3
  252     mov     %r12,0x68+8(%rsp)
  253       vaesenc   $rndkey,$inout4,$inout4
  254       vmovups   0x80-0x80($key),$T1 # borrow $T1 for $rndkey
  255       vaesenc   $rndkey,$inout5,$inout5
  256 
  257       vaesenc   $T1,$inout0,$inout0
  258       vmovups   0x90-0x80($key),$rndkey
  259       vaesenc   $T1,$inout1,$inout1
  260     vpsrldq     \$8,$Z2,$Z2
  261       vaesenc   $T1,$inout2,$inout2
  262     vpxor       $Z2,$Z3,$Z3
  263       vaesenc   $T1,$inout3,$inout3
  264     vpxor       $Ii,$Z0,$Z0
  265     movbe       0x08($in0),%r13
  266       vaesenc   $T1,$inout4,$inout4
  267     movbe       0x00($in0),%r12
  268       vaesenc   $T1,$inout5,$inout5
  269       vmovups   0xa0-0x80($key),$T1
  270       cmp       \$11,$rounds
  271       jb        .Lenc_tail      # 128-bit key
  272 
  273       vaesenc   $rndkey,$inout0,$inout0
  274       vaesenc   $rndkey,$inout1,$inout1
  275       vaesenc   $rndkey,$inout2,$inout2
  276       vaesenc   $rndkey,$inout3,$inout3
  277       vaesenc   $rndkey,$inout4,$inout4
  278       vaesenc   $rndkey,$inout5,$inout5
  279 
  280       vaesenc   $T1,$inout0,$inout0
  281       vaesenc   $T1,$inout1,$inout1
  282       vaesenc   $T1,$inout2,$inout2
  283       vaesenc   $T1,$inout3,$inout3
  284       vaesenc   $T1,$inout4,$inout4
  285       vmovups   0xb0-0x80($key),$rndkey
  286       vaesenc   $T1,$inout5,$inout5
  287       vmovups   0xc0-0x80($key),$T1
  288       je        .Lenc_tail      # 192-bit key
  289 
  290       vaesenc   $rndkey,$inout0,$inout0
  291       vaesenc   $rndkey,$inout1,$inout1
  292       vaesenc   $rndkey,$inout2,$inout2
  293       vaesenc   $rndkey,$inout3,$inout3
  294       vaesenc   $rndkey,$inout4,$inout4
  295       vaesenc   $rndkey,$inout5,$inout5
  296 
  297       vaesenc   $T1,$inout0,$inout0
  298       vaesenc   $T1,$inout1,$inout1
  299       vaesenc   $T1,$inout2,$inout2
  300       vaesenc   $T1,$inout3,$inout3
  301       vaesenc   $T1,$inout4,$inout4
  302       vmovups   0xd0-0x80($key),$rndkey
  303       vaesenc   $T1,$inout5,$inout5
  304       vmovups   0xe0-0x80($key),$T1
  305       jmp       .Lenc_tail      # 256-bit key
  306 
  307 .align  32
  308 .Lhandle_ctr32:
  309     vmovdqu     ($const),$Ii        # borrow $Ii for .Lbswap_mask
  310       vpshufb   $Ii,$T1,$Z2     # byte-swap counter
  311       vmovdqu   0x30($const),$Z1    # borrow $Z1, .Ltwo_lsb
  312       vpaddd    0x40($const),$Z2,$inout1    # .Lone_lsb
  313       vpaddd    $Z1,$Z2,$inout2
  314     vmovdqu     0x00-0x20($Xip),$Hkey   # $Hkey^1
  315       vpaddd    $Z1,$inout1,$inout3
  316       vpshufb   $Ii,$inout1,$inout1
  317       vpaddd    $Z1,$inout2,$inout4
  318       vpshufb   $Ii,$inout2,$inout2
  319       vpxor     $rndkey,$inout1,$inout1
  320       vpaddd    $Z1,$inout3,$inout5
  321       vpshufb   $Ii,$inout3,$inout3
  322       vpxor     $rndkey,$inout2,$inout2
  323       vpaddd    $Z1,$inout4,$T1     # byte-swapped next counter value
  324       vpshufb   $Ii,$inout4,$inout4
  325       vpshufb   $Ii,$inout5,$inout5
  326       vpshufb   $Ii,$T1,$T1     # next counter value
  327     jmp     .Lresume_ctr32
  328 
  329 .align  32
  330 .Lenc_tail:
  331       vaesenc   $rndkey,$inout0,$inout0
  332     vmovdqu     $Z3,16+8(%rsp)      # postpone vpxor $Z3,$Xi,$Xi
  333     vpalignr    \$8,$Z0,$Z0,$Xi     # 2nd phase
  334       vaesenc   $rndkey,$inout1,$inout1
  335     vpclmulqdq  \$0x10,$Hkey,$Z0,$Z0
  336       vpxor     0x00($inp),$T1,$T2
  337       vaesenc   $rndkey,$inout2,$inout2
  338       vpxor     0x10($inp),$T1,$Ii
  339       vaesenc   $rndkey,$inout3,$inout3
  340       vpxor     0x20($inp),$T1,$Z1
  341       vaesenc   $rndkey,$inout4,$inout4
  342       vpxor     0x30($inp),$T1,$Z2
  343       vaesenc   $rndkey,$inout5,$inout5
  344       vpxor     0x40($inp),$T1,$Z3
  345       vpxor     0x50($inp),$T1,$Hkey
  346       vmovdqu   ($ivp),$T1      # load next counter value
  347 
  348       vaesenclast   $T2,$inout0,$inout0
  349       vmovdqu   0x20($const),$T2    # borrow $T2, .Lone_msb
  350       vaesenclast   $Ii,$inout1,$inout1
  351      vpaddb     $T2,$T1,$Ii
  352     mov     %r13,0x70+8(%rsp)
  353     lea     0x60($inp),$inp
  354       vaesenclast   $Z1,$inout2,$inout2
  355      vpaddb     $T2,$Ii,$Z1
  356     mov     %r12,0x78+8(%rsp)
  357     lea     0x60($out),$out
  358       vmovdqu   0x00-0x80($key),$rndkey
  359       vaesenclast   $Z2,$inout3,$inout3
  360      vpaddb     $T2,$Z1,$Z2
  361       vaesenclast   $Z3, $inout4,$inout4
  362      vpaddb     $T2,$Z2,$Z3
  363       vaesenclast   $Hkey,$inout5,$inout5
  364      vpaddb     $T2,$Z3,$Hkey
  365 
  366     add     \$0x60,$ret
  367     sub     \$0x6,$len
  368     jc      .L6x_done
  369 
  370       vmovups   $inout0,-0x60($out) # save output
  371      vpxor      $rndkey,$T1,$inout0
  372       vmovups   $inout1,-0x50($out)
  373      vmovdqa    $Ii,$inout1     # 0 latency
  374       vmovups   $inout2,-0x40($out)
  375      vmovdqa    $Z1,$inout2     # 0 latency
  376       vmovups   $inout3,-0x30($out)
  377      vmovdqa    $Z2,$inout3     # 0 latency
  378       vmovups   $inout4,-0x20($out)
  379      vmovdqa    $Z3,$inout4     # 0 latency
  380       vmovups   $inout5,-0x10($out)
  381      vmovdqa    $Hkey,$inout5       # 0 latency
  382     vmovdqu     0x20+8(%rsp),$Z3    # I[5]
  383     jmp     .Loop6x
  384 
  385 .L6x_done:
  386     vpxor       16+8(%rsp),$Xi,$Xi  # modulo-scheduled
  387     vpxor       $Z0,$Xi,$Xi     # modulo-scheduled
  388 
  389     ret
  390 .size   _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
  391 ___
  392 ######################################################################
  393 #
  394 # size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
  395 #       const AES_KEY *key, unsigned char iv[16],
  396 #       struct { u128 Xi,H,Htbl[9]; } *Xip);
  397 $code.=<<___;
  398 .globl  aesni_gcm_decrypt
  399 .type   aesni_gcm_decrypt,\@function,6
  400 .align  32
  401 aesni_gcm_decrypt:
  402     xor $ret,$ret
  403     cmp \$0x60,$len         # minimal accepted length
  404     jb  .Lgcm_dec_abort
  405 
  406     lea (%rsp),%rax         # save stack pointer
  407     push    %rbx
  408     push    %rbp
  409     push    %r12
  410     push    %r13
  411     push    %r14
  412     push    %r15
  413 ___
  414 $code.=<<___ if ($win64);
  415     lea -0xa8(%rsp),%rsp
  416     movaps  %xmm6,-0xd8(%rax)
  417     movaps  %xmm7,-0xc8(%rax)
  418     movaps  %xmm8,-0xb8(%rax)
  419     movaps  %xmm9,-0xa8(%rax)
  420     movaps  %xmm10,-0x98(%rax)
  421     movaps  %xmm11,-0x88(%rax)
  422     movaps  %xmm12,-0x78(%rax)
  423     movaps  %xmm13,-0x68(%rax)
  424     movaps  %xmm14,-0x58(%rax)
  425     movaps  %xmm15,-0x48(%rax)
  426 .Lgcm_dec_body:
  427 ___
  428 $code.=<<___;
  429     vzeroupper
  430 
  431     vmovdqu     ($ivp),$T1      # input counter value
  432     add     \$-128,%rsp
  433     mov     12($ivp),$counter
  434     lea     .Lbswap_mask(%rip),$const
  435     lea     -0x80($key),$in0    # borrow $in0
  436     mov     \$0xf80,$end0       # borrow $end0
  437     vmovdqu     ($Xip),$Xi      # load Xi
  438     and     \$-128,%rsp     # ensure stack alignment
  439     vmovdqu     ($const),$Ii        # borrow $Ii for .Lbswap_mask
  440     lea     0x80($key),$key     # size optimization
  441     lea     0x20+0x20($Xip),$Xip    # size optimization
  442     mov     0xf0-0x80($key),$rounds
  443     vpshufb     $Ii,$Xi,$Xi
  444 
  445     and     $end0,$in0
  446     and     %rsp,$end0
  447     sub     $in0,$end0
  448     jc      .Ldec_no_key_aliasing
  449     cmp     \$768,$end0
  450     jnc     .Ldec_no_key_aliasing
  451     sub     $end0,%rsp      # avoid aliasing with key
  452 .Ldec_no_key_aliasing:
  453 
  454     vmovdqu     0x50($inp),$Z3      # I[5]
  455     lea     ($inp),$in0
  456     vmovdqu     0x40($inp),$Z0
  457     lea     -0xc0($inp,$len),$end0
  458     vmovdqu     0x30($inp),$Z1
  459     shr     \$4,$len
  460     xor     $ret,$ret
  461     vmovdqu     0x20($inp),$Z2
  462      vpshufb    $Ii,$Z3,$Z3     # passed to _aesni_ctr32_ghash_6x
  463     vmovdqu     0x10($inp),$T2
  464      vpshufb    $Ii,$Z0,$Z0
  465     vmovdqu     ($inp),$Hkey
  466      vpshufb    $Ii,$Z1,$Z1
  467     vmovdqu     $Z0,0x30(%rsp)
  468      vpshufb    $Ii,$Z2,$Z2
  469     vmovdqu     $Z1,0x40(%rsp)
  470      vpshufb    $Ii,$T2,$T2
  471     vmovdqu     $Z2,0x50(%rsp)
  472      vpshufb    $Ii,$Hkey,$Hkey
  473     vmovdqu     $T2,0x60(%rsp)
  474     vmovdqu     $Hkey,0x70(%rsp)
  475 
  476     call        _aesni_ctr32_ghash_6x
  477 
  478     vmovups     $inout0,-0x60($out) # save output
  479     vmovups     $inout1,-0x50($out)
  480     vmovups     $inout2,-0x40($out)
  481     vmovups     $inout3,-0x30($out)
  482     vmovups     $inout4,-0x20($out)
  483     vmovups     $inout5,-0x10($out)
  484 
  485     vpshufb     ($const),$Xi,$Xi    # .Lbswap_mask
  486     vmovdqu     $Xi,-0x40($Xip)     # output Xi
  487 
  488     vzeroupper
  489 ___
  490 $code.=<<___ if ($win64);
  491     movaps  -0xd8(%rax),%xmm6
  492     movaps  -0xc8(%rax),%xmm7
  493     movaps  -0xb8(%rax),%xmm8
  494     movaps  -0xa8(%rax),%xmm9
  495     movaps  -0x98(%rax),%xmm10
  496     movaps  -0x88(%rax),%xmm11
  497     movaps  -0x78(%rax),%xmm12
  498     movaps  -0x68(%rax),%xmm13
  499     movaps  -0x58(%rax),%xmm14
  500     movaps  -0x48(%rax),%xmm15
  501 ___
  502 $code.=<<___;
  503     mov -48(%rax),%r15
  504     mov -40(%rax),%r14
  505     mov -32(%rax),%r13
  506     mov -24(%rax),%r12
  507     mov -16(%rax),%rbp
  508     mov -8(%rax),%rbx
  509     lea (%rax),%rsp     # restore %rsp
  510 .Lgcm_dec_abort:
  511     mov $ret,%rax       # return value
  512     ret
  513 .size   aesni_gcm_decrypt,.-aesni_gcm_decrypt
  514 ___
  515 
  516 $code.=<<___;
  517 .type   _aesni_ctr32_6x,\@abi-omnipotent
  518 .align  32
  519 _aesni_ctr32_6x:
  520     vmovdqu     0x00-0x80($key),$Z0 # borrow $Z0 for $rndkey
  521     vmovdqu     0x20($const),$T2    # borrow $T2, .Lone_msb
  522     lea     -1($rounds),%r13
  523     vmovups     0x10-0x80($key),$rndkey
  524     lea     0x20-0x80($key),%r12
  525     vpxor       $Z0,$T1,$inout0
  526     add     \$`6<<24`,$counter
  527     jc      .Lhandle_ctr32_2
  528     vpaddb      $T2,$T1,$inout1
  529     vpaddb      $T2,$inout1,$inout2
  530     vpxor       $Z0,$inout1,$inout1
  531     vpaddb      $T2,$inout2,$inout3
  532     vpxor       $Z0,$inout2,$inout2
  533     vpaddb      $T2,$inout3,$inout4
  534     vpxor       $Z0,$inout3,$inout3
  535     vpaddb      $T2,$inout4,$inout5
  536     vpxor       $Z0,$inout4,$inout4
  537     vpaddb      $T2,$inout5,$T1
  538     vpxor       $Z0,$inout5,$inout5
  539     jmp     .Loop_ctr32
  540 
  541 .align  16
  542 .Loop_ctr32:
  543     vaesenc     $rndkey,$inout0,$inout0
  544     vaesenc     $rndkey,$inout1,$inout1
  545     vaesenc     $rndkey,$inout2,$inout2
  546     vaesenc     $rndkey,$inout3,$inout3
  547     vaesenc     $rndkey,$inout4,$inout4
  548     vaesenc     $rndkey,$inout5,$inout5
  549     vmovups     (%r12),$rndkey
  550     lea     0x10(%r12),%r12
  551     dec     %r13d
  552     jnz     .Loop_ctr32
  553 
  554     vmovdqu     (%r12),$Hkey        # last round key
  555     vaesenc     $rndkey,$inout0,$inout0
  556     vpxor       0x00($inp),$Hkey,$Z0
  557     vaesenc     $rndkey,$inout1,$inout1
  558     vpxor       0x10($inp),$Hkey,$Z1
  559     vaesenc     $rndkey,$inout2,$inout2
  560     vpxor       0x20($inp),$Hkey,$Z2
  561     vaesenc     $rndkey,$inout3,$inout3
  562     vpxor       0x30($inp),$Hkey,$Xi
  563     vaesenc     $rndkey,$inout4,$inout4
  564     vpxor       0x40($inp),$Hkey,$T2
  565     vaesenc     $rndkey,$inout5,$inout5
  566     vpxor       0x50($inp),$Hkey,$Hkey
  567     lea     0x60($inp),$inp
  568 
  569     vaesenclast $Z0,$inout0,$inout0
  570     vaesenclast $Z1,$inout1,$inout1
  571     vaesenclast $Z2,$inout2,$inout2
  572     vaesenclast $Xi,$inout3,$inout3
  573     vaesenclast $T2,$inout4,$inout4
  574     vaesenclast $Hkey,$inout5,$inout5
  575     vmovups     $inout0,0x00($out)
  576     vmovups     $inout1,0x10($out)
  577     vmovups     $inout2,0x20($out)
  578     vmovups     $inout3,0x30($out)
  579     vmovups     $inout4,0x40($out)
  580     vmovups     $inout5,0x50($out)
  581     lea     0x60($out),$out
  582 
  583     ret
  584 .align  32
  585 .Lhandle_ctr32_2:
  586     vpshufb     $Ii,$T1,$Z2     # byte-swap counter
  587     vmovdqu     0x30($const),$Z1    # borrow $Z1, .Ltwo_lsb
  588     vpaddd      0x40($const),$Z2,$inout1    # .Lone_lsb
  589     vpaddd      $Z1,$Z2,$inout2
  590     vpaddd      $Z1,$inout1,$inout3
  591     vpshufb     $Ii,$inout1,$inout1
  592     vpaddd      $Z1,$inout2,$inout4
  593     vpshufb     $Ii,$inout2,$inout2
  594     vpxor       $Z0,$inout1,$inout1
  595     vpaddd      $Z1,$inout3,$inout5
  596     vpshufb     $Ii,$inout3,$inout3
  597     vpxor       $Z0,$inout2,$inout2
  598     vpaddd      $Z1,$inout4,$T1     # byte-swapped next counter value
  599     vpshufb     $Ii,$inout4,$inout4
  600     vpxor       $Z0,$inout3,$inout3
  601     vpshufb     $Ii,$inout5,$inout5
  602     vpxor       $Z0,$inout4,$inout4
  603     vpshufb     $Ii,$T1,$T1     # next counter value
  604     vpxor       $Z0,$inout5,$inout5
  605     jmp .Loop_ctr32
  606 .size   _aesni_ctr32_6x,.-_aesni_ctr32_6x
  607 
  608 .globl  aesni_gcm_encrypt
  609 .type   aesni_gcm_encrypt,\@function,6
  610 .align  32
  611 aesni_gcm_encrypt:
  612     xor $ret,$ret
  613     cmp \$0x60*3,$len           # minimal accepted length
  614     jb  .Lgcm_enc_abort
  615 
  616     lea (%rsp),%rax         # save stack pointer
  617     push    %rbx
  618     push    %rbp
  619     push    %r12
  620     push    %r13
  621     push    %r14
  622     push    %r15
  623 ___
  624 $code.=<<___ if ($win64);
  625     lea -0xa8(%rsp),%rsp
  626     movaps  %xmm6,-0xd8(%rax)
  627     movaps  %xmm7,-0xc8(%rax)
  628     movaps  %xmm8,-0xb8(%rax)
  629     movaps  %xmm9,-0xa8(%rax)
  630     movaps  %xmm10,-0x98(%rax)
  631     movaps  %xmm11,-0x88(%rax)
  632     movaps  %xmm12,-0x78(%rax)
  633     movaps  %xmm13,-0x68(%rax)
  634     movaps  %xmm14,-0x58(%rax)
  635     movaps  %xmm15,-0x48(%rax)
  636 .Lgcm_enc_body:
  637 ___
  638 $code.=<<___;
  639     vzeroupper
  640 
  641     vmovdqu     ($ivp),$T1      # input counter value
  642     add     \$-128,%rsp
  643     mov     12($ivp),$counter
  644     lea     .Lbswap_mask(%rip),$const
  645     lea     -0x80($key),$in0    # borrow $in0
  646     mov     \$0xf80,$end0       # borrow $end0
  647     lea     0x80($key),$key     # size optimization
  648     vmovdqu     ($const),$Ii        # borrow $Ii for .Lbswap_mask
  649     and     \$-128,%rsp     # ensure stack alignment
  650     mov     0xf0-0x80($key),$rounds
  651 
  652     and     $end0,$in0
  653     and     %rsp,$end0
  654     sub     $in0,$end0
  655     jc      .Lenc_no_key_aliasing
  656     cmp     \$768,$end0
  657     jnc     .Lenc_no_key_aliasing
  658     sub     $end0,%rsp      # avoid aliasing with key
  659 .Lenc_no_key_aliasing:
  660 
  661     lea     ($out),$in0
  662     lea     -0xc0($out,$len),$end0
  663     shr     \$4,$len
  664 
  665     call        _aesni_ctr32_6x
  666     vpshufb     $Ii,$inout0,$Xi     # save bswapped output on stack
  667     vpshufb     $Ii,$inout1,$T2
  668     vmovdqu     $Xi,0x70(%rsp)
  669     vpshufb     $Ii,$inout2,$Z0
  670     vmovdqu     $T2,0x60(%rsp)
  671     vpshufb     $Ii,$inout3,$Z1
  672     vmovdqu     $Z0,0x50(%rsp)
  673     vpshufb     $Ii,$inout4,$Z2
  674     vmovdqu     $Z1,0x40(%rsp)
  675     vpshufb     $Ii,$inout5,$Z3     # passed to _aesni_ctr32_ghash_6x
  676     vmovdqu     $Z2,0x30(%rsp)
  677 
  678     call        _aesni_ctr32_6x
  679 
  680     vmovdqu     ($Xip),$Xi      # load Xi
  681     lea     0x20+0x20($Xip),$Xip    # size optimization
  682     sub     \$12,$len
  683     mov     \$0x60*2,$ret
  684     vpshufb     $Ii,$Xi,$Xi
  685 
  686     call        _aesni_ctr32_ghash_6x
  687     vmovdqu     0x20(%rsp),$Z3      # I[5]
  688      vmovdqu    ($const),$Ii        # borrow $Ii for .Lbswap_mask
  689     vmovdqu     0x00-0x20($Xip),$Hkey   # $Hkey^1
  690     vpunpckhqdq $Z3,$Z3,$T1
  691     vmovdqu     0x20-0x20($Xip),$rndkey # borrow $rndkey for $HK
  692      vmovups    $inout0,-0x60($out) # save output
  693      vpshufb    $Ii,$inout0,$inout0 # but keep bswapped copy
  694     vpxor       $Z3,$T1,$T1
  695      vmovups    $inout1,-0x50($out)
  696      vpshufb    $Ii,$inout1,$inout1
  697      vmovups    $inout2,-0x40($out)
  698      vpshufb    $Ii,$inout2,$inout2
  699      vmovups    $inout3,-0x30($out)
  700      vpshufb    $Ii,$inout3,$inout3
  701      vmovups    $inout4,-0x20($out)
  702      vpshufb    $Ii,$inout4,$inout4
  703      vmovups    $inout5,-0x10($out)
  704      vpshufb    $Ii,$inout5,$inout5
  705      vmovdqu    $inout0,0x10(%rsp)  # free $inout0
  706 ___
  707 { my ($HK,$T3)=($rndkey,$inout0);
  708 
  709 $code.=<<___;
  710      vmovdqu    0x30(%rsp),$Z2      # I[4]
  711      vmovdqu    0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2
  712      vpunpckhqdq    $Z2,$Z2,$T2
  713     vpclmulqdq  \$0x00,$Hkey,$Z3,$Z1
  714      vpxor      $Z2,$T2,$T2
  715     vpclmulqdq  \$0x11,$Hkey,$Z3,$Z3
  716     vpclmulqdq  \$0x00,$HK,$T1,$T1
  717 
  718      vmovdqu    0x40(%rsp),$T3      # I[3]
  719     vpclmulqdq  \$0x00,$Ii,$Z2,$Z0
  720      vmovdqu    0x30-0x20($Xip),$Hkey   # $Hkey^3
  721     vpxor       $Z1,$Z0,$Z0
  722      vpunpckhqdq    $T3,$T3,$Z1
  723     vpclmulqdq  \$0x11,$Ii,$Z2,$Z2
  724      vpxor      $T3,$Z1,$Z1
  725     vpxor       $Z3,$Z2,$Z2
  726     vpclmulqdq  \$0x10,$HK,$T2,$T2
  727      vmovdqu    0x50-0x20($Xip),$HK
  728     vpxor       $T1,$T2,$T2
  729 
  730      vmovdqu    0x50(%rsp),$T1      # I[2]
  731     vpclmulqdq  \$0x00,$Hkey,$T3,$Z3
  732      vmovdqu    0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4
  733     vpxor       $Z0,$Z3,$Z3
  734      vpunpckhqdq    $T1,$T1,$Z0
  735     vpclmulqdq  \$0x11,$Hkey,$T3,$T3
  736      vpxor      $T1,$Z0,$Z0
  737     vpxor       $Z2,$T3,$T3
  738     vpclmulqdq  \$0x00,$HK,$Z1,$Z1
  739     vpxor       $T2,$Z1,$Z1
  740 
  741      vmovdqu    0x60(%rsp),$T2      # I[1]
  742     vpclmulqdq  \$0x00,$Ii,$T1,$Z2
  743      vmovdqu    0x60-0x20($Xip),$Hkey   # $Hkey^5
  744     vpxor       $Z3,$Z2,$Z2
  745      vpunpckhqdq    $T2,$T2,$Z3
  746     vpclmulqdq  \$0x11,$Ii,$T1,$T1
  747      vpxor      $T2,$Z3,$Z3
  748     vpxor       $T3,$T1,$T1
  749     vpclmulqdq  \$0x10,$HK,$Z0,$Z0
  750      vmovdqu    0x80-0x20($Xip),$HK
  751     vpxor       $Z1,$Z0,$Z0
  752 
  753      vpxor      0x70(%rsp),$Xi,$Xi  # accumulate I[0]
  754     vpclmulqdq  \$0x00,$Hkey,$T2,$Z1
  755      vmovdqu    0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6
  756      vpunpckhqdq    $Xi,$Xi,$T3
  757     vpxor       $Z2,$Z1,$Z1
  758     vpclmulqdq  \$0x11,$Hkey,$T2,$T2
  759      vpxor      $Xi,$T3,$T3
  760     vpxor       $T1,$T2,$T2
  761     vpclmulqdq  \$0x00,$HK,$Z3,$Z3
  762     vpxor       $Z0,$Z3,$Z0
  763 
  764     vpclmulqdq  \$0x00,$Ii,$Xi,$Z2
  765      vmovdqu    0x00-0x20($Xip),$Hkey   # $Hkey^1
  766      vpunpckhqdq    $inout5,$inout5,$T1
  767     vpclmulqdq  \$0x11,$Ii,$Xi,$Xi
  768      vpxor      $inout5,$T1,$T1
  769     vpxor       $Z1,$Z2,$Z1
  770     vpclmulqdq  \$0x10,$HK,$T3,$T3
  771      vmovdqu    0x20-0x20($Xip),$HK
  772     vpxor       $T2,$Xi,$Z3
  773     vpxor       $Z0,$T3,$Z2
  774 
  775      vmovdqu    0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2
  776       vpxor     $Z1,$Z3,$T3     # aggregated Karatsuba post-processing
  777     vpclmulqdq  \$0x00,$Hkey,$inout5,$Z0
  778       vpxor     $T3,$Z2,$Z2
  779      vpunpckhqdq    $inout4,$inout4,$T2
  780     vpclmulqdq  \$0x11,$Hkey,$inout5,$inout5
  781      vpxor      $inout4,$T2,$T2
  782       vpslldq   \$8,$Z2,$T3
  783     vpclmulqdq  \$0x00,$HK,$T1,$T1
  784       vpxor     $T3,$Z1,$Xi
  785       vpsrldq   \$8,$Z2,$Z2
  786       vpxor     $Z2,$Z3,$Z3
  787 
  788     vpclmulqdq  \$0x00,$Ii,$inout4,$Z1
  789      vmovdqu    0x30-0x20($Xip),$Hkey   # $Hkey^3
  790     vpxor       $Z0,$Z1,$Z1
  791      vpunpckhqdq    $inout3,$inout3,$T3
  792     vpclmulqdq  \$0x11,$Ii,$inout4,$inout4
  793      vpxor      $inout3,$T3,$T3
  794     vpxor       $inout5,$inout4,$inout4
  795       vpalignr  \$8,$Xi,$Xi,$inout5 # 1st phase
  796     vpclmulqdq  \$0x10,$HK,$T2,$T2
  797      vmovdqu    0x50-0x20($Xip),$HK
  798     vpxor       $T1,$T2,$T2
  799 
  800     vpclmulqdq  \$0x00,$Hkey,$inout3,$Z0
  801      vmovdqu    0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4
  802     vpxor       $Z1,$Z0,$Z0
  803      vpunpckhqdq    $inout2,$inout2,$T1
  804     vpclmulqdq  \$0x11,$Hkey,$inout3,$inout3
  805      vpxor      $inout2,$T1,$T1
  806     vpxor       $inout4,$inout3,$inout3
  807       vxorps    0x10(%rsp),$Z3,$Z3  # accumulate $inout0
  808     vpclmulqdq  \$0x00,$HK,$T3,$T3
  809     vpxor       $T2,$T3,$T3
  810 
  811       vpclmulqdq    \$0x10,0x10($const),$Xi,$Xi
  812       vxorps    $inout5,$Xi,$Xi
  813 
  814     vpclmulqdq  \$0x00,$Ii,$inout2,$Z1
  815      vmovdqu    0x60-0x20($Xip),$Hkey   # $Hkey^5
  816     vpxor       $Z0,$Z1,$Z1
  817      vpunpckhqdq    $inout1,$inout1,$T2
  818     vpclmulqdq  \$0x11,$Ii,$inout2,$inout2
  819      vpxor      $inout1,$T2,$T2
  820       vpalignr  \$8,$Xi,$Xi,$inout5 # 2nd phase
  821     vpxor       $inout3,$inout2,$inout2
  822     vpclmulqdq  \$0x10,$HK,$T1,$T1
  823      vmovdqu    0x80-0x20($Xip),$HK
  824     vpxor       $T3,$T1,$T1
  825 
  826       vxorps    $Z3,$inout5,$inout5
  827       vpclmulqdq    \$0x10,0x10($const),$Xi,$Xi
  828       vxorps    $inout5,$Xi,$Xi
  829 
  830     vpclmulqdq  \$0x00,$Hkey,$inout1,$Z0
  831      vmovdqu    0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6
  832     vpxor       $Z1,$Z0,$Z0
  833      vpunpckhqdq    $Xi,$Xi,$T3
  834     vpclmulqdq  \$0x11,$Hkey,$inout1,$inout1
  835      vpxor      $Xi,$T3,$T3
  836     vpxor       $inout2,$inout1,$inout1
  837     vpclmulqdq  \$0x00,$HK,$T2,$T2
  838     vpxor       $T1,$T2,$T2
  839 
  840     vpclmulqdq  \$0x00,$Ii,$Xi,$Z1
  841     vpclmulqdq  \$0x11,$Ii,$Xi,$Z3
  842     vpxor       $Z0,$Z1,$Z1
  843     vpclmulqdq  \$0x10,$HK,$T3,$Z2
  844     vpxor       $inout1,$Z3,$Z3
  845     vpxor       $T2,$Z2,$Z2
  846 
  847     vpxor       $Z1,$Z3,$Z0     # aggregated Karatsuba post-processing
  848     vpxor       $Z0,$Z2,$Z2
  849     vpslldq     \$8,$Z2,$T1
  850     vmovdqu     0x10($const),$Hkey  # .Lpoly
  851     vpsrldq     \$8,$Z2,$Z2
  852     vpxor       $T1,$Z1,$Xi
  853     vpxor       $Z2,$Z3,$Z3
  854 
  855     vpalignr    \$8,$Xi,$Xi,$T2     # 1st phase
  856     vpclmulqdq  \$0x10,$Hkey,$Xi,$Xi
  857     vpxor       $T2,$Xi,$Xi
  858 
  859     vpalignr    \$8,$Xi,$Xi,$T2     # 2nd phase
  860     vpclmulqdq  \$0x10,$Hkey,$Xi,$Xi
  861     vpxor       $Z3,$T2,$T2
  862     vpxor       $T2,$Xi,$Xi
  863 ___
  864 }
  865 $code.=<<___;
  866     vpshufb     ($const),$Xi,$Xi    # .Lbswap_mask
  867     vmovdqu     $Xi,-0x40($Xip)     # output Xi
  868 
  869     vzeroupper
  870 ___
  871 $code.=<<___ if ($win64);
  872     movaps  -0xd8(%rax),%xmm6
  873     movaps  -0xc8(%rax),%xmm7
  874     movaps  -0xb8(%rax),%xmm8
  875     movaps  -0xa8(%rax),%xmm9
  876     movaps  -0x98(%rax),%xmm10
  877     movaps  -0x88(%rax),%xmm11
  878     movaps  -0x78(%rax),%xmm12
  879     movaps  -0x68(%rax),%xmm13
  880     movaps  -0x58(%rax),%xmm14
  881     movaps  -0x48(%rax),%xmm15
  882 ___
  883 $code.=<<___;
  884     mov -48(%rax),%r15
  885     mov -40(%rax),%r14
  886     mov -32(%rax),%r13
  887     mov -24(%rax),%r12
  888     mov -16(%rax),%rbp
  889     mov -8(%rax),%rbx
  890     lea (%rax),%rsp     # restore %rsp
  891 .Lgcm_enc_abort:
  892     mov $ret,%rax       # return value
  893     ret
  894 .size   aesni_gcm_encrypt,.-aesni_gcm_encrypt
  895 ___
  896 
  897 $code.=<<___;
  898 .align  64
  899 .Lbswap_mask:
  900     .byte   15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
  901 .Lpoly:
  902     .byte   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
  903 .Lone_msb:
  904     .byte   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
  905 .Ltwo_lsb:
  906     .byte   2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  907 .Lone_lsb:
  908     .byte   1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  909 .asciz  "AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  910 .align  64
  911 ___
  912 if ($win64) {
  913 $rec="%rcx";
  914 $frame="%rdx";
  915 $context="%r8";
  916 $disp="%r9";
  917 
  918 $code.=<<___
  919 .extern __imp_RtlVirtualUnwind
  920 .type   gcm_se_handler,\@abi-omnipotent
  921 .align  16
  922 gcm_se_handler:
  923     push    %rsi
  924     push    %rdi
  925     push    %rbx
  926     push    %rbp
  927     push    %r12
  928     push    %r13
  929     push    %r14
  930     push    %r15
  931     pushfq
  932     sub \$64,%rsp
  933 
  934     mov 120($context),%rax  # pull context->Rax
  935     mov 248($context),%rbx  # pull context->Rip
  936 
  937     mov 8($disp),%rsi       # disp->ImageBase
  938     mov 56($disp),%r11      # disp->HandlerData
  939 
  940     mov 0(%r11),%r10d       # HandlerData[0]
  941     lea (%rsi,%r10),%r10    # prologue label
  942     cmp %r10,%rbx       # context->Rip<prologue label
  943     jb  .Lcommon_seh_tail
  944 
  945     mov 152($context),%rax  # pull context->Rsp
  946 
  947     mov 4(%r11),%r10d       # HandlerData[1]
  948     lea (%rsi,%r10),%r10    # epilogue label
  949     cmp %r10,%rbx       # context->Rip>=epilogue label
  950     jae .Lcommon_seh_tail
  951 
  952     mov 120($context),%rax  # pull context->Rax
  953 
  954     mov -48(%rax),%r15
  955     mov -40(%rax),%r14
  956     mov -32(%rax),%r13
  957     mov -24(%rax),%r12
  958     mov -16(%rax),%rbp
  959     mov -8(%rax),%rbx
  960     mov %r15,240($context)
  961     mov %r14,232($context)
  962     mov %r13,224($context)
  963     mov %r12,216($context)
  964     mov %rbp,160($context)
  965     mov %rbx,144($context)
  966 
  967     lea -0xd8(%rax),%rsi    # %xmm save area
  968     lea 512($context),%rdi  # & context.Xmm6
  969     mov \$20,%ecx       # 10*sizeof(%xmm0)/sizeof(%rax)
  970     .long   0xa548f3fc      # cld; rep movsq
  971 
  972 .Lcommon_seh_tail:
  973     mov 8(%rax),%rdi
  974     mov 16(%rax),%rsi
  975     mov %rax,152($context)  # restore context->Rsp
  976     mov %rsi,168($context)  # restore context->Rsi
  977     mov %rdi,176($context)  # restore context->Rdi
  978 
  979     mov 40($disp),%rdi      # disp->ContextRecord
  980     mov $context,%rsi       # context
  981     mov \$154,%ecx      # sizeof(CONTEXT)
  982     .long   0xa548f3fc      # cld; rep movsq
  983 
  984     mov $disp,%rsi
  985     xor %rcx,%rcx       # arg1, UNW_FLAG_NHANDLER
  986     mov 8(%rsi),%rdx        # arg2, disp->ImageBase
  987     mov 0(%rsi),%r8     # arg3, disp->ControlPc
  988     mov 16(%rsi),%r9        # arg4, disp->FunctionEntry
  989     mov 40(%rsi),%r10       # disp->ContextRecord
  990     lea 56(%rsi),%r11       # &disp->HandlerData
  991     lea 24(%rsi),%r12       # &disp->EstablisherFrame
  992     mov %r10,32(%rsp)       # arg5
  993     mov %r11,40(%rsp)       # arg6
  994     mov %r12,48(%rsp)       # arg7
  995     mov %rcx,56(%rsp)       # arg8, (NULL)
  996     call    *__imp_RtlVirtualUnwind(%rip)
  997 
  998     mov \$1,%eax        # ExceptionContinueSearch
  999     add \$64,%rsp
 1000     popfq
 1001     pop %r15
 1002     pop %r14
 1003     pop %r13
 1004     pop %r12
 1005     pop %rbp
 1006     pop %rbx
 1007     pop %rdi
 1008     pop %rsi
 1009     ret
 1010 .size   gcm_se_handler,.-gcm_se_handler
 1011 
 1012 .section    .pdata
 1013 .align  4
 1014     .rva    .LSEH_begin_aesni_gcm_decrypt
 1015     .rva    .LSEH_end_aesni_gcm_decrypt
 1016     .rva    .LSEH_gcm_dec_info
 1017 
 1018     .rva    .LSEH_begin_aesni_gcm_encrypt
 1019     .rva    .LSEH_end_aesni_gcm_encrypt
 1020     .rva    .LSEH_gcm_enc_info
 1021 .section    .xdata
 1022 .align  8
 1023 .LSEH_gcm_dec_info:
 1024     .byte   9,0,0,0
 1025     .rva    gcm_se_handler
 1026     .rva    .Lgcm_dec_body,.Lgcm_dec_abort
 1027 .LSEH_gcm_enc_info:
 1028     .byte   9,0,0,0
 1029     .rva    gcm_se_handler
 1030     .rva    .Lgcm_enc_body,.Lgcm_enc_abort
 1031 ___
 1032 }
 1033 }}} else {{{
 1034 $code=<<___;    # assembler is too old
 1035 .text
 1036 
 1037 .globl  aesni_gcm_encrypt
 1038 .type   aesni_gcm_encrypt,\@abi-omnipotent
 1039 aesni_gcm_encrypt:
 1040     xor %eax,%eax
 1041     ret
 1042 .size   aesni_gcm_encrypt,.-aesni_gcm_encrypt
 1043 
 1044 .globl  aesni_gcm_decrypt
 1045 .type   aesni_gcm_decrypt,\@abi-omnipotent
 1046 aesni_gcm_decrypt:
 1047     xor %eax,%eax
 1048     ret
 1049 .size   aesni_gcm_decrypt,.-aesni_gcm_decrypt
 1050 ___
 1051 }}}
 1052 
 1053 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
 1054 
 1055 print $code;
 1056 
 1057 close STDOUT;