"Fossies" - the Fresh Open Source Software Archive

Member "openssl-1.0.2q/crypto/aes/asm/aes-s390x.pl" (20 Nov 2018, 53114 Bytes) of package /linux/misc/openssl-1.0.2q.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Perl source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "aes-s390x.pl" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 1.1.0g_vs_1.1.1-pre2.

    1 #!/usr/bin/env perl
    2 
    3 # ====================================================================
    4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
    5 # project. The module is, however, dual licensed under OpenSSL and
    6 # CRYPTOGAMS licenses depending on where you obtain it. For further
    7 # details see http://www.openssl.org/~appro/cryptogams/.
    8 # ====================================================================
    9 
   10 # AES for s390x.
   11 
   12 # April 2007.
   13 #
   14 # Software performance improvement over gcc-generated code is ~70% and
   15 # in absolute terms is ~73 cycles per byte processed with 128-bit key.
   16 # You're likely to exclaim "why so slow?" Keep in mind that z-CPUs are
   17 # *strictly* in-order execution and issued instruction [in this case
   18 # load value from memory is critical] has to complete before execution
   19 # flow proceeds. S-boxes are compressed to 2KB[+256B].
   20 #
   21 # As for hardware acceleration support. It's basically a "teaser," as
   22 # it can and should be improved in several ways. Most notably support
   23 # for CBC is not utilized, nor multiple blocks are ever processed.
   24 # Then software key schedule can be postponed till hardware support
   25 # detection... Performance improvement over assembler is reportedly
   26 # ~2.5x, but can reach >8x [naturally on larger chunks] if proper
   27 # support is implemented.
   28 
   29 # May 2007.
   30 #
   31 # Implement AES_set_[en|de]crypt_key. Key schedule setup is avoided
   32 # for 128-bit keys, if hardware support is detected.
   33 
   34 # Januray 2009.
   35 #
   36 # Add support for hardware AES192/256 and reschedule instructions to
   37 # minimize/avoid Address Generation Interlock hazard and to favour
   38 # dual-issue z10 pipeline. This gave ~25% improvement on z10 and
   39 # almost 50% on z9. The gain is smaller on z10, because being dual-
   40 # issue z10 makes it improssible to eliminate the interlock condition:
   41 # critial path is not long enough. Yet it spends ~24 cycles per byte
   42 # processed with 128-bit key.
   43 #
   44 # Unlike previous version hardware support detection takes place only
   45 # at the moment of key schedule setup, which is denoted in key->rounds.
   46 # This is done, because deferred key setup can't be made MT-safe, not
   47 # for keys longer than 128 bits.
   48 #
   49 # Add AES_cbc_encrypt, which gives incredible performance improvement,
   50 # it was measured to be ~6.6x. It's less than previously mentioned 8x,
   51 # because software implementation was optimized.
   52 
   53 # May 2010.
   54 #
   55 # Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
   56 # performance improvement over "generic" counter mode routine relying
   57 # on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
   58 # to the fact that exact throughput value depends on current stack
   59 # frame alignment within 4KB page. In worst case you get ~75% of the
   60 # maximum, but *on average* it would be as much as ~98%. Meaning that
   61 # worst case is unlike, it's like hitting ravine on plateau.
   62 
   63 # November 2010.
   64 #
   65 # Adapt for -m31 build. If kernel supports what's called "highgprs"
   66 # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
   67 # instructions and achieve "64-bit" performance even in 31-bit legacy
   68 # application context. The feature is not specific to any particular
   69 # processor, as long as it's "z-CPU". Latter implies that the code
   70 # remains z/Architecture specific. On z990 it was measured to perform
   71 # 2x better than code generated by gcc 4.3.
   72 
   73 # December 2010.
   74 #
   75 # Add support for z196 "cipher message with counter" instruction.
   76 # Note however that it's disengaged, because it was measured to
   77 # perform ~12% worse than vanilla km-based code...
   78 
   79 # February 2011.
   80 #
   81 # Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes
   82 # instructions, which deliver ~70% improvement at 8KB block size over
   83 # vanilla km-based code, 37% - at most like 512-bytes block size.
   84 
   85 $flavour = shift;
   86 
   87 if ($flavour =~ /3[12]/) {
   88     $SIZE_T=4;
   89     $g="";
   90 } else {
   91     $SIZE_T=8;
   92     $g="g";
   93 }
   94 
   95 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
   96 open STDOUT,">$output";
   97 
   98 $softonly=0;    # allow hardware support
   99 
  100 $t0="%r0";  $mask="%r0";
  101 $t1="%r1";
  102 $t2="%r2";  $inp="%r2";
  103 $t3="%r3";  $out="%r3"; $bits="%r3";
  104 $key="%r4";
  105 $i1="%r5";
  106 $i2="%r6";
  107 $i3="%r7";
  108 $s0="%r8";
  109 $s1="%r9";
  110 $s2="%r10";
  111 $s3="%r11";
  112 $tbl="%r12";
  113 $rounds="%r13";
  114 $ra="%r14";
  115 $sp="%r15";
  116 
  117 $stdframe=16*$SIZE_T+4*8;
  118 
  119 sub _data_word()
  120 { my $i;
  121     while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
  122 }
  123 
  124 $code=<<___;
  125 .text
  126 
  127 .type   AES_Te,\@object
  128 .align  256
  129 AES_Te:
  130 ___
  131 &_data_word(
  132     0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
  133     0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
  134     0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
  135     0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
  136     0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
  137     0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
  138     0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
  139     0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
  140     0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
  141     0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
  142     0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
  143     0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
  144     0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
  145     0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
  146     0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
  147     0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
  148     0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
  149     0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
  150     0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
  151     0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
  152     0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
  153     0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
  154     0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
  155     0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
  156     0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
  157     0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
  158     0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
  159     0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
  160     0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
  161     0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
  162     0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
  163     0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
  164     0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
  165     0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
  166     0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
  167     0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
  168     0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
  169     0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
  170     0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
  171     0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
  172     0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
  173     0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
  174     0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
  175     0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
  176     0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
  177     0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
  178     0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
  179     0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
  180     0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
  181     0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
  182     0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
  183     0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
  184     0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
  185     0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
  186     0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
  187     0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
  188     0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
  189     0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
  190     0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
  191     0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
  192     0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
  193     0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
  194     0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
  195     0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
  196 $code.=<<___;
  197 # Te4[256]
  198 .byte   0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
  199 .byte   0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
  200 .byte   0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
  201 .byte   0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
  202 .byte   0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
  203 .byte   0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
  204 .byte   0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
  205 .byte   0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
  206 .byte   0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
  207 .byte   0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
  208 .byte   0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
  209 .byte   0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
  210 .byte   0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
  211 .byte   0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
  212 .byte   0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
  213 .byte   0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
  214 .byte   0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
  215 .byte   0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
  216 .byte   0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
  217 .byte   0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
  218 .byte   0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
  219 .byte   0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
  220 .byte   0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
  221 .byte   0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
  222 .byte   0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
  223 .byte   0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
  224 .byte   0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
  225 .byte   0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
  226 .byte   0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
  227 .byte   0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
  228 .byte   0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
  229 .byte   0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
  230 # rcon[]
  231 .long   0x01000000, 0x02000000, 0x04000000, 0x08000000
  232 .long   0x10000000, 0x20000000, 0x40000000, 0x80000000
  233 .long   0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
  234 .align  256
  235 .size   AES_Te,.-AES_Te
  236 
  237 # void AES_encrypt(const unsigned char *inp, unsigned char *out,
  238 #        const AES_KEY *key) {
  239 .globl  AES_encrypt
  240 .type   AES_encrypt,\@function
  241 AES_encrypt:
  242 ___
  243 $code.=<<___ if (!$softonly);
  244     l   %r0,240($key)
  245     lhi %r1,16
  246     clr %r0,%r1
  247     jl  .Lesoft
  248 
  249     la  %r1,0($key)
  250     #la %r2,0($inp)
  251     la  %r4,0($out)
  252     lghi    %r3,16      # single block length
  253     .long   0xb92e0042  # km %r4,%r2
  254     brc 1,.-4       # can this happen?
  255     br  %r14
  256 .align  64
  257 .Lesoft:
  258 ___
  259 $code.=<<___;
  260     stm${g} %r3,$ra,3*$SIZE_T($sp)
  261 
  262     llgf    $s0,0($inp)
  263     llgf    $s1,4($inp)
  264     llgf    $s2,8($inp)
  265     llgf    $s3,12($inp)
  266 
  267     larl    $tbl,AES_Te
  268     bras    $ra,_s390x_AES_encrypt
  269 
  270     l${g}   $out,3*$SIZE_T($sp)
  271     st  $s0,0($out)
  272     st  $s1,4($out)
  273     st  $s2,8($out)
  274     st  $s3,12($out)
  275 
  276     lm${g}  %r6,$ra,6*$SIZE_T($sp)
  277     br  $ra
  278 .size   AES_encrypt,.-AES_encrypt
  279 
  280 .type   _s390x_AES_encrypt,\@function
  281 .align  16
  282 _s390x_AES_encrypt:
  283     st${g}  $ra,15*$SIZE_T($sp)
  284     x   $s0,0($key)
  285     x   $s1,4($key)
  286     x   $s2,8($key)
  287     x   $s3,12($key)
  288     l   $rounds,240($key)
  289     llill   $mask,`0xff<<3`
  290     aghi    $rounds,-1
  291     j   .Lenc_loop
  292 .align  16
  293 .Lenc_loop:
  294     sllg    $t1,$s0,`0+3`
  295     srlg    $t2,$s0,`8-3`
  296     srlg    $t3,$s0,`16-3`
  297     srl $s0,`24-3`
  298     nr  $s0,$mask
  299     ngr $t1,$mask
  300     nr  $t2,$mask
  301     nr  $t3,$mask
  302 
  303     srlg    $i1,$s1,`16-3`  # i0
  304     sllg    $i2,$s1,`0+3`
  305     srlg    $i3,$s1,`8-3`
  306     srl $s1,`24-3`
  307     nr  $i1,$mask
  308     nr  $s1,$mask
  309     ngr $i2,$mask
  310     nr  $i3,$mask
  311 
  312     l   $s0,0($s0,$tbl) # Te0[s0>>24]
  313     l   $t1,1($t1,$tbl) # Te3[s0>>0]
  314     l   $t2,2($t2,$tbl) # Te2[s0>>8]
  315     l   $t3,3($t3,$tbl) # Te1[s0>>16]
  316 
  317     x   $s0,3($i1,$tbl) # Te1[s1>>16]
  318     l   $s1,0($s1,$tbl) # Te0[s1>>24]
  319     x   $t2,1($i2,$tbl) # Te3[s1>>0]
  320     x   $t3,2($i3,$tbl) # Te2[s1>>8]
  321 
  322     srlg    $i1,$s2,`8-3`   # i0
  323     srlg    $i2,$s2,`16-3`  # i1
  324     nr  $i1,$mask
  325     nr  $i2,$mask
  326     sllg    $i3,$s2,`0+3`
  327     srl $s2,`24-3`
  328     nr  $s2,$mask
  329     ngr $i3,$mask
  330 
  331     xr  $s1,$t1
  332     srlg    $ra,$s3,`8-3`   # i1
  333     sllg    $t1,$s3,`0+3`   # i0
  334     nr  $ra,$mask
  335     la  $key,16($key)
  336     ngr $t1,$mask
  337 
  338     x   $s0,2($i1,$tbl) # Te2[s2>>8]
  339     x   $s1,3($i2,$tbl) # Te1[s2>>16]
  340     l   $s2,0($s2,$tbl) # Te0[s2>>24]
  341     x   $t3,1($i3,$tbl) # Te3[s2>>0]
  342 
  343     srlg    $i3,$s3,`16-3`  # i2
  344     xr  $s2,$t2
  345     srl $s3,`24-3`
  346     nr  $i3,$mask
  347     nr  $s3,$mask
  348 
  349     x   $s0,0($key)
  350     x   $s1,4($key)
  351     x   $s2,8($key)
  352     x   $t3,12($key)
  353 
  354     x   $s0,1($t1,$tbl) # Te3[s3>>0]
  355     x   $s1,2($ra,$tbl) # Te2[s3>>8]
  356     x   $s2,3($i3,$tbl) # Te1[s3>>16]
  357     l   $s3,0($s3,$tbl) # Te0[s3>>24]
  358     xr  $s3,$t3
  359 
  360     brct    $rounds,.Lenc_loop
  361     .align  16
  362 
  363     sllg    $t1,$s0,`0+3`
  364     srlg    $t2,$s0,`8-3`
  365     ngr $t1,$mask
  366     srlg    $t3,$s0,`16-3`
  367     srl $s0,`24-3`
  368     nr  $s0,$mask
  369     nr  $t2,$mask
  370     nr  $t3,$mask
  371 
  372     srlg    $i1,$s1,`16-3`  # i0
  373     sllg    $i2,$s1,`0+3`
  374     ngr $i2,$mask
  375     srlg    $i3,$s1,`8-3`
  376     srl $s1,`24-3`
  377     nr  $i1,$mask
  378     nr  $s1,$mask
  379     nr  $i3,$mask
  380 
  381     llgc    $s0,2($s0,$tbl) # Te4[s0>>24]
  382     llgc    $t1,2($t1,$tbl) # Te4[s0>>0]
  383     sll $s0,24
  384     llgc    $t2,2($t2,$tbl) # Te4[s0>>8]
  385     llgc    $t3,2($t3,$tbl) # Te4[s0>>16]
  386     sll $t2,8
  387     sll $t3,16
  388 
  389     llgc    $i1,2($i1,$tbl) # Te4[s1>>16]
  390     llgc    $s1,2($s1,$tbl) # Te4[s1>>24]
  391     llgc    $i2,2($i2,$tbl) # Te4[s1>>0]
  392     llgc    $i3,2($i3,$tbl) # Te4[s1>>8]
  393     sll $i1,16
  394     sll $s1,24
  395     sll $i3,8
  396     or  $s0,$i1
  397     or  $s1,$t1
  398     or  $t2,$i2
  399     or  $t3,$i3
  400     
  401     srlg    $i1,$s2,`8-3`   # i0
  402     srlg    $i2,$s2,`16-3`  # i1
  403     nr  $i1,$mask
  404     nr  $i2,$mask
  405     sllg    $i3,$s2,`0+3`
  406     srl $s2,`24-3`
  407     ngr $i3,$mask
  408     nr  $s2,$mask
  409 
  410     sllg    $t1,$s3,`0+3`   # i0
  411     srlg    $ra,$s3,`8-3`   # i1
  412     ngr $t1,$mask
  413 
  414     llgc    $i1,2($i1,$tbl) # Te4[s2>>8]
  415     llgc    $i2,2($i2,$tbl) # Te4[s2>>16]
  416     sll $i1,8
  417     llgc    $s2,2($s2,$tbl) # Te4[s2>>24]
  418     llgc    $i3,2($i3,$tbl) # Te4[s2>>0]
  419     sll $i2,16
  420     nr  $ra,$mask
  421     sll $s2,24
  422     or  $s0,$i1
  423     or  $s1,$i2
  424     or  $s2,$t2
  425     or  $t3,$i3
  426 
  427     srlg    $i3,$s3,`16-3`  # i2
  428     srl $s3,`24-3`
  429     nr  $i3,$mask
  430     nr  $s3,$mask
  431 
  432     l   $t0,16($key)
  433     l   $t2,20($key)
  434 
  435     llgc    $i1,2($t1,$tbl) # Te4[s3>>0]
  436     llgc    $i2,2($ra,$tbl) # Te4[s3>>8]
  437     llgc    $i3,2($i3,$tbl) # Te4[s3>>16]
  438     llgc    $s3,2($s3,$tbl) # Te4[s3>>24]
  439     sll $i2,8
  440     sll $i3,16
  441     sll $s3,24
  442     or  $s0,$i1
  443     or  $s1,$i2
  444     or  $s2,$i3
  445     or  $s3,$t3
  446 
  447     l${g}   $ra,15*$SIZE_T($sp)
  448     xr  $s0,$t0
  449     xr  $s1,$t2
  450     x   $s2,24($key)
  451     x   $s3,28($key)
  452 
  453     br  $ra 
  454 .size   _s390x_AES_encrypt,.-_s390x_AES_encrypt
  455 ___
  456 
  457 $code.=<<___;
  458 .type   AES_Td,\@object
  459 .align  256
  460 AES_Td:
  461 ___
  462 &_data_word(
  463     0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
  464     0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
  465     0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
  466     0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
  467     0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
  468     0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
  469     0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
  470     0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
  471     0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
  472     0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
  473     0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
  474     0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
  475     0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
  476     0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
  477     0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
  478     0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
  479     0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
  480     0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
  481     0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
  482     0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
  483     0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
  484     0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
  485     0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
  486     0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
  487     0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
  488     0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
  489     0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
  490     0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
  491     0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
  492     0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
  493     0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
  494     0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
  495     0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
  496     0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
  497     0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
  498     0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
  499     0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
  500     0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
  501     0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
  502     0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
  503     0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
  504     0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
  505     0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
  506     0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
  507     0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
  508     0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
  509     0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
  510     0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
  511     0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
  512     0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
  513     0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
  514     0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
  515     0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
  516     0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
  517     0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
  518     0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
  519     0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
  520     0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
  521     0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
  522     0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
  523     0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
  524     0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
  525     0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
  526     0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
  527 $code.=<<___;
  528 # Td4[256]
  529 .byte   0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
  530 .byte   0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
  531 .byte   0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
  532 .byte   0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
  533 .byte   0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
  534 .byte   0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
  535 .byte   0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
  536 .byte   0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
  537 .byte   0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
  538 .byte   0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
  539 .byte   0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
  540 .byte   0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
  541 .byte   0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
  542 .byte   0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
  543 .byte   0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
  544 .byte   0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
  545 .byte   0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
  546 .byte   0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
  547 .byte   0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
  548 .byte   0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
  549 .byte   0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
  550 .byte   0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
  551 .byte   0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
  552 .byte   0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
  553 .byte   0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
  554 .byte   0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
  555 .byte   0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
  556 .byte   0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
  557 .byte   0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
  558 .byte   0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
  559 .byte   0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
  560 .byte   0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
  561 .size   AES_Td,.-AES_Td
  562 
  563 # void AES_decrypt(const unsigned char *inp, unsigned char *out,
  564 #        const AES_KEY *key) {
  565 .globl  AES_decrypt
  566 .type   AES_decrypt,\@function
  567 AES_decrypt:
  568 ___
  569 $code.=<<___ if (!$softonly);
  570     l   %r0,240($key)
  571     lhi %r1,16
  572     clr %r0,%r1
  573     jl  .Ldsoft
  574 
  575     la  %r1,0($key)
  576     #la %r2,0($inp)
  577     la  %r4,0($out)
  578     lghi    %r3,16      # single block length
  579     .long   0xb92e0042  # km %r4,%r2
  580     brc 1,.-4       # can this happen?
  581     br  %r14
  582 .align  64
  583 .Ldsoft:
  584 ___
  585 $code.=<<___;
  586     stm${g} %r3,$ra,3*$SIZE_T($sp)
  587 
  588     llgf    $s0,0($inp)
  589     llgf    $s1,4($inp)
  590     llgf    $s2,8($inp)
  591     llgf    $s3,12($inp)
  592 
  593     larl    $tbl,AES_Td
  594     bras    $ra,_s390x_AES_decrypt
  595 
  596     l${g}   $out,3*$SIZE_T($sp)
  597     st  $s0,0($out)
  598     st  $s1,4($out)
  599     st  $s2,8($out)
  600     st  $s3,12($out)
  601 
  602     lm${g}  %r6,$ra,6*$SIZE_T($sp)
  603     br  $ra
  604 .size   AES_decrypt,.-AES_decrypt
  605 
  606 .type   _s390x_AES_decrypt,\@function
  607 .align  16
  608 _s390x_AES_decrypt:
  609     st${g}  $ra,15*$SIZE_T($sp)
  610     x   $s0,0($key)
  611     x   $s1,4($key)
  612     x   $s2,8($key)
  613     x   $s3,12($key)
  614     l   $rounds,240($key)
  615     llill   $mask,`0xff<<3`
  616     aghi    $rounds,-1
  617     j   .Ldec_loop
  618 .align  16
  619 .Ldec_loop:
  620     srlg    $t1,$s0,`16-3`
  621     srlg    $t2,$s0,`8-3`
  622     sllg    $t3,$s0,`0+3`
  623     srl $s0,`24-3`
  624     nr  $s0,$mask
  625     nr  $t1,$mask
  626     nr  $t2,$mask
  627     ngr $t3,$mask
  628 
  629     sllg    $i1,$s1,`0+3`   # i0
  630     srlg    $i2,$s1,`16-3`
  631     srlg    $i3,$s1,`8-3`
  632     srl $s1,`24-3`
  633     ngr $i1,$mask
  634     nr  $s1,$mask
  635     nr  $i2,$mask
  636     nr  $i3,$mask
  637 
  638     l   $s0,0($s0,$tbl) # Td0[s0>>24]
  639     l   $t1,3($t1,$tbl) # Td1[s0>>16]
  640     l   $t2,2($t2,$tbl) # Td2[s0>>8]
  641     l   $t3,1($t3,$tbl) # Td3[s0>>0]
  642 
  643     x   $s0,1($i1,$tbl) # Td3[s1>>0]
  644     l   $s1,0($s1,$tbl) # Td0[s1>>24]
  645     x   $t2,3($i2,$tbl) # Td1[s1>>16]
  646     x   $t3,2($i3,$tbl) # Td2[s1>>8]
  647 
  648     srlg    $i1,$s2,`8-3`   # i0
  649     sllg    $i2,$s2,`0+3`   # i1
  650     srlg    $i3,$s2,`16-3`
  651     srl $s2,`24-3`
  652     nr  $i1,$mask
  653     ngr $i2,$mask
  654     nr  $s2,$mask
  655     nr  $i3,$mask
  656 
  657     xr  $s1,$t1
  658     srlg    $ra,$s3,`8-3`   # i1
  659     srlg    $t1,$s3,`16-3`  # i0
  660     nr  $ra,$mask
  661     la  $key,16($key)
  662     nr  $t1,$mask
  663 
  664     x   $s0,2($i1,$tbl) # Td2[s2>>8]
  665     x   $s1,1($i2,$tbl) # Td3[s2>>0]
  666     l   $s2,0($s2,$tbl) # Td0[s2>>24]
  667     x   $t3,3($i3,$tbl) # Td1[s2>>16]
  668 
  669     sllg    $i3,$s3,`0+3`   # i2
  670     srl $s3,`24-3`
  671     ngr $i3,$mask
  672     nr  $s3,$mask
  673 
  674     xr  $s2,$t2
  675     x   $s0,0($key)
  676     x   $s1,4($key)
  677     x   $s2,8($key)
  678     x   $t3,12($key)
  679 
  680     x   $s0,3($t1,$tbl) # Td1[s3>>16]
  681     x   $s1,2($ra,$tbl) # Td2[s3>>8]
  682     x   $s2,1($i3,$tbl) # Td3[s3>>0]
  683     l   $s3,0($s3,$tbl) # Td0[s3>>24]
  684     xr  $s3,$t3
  685 
  686     brct    $rounds,.Ldec_loop
  687     .align  16
  688 
  689     l   $t1,`2048+0`($tbl)  # prefetch Td4
  690     l   $t2,`2048+64`($tbl)
  691     l   $t3,`2048+128`($tbl)
  692     l   $i1,`2048+192`($tbl)
  693     llill   $mask,0xff
  694 
  695     srlg    $i3,$s0,24  # i0
  696     srlg    $t1,$s0,16
  697     srlg    $t2,$s0,8
  698     nr  $s0,$mask   # i3
  699     nr  $t1,$mask
  700 
  701     srlg    $i1,$s1,24
  702     nr  $t2,$mask
  703     srlg    $i2,$s1,16
  704     srlg    $ra,$s1,8
  705     nr  $s1,$mask   # i0
  706     nr  $i2,$mask
  707     nr  $ra,$mask
  708 
  709     llgc    $i3,2048($i3,$tbl)  # Td4[s0>>24]
  710     llgc    $t1,2048($t1,$tbl)  # Td4[s0>>16]
  711     llgc    $t2,2048($t2,$tbl)  # Td4[s0>>8]
  712     sll $t1,16
  713     llgc    $t3,2048($s0,$tbl)  # Td4[s0>>0]
  714     sllg    $s0,$i3,24
  715     sll $t2,8
  716 
  717     llgc    $s1,2048($s1,$tbl)  # Td4[s1>>0]
  718     llgc    $i1,2048($i1,$tbl)  # Td4[s1>>24]
  719     llgc    $i2,2048($i2,$tbl)  # Td4[s1>>16]
  720     sll $i1,24
  721     llgc    $i3,2048($ra,$tbl)  # Td4[s1>>8]
  722     sll $i2,16
  723     sll $i3,8
  724     or  $s0,$s1
  725     or  $t1,$i1
  726     or  $t2,$i2
  727     or  $t3,$i3
  728 
  729     srlg    $i1,$s2,8   # i0
  730     srlg    $i2,$s2,24
  731     srlg    $i3,$s2,16
  732     nr  $s2,$mask   # i1
  733     nr  $i1,$mask
  734     nr  $i3,$mask
  735     llgc    $i1,2048($i1,$tbl)  # Td4[s2>>8]
  736     llgc    $s1,2048($s2,$tbl)  # Td4[s2>>0]
  737     llgc    $i2,2048($i2,$tbl)  # Td4[s2>>24]
  738     llgc    $i3,2048($i3,$tbl)  # Td4[s2>>16]
  739     sll $i1,8
  740     sll $i2,24
  741     or  $s0,$i1
  742     sll $i3,16
  743     or  $t2,$i2
  744     or  $t3,$i3
  745 
  746     srlg    $i1,$s3,16  # i0
  747     srlg    $i2,$s3,8   # i1
  748     srlg    $i3,$s3,24
  749     nr  $s3,$mask   # i2
  750     nr  $i1,$mask
  751     nr  $i2,$mask
  752 
  753     l${g}   $ra,15*$SIZE_T($sp)
  754     or  $s1,$t1
  755     l   $t0,16($key)
  756     l   $t1,20($key)
  757 
  758     llgc    $i1,2048($i1,$tbl)  # Td4[s3>>16]
  759     llgc    $i2,2048($i2,$tbl)  # Td4[s3>>8]
  760     sll $i1,16
  761     llgc    $s2,2048($s3,$tbl)  # Td4[s3>>0]
  762     llgc    $s3,2048($i3,$tbl)  # Td4[s3>>24]
  763     sll $i2,8
  764     sll $s3,24
  765     or  $s0,$i1
  766     or  $s1,$i2
  767     or  $s2,$t2
  768     or  $s3,$t3
  769 
  770     xr  $s0,$t0
  771     xr  $s1,$t1
  772     x   $s2,24($key)
  773     x   $s3,28($key)
  774 
  775     br  $ra 
  776 .size   _s390x_AES_decrypt,.-_s390x_AES_decrypt
  777 ___
  778 
  779 $code.=<<___;
  780 # void AES_set_encrypt_key(const unsigned char *in, int bits,
  781 #        AES_KEY *key) {
  782 .globl  private_AES_set_encrypt_key
  783 .type   private_AES_set_encrypt_key,\@function
  784 .align  16
  785 private_AES_set_encrypt_key:
  786 _s390x_AES_set_encrypt_key:
  787     lghi    $t0,0
  788     cl${g}r $inp,$t0
  789     je  .Lminus1
  790     cl${g}r $key,$t0
  791     je  .Lminus1
  792 
  793     lghi    $t0,128
  794     clr $bits,$t0
  795     je  .Lproceed
  796     lghi    $t0,192
  797     clr $bits,$t0
  798     je  .Lproceed
  799     lghi    $t0,256
  800     clr $bits,$t0
  801     je  .Lproceed
  802     lghi    %r2,-2
  803     br  %r14
  804 
  805 .align  16
  806 .Lproceed:
  807 ___
  808 $code.=<<___ if (!$softonly);
  809     # convert bits to km code, [128,192,256]->[18,19,20]
  810     lhi %r5,-128
  811     lhi %r0,18
  812     ar  %r5,$bits
  813     srl %r5,6
  814     ar  %r5,%r0
  815 
  816     larl    %r1,OPENSSL_s390xcap_P
  817     lg  %r0,0(%r1)
  818     tmhl    %r0,0x4000  # check for message-security assist
  819     jz  .Lekey_internal
  820 
  821     llihh   %r0,0x8000
  822     srlg    %r0,%r0,0(%r5)
  823     ng  %r0,48(%r1) # check kmc capability vector
  824     jz  .Lekey_internal
  825 
  826     lmg %r0,%r1,0($inp) # just copy 128 bits...
  827     stmg    %r0,%r1,0($key)
  828     lhi %r0,192
  829     cr  $bits,%r0
  830     jl  1f
  831     lg  %r1,16($inp)
  832     stg %r1,16($key)
  833     je  1f
  834     lg  %r1,24($inp)
  835     stg %r1,24($key)
  836 1:  st  $bits,236($key) # save bits [for debugging purposes]
  837     lgr $t0,%r5
  838     st  %r5,240($key)   # save km code
  839     lghi    %r2,0
  840     br  %r14
  841 ___
  842 $code.=<<___;
  843 .align  16
  844 .Lekey_internal:
  845     stm${g} %r4,%r13,4*$SIZE_T($sp) # all non-volatile regs and $key
  846 
  847     larl    $tbl,AES_Te+2048
  848 
  849     llgf    $s0,0($inp)
  850     llgf    $s1,4($inp)
  851     llgf    $s2,8($inp)
  852     llgf    $s3,12($inp)
  853     st  $s0,0($key)
  854     st  $s1,4($key)
  855     st  $s2,8($key)
  856     st  $s3,12($key)
  857     lghi    $t0,128
  858     cr  $bits,$t0
  859     jne .Lnot128
  860 
  861     llill   $mask,0xff
  862     lghi    $t3,0           # i=0
  863     lghi    $rounds,10
  864     st  $rounds,240($key)
  865 
  866     llgfr   $t2,$s3         # temp=rk[3]
  867     srlg    $i1,$s3,8
  868     srlg    $i2,$s3,16
  869     srlg    $i3,$s3,24
  870     nr  $t2,$mask
  871     nr  $i1,$mask
  872     nr  $i2,$mask
  873 
  874 .align  16
  875 .L128_loop:
  876     la  $t2,0($t2,$tbl)
  877     la  $i1,0($i1,$tbl)
  878     la  $i2,0($i2,$tbl)
  879     la  $i3,0($i3,$tbl)
  880     icm $t2,2,0($t2)        # Te4[rk[3]>>0]<<8
  881     icm $t2,4,0($i1)        # Te4[rk[3]>>8]<<16
  882     icm $t2,8,0($i2)        # Te4[rk[3]>>16]<<24
  883     icm $t2,1,0($i3)        # Te4[rk[3]>>24]
  884     x   $t2,256($t3,$tbl)   # rcon[i]
  885     xr  $s0,$t2         # rk[4]=rk[0]^...
  886     xr  $s1,$s0         # rk[5]=rk[1]^rk[4]
  887     xr  $s2,$s1         # rk[6]=rk[2]^rk[5]
  888     xr  $s3,$s2         # rk[7]=rk[3]^rk[6]
  889 
  890     llgfr   $t2,$s3         # temp=rk[3]
  891     srlg    $i1,$s3,8
  892     srlg    $i2,$s3,16
  893     nr  $t2,$mask
  894     nr  $i1,$mask
  895     srlg    $i3,$s3,24
  896     nr  $i2,$mask
  897 
  898     st  $s0,16($key)
  899     st  $s1,20($key)
  900     st  $s2,24($key)
  901     st  $s3,28($key)
  902     la  $key,16($key)       # key+=4
  903     la  $t3,4($t3)      # i++
  904     brct    $rounds,.L128_loop
  905     lghi    $t0,10
  906     lghi    %r2,0
  907     lm${g}  %r4,%r13,4*$SIZE_T($sp)
  908     br  $ra
  909 
  910 .align  16
  911 .Lnot128:
  912     llgf    $t0,16($inp)
  913     llgf    $t1,20($inp)
  914     st  $t0,16($key)
  915     st  $t1,20($key)
  916     lghi    $t0,192
  917     cr  $bits,$t0
  918     jne .Lnot192
  919 
  920     llill   $mask,0xff
  921     lghi    $t3,0           # i=0
  922     lghi    $rounds,12
  923     st  $rounds,240($key)
  924     lghi    $rounds,8
  925 
  926     srlg    $i1,$t1,8
  927     srlg    $i2,$t1,16
  928     srlg    $i3,$t1,24
  929     nr  $t1,$mask
  930     nr  $i1,$mask
  931     nr  $i2,$mask
  932 
  933 .align  16
  934 .L192_loop:
  935     la  $t1,0($t1,$tbl)
  936     la  $i1,0($i1,$tbl)
  937     la  $i2,0($i2,$tbl)
  938     la  $i3,0($i3,$tbl)
  939     icm $t1,2,0($t1)        # Te4[rk[5]>>0]<<8
  940     icm $t1,4,0($i1)        # Te4[rk[5]>>8]<<16
  941     icm $t1,8,0($i2)        # Te4[rk[5]>>16]<<24
  942     icm $t1,1,0($i3)        # Te4[rk[5]>>24]
  943     x   $t1,256($t3,$tbl)   # rcon[i]
  944     xr  $s0,$t1         # rk[6]=rk[0]^...
  945     xr  $s1,$s0         # rk[7]=rk[1]^rk[6]
  946     xr  $s2,$s1         # rk[8]=rk[2]^rk[7]
  947     xr  $s3,$s2         # rk[9]=rk[3]^rk[8]
  948 
  949     st  $s0,24($key)
  950     st  $s1,28($key)
  951     st  $s2,32($key)
  952     st  $s3,36($key)
  953     brct    $rounds,.L192_continue
  954     lghi    $t0,12
  955     lghi    %r2,0
  956     lm${g}  %r4,%r13,4*$SIZE_T($sp)
  957     br  $ra
  958 
  959 .align  16
  960 .L192_continue:
  961     lgr $t1,$s3
  962     x   $t1,16($key)        # rk[10]=rk[4]^rk[9]
  963     st  $t1,40($key)
  964     x   $t1,20($key)        # rk[11]=rk[5]^rk[10]
  965     st  $t1,44($key)
  966 
  967     srlg    $i1,$t1,8
  968     srlg    $i2,$t1,16
  969     srlg    $i3,$t1,24
  970     nr  $t1,$mask
  971     nr  $i1,$mask
  972     nr  $i2,$mask
  973 
  974     la  $key,24($key)       # key+=6
  975     la  $t3,4($t3)      # i++
  976     j   .L192_loop
  977 
  978 .align  16
  979 .Lnot192:
  980     llgf    $t0,24($inp)
  981     llgf    $t1,28($inp)
  982     st  $t0,24($key)
  983     st  $t1,28($key)
  984     llill   $mask,0xff
  985     lghi    $t3,0           # i=0
  986     lghi    $rounds,14
  987     st  $rounds,240($key)
  988     lghi    $rounds,7
  989 
  990     srlg    $i1,$t1,8
  991     srlg    $i2,$t1,16
  992     srlg    $i3,$t1,24
  993     nr  $t1,$mask
  994     nr  $i1,$mask
  995     nr  $i2,$mask
  996 
  997 .align  16
  998 .L256_loop:
  999     la  $t1,0($t1,$tbl)
 1000     la  $i1,0($i1,$tbl)
 1001     la  $i2,0($i2,$tbl)
 1002     la  $i3,0($i3,$tbl)
 1003     icm $t1,2,0($t1)        # Te4[rk[7]>>0]<<8
 1004     icm $t1,4,0($i1)        # Te4[rk[7]>>8]<<16
 1005     icm $t1,8,0($i2)        # Te4[rk[7]>>16]<<24
 1006     icm $t1,1,0($i3)        # Te4[rk[7]>>24]
 1007     x   $t1,256($t3,$tbl)   # rcon[i]
 1008     xr  $s0,$t1         # rk[8]=rk[0]^...
 1009     xr  $s1,$s0         # rk[9]=rk[1]^rk[8]
 1010     xr  $s2,$s1         # rk[10]=rk[2]^rk[9]
 1011     xr  $s3,$s2         # rk[11]=rk[3]^rk[10]
 1012     st  $s0,32($key)
 1013     st  $s1,36($key)
 1014     st  $s2,40($key)
 1015     st  $s3,44($key)
 1016     brct    $rounds,.L256_continue
 1017     lghi    $t0,14
 1018     lghi    %r2,0
 1019     lm${g}  %r4,%r13,4*$SIZE_T($sp)
 1020     br  $ra
 1021 
 1022 .align  16
 1023 .L256_continue:
 1024     lgr $t1,$s3         # temp=rk[11]
 1025     srlg    $i1,$s3,8
 1026     srlg    $i2,$s3,16
 1027     srlg    $i3,$s3,24
 1028     nr  $t1,$mask
 1029     nr  $i1,$mask
 1030     nr  $i2,$mask
 1031     la  $t1,0($t1,$tbl)
 1032     la  $i1,0($i1,$tbl)
 1033     la  $i2,0($i2,$tbl)
 1034     la  $i3,0($i3,$tbl)
 1035     llgc    $t1,0($t1)      # Te4[rk[11]>>0]
 1036     icm $t1,2,0($i1)        # Te4[rk[11]>>8]<<8
 1037     icm $t1,4,0($i2)        # Te4[rk[11]>>16]<<16
 1038     icm $t1,8,0($i3)        # Te4[rk[11]>>24]<<24
 1039     x   $t1,16($key)        # rk[12]=rk[4]^...
 1040     st  $t1,48($key)
 1041     x   $t1,20($key)        # rk[13]=rk[5]^rk[12]
 1042     st  $t1,52($key)
 1043     x   $t1,24($key)        # rk[14]=rk[6]^rk[13]
 1044     st  $t1,56($key)
 1045     x   $t1,28($key)        # rk[15]=rk[7]^rk[14]
 1046     st  $t1,60($key)
 1047 
 1048     srlg    $i1,$t1,8
 1049     srlg    $i2,$t1,16
 1050     srlg    $i3,$t1,24
 1051     nr  $t1,$mask
 1052     nr  $i1,$mask
 1053     nr  $i2,$mask
 1054 
 1055     la  $key,32($key)       # key+=8
 1056     la  $t3,4($t3)      # i++
 1057     j   .L256_loop
 1058 
 1059 .Lminus1:
 1060     lghi    %r2,-1
 1061     br  $ra
 1062 .size   private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
 1063 
 1064 # void AES_set_decrypt_key(const unsigned char *in, int bits,
 1065 #        AES_KEY *key) {
 1066 .globl  private_AES_set_decrypt_key
 1067 .type   private_AES_set_decrypt_key,\@function
 1068 .align  16
 1069 private_AES_set_decrypt_key:
 1070     #st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to
 1071     st${g}  $ra,14*$SIZE_T($sp) # save non-volatile registers and $key!
 1072     bras    $ra,_s390x_AES_set_encrypt_key
 1073     #l${g}  $key,4*$SIZE_T($sp)
 1074     l${g}   $ra,14*$SIZE_T($sp)
 1075     ltgr    %r2,%r2
 1076     bnzr    $ra
 1077 ___
 1078 $code.=<<___ if (!$softonly);
 1079     #l  $t0,240($key)
 1080     lhi $t1,16
 1081     cr  $t0,$t1
 1082     jl  .Lgo
 1083     oill    $t0,0x80    # set "decrypt" bit
 1084     st  $t0,240($key)
 1085     br  $ra
 1086 ___
 1087 $code.=<<___;
 1088 .align  16
 1089 .Lgo:   lgr $rounds,$t0 #llgf   $rounds,240($key)
 1090     la  $i1,0($key)
 1091     sllg    $i2,$rounds,4
 1092     la  $i2,0($i2,$key)
 1093     srl $rounds,1
 1094     lghi    $t1,-16
 1095 
 1096 .align  16
 1097 .Linv:  lmg $s0,$s1,0($i1)
 1098     lmg $s2,$s3,0($i2)
 1099     stmg    $s0,$s1,0($i2)
 1100     stmg    $s2,$s3,0($i1)
 1101     la  $i1,16($i1)
 1102     la  $i2,0($t1,$i2)
 1103     brct    $rounds,.Linv
 1104 ___
 1105 $mask80=$i1;
 1106 $mask1b=$i2;
 1107 $maskfe=$i3;
 1108 $code.=<<___;
 1109     llgf    $rounds,240($key)
 1110     aghi    $rounds,-1
 1111     sll $rounds,2   # (rounds-1)*4
 1112     llilh   $mask80,0x8080
 1113     llilh   $mask1b,0x1b1b
 1114     llilh   $maskfe,0xfefe
 1115     oill    $mask80,0x8080
 1116     oill    $mask1b,0x1b1b
 1117     oill    $maskfe,0xfefe
 1118 
 1119 .align  16
 1120 .Lmix:  l   $s0,16($key)    # tp1
 1121     lr  $s1,$s0
 1122     ngr $s1,$mask80
 1123     srlg    $t1,$s1,7
 1124     slr $s1,$t1
 1125     nr  $s1,$mask1b
 1126     sllg    $t1,$s0,1
 1127     nr  $t1,$maskfe
 1128     xr  $s1,$t1     # tp2
 1129 
 1130     lr  $s2,$s1
 1131     ngr $s2,$mask80
 1132     srlg    $t1,$s2,7
 1133     slr $s2,$t1
 1134     nr  $s2,$mask1b
 1135     sllg    $t1,$s1,1
 1136     nr  $t1,$maskfe
 1137     xr  $s2,$t1     # tp4
 1138 
 1139     lr  $s3,$s2
 1140     ngr $s3,$mask80
 1141     srlg    $t1,$s3,7
 1142     slr $s3,$t1
 1143     nr  $s3,$mask1b
 1144     sllg    $t1,$s2,1
 1145     nr  $t1,$maskfe
 1146     xr  $s3,$t1     # tp8
 1147 
 1148     xr  $s1,$s0     # tp2^tp1
 1149     xr  $s2,$s0     # tp4^tp1
 1150     rll $s0,$s0,24  # = ROTATE(tp1,8)
 1151     xr  $s2,$s3     # ^=tp8
 1152     xr  $s0,$s1     # ^=tp2^tp1
 1153     xr  $s1,$s3     # tp2^tp1^tp8
 1154     xr  $s0,$s2     # ^=tp4^tp1^tp8
 1155     rll $s1,$s1,8
 1156     rll $s2,$s2,16
 1157     xr  $s0,$s1     # ^= ROTATE(tp8^tp2^tp1,24)
 1158     rll $s3,$s3,24
 1159     xr  $s0,$s2     # ^= ROTATE(tp8^tp4^tp1,16)
 1160     xr  $s0,$s3     # ^= ROTATE(tp8,8)
 1161 
 1162     st  $s0,16($key)
 1163     la  $key,4($key)
 1164     brct    $rounds,.Lmix
 1165 
 1166     lm${g}  %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
 1167     lghi    %r2,0
 1168     br  $ra
 1169 .size   private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
 1170 ___
 1171 
 1172 ########################################################################
 1173 # void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
 1174 #                     size_t length, const AES_KEY *key,
 1175 #                     unsigned char *ivec, const int enc)
 1176 {
 1177 my $inp="%r2";
 1178 my $out="%r4";  # length and out are swapped
 1179 my $len="%r3";
 1180 my $key="%r5";
 1181 my $ivp="%r6";
 1182 
 1183 $code.=<<___;
 1184 .globl  AES_cbc_encrypt
 1185 .type   AES_cbc_encrypt,\@function
 1186 .align  16
 1187 AES_cbc_encrypt:
 1188     xgr %r3,%r4     # flip %r3 and %r4, out and len
 1189     xgr %r4,%r3
 1190     xgr %r3,%r4
 1191 ___
 1192 $code.=<<___ if (!$softonly);
 1193     lhi %r0,16
 1194     cl  %r0,240($key)
 1195     jh  .Lcbc_software
 1196 
 1197     lg  %r0,0($ivp) # copy ivec
 1198     lg  %r1,8($ivp)
 1199     stmg    %r0,%r1,16($sp)
 1200     lmg %r0,%r1,0($key) # copy key, cover 256 bit
 1201     stmg    %r0,%r1,32($sp)
 1202     lmg %r0,%r1,16($key)
 1203     stmg    %r0,%r1,48($sp)
 1204     l   %r0,240($key)   # load kmc code
 1205     lghi    $key,15     # res=len%16, len-=res;
 1206     ngr $key,$len
 1207     sl${g}r $len,$key
 1208     la  %r1,16($sp) # parameter block - ivec || key
 1209     jz  .Lkmc_truncated
 1210     .long   0xb92f0042  # kmc %r4,%r2
 1211     brc 1,.-4       # pay attention to "partial completion"
 1212     ltr $key,$key
 1213     jnz .Lkmc_truncated
 1214 .Lkmc_done:
 1215     lmg %r0,%r1,16($sp) # copy ivec to caller
 1216     stg %r0,0($ivp)
 1217     stg %r1,8($ivp)
 1218     br  $ra
 1219 .align  16
 1220 .Lkmc_truncated:
 1221     ahi $key,-1     # it's the way it's encoded in mvc
 1222     tmll    %r0,0x80
 1223     jnz .Lkmc_truncated_dec
 1224     lghi    %r1,0
 1225     stg %r1,16*$SIZE_T($sp)
 1226     stg %r1,16*$SIZE_T+8($sp)
 1227     bras    %r1,1f
 1228     mvc 16*$SIZE_T(1,$sp),0($inp)
 1229 1:  ex  $key,0(%r1)
 1230     la  %r1,16($sp) # restore parameter block
 1231     la  $inp,16*$SIZE_T($sp)
 1232     lghi    $len,16
 1233     .long   0xb92f0042  # kmc %r4,%r2
 1234     j   .Lkmc_done
 1235 .align  16
 1236 .Lkmc_truncated_dec:
 1237     st${g}  $out,4*$SIZE_T($sp)
 1238     la  $out,16*$SIZE_T($sp)
 1239     lghi    $len,16
 1240     .long   0xb92f0042  # kmc %r4,%r2
 1241     l${g}   $out,4*$SIZE_T($sp)
 1242     bras    %r1,2f
 1243     mvc 0(1,$out),16*$SIZE_T($sp)
 1244 2:  ex  $key,0(%r1)
 1245     j   .Lkmc_done
 1246 .align  16
 1247 .Lcbc_software:
 1248 ___
 1249 $code.=<<___;
 1250     stm${g} $key,$ra,5*$SIZE_T($sp)
 1251     lhi %r0,0
 1252     cl  %r0,`$stdframe+$SIZE_T-4`($sp)
 1253     je  .Lcbc_decrypt
 1254 
 1255     larl    $tbl,AES_Te
 1256 
 1257     llgf    $s0,0($ivp)
 1258     llgf    $s1,4($ivp)
 1259     llgf    $s2,8($ivp)
 1260     llgf    $s3,12($ivp)
 1261 
 1262     lghi    $t0,16
 1263     sl${g}r $len,$t0
 1264     brc 4,.Lcbc_enc_tail    # if borrow
 1265 .Lcbc_enc_loop:
 1266     stm${g} $inp,$out,2*$SIZE_T($sp)
 1267     x   $s0,0($inp)
 1268     x   $s1,4($inp)
 1269     x   $s2,8($inp)
 1270     x   $s3,12($inp)
 1271     lgr %r4,$key
 1272 
 1273     bras    $ra,_s390x_AES_encrypt
 1274 
 1275     lm${g}  $inp,$key,2*$SIZE_T($sp)
 1276     st  $s0,0($out)
 1277     st  $s1,4($out)
 1278     st  $s2,8($out)
 1279     st  $s3,12($out)
 1280 
 1281     la  $inp,16($inp)
 1282     la  $out,16($out)
 1283     lghi    $t0,16
 1284     lt${g}r $len,$len
 1285     jz  .Lcbc_enc_done
 1286     sl${g}r $len,$t0
 1287     brc 4,.Lcbc_enc_tail    # if borrow
 1288     j   .Lcbc_enc_loop
 1289 .align  16
 1290 .Lcbc_enc_done:
 1291     l${g}   $ivp,6*$SIZE_T($sp)
 1292     st  $s0,0($ivp)
 1293     st  $s1,4($ivp) 
 1294     st  $s2,8($ivp)
 1295     st  $s3,12($ivp)
 1296 
 1297     lm${g}  %r7,$ra,7*$SIZE_T($sp)
 1298     br  $ra
 1299 
 1300 .align  16
 1301 .Lcbc_enc_tail:
 1302     aghi    $len,15
 1303     lghi    $t0,0
 1304     stg $t0,16*$SIZE_T($sp)
 1305     stg $t0,16*$SIZE_T+8($sp)
 1306     bras    $t1,3f
 1307     mvc 16*$SIZE_T(1,$sp),0($inp)
 1308 3:  ex  $len,0($t1)
 1309     lghi    $len,0
 1310     la  $inp,16*$SIZE_T($sp)
 1311     j   .Lcbc_enc_loop
 1312 
 1313 .align  16
 1314 .Lcbc_decrypt:
 1315     larl    $tbl,AES_Td
 1316 
 1317     lg  $t0,0($ivp)
 1318     lg  $t1,8($ivp)
 1319     stmg    $t0,$t1,16*$SIZE_T($sp)
 1320 
 1321 .Lcbc_dec_loop:
 1322     stm${g} $inp,$out,2*$SIZE_T($sp)
 1323     llgf    $s0,0($inp)
 1324     llgf    $s1,4($inp)
 1325     llgf    $s2,8($inp)
 1326     llgf    $s3,12($inp)
 1327     lgr %r4,$key
 1328 
 1329     bras    $ra,_s390x_AES_decrypt
 1330 
 1331     lm${g}  $inp,$key,2*$SIZE_T($sp)
 1332     sllg    $s0,$s0,32
 1333     sllg    $s2,$s2,32
 1334     lr  $s0,$s1
 1335     lr  $s2,$s3
 1336 
 1337     lg  $t0,0($inp)
 1338     lg  $t1,8($inp)
 1339     xg  $s0,16*$SIZE_T($sp)
 1340     xg  $s2,16*$SIZE_T+8($sp)
 1341     lghi    $s1,16
 1342     sl${g}r $len,$s1
 1343     brc 4,.Lcbc_dec_tail    # if borrow
 1344     brc 2,.Lcbc_dec_done    # if zero
 1345     stg $s0,0($out)
 1346     stg $s2,8($out)
 1347     stmg    $t0,$t1,16*$SIZE_T($sp)
 1348 
 1349     la  $inp,16($inp)
 1350     la  $out,16($out)
 1351     j   .Lcbc_dec_loop
 1352 
 1353 .Lcbc_dec_done:
 1354     stg $s0,0($out)
 1355     stg $s2,8($out)
 1356 .Lcbc_dec_exit:
 1357     lm${g}  %r6,$ra,6*$SIZE_T($sp)
 1358     stmg    $t0,$t1,0($ivp)
 1359 
 1360     br  $ra
 1361 
 1362 .align  16
 1363 .Lcbc_dec_tail:
 1364     aghi    $len,15
 1365     stg $s0,16*$SIZE_T($sp)
 1366     stg $s2,16*$SIZE_T+8($sp)
 1367     bras    $s1,4f
 1368     mvc 0(1,$out),16*$SIZE_T($sp)
 1369 4:  ex  $len,0($s1)
 1370     j   .Lcbc_dec_exit
 1371 .size   AES_cbc_encrypt,.-AES_cbc_encrypt
 1372 ___
 1373 }
 1374 ########################################################################
 1375 # void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
 1376 #                     size_t blocks, const AES_KEY *key,
 1377 #                     const unsigned char *ivec)
 1378 {
 1379 my $inp="%r2";
 1380 my $out="%r4";  # blocks and out are swapped
 1381 my $len="%r3";
 1382 my $key="%r5";  my $iv0="%r5";
 1383 my $ivp="%r6";
 1384 my $fp ="%r7";
 1385 
 1386 $code.=<<___;
 1387 .globl  AES_ctr32_encrypt
 1388 .type   AES_ctr32_encrypt,\@function
 1389 .align  16
 1390 AES_ctr32_encrypt:
 1391     xgr %r3,%r4     # flip %r3 and %r4, $out and $len
 1392     xgr %r4,%r3
 1393     xgr %r3,%r4
 1394     llgfr   $len,$len   # safe in ctr32 subroutine even in 64-bit case
 1395 ___
 1396 $code.=<<___ if (!$softonly);
 1397     l   %r0,240($key)
 1398     lhi %r1,16
 1399     clr %r0,%r1
 1400     jl  .Lctr32_software
 1401 
 1402     stm${g} %r6,$s3,6*$SIZE_T($sp)
 1403 
 1404     slgr    $out,$inp
 1405     la  %r1,0($key) # %r1 is permanent copy of $key
 1406     lg  $iv0,0($ivp)    # load ivec
 1407     lg  $ivp,8($ivp)
 1408 
 1409     # prepare and allocate stack frame at the top of 4K page
 1410     # with 1K reserved for eventual signal handling
 1411     lghi    $s0,-1024-256-16# guarantee at least 256-bytes buffer
 1412     lghi    $s1,-4096
 1413     algr    $s0,$sp
 1414     lgr $fp,$sp
 1415     ngr $s0,$s1     # align at page boundary
 1416     slgr    $fp,$s0     # total buffer size
 1417     lgr $s2,$sp
 1418     lghi    $s1,1024+16 # sl[g]fi is extended-immediate facility
 1419     slgr    $fp,$s1     # deduct reservation to get usable buffer size
 1420     # buffer size is at lest 256 and at most 3072+256-16
 1421 
 1422     la  $sp,1024($s0)   # alloca
 1423     srlg    $fp,$fp,4   # convert bytes to blocks, minimum 16
 1424     st${g}  $s2,0($sp)  # back-chain
 1425     st${g}  $fp,$SIZE_T($sp)
 1426 
 1427     slgr    $len,$fp
 1428     brc 1,.Lctr32_hw_switch # not zero, no borrow
 1429     algr    $fp,$len    # input is shorter than allocated buffer
 1430     lghi    $len,0
 1431     st${g}  $fp,$SIZE_T($sp)
 1432 
 1433 .Lctr32_hw_switch:
 1434 ___
 1435 $code.=<<___ if (0);    ######### kmctr code was measured to be ~12% slower
 1436     larl    $s0,OPENSSL_s390xcap_P
 1437     lg  $s0,8($s0)
 1438     tmhh    $s0,0x0004  # check for message_security-assist-4
 1439     jz  .Lctr32_km_loop
 1440 
 1441     llgfr   $s0,%r0
 1442     lgr $s1,%r1
 1443     larl    %r1,OPENSSL_s390xcap_P
 1444     llihh   %r0,0x8000  # check if kmctr supports the function code
 1445     srlg    %r0,%r0,0($s0)
 1446     ng  %r0,64(%r1) # check kmctr capability vector
 1447     lgr %r0,$s0
 1448     lgr %r1,$s1
 1449     jz  .Lctr32_km_loop
 1450 
 1451 ####### kmctr code
 1452     algr    $out,$inp   # restore $out
 1453     lgr $s1,$len    # $s1 undertakes $len
 1454     j   .Lctr32_kmctr_loop
 1455 .align  16
 1456 .Lctr32_kmctr_loop:
 1457     la  $s2,16($sp)
 1458     lgr $s3,$fp
 1459 .Lctr32_kmctr_prepare:
 1460     stg $iv0,0($s2)
 1461     stg $ivp,8($s2)
 1462     la  $s2,16($s2)
 1463     ahi $ivp,1      # 32-bit increment, preserves upper half
 1464     brct    $s3,.Lctr32_kmctr_prepare
 1465 
 1466     #la $inp,0($inp)    # inp
 1467     sllg    $len,$fp,4  # len
 1468     #la $out,0($out)    # out
 1469     la  $s2,16($sp) # iv
 1470     .long   0xb92da042  # kmctr $out,$s2,$inp
 1471     brc 1,.-4       # pay attention to "partial completion"
 1472 
 1473     slgr    $s1,$fp
 1474     brc 1,.Lctr32_kmctr_loop    # not zero, no borrow
 1475     algr    $fp,$s1
 1476     lghi    $s1,0
 1477     brc 4+1,.Lctr32_kmctr_loop  # not zero
 1478 
 1479     l${g}   $sp,0($sp)
 1480     lm${g}  %r6,$s3,6*$SIZE_T($sp)
 1481     br  $ra
 1482 .align  16
 1483 ___
 1484 $code.=<<___;
 1485 .Lctr32_km_loop:
 1486     la  $s2,16($sp)
 1487     lgr $s3,$fp
 1488 .Lctr32_km_prepare:
 1489     stg $iv0,0($s2)
 1490     stg $ivp,8($s2)
 1491     la  $s2,16($s2)
 1492     ahi $ivp,1      # 32-bit increment, preserves upper half
 1493     brct    $s3,.Lctr32_km_prepare
 1494 
 1495     la  $s0,16($sp) # inp
 1496     sllg    $s1,$fp,4   # len
 1497     la  $s2,16($sp) # out
 1498     .long   0xb92e00a8  # km %r10,%r8
 1499     brc 1,.-4       # pay attention to "partial completion"
 1500 
 1501     la  $s2,16($sp)
 1502     lgr $s3,$fp
 1503     slgr    $s2,$inp
 1504 .Lctr32_km_xor:
 1505     lg  $s0,0($inp)
 1506     lg  $s1,8($inp)
 1507     xg  $s0,0($s2,$inp)
 1508     xg  $s1,8($s2,$inp)
 1509     stg $s0,0($out,$inp)
 1510     stg $s1,8($out,$inp)
 1511     la  $inp,16($inp)
 1512     brct    $s3,.Lctr32_km_xor
 1513 
 1514     slgr    $len,$fp
 1515     brc 1,.Lctr32_km_loop   # not zero, no borrow
 1516     algr    $fp,$len
 1517     lghi    $len,0
 1518     brc 4+1,.Lctr32_km_loop # not zero
 1519 
 1520     l${g}   $s0,0($sp)
 1521     l${g}   $s1,$SIZE_T($sp)
 1522     la  $s2,16($sp)
 1523 .Lctr32_km_zap:
 1524     stg $s0,0($s2)
 1525     stg $s0,8($s2)
 1526     la  $s2,16($s2)
 1527     brct    $s1,.Lctr32_km_zap
 1528 
 1529     la  $sp,0($s0)
 1530     lm${g}  %r6,$s3,6*$SIZE_T($sp)
 1531     br  $ra
 1532 .align  16
 1533 .Lctr32_software:
 1534 ___
 1535 $code.=<<___;
 1536     stm${g} $key,$ra,5*$SIZE_T($sp)
 1537     sl${g}r $inp,$out
 1538     larl    $tbl,AES_Te
 1539     llgf    $t1,12($ivp)
 1540 
 1541 .Lctr32_loop:
 1542     stm${g} $inp,$out,2*$SIZE_T($sp)
 1543     llgf    $s0,0($ivp)
 1544     llgf    $s1,4($ivp)
 1545     llgf    $s2,8($ivp)
 1546     lgr $s3,$t1
 1547     st  $t1,16*$SIZE_T($sp)
 1548     lgr %r4,$key
 1549 
 1550     bras    $ra,_s390x_AES_encrypt
 1551 
 1552     lm${g}  $inp,$ivp,2*$SIZE_T($sp)
 1553     llgf    $t1,16*$SIZE_T($sp)
 1554     x   $s0,0($inp,$out)
 1555     x   $s1,4($inp,$out)
 1556     x   $s2,8($inp,$out)
 1557     x   $s3,12($inp,$out)
 1558     stm $s0,$s3,0($out)
 1559 
 1560     la  $out,16($out)
 1561     ahi $t1,1       # 32-bit increment
 1562     brct    $len,.Lctr32_loop
 1563 
 1564     lm${g}  %r6,$ra,6*$SIZE_T($sp)
 1565     br  $ra
 1566 .size   AES_ctr32_encrypt,.-AES_ctr32_encrypt
 1567 ___
 1568 }
 1569 
 1570 ########################################################################
 1571 # void AES_xts_encrypt(const unsigned char *inp, unsigned char *out,
 1572 #   size_t len, const AES_KEY *key1, const AES_KEY *key2,
 1573 #   const unsigned char iv[16]);
 1574 #
 1575 {
 1576 my $inp="%r2";
 1577 my $out="%r4";  # len and out are swapped
 1578 my $len="%r3";
 1579 my $key1="%r5"; # $i1
 1580 my $key2="%r6"; # $i2
 1581 my $fp="%r7";   # $i3
 1582 my $tweak=16*$SIZE_T+16;    # or $stdframe-16, bottom of the frame...
 1583 
 1584 $code.=<<___;
 1585 .type   _s390x_xts_km,\@function
 1586 .align  16
 1587 _s390x_xts_km:
 1588 ___
 1589 $code.=<<___ if(1);
 1590     llgfr   $s0,%r0         # put aside the function code
 1591     lghi    $s1,0x7f
 1592     nr  $s1,%r0
 1593     larl    %r1,OPENSSL_s390xcap_P
 1594     llihh   %r0,0x8000
 1595     srlg    %r0,%r0,32($s1)     # check for 32+function code
 1596     ng  %r0,32(%r1)     # check km capability vector
 1597     lgr %r0,$s0         # restore the function code
 1598     la  %r1,0($key1)        # restore $key1
 1599     jz  .Lxts_km_vanilla
 1600 
 1601     lmg $i2,$i3,$tweak($sp) # put aside the tweak value
 1602     algr    $out,$inp
 1603 
 1604     oill    %r0,32          # switch to xts function code
 1605     aghi    $s1,-18         #
 1606     sllg    $s1,$s1,3       # (function code - 18)*8, 0 or 16
 1607     la  %r1,$tweak-16($sp)
 1608     slgr    %r1,$s1         # parameter block position
 1609     lmg $s0,$s3,0($key1)    # load 256 bits of key material,
 1610     stmg    $s0,$s3,0(%r1)      # and copy it to parameter block.
 1611                     # yes, it contains junk and overlaps
 1612                     # with the tweak in 128-bit case.
 1613                     # it's done to avoid conditional
 1614                     # branch.
 1615     stmg    $i2,$i3,$tweak($sp) # "re-seat" the tweak value
 1616 
 1617     .long   0xb92e0042      # km %r4,%r2
 1618     brc 1,.-4           # pay attention to "partial completion"
 1619 
 1620     lrvg    $s0,$tweak+0($sp)   # load the last tweak
 1621     lrvg    $s1,$tweak+8($sp)
 1622     stmg    %r0,%r3,$tweak-32($sp)  # wipe copy of the key
 1623 
 1624     nill    %r0,0xffdf      # switch back to original function code
 1625     la  %r1,0($key1)        # restore pointer to $key1
 1626     slgr    $out,$inp
 1627 
 1628     llgc    $len,2*$SIZE_T-1($sp)
 1629     nill    $len,0x0f       # $len%=16
 1630     br  $ra
 1631     
 1632 .align  16
 1633 .Lxts_km_vanilla:
 1634 ___
 1635 $code.=<<___;
 1636     # prepare and allocate stack frame at the top of 4K page
 1637     # with 1K reserved for eventual signal handling
 1638     lghi    $s0,-1024-256-16# guarantee at least 256-bytes buffer
 1639     lghi    $s1,-4096
 1640     algr    $s0,$sp
 1641     lgr $fp,$sp
 1642     ngr $s0,$s1     # align at page boundary
 1643     slgr    $fp,$s0     # total buffer size
 1644     lgr $s2,$sp
 1645     lghi    $s1,1024+16 # sl[g]fi is extended-immediate facility
 1646     slgr    $fp,$s1     # deduct reservation to get usable buffer size
 1647     # buffer size is at lest 256 and at most 3072+256-16
 1648 
 1649     la  $sp,1024($s0)   # alloca
 1650     nill    $fp,0xfff0  # round to 16*n
 1651     st${g}  $s2,0($sp)  # back-chain
 1652     nill    $len,0xfff0 # redundant
 1653     st${g}  $fp,$SIZE_T($sp)
 1654 
 1655     slgr    $len,$fp
 1656     brc 1,.Lxts_km_go   # not zero, no borrow
 1657     algr    $fp,$len    # input is shorter than allocated buffer
 1658     lghi    $len,0
 1659     st${g}  $fp,$SIZE_T($sp)
 1660 
 1661 .Lxts_km_go:
 1662     lrvg    $s0,$tweak+0($s2)   # load the tweak value in little-endian
 1663     lrvg    $s1,$tweak+8($s2)
 1664 
 1665     la  $s2,16($sp)     # vector of ascending tweak values
 1666     slgr    $s2,$inp
 1667     srlg    $s3,$fp,4
 1668     j   .Lxts_km_start
 1669 
 1670 .Lxts_km_loop:
 1671     la  $s2,16($sp)
 1672     slgr    $s2,$inp
 1673     srlg    $s3,$fp,4
 1674 .Lxts_km_prepare:
 1675     lghi    $i1,0x87
 1676     srag    $i2,$s1,63      # broadcast upper bit
 1677     ngr $i1,$i2         # rem
 1678     algr    $s0,$s0
 1679     alcgr   $s1,$s1
 1680     xgr $s0,$i1
 1681 .Lxts_km_start:
 1682     lrvgr   $i1,$s0         # flip byte order
 1683     lrvgr   $i2,$s1
 1684     stg $i1,0($s2,$inp)
 1685     stg $i2,8($s2,$inp)
 1686     xg  $i1,0($inp)
 1687     xg  $i2,8($inp)
 1688     stg $i1,0($out,$inp)
 1689     stg $i2,8($out,$inp)
 1690     la  $inp,16($inp)
 1691     brct    $s3,.Lxts_km_prepare
 1692 
 1693     slgr    $inp,$fp        # rewind $inp
 1694     la  $s2,0($out,$inp)
 1695     lgr $s3,$fp
 1696     .long   0xb92e00aa      # km $s2,$s2
 1697     brc 1,.-4           # pay attention to "partial completion"
 1698 
 1699     la  $s2,16($sp)
 1700     slgr    $s2,$inp
 1701     srlg    $s3,$fp,4
 1702 .Lxts_km_xor:
 1703     lg  $i1,0($out,$inp)
 1704     lg  $i2,8($out,$inp)
 1705     xg  $i1,0($s2,$inp)
 1706     xg  $i2,8($s2,$inp)
 1707     stg $i1,0($out,$inp)
 1708     stg $i2,8($out,$inp)
 1709     la  $inp,16($inp)
 1710     brct    $s3,.Lxts_km_xor
 1711 
 1712     slgr    $len,$fp
 1713     brc 1,.Lxts_km_loop     # not zero, no borrow
 1714     algr    $fp,$len
 1715     lghi    $len,0
 1716     brc 4+1,.Lxts_km_loop   # not zero
 1717 
 1718     l${g}   $i1,0($sp)      # back-chain
 1719     llgf    $fp,`2*$SIZE_T-4`($sp)  # bytes used
 1720     la  $i2,16($sp)
 1721     srlg    $fp,$fp,4
 1722 .Lxts_km_zap:
 1723     stg $i1,0($i2)
 1724     stg $i1,8($i2)
 1725     la  $i2,16($i2)
 1726     brct    $fp,.Lxts_km_zap
 1727 
 1728     la  $sp,0($i1)
 1729     llgc    $len,2*$SIZE_T-1($i1)
 1730     nill    $len,0x0f       # $len%=16
 1731     bzr $ra
 1732 
 1733     # generate one more tweak...
 1734     lghi    $i1,0x87
 1735     srag    $i2,$s1,63      # broadcast upper bit
 1736     ngr $i1,$i2         # rem
 1737     algr    $s0,$s0
 1738     alcgr   $s1,$s1
 1739     xgr $s0,$i1
 1740 
 1741     ltr $len,$len       # clear zero flag
 1742     br  $ra
 1743 .size   _s390x_xts_km,.-_s390x_xts_km
 1744 
 1745 .globl  AES_xts_encrypt
 1746 .type   AES_xts_encrypt,\@function
 1747 .align  16
 1748 AES_xts_encrypt:
 1749     xgr %r3,%r4         # flip %r3 and %r4, $out and $len
 1750     xgr %r4,%r3
 1751     xgr %r3,%r4
 1752 ___
 1753 $code.=<<___ if ($SIZE_T==4);
 1754     llgfr   $len,$len
 1755 ___
 1756 $code.=<<___;
 1757     st${g}  $len,1*$SIZE_T($sp) # save copy of $len
 1758     srag    $len,$len,4     # formally wrong, because it expands
 1759                     # sign byte, but who can afford asking
 1760                     # to process more than 2^63-1 bytes?
 1761                     # I use it, because it sets condition
 1762                     # code...
 1763     bcr 8,$ra           # abort if zero (i.e. less than 16)
 1764 ___
 1765 $code.=<<___ if (!$softonly);
 1766     llgf    %r0,240($key2)
 1767     lhi %r1,16
 1768     clr %r0,%r1
 1769     jl  .Lxts_enc_software
 1770 
 1771     st${g}  $ra,5*$SIZE_T($sp)
 1772     stm${g} %r6,$s3,6*$SIZE_T($sp)
 1773 
 1774     sllg    $len,$len,4     # $len&=~15
 1775     slgr    $out,$inp
 1776 
 1777     # generate the tweak value
 1778     l${g}   $s3,$stdframe($sp)  # pointer to iv
 1779     la  $s2,$tweak($sp)
 1780     lmg $s0,$s1,0($s3)
 1781     lghi    $s3,16
 1782     stmg    $s0,$s1,0($s2)
 1783     la  %r1,0($key2)        # $key2 is not needed anymore
 1784     .long   0xb92e00aa      # km $s2,$s2, generate the tweak
 1785     brc 1,.-4           # can this happen?
 1786 
 1787     l   %r0,240($key1)
 1788     la  %r1,0($key1)        # $key1 is not needed anymore
 1789     bras    $ra,_s390x_xts_km
 1790     jz  .Lxts_enc_km_done
 1791 
 1792     aghi    $inp,-16        # take one step back
 1793     la  $i3,0($out,$inp)    # put aside real $out
 1794 .Lxts_enc_km_steal:
 1795     llgc    $i1,16($inp)
 1796     llgc    $i2,0($out,$inp)
 1797     stc $i1,0($out,$inp)
 1798     stc $i2,16($out,$inp)
 1799     la  $inp,1($inp)
 1800     brct    $len,.Lxts_enc_km_steal
 1801 
 1802     la  $s2,0($i3)
 1803     lghi    $s3,16
 1804     lrvgr   $i1,$s0         # flip byte order
 1805     lrvgr   $i2,$s1
 1806     xg  $i1,0($s2)
 1807     xg  $i2,8($s2)
 1808     stg $i1,0($s2)
 1809     stg $i2,8($s2)
 1810     .long   0xb92e00aa      # km $s2,$s2
 1811     brc 1,.-4           # can this happen?
 1812     lrvgr   $i1,$s0         # flip byte order
 1813     lrvgr   $i2,$s1
 1814     xg  $i1,0($i3)
 1815     xg  $i2,8($i3)
 1816     stg $i1,0($i3)
 1817     stg $i2,8($i3)
 1818 
 1819 .Lxts_enc_km_done:
 1820     stg $sp,$tweak+0($sp)   # wipe tweak
 1821     stg $sp,$tweak+8($sp)
 1822     l${g}   $ra,5*$SIZE_T($sp)
 1823     lm${g}  %r6,$s3,6*$SIZE_T($sp)
 1824     br  $ra
 1825 .align  16
 1826 .Lxts_enc_software:
 1827 ___
 1828 $code.=<<___;
 1829     stm${g} %r6,$ra,6*$SIZE_T($sp)
 1830 
 1831     slgr    $out,$inp
 1832 
 1833     l${g}   $s3,$stdframe($sp)  # ivp
 1834     llgf    $s0,0($s3)      # load iv
 1835     llgf    $s1,4($s3)
 1836     llgf    $s2,8($s3)
 1837     llgf    $s3,12($s3)
 1838     stm${g} %r2,%r5,2*$SIZE_T($sp)
 1839     la  $key,0($key2)
 1840     larl    $tbl,AES_Te
 1841     bras    $ra,_s390x_AES_encrypt  # generate the tweak
 1842     lm${g}  %r2,%r5,2*$SIZE_T($sp)
 1843     stm $s0,$s3,$tweak($sp) # save the tweak
 1844     j   .Lxts_enc_enter
 1845 
 1846 .align  16
 1847 .Lxts_enc_loop:
 1848     lrvg    $s1,$tweak+0($sp)   # load the tweak in little-endian
 1849     lrvg    $s3,$tweak+8($sp)
 1850     lghi    %r1,0x87
 1851     srag    %r0,$s3,63      # broadcast upper bit
 1852     ngr %r1,%r0         # rem
 1853     algr    $s1,$s1
 1854     alcgr   $s3,$s3
 1855     xgr $s1,%r1
 1856     lrvgr   $s1,$s1         # flip byte order
 1857     lrvgr   $s3,$s3
 1858     srlg    $s0,$s1,32      # smash the tweak to 4x32-bits 
 1859     stg $s1,$tweak+0($sp)   # save the tweak
 1860     llgfr   $s1,$s1
 1861     srlg    $s2,$s3,32
 1862     stg $s3,$tweak+8($sp)
 1863     llgfr   $s3,$s3
 1864     la  $inp,16($inp)       # $inp+=16
 1865 .Lxts_enc_enter:
 1866     x   $s0,0($inp)     # ^=*($inp)
 1867     x   $s1,4($inp)
 1868     x   $s2,8($inp)
 1869     x   $s3,12($inp)
 1870     stm${g} %r2,%r3,2*$SIZE_T($sp)  # only two registers are changing
 1871     la  $key,0($key1)
 1872     bras    $ra,_s390x_AES_encrypt
 1873     lm${g}  %r2,%r5,2*$SIZE_T($sp)
 1874     x   $s0,$tweak+0($sp)   # ^=tweak
 1875     x   $s1,$tweak+4($sp)
 1876     x   $s2,$tweak+8($sp)
 1877     x   $s3,$tweak+12($sp)
 1878     st  $s0,0($out,$inp)
 1879     st  $s1,4($out,$inp)
 1880     st  $s2,8($out,$inp)
 1881     st  $s3,12($out,$inp)
 1882     brct${g}    $len,.Lxts_enc_loop
 1883 
 1884     llgc    $len,`2*$SIZE_T-1`($sp)
 1885     nill    $len,0x0f       # $len%16
 1886     jz  .Lxts_enc_done
 1887 
 1888     la  $i3,0($inp,$out)    # put aside real $out
 1889 .Lxts_enc_steal:
 1890     llgc    %r0,16($inp)
 1891     llgc    %r1,0($out,$inp)
 1892     stc %r0,0($out,$inp)
 1893     stc %r1,16($out,$inp)
 1894     la  $inp,1($inp)
 1895     brct    $len,.Lxts_enc_steal
 1896     la  $out,0($i3)     # restore real $out
 1897 
 1898     # generate last tweak...
 1899     lrvg    $s1,$tweak+0($sp)   # load the tweak in little-endian
 1900     lrvg    $s3,$tweak+8($sp)
 1901     lghi    %r1,0x87
 1902     srag    %r0,$s3,63      # broadcast upper bit
 1903     ngr %r1,%r0         # rem
 1904     algr    $s1,$s1
 1905     alcgr   $s3,$s3
 1906     xgr $s1,%r1
 1907     lrvgr   $s1,$s1         # flip byte order
 1908     lrvgr   $s3,$s3
 1909     srlg    $s0,$s1,32      # smash the tweak to 4x32-bits 
 1910     stg $s1,$tweak+0($sp)   # save the tweak
 1911     llgfr   $s1,$s1
 1912     srlg    $s2,$s3,32
 1913     stg $s3,$tweak+8($sp)
 1914     llgfr   $s3,$s3
 1915 
 1916     x   $s0,0($out)     # ^=*(inp)|stolen cipther-text
 1917     x   $s1,4($out)
 1918     x   $s2,8($out)
 1919     x   $s3,12($out)
 1920     st${g}  $out,4*$SIZE_T($sp)
 1921     la  $key,0($key1)
 1922     bras    $ra,_s390x_AES_encrypt
 1923     l${g}   $out,4*$SIZE_T($sp)
 1924     x   $s0,`$tweak+0`($sp) # ^=tweak
 1925     x   $s1,`$tweak+4`($sp)
 1926     x   $s2,`$tweak+8`($sp)
 1927     x   $s3,`$tweak+12`($sp)
 1928     st  $s0,0($out)
 1929     st  $s1,4($out)
 1930     st  $s2,8($out)
 1931     st  $s3,12($out)
 1932 
 1933 .Lxts_enc_done:
 1934     stg $sp,$tweak+0($sp)   # wipe tweak
 1935     stg $sp,$twesk+8($sp)
 1936     lm${g}  %r6,$ra,6*$SIZE_T($sp)
 1937     br  $ra
 1938 .size   AES_xts_encrypt,.-AES_xts_encrypt
 1939 ___
 1940 # void AES_xts_decrypt(const unsigned char *inp, unsigned char *out,
 1941 #   size_t len, const AES_KEY *key1, const AES_KEY *key2,
 1942 #   const unsigned char iv[16]);
 1943 #
 1944 $code.=<<___;
 1945 .globl  AES_xts_decrypt
 1946 .type   AES_xts_decrypt,\@function
 1947 .align  16
 1948 AES_xts_decrypt:
 1949     xgr %r3,%r4         # flip %r3 and %r4, $out and $len
 1950     xgr %r4,%r3
 1951     xgr %r3,%r4
 1952 ___
 1953 $code.=<<___ if ($SIZE_T==4);
 1954     llgfr   $len,$len
 1955 ___
 1956 $code.=<<___;
 1957     st${g}  $len,1*$SIZE_T($sp) # save copy of $len
 1958     aghi    $len,-16
 1959     bcr 4,$ra           # abort if less than zero. formally
 1960                     # wrong, because $len is unsigned,
 1961                     # but who can afford asking to
 1962                     # process more than 2^63-1 bytes?
 1963     tmll    $len,0x0f
 1964     jnz .Lxts_dec_proceed
 1965     aghi    $len,16
 1966 .Lxts_dec_proceed:
 1967 ___
 1968 $code.=<<___ if (!$softonly);
 1969     llgf    %r0,240($key2)
 1970     lhi %r1,16
 1971     clr %r0,%r1
 1972     jl  .Lxts_dec_software
 1973 
 1974     st${g}  $ra,5*$SIZE_T($sp)
 1975     stm${g} %r6,$s3,6*$SIZE_T($sp)
 1976 
 1977     nill    $len,0xfff0     # $len&=~15
 1978     slgr    $out,$inp
 1979 
 1980     # generate the tweak value
 1981     l${g}   $s3,$stdframe($sp)  # pointer to iv
 1982     la  $s2,$tweak($sp)
 1983     lmg $s0,$s1,0($s3)
 1984     lghi    $s3,16
 1985     stmg    $s0,$s1,0($s2)
 1986     la  %r1,0($key2)        # $key2 is not needed past this point
 1987     .long   0xb92e00aa      # km $s2,$s2, generate the tweak
 1988     brc 1,.-4           # can this happen?
 1989 
 1990     l   %r0,240($key1)
 1991     la  %r1,0($key1)        # $key1 is not needed anymore
 1992 
 1993     ltgr    $len,$len
 1994     jz  .Lxts_dec_km_short
 1995     bras    $ra,_s390x_xts_km
 1996     jz  .Lxts_dec_km_done
 1997 
 1998     lrvgr   $s2,$s0         # make copy in reverse byte order
 1999     lrvgr   $s3,$s1
 2000     j   .Lxts_dec_km_2ndtweak
 2001 
 2002 .Lxts_dec_km_short:
 2003     llgc    $len,`2*$SIZE_T-1`($sp)
 2004     nill    $len,0x0f       # $len%=16
 2005     lrvg    $s0,$tweak+0($sp)   # load the tweak
 2006     lrvg    $s1,$tweak+8($sp)
 2007     lrvgr   $s2,$s0         # make copy in reverse byte order
 2008     lrvgr   $s3,$s1
 2009 
 2010 .Lxts_dec_km_2ndtweak:
 2011     lghi    $i1,0x87
 2012     srag    $i2,$s1,63      # broadcast upper bit
 2013     ngr $i1,$i2         # rem
 2014     algr    $s0,$s0
 2015     alcgr   $s1,$s1
 2016     xgr $s0,$i1
 2017     lrvgr   $i1,$s0         # flip byte order
 2018     lrvgr   $i2,$s1
 2019 
 2020     xg  $i1,0($inp)
 2021     xg  $i2,8($inp)
 2022     stg $i1,0($out,$inp)
 2023     stg $i2,8($out,$inp)
 2024     la  $i2,0($out,$inp)
 2025     lghi    $i3,16
 2026     .long   0xb92e0066      # km $i2,$i2
 2027     brc 1,.-4           # can this happen?
 2028     lrvgr   $i1,$s0
 2029     lrvgr   $i2,$s1
 2030     xg  $i1,0($out,$inp)
 2031     xg  $i2,8($out,$inp)
 2032     stg $i1,0($out,$inp)
 2033     stg $i2,8($out,$inp)
 2034 
 2035     la  $i3,0($out,$inp)    # put aside real $out
 2036 .Lxts_dec_km_steal:
 2037     llgc    $i1,16($inp)
 2038     llgc    $i2,0($out,$inp)
 2039     stc $i1,0($out,$inp)
 2040     stc $i2,16($out,$inp)
 2041     la  $inp,1($inp)
 2042     brct    $len,.Lxts_dec_km_steal
 2043 
 2044     lgr $s0,$s2
 2045     lgr $s1,$s3
 2046     xg  $s0,0($i3)
 2047     xg  $s1,8($i3)
 2048     stg $s0,0($i3)
 2049     stg $s1,8($i3)
 2050     la  $s0,0($i3)
 2051     lghi    $s1,16
 2052     .long   0xb92e0088      # km $s0,$s0
 2053     brc 1,.-4           # can this happen?
 2054     xg  $s2,0($i3)
 2055     xg  $s3,8($i3)
 2056     stg $s2,0($i3)
 2057     stg $s3,8($i3)
 2058 .Lxts_dec_km_done:
 2059     stg $sp,$tweak+0($sp)   # wipe tweak
 2060     stg $sp,$tweak+8($sp)
 2061     l${g}   $ra,5*$SIZE_T($sp)
 2062     lm${g}  %r6,$s3,6*$SIZE_T($sp)
 2063     br  $ra
 2064 .align  16
 2065 .Lxts_dec_software:
 2066 ___
 2067 $code.=<<___;
 2068     stm${g} %r6,$ra,6*$SIZE_T($sp)
 2069 
 2070     srlg    $len,$len,4
 2071     slgr    $out,$inp
 2072 
 2073     l${g}   $s3,$stdframe($sp)  # ivp
 2074     llgf    $s0,0($s3)      # load iv
 2075     llgf    $s1,4($s3)
 2076     llgf    $s2,8($s3)
 2077     llgf    $s3,12($s3)
 2078     stm${g} %r2,%r5,2*$SIZE_T($sp)
 2079     la  $key,0($key2)
 2080     larl    $tbl,AES_Te
 2081     bras    $ra,_s390x_AES_encrypt  # generate the tweak
 2082     lm${g}  %r2,%r5,2*$SIZE_T($sp)
 2083     larl    $tbl,AES_Td
 2084     lt${g}r $len,$len
 2085     stm $s0,$s3,$tweak($sp) # save the tweak
 2086     jz  .Lxts_dec_short
 2087     j   .Lxts_dec_enter
 2088 
 2089 .align  16
 2090 .Lxts_dec_loop:
 2091     lrvg    $s1,$tweak+0($sp)   # load the tweak in little-endian
 2092     lrvg    $s3,$tweak+8($sp)
 2093     lghi    %r1,0x87
 2094     srag    %r0,$s3,63      # broadcast upper bit
 2095     ngr %r1,%r0         # rem
 2096     algr    $s1,$s1
 2097     alcgr   $s3,$s3
 2098     xgr $s1,%r1
 2099     lrvgr   $s1,$s1         # flip byte order
 2100     lrvgr   $s3,$s3
 2101     srlg    $s0,$s1,32      # smash the tweak to 4x32-bits 
 2102     stg $s1,$tweak+0($sp)   # save the tweak
 2103     llgfr   $s1,$s1
 2104     srlg    $s2,$s3,32
 2105     stg $s3,$tweak+8($sp)
 2106     llgfr   $s3,$s3
 2107 .Lxts_dec_enter:
 2108     x   $s0,0($inp)     # tweak^=*(inp)
 2109     x   $s1,4($inp)
 2110     x   $s2,8($inp)
 2111     x   $s3,12($inp)
 2112     stm${g} %r2,%r3,2*$SIZE_T($sp)  # only two registers are changing
 2113     la  $key,0($key1)
 2114     bras    $ra,_s390x_AES_decrypt
 2115     lm${g}  %r2,%r5,2*$SIZE_T($sp)
 2116     x   $s0,$tweak+0($sp)   # ^=tweak
 2117     x   $s1,$tweak+4($sp)
 2118     x   $s2,$tweak+8($sp)
 2119     x   $s3,$tweak+12($sp)
 2120     st  $s0,0($out,$inp)
 2121     st  $s1,4($out,$inp)
 2122     st  $s2,8($out,$inp)
 2123     st  $s3,12($out,$inp)
 2124     la  $inp,16($inp)
 2125     brct${g}    $len,.Lxts_dec_loop
 2126 
 2127     llgc    $len,`2*$SIZE_T-1`($sp)
 2128     nill    $len,0x0f       # $len%16
 2129     jz  .Lxts_dec_done
 2130 
 2131     # generate pair of tweaks...
 2132     lrvg    $s1,$tweak+0($sp)   # load the tweak in little-endian
 2133     lrvg    $s3,$tweak+8($sp)
 2134     lghi    %r1,0x87
 2135     srag    %r0,$s3,63      # broadcast upper bit
 2136     ngr %r1,%r0         # rem
 2137     algr    $s1,$s1
 2138     alcgr   $s3,$s3
 2139     xgr $s1,%r1
 2140     lrvgr   $i2,$s1         # flip byte order
 2141     lrvgr   $i3,$s3
 2142     stmg    $i2,$i3,$tweak($sp) # save the 1st tweak
 2143     j   .Lxts_dec_2ndtweak
 2144 
 2145 .align  16
 2146 .Lxts_dec_short:
 2147     llgc    $len,`2*$SIZE_T-1`($sp)
 2148     nill    $len,0x0f       # $len%16
 2149     lrvg    $s1,$tweak+0($sp)   # load the tweak in little-endian
 2150     lrvg    $s3,$tweak+8($sp)
 2151 .Lxts_dec_2ndtweak:
 2152     lghi    %r1,0x87
 2153     srag    %r0,$s3,63      # broadcast upper bit
 2154     ngr %r1,%r0         # rem
 2155     algr    $s1,$s1
 2156     alcgr   $s3,$s3
 2157     xgr $s1,%r1
 2158     lrvgr   $s1,$s1         # flip byte order
 2159     lrvgr   $s3,$s3
 2160     srlg    $s0,$s1,32      # smash the tweak to 4x32-bits
 2161     stg $s1,$tweak-16+0($sp)    # save the 2nd tweak
 2162     llgfr   $s1,$s1
 2163     srlg    $s2,$s3,32
 2164     stg $s3,$tweak-16+8($sp)
 2165     llgfr   $s3,$s3
 2166 
 2167     x   $s0,0($inp)     # tweak_the_2nd^=*(inp)
 2168     x   $s1,4($inp)
 2169     x   $s2,8($inp)
 2170     x   $s3,12($inp)
 2171     stm${g} %r2,%r3,2*$SIZE_T($sp)
 2172     la  $key,0($key1)
 2173     bras    $ra,_s390x_AES_decrypt
 2174     lm${g}  %r2,%r5,2*$SIZE_T($sp)
 2175     x   $s0,$tweak-16+0($sp)    # ^=tweak_the_2nd
 2176     x   $s1,$tweak-16+4($sp)
 2177     x   $s2,$tweak-16+8($sp)
 2178     x   $s3,$tweak-16+12($sp)
 2179     st  $s0,0($out,$inp)
 2180     st  $s1,4($out,$inp)
 2181     st  $s2,8($out,$inp)
 2182     st  $s3,12($out,$inp)
 2183 
 2184     la  $i3,0($out,$inp)    # put aside real $out
 2185 .Lxts_dec_steal:
 2186     llgc    %r0,16($inp)
 2187     llgc    %r1,0($out,$inp)
 2188     stc %r0,0($out,$inp)
 2189     stc %r1,16($out,$inp)
 2190     la  $inp,1($inp)
 2191     brct    $len,.Lxts_dec_steal
 2192     la  $out,0($i3)     # restore real $out
 2193 
 2194     lm  $s0,$s3,$tweak($sp) # load the 1st tweak
 2195     x   $s0,0($out)     # tweak^=*(inp)|stolen cipher-text
 2196     x   $s1,4($out)
 2197     x   $s2,8($out)
 2198     x   $s3,12($out)
 2199     st${g}  $out,4*$SIZE_T($sp)
 2200     la  $key,0($key1)
 2201     bras    $ra,_s390x_AES_decrypt
 2202     l${g}   $out,4*$SIZE_T($sp)
 2203     x   $s0,$tweak+0($sp)   # ^=tweak
 2204     x   $s1,$tweak+4($sp)
 2205     x   $s2,$tweak+8($sp)
 2206     x   $s3,$tweak+12($sp)
 2207     st  $s0,0($out)
 2208     st  $s1,4($out)
 2209     st  $s2,8($out)
 2210     st  $s3,12($out)
 2211     stg $sp,$tweak-16+0($sp)    # wipe 2nd tweak
 2212     stg $sp,$tweak-16+8($sp)
 2213 .Lxts_dec_done:
 2214     stg $sp,$tweak+0($sp)   # wipe tweak
 2215     stg $sp,$twesk+8($sp)
 2216     lm${g}  %r6,$ra,6*$SIZE_T($sp)
 2217     br  $ra
 2218 .size   AES_xts_decrypt,.-AES_xts_decrypt
 2219 ___
 2220 }
 2221 $code.=<<___;
 2222 .string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
 2223 .comm   OPENSSL_s390xcap_P,80,8
 2224 ___
 2225 
 2226 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 2227 print $code;
 2228 close STDOUT;   # force flush