"Fossies" - the Fresh Open Source Software Archive

Member "src/Crypto/Aes_x64.asm" (10 Oct 2018, 26997 Bytes) of package /windows/misc/VeraCrypt_1.23-Hotfix-2_Source.zip:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Generic Assembler source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file.

    1 
    2 ; ---------------------------------------------------------------------------
    3 ; Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
    4 ;
    5 ; LICENSE TERMS
    6 ;
    7 ; The free distribution and use of this software is allowed (with or without
    8 ; changes) provided that:
    9 ;
   10 ;  1. source code distributions include the above copyright notice, this
   11 ;     list of conditions and the following disclaimer;
   12 ;
   13 ;  2. binary distributions include the above copyright notice, this list
   14 ;     of conditions and the following disclaimer in their documentation;
   15 ;
   16 ;  3. the name of the copyright holder is not used to endorse products
   17 ;     built using this software without specific written permission.
   18 ;
   19 ; DISCLAIMER
   20 ;
   21 ; This software is provided 'as is' with no explicit or implied warranties
   22 ; in respect of its properties, including, but not limited to, correctness
   23 ; and/or fitness for purpose.
   24 ; ---------------------------------------------------------------------------
   25 ; Issue 20/12/2007
   26 ;
   27 ; I am grateful to Dag Arne Osvik for many discussions of the techniques that
   28 ; can be used to optimise AES assembler code on AMD64/EM64T architectures.
   29 ; Some of the techniques used in this implementation are the result of
   30 ; suggestions made by him for which I am most grateful.
   31 
   32 ;
   33 ; Adapted for TrueCrypt:
   34 ; - Compatibility with NASM
   35 ;
   36 
   37 ; An AES implementation for AMD64 processors using the YASM assembler.  This
   38 ; implemetation provides only encryption, decryption and hence requires key
   39 ; scheduling support in C. It uses 8k bytes of tables but its encryption and
   40 ; decryption performance is very close to that obtained using large tables.
   41 ; It can use either Windows or Gnu/Linux calling conventions, which are as
   42 ; follows:
   43 ;               windows  gnu/linux
   44 ;
   45 ;   in_blk          rcx     rdi
   46 ;   out_blk         rdx     rsi
   47 ;   context (cx)     r8     rdx
   48 ;
   49 ;   preserved       rsi      -    + rbx, rbp, rsp, r12, r13, r14 & r15
   50 ;   registers       rdi      -      on both
   51 ;
   52 ;   destroyed        -      rsi   + rax, rcx, rdx, r8, r9, r10 & r11
   53 ;   registers        -      rdi     on both
   54 ;
   55 ; The default convention is that for windows, the gnu/linux convention being
   56 ; used if __GNUC__ is defined.
   57 ;
   58 ; Define _SEH_ to include support for Win64 structured exception handling
   59 ; (this requires YASM version 0.6 or later).
   60 ;
   61 ; This code provides the standard AES block size (128 bits, 16 bytes) and the
   62 ; three standard AES key sizes (128, 192 and 256 bits). It has the same call
   63 ; interface as my C implementation.  It uses the Microsoft C AMD64 calling
   64 ; conventions in which the three parameters are placed in  rcx, rdx and r8
   65 ; respectively.  The rbx, rsi, rdi, rbp and r12..r15 registers are preserved.
   66 ;
   67 ;     AES_RETURN aes_encrypt(const unsigned char in_blk[],
   68 ;                   unsigned char out_blk[], const aes_encrypt_ctx cx[1]);
   69 ;
   70 ;     AES_RETURN aes_decrypt(const unsigned char in_blk[],
   71 ;                   unsigned char out_blk[], const aes_decrypt_ctx cx[1]);
   72 ;
   73 ;     AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[],
   74 ;                                            const aes_encrypt_ctx cx[1]);
   75 ;
   76 ;     AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[],
   77 ;                                            const aes_decrypt_ctx cx[1]);
   78 ;
   79 ;     AES_RETURN aes_encrypt_key(const unsigned char key[],
   80 ;                           unsigned int len, const aes_decrypt_ctx cx[1]);
   81 ;
   82 ;     AES_RETURN aes_decrypt_key(const unsigned char key[],
   83 ;                           unsigned int len, const aes_decrypt_ctx cx[1]);
   84 ;
   85 ; where <NNN> is 128, 102 or 256.  In the last two calls the length can be in
   86 ; either bits or bytes.
   87 ;
   88 ; Comment in/out the following lines to obtain the desired subroutines. These
   89 ; selections MUST match those in the C header file aes.h
   90 
   91 ; %define AES_128                 ; define if AES with 128 bit keys is needed
   92 ; %define AES_192                 ; define if AES with 192 bit keys is needed
   93 %define AES_256                 ; define if AES with 256 bit keys is needed
   94 ; %define AES_VAR                 ; define if a variable key size is needed
   95 %define ENCRYPTION              ; define if encryption is needed
   96 %define DECRYPTION              ; define if decryption is needed
   97 %define AES_REV_DKS             ; define if key decryption schedule is reversed
   98 %define LAST_ROUND_TABLES       ; define for the faster version using extra tables
   99 
  100 ; The encryption key schedule has the following in memory layout where N is the
  101 ; number of rounds (10, 12 or 14):
  102 ;
  103 ; lo: | input key (round 0)  |  ; each round is four 32-bit words
  104 ;     | encryption round 1   |
  105 ;     | encryption round 2   |
  106 ;     ....
  107 ;     | encryption round N-1 |
  108 ; hi: | encryption round N   |
  109 ;
  110 ; The decryption key schedule is normally set up so that it has the same
  111 ; layout as above by actually reversing the order of the encryption key
  112 ; schedule in memory (this happens when AES_REV_DKS is set):
  113 ;
  114 ; lo: | decryption round 0   | =              | encryption round N   |
  115 ;     | decryption round 1   | = INV_MIX_COL[ | encryption round N-1 | ]
  116 ;     | decryption round 2   | = INV_MIX_COL[ | encryption round N-2 | ]
  117 ;     ....                       ....
  118 ;     | decryption round N-1 | = INV_MIX_COL[ | encryption round 1   | ]
  119 ; hi: | decryption round N   | =              | input key (round 0)  |
  120 ;
  121 ; with rounds except the first and last modified using inv_mix_column()
  122 ; But if AES_REV_DKS is NOT set the order of keys is left as it is for
  123 ; encryption so that it has to be accessed in reverse when used for
  124 ; decryption (although the inverse mix column modifications are done)
  125 ;
  126 ; lo: | decryption round 0   | =              | input key (round 0)  |
  127 ;     | decryption round 1   | = INV_MIX_COL[ | encryption round 1   | ]
  128 ;     | decryption round 2   | = INV_MIX_COL[ | encryption round 2   | ]
  129 ;     ....                       ....
  130 ;     | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]
  131 ; hi: | decryption round N   | =              | encryption round N   |
  132 ;
  133 ; This layout is faster when the assembler key scheduling provided here
  134 ; is used.
  135 ;
  136 ; The DLL interface must use the _stdcall convention in which the number
  137 ; of bytes of parameter space is added after an @ to the sutine's name.
  138 ; We must also remove our parameters from the stack before return (see
  139 ; the do_exit macro). Define DLL_EXPORT for the Dynamic Link Library version.
  140 
  141 ;%define DLL_EXPORT
  142 
  143 ; End of user defines
  144 
  145 %ifdef AES_VAR
  146 %ifndef AES_128
  147 %define AES_128
  148 %endif
  149 %ifndef AES_192
  150 %define AES_192
  151 %endif
  152 %ifndef AES_256
  153 %define AES_256
  154 %endif
  155 %endif
  156 
  157 %ifdef AES_VAR
  158 %define KS_LENGTH       60
  159 %elifdef AES_256
  160 %define KS_LENGTH       60
  161 %elifdef AES_192
  162 %define KS_LENGTH       52
  163 %else
  164 %define KS_LENGTH       44
  165 %endif
  166 
  167 %define     r0  rax
  168 %define     r1  rdx
  169 %define     r2  rcx
  170 %define     r3  rbx
  171 %define     r4  rsi
  172 %define     r5  rdi
  173 %define     r6  rbp
  174 %define     r7  rsp
  175 
  176 %define     raxd    eax
  177 %define     rdxd    edx
  178 %define     rcxd    ecx
  179 %define     rbxd    ebx
  180 %define     rsid    esi
  181 %define     rdid    edi
  182 %define     rbpd    ebp
  183 %define     rspd    esp
  184 
  185 %define     raxb    al
  186 %define     rdxb    dl
  187 %define     rcxb    cl
  188 %define     rbxb    bl
  189 %define     rsib    sil
  190 %define     rdib    dil
  191 %define     rbpb    bpl
  192 %define     rspb    spl
  193 
  194 %define     r0h ah
  195 %define     r1h dh
  196 %define     r2h ch
  197 %define     r3h bh
  198 
  199 %define     r0d eax
  200 %define     r1d edx
  201 %define     r2d ecx
  202 %define     r3d ebx
  203 
  204 ; finite field multiplies by {02}, {04} and {08}
  205 
  206 %define f2(x)   ((x<<1)^(((x>>7)&1)*0x11b))
  207 %define f4(x)   ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
  208 %define f8(x)   ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
  209 
  210 ; finite field multiplies required in table generation
  211 
  212 %define f3(x)   (f2(x) ^ x)
  213 %define f9(x)   (f8(x) ^ x)
  214 %define fb(x)   (f8(x) ^ f2(x) ^ x)
  215 %define fd(x)   (f8(x) ^ f4(x) ^ x)
  216 %define fe(x)   (f8(x) ^ f4(x) ^ f2(x))
  217 
  218 ; macro for expanding S-box data
  219 
  220 %macro enc_vals 1
  221     db  %1(0x63),%1(0x7c),%1(0x77),%1(0x7b),%1(0xf2),%1(0x6b),%1(0x6f),%1(0xc5)
  222     db  %1(0x30),%1(0x01),%1(0x67),%1(0x2b),%1(0xfe),%1(0xd7),%1(0xab),%1(0x76)
  223     db  %1(0xca),%1(0x82),%1(0xc9),%1(0x7d),%1(0xfa),%1(0x59),%1(0x47),%1(0xf0)
  224     db  %1(0xad),%1(0xd4),%1(0xa2),%1(0xaf),%1(0x9c),%1(0xa4),%1(0x72),%1(0xc0)
  225     db  %1(0xb7),%1(0xfd),%1(0x93),%1(0x26),%1(0x36),%1(0x3f),%1(0xf7),%1(0xcc)
  226     db  %1(0x34),%1(0xa5),%1(0xe5),%1(0xf1),%1(0x71),%1(0xd8),%1(0x31),%1(0x15)
  227     db  %1(0x04),%1(0xc7),%1(0x23),%1(0xc3),%1(0x18),%1(0x96),%1(0x05),%1(0x9a)
  228     db  %1(0x07),%1(0x12),%1(0x80),%1(0xe2),%1(0xeb),%1(0x27),%1(0xb2),%1(0x75)
  229     db  %1(0x09),%1(0x83),%1(0x2c),%1(0x1a),%1(0x1b),%1(0x6e),%1(0x5a),%1(0xa0)
  230     db  %1(0x52),%1(0x3b),%1(0xd6),%1(0xb3),%1(0x29),%1(0xe3),%1(0x2f),%1(0x84)
  231     db  %1(0x53),%1(0xd1),%1(0x00),%1(0xed),%1(0x20),%1(0xfc),%1(0xb1),%1(0x5b)
  232     db  %1(0x6a),%1(0xcb),%1(0xbe),%1(0x39),%1(0x4a),%1(0x4c),%1(0x58),%1(0xcf)
  233     db  %1(0xd0),%1(0xef),%1(0xaa),%1(0xfb),%1(0x43),%1(0x4d),%1(0x33),%1(0x85)
  234     db  %1(0x45),%1(0xf9),%1(0x02),%1(0x7f),%1(0x50),%1(0x3c),%1(0x9f),%1(0xa8)
  235     db  %1(0x51),%1(0xa3),%1(0x40),%1(0x8f),%1(0x92),%1(0x9d),%1(0x38),%1(0xf5)
  236     db  %1(0xbc),%1(0xb6),%1(0xda),%1(0x21),%1(0x10),%1(0xff),%1(0xf3),%1(0xd2)
  237     db  %1(0xcd),%1(0x0c),%1(0x13),%1(0xec),%1(0x5f),%1(0x97),%1(0x44),%1(0x17)
  238     db  %1(0xc4),%1(0xa7),%1(0x7e),%1(0x3d),%1(0x64),%1(0x5d),%1(0x19),%1(0x73)
  239     db  %1(0x60),%1(0x81),%1(0x4f),%1(0xdc),%1(0x22),%1(0x2a),%1(0x90),%1(0x88)
  240     db  %1(0x46),%1(0xee),%1(0xb8),%1(0x14),%1(0xde),%1(0x5e),%1(0x0b),%1(0xdb)
  241     db  %1(0xe0),%1(0x32),%1(0x3a),%1(0x0a),%1(0x49),%1(0x06),%1(0x24),%1(0x5c)
  242     db  %1(0xc2),%1(0xd3),%1(0xac),%1(0x62),%1(0x91),%1(0x95),%1(0xe4),%1(0x79)
  243     db  %1(0xe7),%1(0xc8),%1(0x37),%1(0x6d),%1(0x8d),%1(0xd5),%1(0x4e),%1(0xa9)
  244     db  %1(0x6c),%1(0x56),%1(0xf4),%1(0xea),%1(0x65),%1(0x7a),%1(0xae),%1(0x08)
  245     db  %1(0xba),%1(0x78),%1(0x25),%1(0x2e),%1(0x1c),%1(0xa6),%1(0xb4),%1(0xc6)
  246     db  %1(0xe8),%1(0xdd),%1(0x74),%1(0x1f),%1(0x4b),%1(0xbd),%1(0x8b),%1(0x8a)
  247     db  %1(0x70),%1(0x3e),%1(0xb5),%1(0x66),%1(0x48),%1(0x03),%1(0xf6),%1(0x0e)
  248     db  %1(0x61),%1(0x35),%1(0x57),%1(0xb9),%1(0x86),%1(0xc1),%1(0x1d),%1(0x9e)
  249     db  %1(0xe1),%1(0xf8),%1(0x98),%1(0x11),%1(0x69),%1(0xd9),%1(0x8e),%1(0x94)
  250     db  %1(0x9b),%1(0x1e),%1(0x87),%1(0xe9),%1(0xce),%1(0x55),%1(0x28),%1(0xdf)
  251     db  %1(0x8c),%1(0xa1),%1(0x89),%1(0x0d),%1(0xbf),%1(0xe6),%1(0x42),%1(0x68)
  252     db  %1(0x41),%1(0x99),%1(0x2d),%1(0x0f),%1(0xb0),%1(0x54),%1(0xbb),%1(0x16)
  253 %endmacro
  254 
  255 %macro dec_vals 1
  256     db  %1(0x52),%1(0x09),%1(0x6a),%1(0xd5),%1(0x30),%1(0x36),%1(0xa5),%1(0x38)
  257     db  %1(0xbf),%1(0x40),%1(0xa3),%1(0x9e),%1(0x81),%1(0xf3),%1(0xd7),%1(0xfb)
  258     db  %1(0x7c),%1(0xe3),%1(0x39),%1(0x82),%1(0x9b),%1(0x2f),%1(0xff),%1(0x87)
  259     db  %1(0x34),%1(0x8e),%1(0x43),%1(0x44),%1(0xc4),%1(0xde),%1(0xe9),%1(0xcb)
  260     db  %1(0x54),%1(0x7b),%1(0x94),%1(0x32),%1(0xa6),%1(0xc2),%1(0x23),%1(0x3d)
  261     db  %1(0xee),%1(0x4c),%1(0x95),%1(0x0b),%1(0x42),%1(0xfa),%1(0xc3),%1(0x4e)
  262     db  %1(0x08),%1(0x2e),%1(0xa1),%1(0x66),%1(0x28),%1(0xd9),%1(0x24),%1(0xb2)
  263     db  %1(0x76),%1(0x5b),%1(0xa2),%1(0x49),%1(0x6d),%1(0x8b),%1(0xd1),%1(0x25)
  264     db  %1(0x72),%1(0xf8),%1(0xf6),%1(0x64),%1(0x86),%1(0x68),%1(0x98),%1(0x16)
  265     db  %1(0xd4),%1(0xa4),%1(0x5c),%1(0xcc),%1(0x5d),%1(0x65),%1(0xb6),%1(0x92)
  266     db  %1(0x6c),%1(0x70),%1(0x48),%1(0x50),%1(0xfd),%1(0xed),%1(0xb9),%1(0xda)
  267     db  %1(0x5e),%1(0x15),%1(0x46),%1(0x57),%1(0xa7),%1(0x8d),%1(0x9d),%1(0x84)
  268     db  %1(0x90),%1(0xd8),%1(0xab),%1(0x00),%1(0x8c),%1(0xbc),%1(0xd3),%1(0x0a)
  269     db  %1(0xf7),%1(0xe4),%1(0x58),%1(0x05),%1(0xb8),%1(0xb3),%1(0x45),%1(0x06)
  270     db  %1(0xd0),%1(0x2c),%1(0x1e),%1(0x8f),%1(0xca),%1(0x3f),%1(0x0f),%1(0x02)
  271     db  %1(0xc1),%1(0xaf),%1(0xbd),%1(0x03),%1(0x01),%1(0x13),%1(0x8a),%1(0x6b)
  272     db  %1(0x3a),%1(0x91),%1(0x11),%1(0x41),%1(0x4f),%1(0x67),%1(0xdc),%1(0xea)
  273     db  %1(0x97),%1(0xf2),%1(0xcf),%1(0xce),%1(0xf0),%1(0xb4),%1(0xe6),%1(0x73)
  274     db  %1(0x96),%1(0xac),%1(0x74),%1(0x22),%1(0xe7),%1(0xad),%1(0x35),%1(0x85)
  275     db  %1(0xe2),%1(0xf9),%1(0x37),%1(0xe8),%1(0x1c),%1(0x75),%1(0xdf),%1(0x6e)
  276     db  %1(0x47),%1(0xf1),%1(0x1a),%1(0x71),%1(0x1d),%1(0x29),%1(0xc5),%1(0x89)
  277     db  %1(0x6f),%1(0xb7),%1(0x62),%1(0x0e),%1(0xaa),%1(0x18),%1(0xbe),%1(0x1b)
  278     db  %1(0xfc),%1(0x56),%1(0x3e),%1(0x4b),%1(0xc6),%1(0xd2),%1(0x79),%1(0x20)
  279     db  %1(0x9a),%1(0xdb),%1(0xc0),%1(0xfe),%1(0x78),%1(0xcd),%1(0x5a),%1(0xf4)
  280     db  %1(0x1f),%1(0xdd),%1(0xa8),%1(0x33),%1(0x88),%1(0x07),%1(0xc7),%1(0x31)
  281     db  %1(0xb1),%1(0x12),%1(0x10),%1(0x59),%1(0x27),%1(0x80),%1(0xec),%1(0x5f)
  282     db  %1(0x60),%1(0x51),%1(0x7f),%1(0xa9),%1(0x19),%1(0xb5),%1(0x4a),%1(0x0d)
  283     db  %1(0x2d),%1(0xe5),%1(0x7a),%1(0x9f),%1(0x93),%1(0xc9),%1(0x9c),%1(0xef)
  284     db  %1(0xa0),%1(0xe0),%1(0x3b),%1(0x4d),%1(0xae),%1(0x2a),%1(0xf5),%1(0xb0)
  285     db  %1(0xc8),%1(0xeb),%1(0xbb),%1(0x3c),%1(0x83),%1(0x53),%1(0x99),%1(0x61)
  286     db  %1(0x17),%1(0x2b),%1(0x04),%1(0x7e),%1(0xba),%1(0x77),%1(0xd6),%1(0x26)
  287     db  %1(0xe1),%1(0x69),%1(0x14),%1(0x63),%1(0x55),%1(0x21),%1(0x0c),%1(0x7d)
  288 %endmacro
  289 
  290 %define u8(x)   f2(x), x, x, f3(x), f2(x), x, x, f3(x)
  291 %define v8(x)   fe(x), f9(x), fd(x), fb(x), fe(x), f9(x), fd(x), x
  292 %define w8(x)   x, 0, 0, 0, x, 0, 0, 0
  293 
  294 %define tptr    rbp     ; table pointer
  295 %define kptr    r8      ; key schedule pointer
  296 %define fofs    128     ; adjust offset in key schedule to keep |disp| < 128
  297 %define fk_ref(x,y) [kptr-16*x+fofs+4*y]
  298 %ifdef  AES_REV_DKS
  299 %define rofs    128
  300 %define ik_ref(x,y) [kptr-16*x+rofs+4*y]
  301 %else
  302 %define rofs    -128
  303 %define ik_ref(x,y) [kptr+16*x+rofs+4*y]
  304 %endif
  305 
  306 %define tab_0(x)   [tptr+8*x]
  307 %define tab_1(x)   [tptr+8*x+3]
  308 %define tab_2(x)   [tptr+8*x+2]
  309 %define tab_3(x)   [tptr+8*x+1]
  310 %define tab_f(x)   byte [tptr+8*x+1]
  311 %define tab_i(x)   byte [tptr+8*x+7]
  312 %define t_ref(x,r) tab_ %+ x(r)
  313 
  314 %macro ff_rnd 5                 ; normal forward round
  315     mov     %1d, fk_ref(%5,0)
  316     mov     %2d, fk_ref(%5,1)
  317     mov     %3d, fk_ref(%5,2)
  318     mov     %4d, fk_ref(%5,3)
  319 
  320     movzx   esi, al
  321     movzx   edi, ah
  322     shr     eax, 16
  323     xor     %1d, t_ref(0,rsi)
  324     xor     %4d, t_ref(1,rdi)
  325     movzx   esi, al
  326     movzx   edi, ah
  327     xor     %3d, t_ref(2,rsi)
  328     xor     %2d, t_ref(3,rdi)
  329 
  330     movzx   esi, bl
  331     movzx   edi, bh
  332     shr     ebx, 16
  333     xor     %2d, t_ref(0,rsi)
  334     xor     %1d, t_ref(1,rdi)
  335     movzx   esi, bl
  336     movzx   edi, bh
  337     xor     %4d, t_ref(2,rsi)
  338     xor     %3d, t_ref(3,rdi)
  339 
  340     movzx   esi, cl
  341     movzx   edi, ch
  342     shr     ecx, 16
  343     xor     %3d, t_ref(0,rsi)
  344     xor     %2d, t_ref(1,rdi)
  345     movzx   esi, cl
  346     movzx   edi, ch
  347     xor     %1d, t_ref(2,rsi)
  348     xor     %4d, t_ref(3,rdi)
  349 
  350     movzx   esi, dl
  351     movzx   edi, dh
  352     shr     edx, 16
  353     xor     %4d, t_ref(0,rsi)
  354     xor     %3d, t_ref(1,rdi)
  355     movzx   esi, dl
  356     movzx   edi, dh
  357     xor     %2d, t_ref(2,rsi)
  358     xor     %1d, t_ref(3,rdi)
  359 
  360     mov     eax,%1d
  361     mov     ebx,%2d
  362     mov     ecx,%3d
  363     mov     edx,%4d
  364 %endmacro
  365 
  366 %ifdef LAST_ROUND_TABLES
  367 
  368 %macro fl_rnd 5                 ; last forward round
  369     add     tptr, 2048
  370     mov     %1d, fk_ref(%5,0)
  371     mov     %2d, fk_ref(%5,1)
  372     mov     %3d, fk_ref(%5,2)
  373     mov     %4d, fk_ref(%5,3)
  374 
  375     movzx   esi, al
  376     movzx   edi, ah
  377     shr     eax, 16
  378     xor     %1d, t_ref(0,rsi)
  379     xor     %4d, t_ref(1,rdi)
  380     movzx   esi, al
  381     movzx   edi, ah
  382     xor     %3d, t_ref(2,rsi)
  383     xor     %2d, t_ref(3,rdi)
  384 
  385     movzx   esi, bl
  386     movzx   edi, bh
  387     shr     ebx, 16
  388     xor     %2d, t_ref(0,rsi)
  389     xor     %1d, t_ref(1,rdi)
  390     movzx   esi, bl
  391     movzx   edi, bh
  392     xor     %4d, t_ref(2,rsi)
  393     xor     %3d, t_ref(3,rdi)
  394 
  395     movzx   esi, cl
  396     movzx   edi, ch
  397     shr     ecx, 16
  398     xor     %3d, t_ref(0,rsi)
  399     xor     %2d, t_ref(1,rdi)
  400     movzx   esi, cl
  401     movzx   edi, ch
  402     xor     %1d, t_ref(2,rsi)
  403     xor     %4d, t_ref(3,rdi)
  404 
  405     movzx   esi, dl
  406     movzx   edi, dh
  407     shr     edx, 16
  408     xor     %4d, t_ref(0,rsi)
  409     xor     %3d, t_ref(1,rdi)
  410     movzx   esi, dl
  411     movzx   edi, dh
  412     xor     %2d, t_ref(2,rsi)
  413     xor     %1d, t_ref(3,rdi)
  414 %endmacro
  415 
  416 %else
  417 
  418 %macro fl_rnd 5                 ; last forward round
  419     mov     %1d, fk_ref(%5,0)
  420     mov     %2d, fk_ref(%5,1)
  421     mov     %3d, fk_ref(%5,2)
  422     mov     %4d, fk_ref(%5,3)
  423 
  424     movzx   esi, al
  425     movzx   edi, ah
  426     shr     eax, 16
  427     movzx   esi, t_ref(f,rsi)
  428     movzx   edi, t_ref(f,rdi)
  429     xor     %1d, esi
  430     rol     edi, 8
  431     xor     %4d, edi
  432     movzx   esi, al
  433     movzx   edi, ah
  434     movzx   esi, t_ref(f,rsi)
  435     movzx   edi, t_ref(f,rdi)
  436     rol     esi, 16
  437     rol     edi, 24
  438     xor     %3d, esi
  439     xor     %2d, edi
  440 
  441     movzx   esi, bl
  442     movzx   edi, bh
  443     shr     ebx, 16
  444     movzx   esi, t_ref(f,rsi)
  445     movzx   edi, t_ref(f,rdi)
  446     xor     %2d, esi
  447     rol     edi, 8
  448     xor     %1d, edi
  449     movzx   esi, bl
  450     movzx   edi, bh
  451     movzx   esi, t_ref(f,rsi)
  452     movzx   edi, t_ref(f,rdi)
  453     rol     esi, 16
  454     rol     edi, 24
  455     xor     %4d, esi
  456     xor     %3d, edi
  457 
  458     movzx   esi, cl
  459     movzx   edi, ch
  460     movzx   esi, t_ref(f,rsi)
  461     movzx   edi, t_ref(f,rdi)
  462     shr     ecx, 16
  463     xor     %3d, esi
  464     rol     edi, 8
  465     xor     %2d, edi
  466     movzx   esi, cl
  467     movzx   edi, ch
  468     movzx   esi, t_ref(f,rsi)
  469     movzx   edi, t_ref(f,rdi)
  470     rol     esi, 16
  471     rol     edi, 24
  472     xor     %1d, esi
  473     xor     %4d, edi
  474 
  475     movzx   esi, dl
  476     movzx   edi, dh
  477     movzx   esi, t_ref(f,rsi)
  478     movzx   edi, t_ref(f,rdi)
  479     shr     edx, 16
  480     xor     %4d, esi
  481     rol     edi, 8
  482     xor     %3d, edi
  483     movzx   esi, dl
  484     movzx   edi, dh
  485     movzx   esi, t_ref(f,rsi)
  486     movzx   edi, t_ref(f,rdi)
  487     rol     esi, 16
  488     rol     edi, 24
  489     xor     %2d, esi
  490     xor     %1d, edi
  491 %endmacro
  492 
  493 %endif
  494 
  495 %macro ii_rnd 5                 ; normal inverse round
  496     mov     %1d, ik_ref(%5,0)
  497     mov     %2d, ik_ref(%5,1)
  498     mov     %3d, ik_ref(%5,2)
  499     mov     %4d, ik_ref(%5,3)
  500 
  501     movzx   esi, al
  502     movzx   edi, ah
  503     shr     eax, 16
  504     xor     %1d, t_ref(0,rsi)
  505     xor     %2d, t_ref(1,rdi)
  506     movzx   esi, al
  507     movzx   edi, ah
  508     xor     %3d, t_ref(2,rsi)
  509     xor     %4d, t_ref(3,rdi)
  510 
  511     movzx   esi, bl
  512     movzx   edi, bh
  513     shr     ebx, 16
  514     xor     %2d, t_ref(0,rsi)
  515     xor     %3d, t_ref(1,rdi)
  516     movzx   esi, bl
  517     movzx   edi, bh
  518     xor     %4d, t_ref(2,rsi)
  519     xor     %1d, t_ref(3,rdi)
  520 
  521     movzx   esi, cl
  522     movzx   edi, ch
  523     shr     ecx, 16
  524     xor     %3d, t_ref(0,rsi)
  525     xor     %4d, t_ref(1,rdi)
  526     movzx   esi, cl
  527     movzx   edi, ch
  528     xor     %1d, t_ref(2,rsi)
  529     xor     %2d, t_ref(3,rdi)
  530 
  531     movzx   esi, dl
  532     movzx   edi, dh
  533     shr     edx, 16
  534     xor     %4d, t_ref(0,rsi)
  535     xor     %1d, t_ref(1,rdi)
  536     movzx   esi, dl
  537     movzx   edi, dh
  538     xor     %2d, t_ref(2,rsi)
  539     xor     %3d, t_ref(3,rdi)
  540 
  541     mov     eax,%1d
  542     mov     ebx,%2d
  543     mov     ecx,%3d
  544     mov     edx,%4d
  545 %endmacro
  546 
  547 %ifdef LAST_ROUND_TABLES
  548 
  549 %macro il_rnd 5                 ; last inverse round
  550     add     tptr, 2048
  551     mov     %1d, ik_ref(%5,0)
  552     mov     %2d, ik_ref(%5,1)
  553     mov     %3d, ik_ref(%5,2)
  554     mov     %4d, ik_ref(%5,3)
  555 
  556     movzx   esi, al
  557     movzx   edi, ah
  558     shr     eax, 16
  559     xor     %1d, t_ref(0,rsi)
  560     xor     %2d, t_ref(1,rdi)
  561     movzx   esi, al
  562     movzx   edi, ah
  563     xor     %3d, t_ref(2,rsi)
  564     xor     %4d, t_ref(3,rdi)
  565 
  566     movzx   esi, bl
  567     movzx   edi, bh
  568     shr     ebx, 16
  569     xor     %2d, t_ref(0,rsi)
  570     xor     %3d, t_ref(1,rdi)
  571     movzx   esi, bl
  572     movzx   edi, bh
  573     xor     %4d, t_ref(2,rsi)
  574     xor     %1d, t_ref(3,rdi)
  575 
  576     movzx   esi, cl
  577     movzx   edi, ch
  578     shr     ecx, 16
  579     xor     %3d, t_ref(0,rsi)
  580     xor     %4d, t_ref(1,rdi)
  581     movzx   esi, cl
  582     movzx   edi, ch
  583     xor     %1d, t_ref(2,rsi)
  584     xor     %2d, t_ref(3,rdi)
  585 
  586     movzx   esi, dl
  587     movzx   edi, dh
  588     shr     edx, 16
  589     xor     %4d, t_ref(0,rsi)
  590     xor     %1d, t_ref(1,rdi)
  591     movzx   esi, dl
  592     movzx   edi, dh
  593     xor     %2d, t_ref(2,rsi)
  594     xor     %3d, t_ref(3,rdi)
  595 %endmacro
  596 
  597 %else
  598 
  599 %macro il_rnd 5                 ; last inverse round
  600     mov     %1d, ik_ref(%5,0)
  601     mov     %2d, ik_ref(%5,1)
  602     mov     %3d, ik_ref(%5,2)
  603     mov     %4d, ik_ref(%5,3)
  604 
  605     movzx   esi, al
  606     movzx   edi, ah
  607     movzx   esi, t_ref(i,rsi)
  608     movzx   edi, t_ref(i,rdi)
  609     shr     eax, 16
  610     xor     %1d, esi
  611     rol     edi, 8
  612     xor     %2d, edi
  613     movzx   esi, al
  614     movzx   edi, ah
  615     movzx   esi, t_ref(i,rsi)
  616     movzx   edi, t_ref(i,rdi)
  617     rol     esi, 16
  618     rol     edi, 24
  619     xor     %3d, esi
  620     xor     %4d, edi
  621 
  622     movzx   esi, bl
  623     movzx   edi, bh
  624     movzx   esi, t_ref(i,rsi)
  625     movzx   edi, t_ref(i,rdi)
  626     shr     ebx, 16
  627     xor     %2d, esi
  628     rol     edi, 8
  629     xor     %3d, edi
  630     movzx   esi, bl
  631     movzx   edi, bh
  632     movzx   esi, t_ref(i,rsi)
  633     movzx   edi, t_ref(i,rdi)
  634     rol     esi, 16
  635     rol     edi, 24
  636     xor     %4d, esi
  637     xor     %1d, edi
  638 
  639     movzx   esi, cl
  640     movzx   edi, ch
  641     movzx   esi, t_ref(i,rsi)
  642     movzx   edi, t_ref(i,rdi)
  643     shr     ecx, 16
  644     xor     %3d, esi
  645     rol     edi, 8
  646     xor     %4d, edi
  647     movzx   esi, cl
  648     movzx   edi, ch
  649     movzx   esi, t_ref(i,rsi)
  650     movzx   edi, t_ref(i,rdi)
  651     rol     esi, 16
  652     rol     edi, 24
  653     xor     %1d, esi
  654     xor     %2d, edi
  655 
  656     movzx   esi, dl
  657     movzx   edi, dh
  658     movzx   esi, t_ref(i,rsi)
  659     movzx   edi, t_ref(i,rdi)
  660     shr     edx, 16
  661     xor     %4d, esi
  662     rol     edi, 8
  663     xor     %1d, edi
  664     movzx   esi, dl
  665     movzx   edi, dh
  666     movzx   esi, t_ref(i,rsi)
  667     movzx   edi, t_ref(i,rdi)
  668     rol     esi, 16
  669     rol     edi, 24
  670     xor     %2d, esi
  671     xor     %3d, edi
  672 %endmacro
  673 
  674 %endif
  675 
  676 %ifdef ENCRYPTION
  677 
  678     global  aes_encrypt
  679 %ifdef DLL_EXPORT
  680     export  aes_encrypt
  681 %endif
  682 
  683     section .data align=64
  684     align   64
  685 enc_tab:
  686     enc_vals u8
  687 %ifdef LAST_ROUND_TABLES
  688     enc_vals w8
  689 %endif
  690 
  691     section .text align=16
  692     align   16
  693 
  694 %ifdef _SEH_
  695 proc_frame aes_encrypt
  696     alloc_stack 7*8         ; 7 to align stack to 16 bytes
  697     save_reg    rsi,4*8
  698     save_reg    rdi,5*8
  699     save_reg    rbx,1*8
  700     save_reg    rbp,2*8
  701     save_reg    r12,3*8
  702 end_prologue
  703     mov     rdi, rcx        ; input pointer
  704     mov     [rsp+0*8], rdx  ; output pointer
  705 %else
  706     aes_encrypt:
  707     %ifdef __GNUC__
  708         sub     rsp, 4*8        ; gnu/linux binary interface
  709         mov     [rsp+0*8], rsi  ; output pointer
  710         mov     r8, rdx         ; context
  711     %else
  712         sub     rsp, 6*8        ; windows binary interface
  713         mov     [rsp+4*8], rsi
  714         mov     [rsp+5*8], rdi
  715         mov     rdi, rcx        ; input pointer
  716         mov     [rsp+0*8], rdx  ; output pointer
  717     %endif
  718         mov     [rsp+1*8], rbx  ; input pointer in rdi
  719         mov     [rsp+2*8], rbp  ; output pointer in [rsp]
  720         mov     [rsp+3*8], r12  ; context in r8
  721 %endif
  722 
  723     movzx   esi, byte [kptr+4*KS_LENGTH]
  724     lea     tptr, [rel enc_tab]
  725     sub     kptr, fofs
  726 
  727     mov     eax, [rdi+0*4]
  728     mov     ebx, [rdi+1*4]
  729     mov     ecx, [rdi+2*4]
  730     mov     edx, [rdi+3*4]
  731 
  732     xor     eax, [kptr+fofs]
  733     xor     ebx, [kptr+fofs+4]
  734     xor     ecx, [kptr+fofs+8]
  735     xor     edx, [kptr+fofs+12]
  736 
  737     lea     kptr,[kptr+rsi]
  738     cmp     esi, 10*16
  739     je      .3
  740     cmp     esi, 12*16
  741     je      .2
  742     cmp     esi, 14*16
  743     je      .1
  744     mov     rax, -1
  745     jmp     .4
  746 
  747 .1: ff_rnd  r9, r10, r11, r12, 13
  748     ff_rnd  r9, r10, r11, r12, 12
  749 .2: ff_rnd  r9, r10, r11, r12, 11
  750     ff_rnd  r9, r10, r11, r12, 10
  751 .3: ff_rnd  r9, r10, r11, r12, 9
  752     ff_rnd  r9, r10, r11, r12, 8
  753     ff_rnd  r9, r10, r11, r12, 7
  754     ff_rnd  r9, r10, r11, r12, 6
  755     ff_rnd  r9, r10, r11, r12, 5
  756     ff_rnd  r9, r10, r11, r12, 4
  757     ff_rnd  r9, r10, r11, r12, 3
  758     ff_rnd  r9, r10, r11, r12, 2
  759     ff_rnd  r9, r10, r11, r12, 1
  760     fl_rnd  r9, r10, r11, r12, 0
  761 
  762     mov     rbx, [rsp]
  763     mov     [rbx], r9d
  764     mov     [rbx+4], r10d
  765     mov     [rbx+8], r11d
  766     mov     [rbx+12], r12d
  767     xor     rax, rax
  768 .4:
  769     mov     rbx, [rsp+1*8]
  770     mov     rbp, [rsp+2*8]
  771     mov     r12, [rsp+3*8]
  772 %ifdef __GNUC__
  773     add     rsp, 4*8
  774     ret
  775 %else
  776         mov     rsi, [rsp+4*8]
  777         mov     rdi, [rsp+5*8]
  778     %ifdef _SEH_
  779         add     rsp, 7*8
  780         ret
  781     endproc_frame
  782     %else
  783         add     rsp, 6*8
  784         ret
  785     %endif
  786 %endif
  787 
  788 %endif
  789 
  790 %ifdef DECRYPTION
  791 
  792     global  aes_decrypt
  793 %ifdef DLL_EXPORT
  794     export  aes_decrypt
  795 %endif
  796 
  797     section .data
  798     align   64
  799 dec_tab:
  800     dec_vals v8
  801 %ifdef LAST_ROUND_TABLES
  802     dec_vals w8
  803 %endif
  804 
  805     section .text
  806     align   16
  807 
  808 %ifdef _SEH_
  809 proc_frame aes_decrypt
  810     alloc_stack 7*8         ; 7 to align stack to 16 bytes
  811     save_reg    rsi,4*8
  812     save_reg    rdi,5*8
  813     save_reg    rbx,1*8
  814     save_reg    rbp,2*8
  815     save_reg    r12,3*8
  816 end_prologue
  817     mov     rdi, rcx        ; input pointer
  818     mov     [rsp+0*8], rdx  ; output pointer
  819 %else
  820     aes_decrypt:
  821     %ifdef __GNUC__
  822         sub     rsp, 4*8        ; gnu/linux binary interface
  823         mov     [rsp+0*8], rsi  ; output pointer
  824         mov     r8, rdx         ; context
  825     %else
  826         sub     rsp, 6*8        ; windows binary interface
  827         mov     [rsp+4*8], rsi
  828         mov     [rsp+5*8], rdi
  829         mov     rdi, rcx        ; input pointer
  830         mov     [rsp+0*8], rdx  ; output pointer
  831     %endif
  832         mov     [rsp+1*8], rbx  ; input pointer in rdi
  833         mov     [rsp+2*8], rbp  ; output pointer in [rsp]
  834         mov     [rsp+3*8], r12  ; context in r8
  835 %endif
  836 
  837     movzx   esi,byte[kptr+4*KS_LENGTH]
  838     lea     tptr, [rel dec_tab]
  839     sub     kptr, rofs
  840 
  841     mov     eax, [rdi+0*4]
  842     mov     ebx, [rdi+1*4]
  843     mov     ecx, [rdi+2*4]
  844     mov     edx, [rdi+3*4]
  845 
  846 %ifdef      AES_REV_DKS
  847     mov     rdi, kptr
  848     lea     kptr,[kptr+rsi]
  849 %else
  850     lea     rdi,[kptr+rsi]
  851 %endif
  852 
  853     xor     eax, [rdi+rofs]
  854     xor     ebx, [rdi+rofs+4]
  855     xor     ecx, [rdi+rofs+8]
  856     xor     edx, [rdi+rofs+12]
  857 
  858     cmp     esi, 10*16
  859     je      .3
  860     cmp     esi, 12*16
  861     je      .2
  862     cmp     esi, 14*16
  863     je      .1
  864     mov     rax, -1
  865     jmp     .4
  866 
  867 .1: ii_rnd  r9, r10, r11, r12, 13
  868     ii_rnd  r9, r10, r11, r12, 12
  869 .2: ii_rnd  r9, r10, r11, r12, 11
  870     ii_rnd  r9, r10, r11, r12, 10
  871 .3: ii_rnd  r9, r10, r11, r12, 9
  872     ii_rnd  r9, r10, r11, r12, 8
  873     ii_rnd  r9, r10, r11, r12, 7
  874     ii_rnd  r9, r10, r11, r12, 6
  875     ii_rnd  r9, r10, r11, r12, 5
  876     ii_rnd  r9, r10, r11, r12, 4
  877     ii_rnd  r9, r10, r11, r12, 3
  878     ii_rnd  r9, r10, r11, r12, 2
  879     ii_rnd  r9, r10, r11, r12, 1
  880     il_rnd  r9, r10, r11, r12, 0
  881 
  882     mov     rbx, [rsp]
  883     mov     [rbx], r9d
  884     mov     [rbx+4], r10d
  885     mov     [rbx+8], r11d
  886     mov     [rbx+12], r12d
  887     xor     rax, rax
  888 .4: mov     rbx, [rsp+1*8]
  889     mov     rbp, [rsp+2*8]
  890     mov     r12, [rsp+3*8]
  891 %ifdef __GNUC__
  892     add     rsp, 4*8
  893     ret
  894 %else
  895         mov     rsi, [rsp+4*8]
  896         mov     rdi, [rsp+5*8]
  897     %ifdef _SEH_
  898         add     rsp, 7*8
  899         ret
  900     endproc_frame
  901     %else
  902         add     rsp, 6*8
  903         ret
  904     %endif
  905 %endif
  906 
  907 %endif
  908 
  909 %ifidn __OUTPUT_FORMAT__,elf
  910 section .note.GNU-stack noalloc noexec nowrite progbits
  911 %endif
  912 %ifidn __OUTPUT_FORMAT__,elf32
  913 section .note.GNU-stack noalloc noexec nowrite progbits
  914 %endif
  915 %ifidn __OUTPUT_FORMAT__,elf64
  916 section .note.GNU-stack noalloc noexec nowrite progbits
  917 %endif
  918