"Fossies" - the Fresh Open Source Software Archive

Member "src/Crypto/SerpentFast_simd.cpp" (10 Oct 2018, 13124 Bytes) of package /windows/misc/VeraCrypt_1.23-Hotfix-2_Source.zip:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "SerpentFast_simd.cpp" see the Fossies "Dox" file reference documentation.

    1 /*
    2 * Serpent (SIMD)
    3 * (C) 2009,2013 Jack Lloyd
    4 *
    5 * Botan is released under the Simplified BSD License (see license.txt)
    6 */
    7 
    8 #include "SerpentFast.h"
    9 #include "SerpentFast_sbox.h"
   10 #if !defined(_UEFI)
   11 #include <memory.h>
   12 #include <stdlib.h>
   13 #endif
   14 #include "cpu.h"
   15 #include "misc.h"
   16 
   17 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
   18 
   19 /**
   20 * This class is not a general purpose SIMD type, and only offers
   21 * instructions needed for evaluation of specific crypto primitives.
   22 * For example it does not currently have equality operators of any
   23 * kind.
   24 */
   25 class SIMD_4x32
   26 {
   27 public:
   28 
   29     SIMD_4x32() // zero initialized
   30         {
   31         ::memset(&m_reg, 0, sizeof(m_reg));
   32         }
   33 
   34     explicit SIMD_4x32(const unsigned __int32 B[4])
   35         {
   36         m_reg = _mm_loadu_si128(reinterpret_cast<const __m128i*>(B));
   37         }
   38 
   39     SIMD_4x32(unsigned __int32 B0, unsigned __int32 B1, unsigned __int32 B2, unsigned __int32 B3)
   40         {
   41         m_reg = _mm_set_epi32(B0, B1, B2, B3);
   42         }
   43 
   44     explicit SIMD_4x32(unsigned __int32 B)
   45         {
   46         m_reg = _mm_set1_epi32(B);
   47         }
   48 
   49     static SIMD_4x32 load_le(const void* in)
   50         {
   51         return SIMD_4x32(_mm_loadu_si128(reinterpret_cast<const __m128i*>(in)));
   52         }
   53 
   54     static SIMD_4x32 load_be(const void* in)
   55         {
   56         return load_le(in).bswap();
   57         }
   58 
   59     void store_le(unsigned __int8 out[]) const
   60         {
   61         _mm_storeu_si128(reinterpret_cast<__m128i*>(out), m_reg);
   62         }
   63 
   64     void store_be(unsigned __int8 out[]) const
   65         {
   66         bswap().store_le(out);
   67         }
   68 
   69     void rotate_left(size_t rot)
   70         {
   71         m_reg = _mm_or_si128(_mm_slli_epi32(m_reg, static_cast<int>(rot)),
   72                             _mm_srli_epi32(m_reg, static_cast<int>(32-rot)));
   73 
   74         }
   75 
   76     void rotate_right(size_t rot)
   77         {
   78         rotate_left(32 - rot);
   79         }
   80 
   81     void operator+=(const SIMD_4x32& other)
   82         {
   83         m_reg = _mm_add_epi32(m_reg, other.m_reg);
   84         }
   85 
   86     SIMD_4x32 operator+(const SIMD_4x32& other) const
   87         {
   88         return SIMD_4x32(_mm_add_epi32(m_reg, other.m_reg));
   89         }
   90 
   91     void operator-=(const SIMD_4x32& other)
   92         {
   93         m_reg = _mm_sub_epi32(m_reg, other.m_reg);
   94         }
   95 
   96     SIMD_4x32 operator-(const SIMD_4x32& other) const
   97         {
   98         return SIMD_4x32(_mm_sub_epi32(m_reg, other.m_reg));
   99         }
  100 
  101     void operator^=(const SIMD_4x32& other)
  102         {
  103         m_reg = _mm_xor_si128(m_reg, other.m_reg);
  104         }
  105 
  106     SIMD_4x32 operator^(const SIMD_4x32& other) const
  107         {
  108         return SIMD_4x32(_mm_xor_si128(m_reg, other.m_reg));
  109         }
  110 
  111     void operator|=(const SIMD_4x32& other)
  112         {
  113         m_reg = _mm_or_si128(m_reg, other.m_reg);
  114         }
  115 
  116     SIMD_4x32 operator&(const SIMD_4x32& other)
  117         {
  118         return SIMD_4x32(_mm_and_si128(m_reg, other.m_reg));
  119         }
  120 
  121     void operator&=(const SIMD_4x32& other)
  122         {
  123         m_reg = _mm_and_si128(m_reg, other.m_reg);
  124         }
  125 
  126     SIMD_4x32 operator<<(size_t shift) const
  127         {
  128         return SIMD_4x32(_mm_slli_epi32(m_reg, static_cast<int>(shift)));
  129         }
  130 
  131     SIMD_4x32 operator>>(size_t shift) const
  132         {
  133         return SIMD_4x32(_mm_srli_epi32(m_reg, static_cast<int>(shift)));
  134         }
  135 
  136     SIMD_4x32 operator~() const
  137         {
  138         return SIMD_4x32(_mm_xor_si128(m_reg, _mm_set1_epi32(0xFFFFFFFF)));
  139         }
  140 
  141     // (~reg) & other
  142     SIMD_4x32 andc(const SIMD_4x32& other)
  143         {
  144         return SIMD_4x32(_mm_andnot_si128(m_reg, other.m_reg));
  145         }
  146 
  147     SIMD_4x32 bswap() const
  148         {
  149         __m128i T = m_reg;
  150 
  151         T = _mm_shufflehi_epi16(T, _MM_SHUFFLE(2, 3, 0, 1));
  152         T = _mm_shufflelo_epi16(T, _MM_SHUFFLE(2, 3, 0, 1));
  153 
  154         return SIMD_4x32(_mm_or_si128(_mm_srli_epi16(T, 8),
  155                                     _mm_slli_epi16(T, 8)));
  156         }
  157 
  158     static void transpose(SIMD_4x32& B0, SIMD_4x32& B1,
  159                         SIMD_4x32& B2, SIMD_4x32& B3)
  160         {
  161         __m128i T0 = _mm_unpacklo_epi32(B0.m_reg, B1.m_reg);
  162         __m128i T1 = _mm_unpacklo_epi32(B2.m_reg, B3.m_reg);
  163         __m128i T2 = _mm_unpackhi_epi32(B0.m_reg, B1.m_reg);
  164         __m128i T3 = _mm_unpackhi_epi32(B2.m_reg, B3.m_reg);
  165         B0.m_reg = _mm_unpacklo_epi64(T0, T1);
  166         B1.m_reg = _mm_unpackhi_epi64(T0, T1);
  167         B2.m_reg = _mm_unpacklo_epi64(T2, T3);
  168         B3.m_reg = _mm_unpackhi_epi64(T2, T3);
  169         }
  170 
  171 private:
  172 
  173     explicit SIMD_4x32(__m128i in) { m_reg = in; }
  174 
  175     __m128i m_reg;
  176 
  177 };
  178 
  179 typedef SIMD_4x32 SIMD_32;
  180 
  181 #define key_xor(round, B0, B1, B2, B3)                             \
  182    do {                                                            \
  183       B0 ^= SIMD_32(round_key[4*round  ]);                       \
  184       B1 ^= SIMD_32(round_key[4*round+1]);                       \
  185       B2 ^= SIMD_32(round_key[4*round+2]);                       \
  186       B3 ^= SIMD_32(round_key[4*round+3]);                       \
  187    } while(0);
  188 
  189 /*
  190 * Serpent's linear transformations
  191 */
  192 #define transform(B0, B1, B2, B3)                                  \
  193    do {                                                            \
  194       B0.rotate_left(13);                                          \
  195       B2.rotate_left(3);                                           \
  196       B1 ^= B0 ^ B2;                                               \
  197       B3 ^= B2 ^ (B0 << 3);                                        \
  198       B1.rotate_left(1);                                           \
  199       B3.rotate_left(7);                                           \
  200       B0 ^= B1 ^ B3;                                               \
  201       B2 ^= B3 ^ (B1 << 7);                                        \
  202       B0.rotate_left(5);                                           \
  203       B2.rotate_left(22);                                          \
  204    } while(0);
  205 
  206 #define i_transform(B0, B1, B2, B3)                                \
  207    do {                                                            \
  208       B2.rotate_right(22);                                         \
  209       B0.rotate_right(5);                                          \
  210       B2 ^= B3 ^ (B1 << 7);                                        \
  211       B0 ^= B1 ^ B3;                                               \
  212       B3.rotate_right(7);                                          \
  213       B1.rotate_right(1);                                          \
  214       B3 ^= B2 ^ (B0 << 3);                                        \
  215       B1 ^= B0 ^ B2;                                               \
  216       B2.rotate_right(3);                                          \
  217       B0.rotate_right(13);                                         \
  218    } while(0);
  219 
  220 
  221 #if (!defined (DEBUG) || !defined (TC_WINDOWS_DRIVER))
  222 /*
  223 * SIMD Serpent Encryption of 4 blocks in parallel
  224 */
  225 extern "C" void serpent_simd_encrypt_blocks_4(const unsigned __int8 in[], unsigned __int8 out[], unsigned __int32* round_key)
  226 {
  227    SIMD_32 B0 = SIMD_32::load_le(in);
  228    SIMD_32 B1 = SIMD_32::load_le(in + 16);
  229    SIMD_32 B2 = SIMD_32::load_le(in + 32);
  230    SIMD_32 B3 = SIMD_32::load_le(in + 48);
  231 
  232    SIMD_32::transpose(B0, B1, B2, B3);
  233 
  234    key_xor( 0,B0,B1,B2,B3); SBoxE1(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  235    key_xor( 1,B0,B1,B2,B3); SBoxE2(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  236    key_xor( 2,B0,B1,B2,B3); SBoxE3(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  237    key_xor( 3,B0,B1,B2,B3); SBoxE4(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  238    key_xor( 4,B0,B1,B2,B3); SBoxE5(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  239    key_xor( 5,B0,B1,B2,B3); SBoxE6(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  240    key_xor( 6,B0,B1,B2,B3); SBoxE7(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  241    key_xor( 7,B0,B1,B2,B3); SBoxE8(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  242 
  243    key_xor( 8,B0,B1,B2,B3); SBoxE1(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  244    key_xor( 9,B0,B1,B2,B3); SBoxE2(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  245    key_xor(10,B0,B1,B2,B3); SBoxE3(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  246    key_xor(11,B0,B1,B2,B3); SBoxE4(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  247    key_xor(12,B0,B1,B2,B3); SBoxE5(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  248    key_xor(13,B0,B1,B2,B3); SBoxE6(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  249    key_xor(14,B0,B1,B2,B3); SBoxE7(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  250    key_xor(15,B0,B1,B2,B3); SBoxE8(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  251 
  252    key_xor(16,B0,B1,B2,B3); SBoxE1(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  253    key_xor(17,B0,B1,B2,B3); SBoxE2(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  254    key_xor(18,B0,B1,B2,B3); SBoxE3(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  255    key_xor(19,B0,B1,B2,B3); SBoxE4(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  256    key_xor(20,B0,B1,B2,B3); SBoxE5(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  257    key_xor(21,B0,B1,B2,B3); SBoxE6(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  258    key_xor(22,B0,B1,B2,B3); SBoxE7(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  259    key_xor(23,B0,B1,B2,B3); SBoxE8(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  260 
  261    key_xor(24,B0,B1,B2,B3); SBoxE1(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  262    key_xor(25,B0,B1,B2,B3); SBoxE2(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  263    key_xor(26,B0,B1,B2,B3); SBoxE3(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  264    key_xor(27,B0,B1,B2,B3); SBoxE4(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  265    key_xor(28,B0,B1,B2,B3); SBoxE5(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  266    key_xor(29,B0,B1,B2,B3); SBoxE6(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  267    key_xor(30,B0,B1,B2,B3); SBoxE7(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
  268    key_xor(31,B0,B1,B2,B3); SBoxE8(SIMD_32,B0,B1,B2,B3); key_xor(32,B0,B1,B2,B3);
  269 
  270    SIMD_32::transpose(B0, B1, B2, B3);
  271 
  272    B0.store_le(out);
  273    B1.store_le(out + 16);
  274    B2.store_le(out + 32);
  275    B3.store_le(out + 48);
  276 }
  277 
  278 /*
  279 * SIMD Serpent Decryption of 4 blocks in parallel
  280 */
  281 extern "C" void serpent_simd_decrypt_blocks_4(const unsigned __int8 in[], unsigned __int8 out[], unsigned __int32* round_key)
  282 {
  283    SIMD_32 B0 = SIMD_32::load_le(in);
  284    SIMD_32 B1 = SIMD_32::load_le(in + 16);
  285    SIMD_32 B2 = SIMD_32::load_le(in + 32);
  286    SIMD_32 B3 = SIMD_32::load_le(in + 48);
  287 
  288    SIMD_32::transpose(B0, B1, B2, B3);
  289 
  290    key_xor(32,B0,B1,B2,B3);  SBoxD8(SIMD_32,B0,B1,B2,B3); key_xor(31,B0,B1,B2,B3);
  291    i_transform(B0,B1,B2,B3); SBoxD7(SIMD_32,B0,B1,B2,B3); key_xor(30,B0,B1,B2,B3);
  292    i_transform(B0,B1,B2,B3); SBoxD6(SIMD_32,B0,B1,B2,B3); key_xor(29,B0,B1,B2,B3);
  293    i_transform(B0,B1,B2,B3); SBoxD5(SIMD_32,B0,B1,B2,B3); key_xor(28,B0,B1,B2,B3);
  294    i_transform(B0,B1,B2,B3); SBoxD4(SIMD_32,B0,B1,B2,B3); key_xor(27,B0,B1,B2,B3);
  295    i_transform(B0,B1,B2,B3); SBoxD3(SIMD_32,B0,B1,B2,B3); key_xor(26,B0,B1,B2,B3);
  296    i_transform(B0,B1,B2,B3); SBoxD2(SIMD_32,B0,B1,B2,B3); key_xor(25,B0,B1,B2,B3);
  297    i_transform(B0,B1,B2,B3); SBoxD1(SIMD_32,B0,B1,B2,B3); key_xor(24,B0,B1,B2,B3);
  298 
  299    i_transform(B0,B1,B2,B3); SBoxD8(SIMD_32,B0,B1,B2,B3); key_xor(23,B0,B1,B2,B3);
  300    i_transform(B0,B1,B2,B3); SBoxD7(SIMD_32,B0,B1,B2,B3); key_xor(22,B0,B1,B2,B3);
  301    i_transform(B0,B1,B2,B3); SBoxD6(SIMD_32,B0,B1,B2,B3); key_xor(21,B0,B1,B2,B3);
  302    i_transform(B0,B1,B2,B3); SBoxD5(SIMD_32,B0,B1,B2,B3); key_xor(20,B0,B1,B2,B3);
  303    i_transform(B0,B1,B2,B3); SBoxD4(SIMD_32,B0,B1,B2,B3); key_xor(19,B0,B1,B2,B3);
  304    i_transform(B0,B1,B2,B3); SBoxD3(SIMD_32,B0,B1,B2,B3); key_xor(18,B0,B1,B2,B3);
  305    i_transform(B0,B1,B2,B3); SBoxD2(SIMD_32,B0,B1,B2,B3); key_xor(17,B0,B1,B2,B3);
  306    i_transform(B0,B1,B2,B3); SBoxD1(SIMD_32,B0,B1,B2,B3); key_xor(16,B0,B1,B2,B3);
  307 
  308    i_transform(B0,B1,B2,B3); SBoxD8(SIMD_32,B0,B1,B2,B3); key_xor(15,B0,B1,B2,B3);
  309    i_transform(B0,B1,B2,B3); SBoxD7(SIMD_32,B0,B1,B2,B3); key_xor(14,B0,B1,B2,B3);
  310    i_transform(B0,B1,B2,B3); SBoxD6(SIMD_32,B0,B1,B2,B3); key_xor(13,B0,B1,B2,B3);
  311    i_transform(B0,B1,B2,B3); SBoxD5(SIMD_32,B0,B1,B2,B3); key_xor(12,B0,B1,B2,B3);
  312    i_transform(B0,B1,B2,B3); SBoxD4(SIMD_32,B0,B1,B2,B3); key_xor(11,B0,B1,B2,B3);
  313    i_transform(B0,B1,B2,B3); SBoxD3(SIMD_32,B0,B1,B2,B3); key_xor(10,B0,B1,B2,B3);
  314    i_transform(B0,B1,B2,B3); SBoxD2(SIMD_32,B0,B1,B2,B3); key_xor( 9,B0,B1,B2,B3);
  315    i_transform(B0,B1,B2,B3); SBoxD1(SIMD_32,B0,B1,B2,B3); key_xor( 8,B0,B1,B2,B3);
  316 
  317    i_transform(B0,B1,B2,B3); SBoxD8(SIMD_32,B0,B1,B2,B3); key_xor( 7,B0,B1,B2,B3);
  318    i_transform(B0,B1,B2,B3); SBoxD7(SIMD_32,B0,B1,B2,B3); key_xor( 6,B0,B1,B2,B3);
  319    i_transform(B0,B1,B2,B3); SBoxD6(SIMD_32,B0,B1,B2,B3); key_xor( 5,B0,B1,B2,B3);
  320    i_transform(B0,B1,B2,B3); SBoxD5(SIMD_32,B0,B1,B2,B3); key_xor( 4,B0,B1,B2,B3);
  321    i_transform(B0,B1,B2,B3); SBoxD4(SIMD_32,B0,B1,B2,B3); key_xor( 3,B0,B1,B2,B3);
  322    i_transform(B0,B1,B2,B3); SBoxD3(SIMD_32,B0,B1,B2,B3); key_xor( 2,B0,B1,B2,B3);
  323    i_transform(B0,B1,B2,B3); SBoxD2(SIMD_32,B0,B1,B2,B3); key_xor( 1,B0,B1,B2,B3);
  324    i_transform(B0,B1,B2,B3); SBoxD1(SIMD_32,B0,B1,B2,B3); key_xor( 0,B0,B1,B2,B3);
  325 
  326    SIMD_32::transpose(B0, B1, B2, B3);
  327 
  328    B0.store_le(out);
  329    B1.store_le(out + 16);
  330    B2.store_le(out + 32);
  331    B3.store_le(out + 48);
  332 }
  333 #endif
  334 #undef key_xor
  335 #undef transform
  336 #undef i_transform
  337 
  338 #endif