"Fossies" - the Fresh Open Source Software Archive

Member "src/Common/GfMul.c" (10 Oct 2018, 23169 Bytes) of package /windows/misc/VeraCrypt_1.23-Hotfix-2_Source.zip:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "GfMul.c" see the Fossies "Dox" file reference documentation.

    1 /*
    2  ---------------------------------------------------------------------------
    3  Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.   All rights reserved.
    4 
    5  LICENSE TERMS
    6 
    7  The free distribution and use of this software is allowed (with or without
    8  changes) provided that:
    9 
   10   1. source code distributions include the above copyright notice, this
   11      list of conditions and the following disclaimer;
   12 
   13   2. binary distributions include the above copyright notice, this list
   14      of conditions and the following disclaimer in their documentation;
   15 
   16   3. the name of the copyright holder is not used to endorse products
   17      built using this software without specific written permission.
   18 
   19  DISCLAIMER
   20 
   21  This software is provided 'as is' with no explicit or implied warranties
   22  in respect of its properties, including, but not limited to, correctness
   23  and/or fitness for purpose.
   24  ---------------------------------------------------------------------------
   25  Issue Date: 31/01/2004
   26 
   27  My thanks to John Viega and David McGrew for their support in developing
   28  this code and to David for testing it on a big-endain system.
   29 */
   30 
   31 /*
   32  ---------------------------------------------------------------------------
   33  Portions Copyright (c) 2005 TrueCrypt Developers Association
   34 
   35  Changes:
   36 
   37    - Added multiplication in the finite field GF(2^128) optimized for
   38      cases involving a 64-bit operand.
   39 
   40    - Added multiplication in the finite field GF(2^64).
   41 
   42    - Added MSB-first mode.
   43 
   44    - Added basic test algorithms.
   45 
   46    - Removed GCM.
   47  ---------------------------------------------------------------------------
   48 */
   49 
   50 #include <memory.h>
   51 #include <stdlib.h>
   52 #include "GfMul.h"
   53 #include "Tcdefs.h"
   54 #include "Common/Endian.h"
   55 
   56 /* BUFFER_ALIGN32 or BUFFER_ALIGN64 must be defined at this point to    */
   57 /* enable faster operation by taking advantage of memory aligned values */
   58 /* NOTE: the BUFFER_ALIGN64 option has not been tested extensively      */
   59 
   60 #define BUFFER_ALIGN32
   61 #define UNROLL_LOOPS    /* define to unroll some loops      */
   62 #define IN_LINES        /* define to use inline functions   */
   63                         /* in place of macros               */
   64 
   65 #define mode(x)         GM_##x
   66 
   67 #if defined(__cplusplus)
   68 extern "C"
   69 {
   70 #endif
   71 
   72 typedef unsigned __int32 mode(32t);
   73 typedef uint64 mode(64t);
   74 
   75 #define BRG_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */
   76 #define BRG_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
   77 
   78 #if BYTE_ORDER == LITTLE_ENDIAN
   79 #  define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN
   80 #endif
   81 
   82 #if BYTE_ORDER == BIG_ENDIAN
   83 #  define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN
   84 #endif
   85 
   86 #ifdef _MSC_VER
   87 #pragma intrinsic(memcpy)
   88 #define in_line __inline
   89 #else
   90 #define in_line
   91 #endif
   92 
   93 #if 0 && defined(_MSC_VER)
   94 #define rotl32 _lrotl
   95 #define rotr32 _lrotr
   96 #else
   97 #define rotl32(x,n)   (((x) << n) | ((x) >> (32 - n)))
   98 #define rotr32(x,n)   (((x) >> n) | ((x) << (32 - n)))
   99 #endif
  100 
  101 #if !defined(bswap_32)
  102 #define bswap_32(x) ((rotr32((x), 24) & 0x00ff00ff) | (rotr32((x), 8) & 0xff00ff00))
  103 #endif
  104 
  105 #if (PLATFORM_BYTE_ORDER == BRG_LITTLE_ENDIAN)
  106 #define SWAP_BYTES
  107 #else
  108 #undef  SWAP_BYTES
  109 #endif
  110 
  111 #if defined(SWAP_BYTES)
  112 
  113 #if defined ( IN_LINES )
  114 
  115 in_line void bsw_32(void * p, unsigned int n)
  116 {   unsigned int i = n;
  117     while(i--)
  118         ((mode(32t)*)p)[i] = bswap_32(((mode(32t)*)p)[i]);
  119 }
  120 
  121 #else
  122 
  123 #define bsw_32(p,n) \
  124     { int _i = (n); while(_i--) ((mode(32t)*)p)[_i] = bswap_32(((mode(32t)*)p)[_i]); }
  125 
  126 #endif
  127 
  128 #else
  129 #define bsw_32(p,n)
  130 #endif
  131 
  132 /* These values are used to detect long word alignment in order */
  133 /* to speed up some GCM buffer operations. This facility may    */
  134 /* not work on some machines                                    */
  135 
  136 #define lp08(x)      ((unsigned char*)(x))
  137 #define lp32(x)      ((mode(32t)*)(x))
  138 #define lp64(x)      ((mode(64t)*)(x))
  139 
  140 #define A32_MASK     3
  141 #define A64_MASK     7
  142 #define aligned32(x) (!(((mode(32t))(x)) & A32_MASK))
  143 #define aligned64(x) (!(((mode(32t))(x)) & A64_MASK))
  144 
  145 #if defined( BUFFER_ALIGN32 )
  146 
  147 #define ADR_MASK    A32_MASK
  148 #define aligned     aligned32
  149 #define lp          lp32
  150 #define lp_inc      4
  151 
  152 #if defined( IN_LINES )
  153 
  154 in_line void move_block_aligned( void *p, const void *q)
  155 {
  156     lp32(p)[0] = lp32(q)[0], lp32(p)[1] = lp32(q)[1],
  157     lp32(p)[2] = lp32(q)[2], lp32(p)[3] = lp32(q)[3];
  158 }
  159 
  160 in_line void move_block_aligned64( void *p, const void *q)
  161 {
  162     lp32(p)[0] = lp32(q)[0], lp32(p)[1] = lp32(q)[1];
  163 }
  164 
  165 in_line void xor_block_aligned( void *p, const void *q)
  166 {
  167     lp32(p)[0] ^= lp32(q)[0], lp32(p)[1] ^= lp32(q)[1],
  168     lp32(p)[2] ^= lp32(q)[2], lp32(p)[3] ^= lp32(q)[3];
  169 }
  170 
  171 in_line void xor_block_aligned64( void *p, const void *q)
  172 {
  173     lp32(p)[0] ^= lp32(q)[0], lp32(p)[1] ^= lp32(q)[1];
  174 }
  175 
  176 #else
  177 
  178 #define move_block_aligned(p,q) \
  179     lp32(p)[0] = lp32(q)[0], lp32(p)[1] = lp32(q)[1], \
  180     lp32(p)[2] = lp32(q)[2], lp32(p)[3] = lp32(q)[3]
  181 
  182 #define xor_block_aligned(p,q) \
  183     lp32(p)[0] ^= lp32(q)[0], lp32(p)[1] ^= lp32(q)[1], \
  184     lp32(p)[2] ^= lp32(q)[2], lp32(p)[3] ^= lp32(q)[3]
  185 
  186 #endif
  187 
  188 #elif defined( BUFFER_ALIGN64 )
  189 
  190 #define ADR_MASK    A64_MASK
  191 #define aligned     aligned64
  192 #define lp          lp64
  193 #define lp_inc      8
  194 
  195 #define move_block_aligned(p,q) \
  196     lp64(p)[0] = lp64(q)[0], lp64(p)[1] = lp64(q)[1]
  197 
  198 #define xor_block_aligned(p,q) \
  199     lp64(p)[0] ^= lp64(q)[0], lp64(p)[1] ^= lp64(q)[1]
  200 
  201 #else
  202 #define aligned(x) 0
  203 #endif
  204 
  205 #define move_block(p,q) memcpy((p), (q), BLOCK_LEN)
  206 
  207 #define xor_block(p,q) \
  208     lp08(p)[ 0] ^= lp08(q)[ 0], lp08(p)[ 1] ^= lp08(q)[ 1], \
  209     lp08(p)[ 2] ^= lp08(q)[ 2], lp08(p)[ 3] ^= lp08(q)[ 3], \
  210     lp08(p)[ 4] ^= lp08(q)[ 4], lp08(p)[ 5] ^= lp08(q)[ 5], \
  211     lp08(p)[ 6] ^= lp08(q)[ 6], lp08(p)[ 7] ^= lp08(q)[ 7], \
  212     lp08(p)[ 8] ^= lp08(q)[ 8], lp08(p)[ 9] ^= lp08(q)[ 9], \
  213     lp08(p)[10] ^= lp08(q)[10], lp08(p)[11] ^= lp08(q)[11], \
  214     lp08(p)[12] ^= lp08(q)[12], lp08(p)[13] ^= lp08(q)[13], \
  215     lp08(p)[14] ^= lp08(q)[14], lp08(p)[15] ^= lp08(q)[15]
  216 
  217 
  218 #define gf_dat(q) {\
  219     q(0x00), q(0x01), q(0x02), q(0x03), q(0x04), q(0x05), q(0x06), q(0x07),\
  220     q(0x08), q(0x09), q(0x0a), q(0x0b), q(0x0c), q(0x0d), q(0x0e), q(0x0f),\
  221     q(0x10), q(0x11), q(0x12), q(0x13), q(0x14), q(0x15), q(0x16), q(0x17),\
  222     q(0x18), q(0x19), q(0x1a), q(0x1b), q(0x1c), q(0x1d), q(0x1e), q(0x1f),\
  223     q(0x20), q(0x21), q(0x22), q(0x23), q(0x24), q(0x25), q(0x26), q(0x27),\
  224     q(0x28), q(0x29), q(0x2a), q(0x2b), q(0x2c), q(0x2d), q(0x2e), q(0x2f),\
  225     q(0x30), q(0x31), q(0x32), q(0x33), q(0x34), q(0x35), q(0x36), q(0x37),\
  226     q(0x38), q(0x39), q(0x3a), q(0x3b), q(0x3c), q(0x3d), q(0x3e), q(0x3f),\
  227     q(0x40), q(0x41), q(0x42), q(0x43), q(0x44), q(0x45), q(0x46), q(0x47),\
  228     q(0x48), q(0x49), q(0x4a), q(0x4b), q(0x4c), q(0x4d), q(0x4e), q(0x4f),\
  229     q(0x50), q(0x51), q(0x52), q(0x53), q(0x54), q(0x55), q(0x56), q(0x57),\
  230     q(0x58), q(0x59), q(0x5a), q(0x5b), q(0x5c), q(0x5d), q(0x5e), q(0x5f),\
  231     q(0x60), q(0x61), q(0x62), q(0x63), q(0x64), q(0x65), q(0x66), q(0x67),\
  232     q(0x68), q(0x69), q(0x6a), q(0x6b), q(0x6c), q(0x6d), q(0x6e), q(0x6f),\
  233     q(0x70), q(0x71), q(0x72), q(0x73), q(0x74), q(0x75), q(0x76), q(0x77),\
  234     q(0x78), q(0x79), q(0x7a), q(0x7b), q(0x7c), q(0x7d), q(0x7e), q(0x7f),\
  235     q(0x80), q(0x81), q(0x82), q(0x83), q(0x84), q(0x85), q(0x86), q(0x87),\
  236     q(0x88), q(0x89), q(0x8a), q(0x8b), q(0x8c), q(0x8d), q(0x8e), q(0x8f),\
  237     q(0x90), q(0x91), q(0x92), q(0x93), q(0x94), q(0x95), q(0x96), q(0x97),\
  238     q(0x98), q(0x99), q(0x9a), q(0x9b), q(0x9c), q(0x9d), q(0x9e), q(0x9f),\
  239     q(0xa0), q(0xa1), q(0xa2), q(0xa3), q(0xa4), q(0xa5), q(0xa6), q(0xa7),\
  240     q(0xa8), q(0xa9), q(0xaa), q(0xab), q(0xac), q(0xad), q(0xae), q(0xaf),\
  241     q(0xb0), q(0xb1), q(0xb2), q(0xb3), q(0xb4), q(0xb5), q(0xb6), q(0xb7),\
  242     q(0xb8), q(0xb9), q(0xba), q(0xbb), q(0xbc), q(0xbd), q(0xbe), q(0xbf),\
  243     q(0xc0), q(0xc1), q(0xc2), q(0xc3), q(0xc4), q(0xc5), q(0xc6), q(0xc7),\
  244     q(0xc8), q(0xc9), q(0xca), q(0xcb), q(0xcc), q(0xcd), q(0xce), q(0xcf),\
  245     q(0xd0), q(0xd1), q(0xd2), q(0xd3), q(0xd4), q(0xd5), q(0xd6), q(0xd7),\
  246     q(0xd8), q(0xd9), q(0xda), q(0xdb), q(0xdc), q(0xdd), q(0xde), q(0xdf),\
  247     q(0xe0), q(0xe1), q(0xe2), q(0xe3), q(0xe4), q(0xe5), q(0xe6), q(0xe7),\
  248     q(0xe8), q(0xe9), q(0xea), q(0xeb), q(0xec), q(0xed), q(0xee), q(0xef),\
  249     q(0xf0), q(0xf1), q(0xf2), q(0xf3), q(0xf4), q(0xf5), q(0xf6), q(0xf7),\
  250     q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) }
  251 
  252 /* given the value i in 0..255 as the byte overflow when a a field  */
  253 /* element in GHASH is multipled by x^8, this function will return  */
  254 /* the values that are generated in the lo 16-bit word of the field */
  255 /* value by applying the modular polynomial. The values lo_byte and */
  256 /* hi_byte are returned via the macro xp_fun(lo_byte, hi_byte) so   */
  257 /* that the values can be assembled into memory as required by a    */
  258 /* suitable definition of this macro operating on the table above   */
  259 
  260 #define xp(i) xp_fun( \
  261     (i & 0x80 ? 0xe1 : 0) ^ (i & 0x40 ? 0x70 : 0) ^ \
  262     (i & 0x20 ? 0x38 : 0) ^ (i & 0x10 ? 0x1c : 0) ^ \
  263     (i & 0x08 ? 0x0e : 0) ^ (i & 0x04 ? 0x07 : 0) ^ \
  264     (i & 0x02 ? 0x03 : 0) ^ (i & 0x01 ? 0x01 : 0),  \
  265     (i & 0x80 ? 0x00 : 0) ^ (i & 0x40 ? 0x80 : 0) ^ \
  266     (i & 0x20 ? 0x40 : 0) ^ (i & 0x10 ? 0x20 : 0) ^ \
  267     (i & 0x08 ? 0x10 : 0) ^ (i & 0x04 ? 0x08 : 0) ^ \
  268     (i & 0x02 ? 0x84 : 0) ^ (i & 0x01 ? 0xc2 : 0) )
  269 
  270 #define xp64(i) xp_fun( \
  271     (i & 0x80 ? 0xd8 : 0) ^ (i & 0x40 ? 0x6c : 0) ^ \
  272     (i & 0x20 ? 0x36 : 0) ^ (i & 0x10 ? 0x1b : 0) ^ \
  273     (i & 0x08 ? 0x0d : 0) ^ (i & 0x04 ? 0x06 : 0) ^ \
  274     (i & 0x02 ? 0x03 : 0) ^ (i & 0x01 ? 0x01 : 0),  \
  275     (i & 0x80 ? 0x00 : 0) ^ (i & 0x40 ? 0x00 : 0) ^ \
  276     (i & 0x20 ? 0x00 : 0) ^ (i & 0x10 ? 0x00 : 0) ^ \
  277     (i & 0x08 ? 0x80 : 0) ^ (i & 0x04 ? 0xc0 : 0) ^ \
  278     (i & 0x02 ? 0x60 : 0) ^ (i & 0x01 ? 0xb0 : 0) )
  279 
  280 static mode(32t) gf_poly[2] = { 0, 0xe1000000 };
  281 static mode(32t) gf_poly64[2] = { 0, 0xd8000000 };
  282 
  283 /* Multiply of a GF128 field element by x.   The field element  */
  284 /* is held in an array of bytes in which field bits 8n..8n + 7  */
  285 /* are held in byte[n], with lower indexed bits placed in the   */
  286 /* more numerically significant bit positions in bytes.         */
  287 
  288 /* This function multiples a field element x, in the polynomial */
  289 /* field representation. It uses 32-bit word operations to gain */
  290 /* speed but compensates for machine endianess and hence works  */
  291 /* correctly on both styles of machine                          */
  292 
  293 in_line void mul_x(mode(32t) x[4])
  294 {   mode(32t)   t;
  295 
  296     bsw_32(x, 4);
  297 
  298     /* at this point the filed element bits 0..127 are set out  */
  299     /* as follows in 32-bit words (where the most significant   */
  300     /* (ms) numeric bits are to the left)                       */
  301     /*                                                          */
  302     /*            x[0]      x[1]      x[2]      x[3]            */
  303     /*          ms    ls  ms    ls  ms    ls  ms     ls         */
  304     /* field:   0 ... 31  32 .. 63  64 .. 95  96 .. 127         */
  305 
  306     t = gf_poly[x[3] & 1];          /* bit 127 of the element   */
  307     x[3] = (x[3] >> 1) | (x[2] << 31);  /* shift bits up by one */
  308     x[2] = (x[2] >> 1) | (x[1] << 31);  /* position             */
  309     x[1] = (x[1] >> 1) | (x[0] << 31);  /* if bit 7 is 1 xor in */
  310     x[0] = (x[0] >> 1) ^ t;             /* the field polynomial */
  311     bsw_32(x, 4);
  312 }
  313 
  314 in_line void mul_x64(mode(32t) x[2])
  315 {   mode(32t)   t;
  316 
  317     bsw_32(x, 2);
  318 
  319     /* at this point the filed element bits 0..127 are set out  */
  320     /* as follows in 32-bit words (where the most significant   */
  321     /* (ms) numeric bits are to the left)                       */
  322     /*                                                          */
  323     /*            x[0]      x[1]      x[2]      x[3]            */
  324     /*          ms    ls  ms    ls  ms    ls  ms     ls         */
  325     /* field:   0 ... 31  32 .. 63  64 .. 95  96 .. 127         */
  326 
  327     t = gf_poly64[x[1] & 1];          /* bit 127 of the element   */
  328                                         /* shift bits up by one */
  329                                         /* position             */
  330     x[1] = (x[1] >> 1) | (x[0] << 31);  /* if bit 7 is 1 xor in */
  331     x[0] = (x[0] >> 1) ^ t;             /* the field polynomial */
  332     bsw_32(x, 2);
  333 }
  334 
  335 /* Multiply of a GF128 field element by x^8 using 32-bit words  */
  336 /* for speed - machine endianess matters here                   */
  337 
  338 #if (PLATFORM_BYTE_ORDER == BRG_LITTLE_ENDIAN)
  339 
  340 #define xp_fun(x,y)    ((mode(32t))(x)) | (((mode(32t))(y)) << 8)
  341 static const unsigned __int16 gft_le[256] = gf_dat(xp);
  342 static const unsigned __int16 gft_le64[256] = gf_dat(xp64);
  343 
  344 in_line void mul_lex8(mode(32t) x[4])   /* mutiply with long words  */
  345 {   mode(32t)   t = (x[3] >> 24);       /* in little endian format  */
  346     x[3] = (x[3] << 8) | (x[2] >> 24);
  347     x[2] = (x[2] << 8) | (x[1] >> 24);
  348     x[1] = (x[1] << 8) | (x[0] >> 24);
  349     x[0] = (x[0] << 8) ^ gft_le[t];
  350 }
  351 
  352 in_line void mul_lex8_64(mode(32t) x[2])   /* mutiply with long words  */
  353 {   mode(32t)   t = (x[1] >> 24);       /* in little endian format  */
  354     x[1] = (x[1] << 8) | (x[0] >> 24);
  355     x[0] = (x[0] << 8) ^ gft_le64[t];
  356 }
  357 
  358 #endif
  359 
  360 #if 1 || (PLATFORM_BYTE_ORDER == BRG_LITTLE_ENDIAN)
  361 
  362 #undef  xp_fun
  363 #define xp_fun(x,y)    ((mode(32t))(y)) | (((mode(32t))(x)) << 8)
  364 static const unsigned __int16 gft_be[256] = gf_dat(xp);
  365 static const unsigned __int16 gft_be64[256] = gf_dat(xp64);
  366 
  367 in_line void mul_bex8(mode(32t) x[4])   /* mutiply with long words  */
  368 {   mode(32t)   t = (x[3] & 0xff);      /* in big endian format     */
  369     x[3] = (x[3] >> 8) | (x[2] << 24);
  370     x[2] = (x[2] >> 8) | (x[1] << 24);
  371     x[1] = (x[1] >> 8) | (x[0] << 24);
  372     x[0] = (x[0] >> 8) ^ (((mode(32t))gft_be[t]) << 16);
  373 }
  374 
  375 in_line void mul_bex8_64(mode(32t) x[2])   /* mutiply with long words  */
  376 {   mode(32t)   t = (x[1] & 0xff);      /* in big endian format     */
  377     x[1] = (x[1] >> 8) | (x[0] << 24);
  378     x[0] = (x[0] >> 8) ^ (((mode(32t))gft_be64[t]) << 16);
  379 }
  380 
  381 #endif
  382 
  383 /* hence choose the correct version for the machine endianess       */
  384 
  385 #if PLATFORM_BYTE_ORDER == BRG_BIG_ENDIAN
  386 #define mul_x8  mul_bex8
  387 #define mul_x8_64  mul_bex8_64
  388 #else
  389 #define mul_x8  mul_lex8
  390 #define mul_x8_64  mul_lex8_64
  391 #endif
  392 
  393 /* different versions of the general gf_mul function are provided   */
  394 /* here. Sadly none are very fast :-(                               */
  395 
  396 void GfMul128 (void *a, const void* b)
  397 {   mode(32t) r[CBLK_LEN >> 2], p[8][CBLK_LEN >> 2];
  398     int i;
  399 
  400     move_block_aligned(p[0], b);
  401     bsw_32(p[0], 4);
  402     for(i = 0; i < 7; ++i)
  403     {
  404         p[i + 1][3] = (p[i][3] >> 1) | (p[i][2] << 31);
  405         p[i + 1][2] = (p[i][2] >> 1) | (p[i][1] << 31);
  406         p[i + 1][1] = (p[i][1] >> 1) | (p[i][0] << 31);
  407         p[i + 1][0] = (p[i][0] >> 1) ^ gf_poly[p[i][3] & 1];
  408     }
  409 
  410     memset(r, 0, CBLK_LEN);
  411     for(i = 0; i < 16; ++i)
  412     {
  413         if(i) mul_bex8(r);  /* order is always big endian here */
  414 
  415         if(((unsigned char*)a)[15 - i] & 0x80)
  416             xor_block_aligned(r, p[0]);
  417         if(((unsigned char*)a)[15 - i] & 0x40)
  418             xor_block_aligned(r, p[1]);
  419         if(((unsigned char*)a)[15 - i] & 0x20)
  420             xor_block_aligned(r, p[2]);
  421         if(((unsigned char*)a)[15 - i] & 0x10)
  422             xor_block_aligned(r, p[3]);
  423         if(((unsigned char*)a)[15 - i] & 0x08)
  424             xor_block_aligned(r, p[4]);
  425         if(((unsigned char*)a)[15 - i] & 0x04)
  426             xor_block_aligned(r, p[5]);
  427         if(((unsigned char*)a)[15 - i] & 0x02)
  428             xor_block_aligned(r, p[6]);
  429         if(((unsigned char*)a)[15 - i] & 0x01)
  430             xor_block_aligned(r, p[7]);
  431     }
  432     bsw_32(r, 4);
  433     move_block_aligned(a, r);
  434 }
  435 
  436 #if defined( UNROLL_LOOPS )
  437 
  438 #define xor_8k(i)   \
  439     xor_block_aligned(r, ctx->gf_t8k[i + i][a[i] & 15]); \
  440     xor_block_aligned(r, ctx->gf_t8k[i + i + 1][a[i] >> 4])
  441 
  442 
  443 void GfMul128Tab (unsigned char a[CBLK_LEN], GfCtx8k *ctx)
  444 {   unsigned __int32 r[CBLK_LEN >> 2];
  445 
  446     move_block_aligned(r, ctx->gf_t8k[0][a[0] & 15]);
  447     xor_block_aligned(r, ctx->gf_t8k[1][a[0] >> 4]);
  448                 xor_8k( 1); xor_8k( 2); xor_8k( 3);
  449     xor_8k( 4); xor_8k( 5); xor_8k( 6); xor_8k( 7);
  450     xor_8k( 8); xor_8k( 9); xor_8k(10); xor_8k(11);
  451     xor_8k(12); xor_8k(13); xor_8k(14); xor_8k(15);
  452     move_block_aligned(a, r);
  453 }
  454 
  455 #else
  456 
  457 void GfMul128Tab (unsigned char a[CBLK_LEN], GfCtx8k *ctx)
  458 {   unsigned __int32 r[CBLK_LEN >> 2], *p;
  459     int i;
  460 
  461     p = ctx->gf_t8k[0][a[0] & 15];
  462     memcpy(r, p, CBLK_LEN);
  463     p = ctx->gf_t8k[1][a[0] >> 4];
  464     xor_block_aligned(r, p);
  465     for(i = 1; i < CBLK_LEN; ++i)
  466     {
  467         xor_block_aligned(r, ctx->gf_t8k[i + i][a[i] & 15]);
  468         xor_block_aligned(r, ctx->gf_t8k[i + i + 1][a[i] >> 4]);
  469     }
  470     memcpy(a, r, CBLK_LEN);
  471 }
  472 
  473 #endif
  474 
  475 void compile_8k_table(unsigned __int8 *a, GfCtx8k *ctx)
  476 {   int i, j, k;
  477 
  478     memset(ctx->gf_t8k, 0, 32 * 16 * 16);
  479     for(i = 0; i < 2 * CBLK_LEN; ++i)
  480     {
  481         if(i == 0)
  482         {
  483             memcpy(ctx->gf_t8k[1][8], a, CBLK_LEN);
  484             for(j = 4; j > 0; j >>= 1)
  485             {
  486                 memcpy(ctx->gf_t8k[1][j], ctx->gf_t8k[1][j + j], CBLK_LEN);
  487                 mul_x(ctx->gf_t8k[1][j]);
  488             }
  489             memcpy(ctx->gf_t8k[0][8], ctx->gf_t8k[1][1], CBLK_LEN);
  490             mul_x(ctx->gf_t8k[0][8]);
  491             for(j = 4; j > 0; j >>= 1)
  492             {
  493                 memcpy(ctx->gf_t8k[0][j], ctx->gf_t8k[0][j + j], CBLK_LEN);
  494                 mul_x(ctx->gf_t8k[0][j]);
  495             }
  496         }
  497         else if(i > 1)
  498             for(j = 8; j > 0; j >>= 1)
  499             {
  500                 memcpy(ctx->gf_t8k[i][j], ctx->gf_t8k[i - 2][j], CBLK_LEN);
  501                 mul_x8(ctx->gf_t8k[i][j]);
  502             }
  503 
  504         for(j = 2; j < 16; j += j)
  505         {
  506             mode(32t) *pj = ctx->gf_t8k[i][j];
  507             mode(32t) *pk = ctx->gf_t8k[i][1];
  508             mode(32t) *pl = ctx->gf_t8k[i][j + 1];
  509 
  510             for(k = 1; k < j; ++k)
  511             {
  512                 *pl++ = pj[0] ^ *pk++;
  513                 *pl++ = pj[1] ^ *pk++;
  514                 *pl++ = pj[2] ^ *pk++;
  515                 *pl++ = pj[3] ^ *pk++;
  516             }
  517         }
  518     }
  519 }
  520 
  521 
  522 void compile_4k_table64(unsigned __int8 *a, GfCtx4k64 *ctx)
  523 {   int i, j, k;
  524 
  525     memset(ctx->gf_t4k, 0, sizeof(ctx->gf_t4k));
  526     for(i = 0; i < 2 * CBLK_LEN8; ++i)
  527     {
  528         if(i == 0)
  529         {
  530             memcpy(ctx->gf_t4k[1][8], a, CBLK_LEN8);
  531             for(j = 4; j > 0; j >>= 1)
  532             {
  533                 memcpy(ctx->gf_t4k[1][j], ctx->gf_t4k[1][j + j], CBLK_LEN8);
  534                 mul_x64(ctx->gf_t4k[1][j]);
  535             }
  536             memcpy(ctx->gf_t4k[0][8], ctx->gf_t4k[1][1], CBLK_LEN8);
  537             mul_x64(ctx->gf_t4k[0][8]);
  538             for(j = 4; j > 0; j >>= 1)
  539             {
  540                 memcpy(ctx->gf_t4k[0][j], ctx->gf_t4k[0][j + j], CBLK_LEN8);
  541                 mul_x64(ctx->gf_t4k[0][j]);
  542             }
  543         }
  544         else if(i > 1)
  545             for(j = 8; j > 0; j >>= 1)
  546             {
  547                 memcpy(ctx->gf_t4k[i][j], ctx->gf_t4k[i - 2][j], CBLK_LEN8);
  548                 mul_x8_64(ctx->gf_t4k[i][j]);
  549             }
  550 
  551         for(j = 2; j < 16; j += j)
  552         {
  553             mode(32t) *pj = ctx->gf_t4k[i][j];
  554             mode(32t) *pk = ctx->gf_t4k[i][1];
  555             mode(32t) *pl = ctx->gf_t4k[i][j + 1];
  556 
  557             for(k = 1; k < j; ++k)
  558             {
  559                 *pl++ = pj[0] ^ *pk++;
  560                 *pl++ = pj[1] ^ *pk++;
  561                 *pl++ = pj[2] ^ *pk++;
  562                 *pl++ = pj[3] ^ *pk++;
  563             }
  564         }
  565     }
  566 }
  567 
  568 static int IsBitSet128 (unsigned int bit, unsigned __int8 *a)
  569 {
  570     return a[(127 - bit) / 8] & (0x80 >> ((127 - bit) % 8));
  571 }
  572 
  573 static int IsBitSet64 (unsigned int bit, unsigned __int8 *a)
  574 {
  575     return a[(63 - bit) / 8] & (0x80 >> ((63 - bit) % 8));
  576 }
  577 
  578 static void SetBit128 (unsigned int bit, unsigned __int8 *a)
  579 {
  580     a[(127 - bit) / 8] |= 0x80 >> ((127 - bit) % 8);
  581 }
  582 
  583 static void SetBit64 (unsigned int bit, unsigned __int8 *a)
  584 {
  585     a[(63 - bit) / 8] |= 0x80 >> ((63 - bit) % 8);
  586 }
  587 
  588 void MirrorBits128 (unsigned __int8 *a)
  589 {
  590     unsigned __int8 t[128 / 8];
  591     int i;
  592     memset (t,0,16);
  593     for (i = 0; i < 128; i++)
  594     {
  595         if (IsBitSet128(i, a))
  596             SetBit128 (127 - i, t);
  597     }
  598     memcpy (a, t, sizeof (t));
  599     burn (t,sizeof (t));
  600 }
  601 
  602 void MirrorBits64 (unsigned __int8 *a)
  603 {
  604     unsigned __int8 t[64 / 8];
  605     int i;
  606     memset (t,0,8);
  607     for (i = 0; i < 64; i++)
  608     {
  609         if (IsBitSet64(i, a))
  610             SetBit64 (63 - i, t);
  611     }
  612     memcpy (a, t, sizeof (t));
  613     burn (t,sizeof (t));
  614 }
  615 
  616 /* Allocate and initialize speed optimization table
  617    for multiplication by 64-bit operand in MSB-first mode */
  618 int Gf128Tab64Init (unsigned __int8 *a, GfCtx *ctx)
  619 {
  620     GfCtx8k *ctx8k;
  621     unsigned __int8 am[16];
  622     int i, j;
  623 
  624     ctx8k = (GfCtx8k *) TCalloc (sizeof (GfCtx8k));
  625     if (!ctx8k)
  626         return FALSE;
  627 
  628     memcpy (am, a, 16);
  629     MirrorBits128 (am);
  630     compile_8k_table (am, ctx8k);
  631 
  632     /* Convert 8k LSB-first table to 4k MSB-first */
  633     for (i = 16; i < 32; i++)
  634     {
  635         for (j = 0; j < 16; j++)
  636         {
  637             int jm = 0;
  638             jm |= (j & 0x1) << 3;
  639             jm |= (j & 0x2) << 1;
  640             jm |= (j & 0x4) >> 1;
  641             jm |= (j & 0x8) >> 3;
  642 
  643             memcpy (&ctx->gf_t128[i-16][jm], (unsigned char *)&ctx8k->gf_t8k[31-i][j], 16);
  644             MirrorBits128 ((unsigned char *)&ctx->gf_t128[i-16][jm]);
  645         }
  646     }
  647 
  648     burn (ctx8k ,sizeof (*ctx8k));
  649     burn (am, sizeof (am));
  650     TCfree (ctx8k);
  651     return TRUE;
  652 }
  653 
  654 
  655 #define xor_8kt64(i)   \
  656     xor_block_aligned(r, ctx->gf_t128[i + i][a[i] & 15]); \
  657     xor_block_aligned(r, ctx->gf_t128[i + i + 1][a[i] >> 4])
  658 
  659 /* Multiply a 128-bit number by a 64-bit number in the finite field GF(2^128) */
  660 void Gf128MulBy64Tab (unsigned __int8 a[8], unsigned __int8 p[16], GfCtx *ctx)
  661 {
  662     unsigned __int32 r[CBLK_LEN >> 2];
  663 
  664     move_block_aligned(r, ctx->gf_t128[7*2][a[7] & 15]);
  665     xor_block_aligned(r,  ctx->gf_t128[7*2+1][a[7] >> 4]);
  666 
  667     if (*(unsigned __int16 *)a)
  668     {
  669         xor_8kt64(0);
  670         xor_8kt64(1);
  671     }
  672     if (a[2])
  673     {
  674         xor_8kt64(2);
  675     }
  676     xor_8kt64(3);
  677     xor_8kt64(4);
  678     xor_8kt64(5);
  679     xor_8kt64(6);
  680 
  681     move_block_aligned(p, r);
  682 }
  683 
  684 
  685 
  686 /* Basic algorithms for testing of optimized algorithms */
  687 
  688 static void xor128 (uint64 *a, uint64 *b)
  689 {
  690     *a++ ^= *b++;
  691     *a ^= *b;
  692 }
  693 
  694 static void shl128 (unsigned __int8 *a)
  695 {
  696     int i, x = 0, xx;
  697     for (i = 15; i >= 0; i--)
  698     {
  699         xx = (a[i] & 0x80) >> 7;
  700         a[i] = (char) ((a[i] << 1) | x);
  701         x = xx;
  702     }
  703 }
  704 
  705 static void GfMul128Basic (unsigned __int8 *a, unsigned __int8 *b, unsigned __int8 *p)
  706 {
  707     int i;
  708     unsigned __int8 la[16];
  709     memcpy (la, a, 16);
  710     memset (p, 0, 16);
  711 
  712     for (i = 0; i < 128; i++)
  713     {
  714         if (IsBitSet128 (i, b))
  715             xor128 ((uint64 *)p, (uint64 *)la);
  716 
  717         if (la[0] & 0x80)
  718         {
  719             shl128 (la);
  720             la[15] ^= 0x87;
  721         }
  722         else
  723         {
  724             shl128 (la);
  725         }
  726     }
  727 }
  728 
  729 
  730 BOOL GfMulSelfTest ()
  731 {
  732     BOOL result = TRUE;
  733     unsigned __int8 a[16];
  734     unsigned __int8 b[16];
  735     unsigned __int8 p1[16];
  736     unsigned __int8 p2[16];
  737     GfCtx *gfCtx = (GfCtx *) TCalloc (sizeof (GfCtx));
  738     int i, j;
  739 
  740     if (!gfCtx)
  741         return FALSE;
  742 
  743 
  744     /* GF(2^128) */
  745     for (i = 0; i < 0x100; i++)
  746     {
  747         for (j = 0; j < 16; j++)
  748         {
  749             a[j] = (unsigned __int8) i;
  750             b[j] = j < 8 ? 0 : a[j] ^ 0xff;
  751         }
  752 
  753         GfMul128Basic (a, b, p1);
  754 
  755         Gf128Tab64Init (a, gfCtx);
  756         Gf128MulBy64Tab (b + 8, p2, gfCtx);
  757 
  758         if (memcmp (p1, p2, 16) != 0)
  759             result = FALSE;
  760     }
  761 
  762     TCfree (gfCtx);
  763     return result;
  764 }
  765 
  766 #if defined(__cplusplus)
  767 }
  768 #endif