"Fossies" - the Fresh Open Source Software Archive

Member "memcached-1.6.9/crc32c.c" (21 Nov 2020, 17285 Bytes) of package /linux/www/memcached-1.6.9.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "crc32c.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 1.6.8_vs_1.6.9.

    1 /* crc32c.c -- compute CRC-32C using the Intel crc32 instruction
    2  * Copyright (C) 2013, 2015 Mark Adler
    3  * Version 1.3  31 Dec 2015  Mark Adler
    4  */
    5 
    6 /*
    7   This software is provided 'as-is', without any express or implied
    8   warranty.  In no event will the author be held liable for any damages
    9   arising from the use of this software.
   10 
   11   Permission is granted to anyone to use this software for any purpose,
   12   including commercial applications, and to alter it and redistribute it
   13   freely, subject to the following restrictions:
   14 
   15   1. The origin of this software must not be misrepresented; you must not
   16      claim that you wrote the original software. If you use this software
   17      in a product, an acknowledgment in the product documentation would be
   18      appreciated but is not required.
   19   2. Altered source versions must be plainly marked as such, and must not be
   20      misrepresented as being the original software.
   21   3. This notice may not be removed or altered from any source distribution.
   22 
   23   Mark Adler
   24   madler@alumni.caltech.edu
   25  */
   26 
   27 /* Use hardware CRC instruction on Intel SSE 4.2 processors.  This computes a
   28    CRC-32C, *not* the CRC-32 used by Ethernet and zip, gzip, etc.  A software
   29    version is provided as a fall-back, as well as for speed comparisons. */
   30 
   31 /* Version history:
   32    1.0  10 Feb 2013  First version
   33    1.1   1 Aug 2013  Correct comments on why three crc instructions in parallel
   34    1.2   1 Nov 2015  Add const qualifier to avoid compiler warning
   35                      Load entire input into memory (test code)
   36                      Argument gives number of times to repeat (test code)
   37                      Argument < 0 forces software implementation (test code)
   38    1.3  31 Dec 2015  Check for Intel architecture using compiler macro
   39                      Support big-endian processors in software calculation
   40                      Add header for external use
   41  */
   42 
   43 #include <pthread.h>
   44 #include "crc32c.h"
   45 
   46 crc_func crc32c;
   47 
   48 /* CRC-32C (iSCSI) polynomial in reversed bit order. */
   49 #define POLY 0x82f63b78
   50 
   51 uint32_t crc32c_sw_little(uint32_t crc, void const *buf, size_t len);
   52 uint32_t crc32c_sw_big(uint32_t crc, void const *buf, size_t len);
   53 #ifdef __x86_64__
   54 
   55 /* Hardware CRC-32C for Intel and compatible processors. */
   56 
   57 /* Multiply a matrix times a vector over the Galois field of two elements,
   58    GF(2).  Each element is a bit in an unsigned integer.  mat must have at
   59    least as many entries as the power of two for most significant one bit in
   60    vec. */
   61 static inline uint32_t gf2_matrix_times(uint32_t *mat, uint32_t vec) {
   62     uint32_t sum = 0;
   63     while (vec) {
   64         if (vec & 1)
   65             sum ^= *mat;
   66         vec >>= 1;
   67         mat++;
   68     }
   69     return sum;
   70 }
   71 
   72 /* Multiply a matrix by itself over GF(2).  Both mat and square must have 32
   73    rows. */
   74 static inline void gf2_matrix_square(uint32_t *square, uint32_t *mat) {
   75     for (unsigned n = 0; n < 32; n++)
   76         square[n] = gf2_matrix_times(mat, mat[n]);
   77 }
   78 
   79 /* Construct an operator to apply len zeros to a crc.  len must be a power of
   80    two.  If len is not a power of two, then the result is the same as for the
   81    largest power of two less than len.  The result for len == 0 is the same as
   82    for len == 1.  A version of this routine could be easily written for any
   83    len, but that is not needed for this application. */
   84 static void crc32c_zeros_op(uint32_t *even, size_t len) {
   85     uint32_t odd[32];       /* odd-power-of-two zeros operator */
   86 
   87     /* put operator for one zero bit in odd */
   88     odd[0] = POLY;              /* CRC-32C polynomial */
   89     uint32_t row = 1;
   90     for (unsigned n = 1; n < 32; n++) {
   91         odd[n] = row;
   92         row <<= 1;
   93     }
   94 
   95     /* put operator for two zero bits in even */
   96     gf2_matrix_square(even, odd);
   97 
   98     /* put operator for four zero bits in odd */
   99     gf2_matrix_square(odd, even);
  100 
  101     /* first square will put the operator for one zero byte (eight zero bits),
  102        in even -- next square puts operator for two zero bytes in odd, and so
  103        on, until len has been rotated down to zero */
  104     do {
  105         gf2_matrix_square(even, odd);
  106         len >>= 1;
  107         if (len == 0)
  108             return;
  109         gf2_matrix_square(odd, even);
  110         len >>= 1;
  111     } while (len);
  112 
  113     /* answer ended up in odd -- copy to even */
  114     for (unsigned n = 0; n < 32; n++)
  115         even[n] = odd[n];
  116 }
  117 
  118 /* Take a length and build four lookup tables for applying the zeros operator
  119    for that length, byte-by-byte on the operand. */
  120 static void crc32c_zeros(uint32_t zeros[][256], size_t len) {
  121     uint32_t op[32];
  122 
  123     crc32c_zeros_op(op, len);
  124     for (unsigned n = 0; n < 256; n++) {
  125         zeros[0][n] = gf2_matrix_times(op, n);
  126         zeros[1][n] = gf2_matrix_times(op, n << 8);
  127         zeros[2][n] = gf2_matrix_times(op, n << 16);
  128         zeros[3][n] = gf2_matrix_times(op, n << 24);
  129     }
  130 }
  131 
  132 /* Apply the zeros operator table to crc. */
  133 static inline uint32_t crc32c_shift(uint32_t zeros[][256], uint32_t crc) {
  134     return zeros[0][crc & 0xff] ^ zeros[1][(crc >> 8) & 0xff] ^
  135            zeros[2][(crc >> 16) & 0xff] ^ zeros[3][crc >> 24];
  136 }
  137 
  138 /* Block sizes for three-way parallel crc computation.  LONG and SHORT must
  139    both be powers of two.  The associated string constants must be set
  140    accordingly, for use in constructing the assembler instructions. */
  141 #define LONG 8192
  142 #define LONGx1 "8192"
  143 #define LONGx2 "16384"
  144 #define SHORT 256
  145 #define SHORTx1 "256"
  146 #define SHORTx2 "512"
  147 
  148 /* Tables for hardware crc that shift a crc by LONG and SHORT zeros. */
  149 static pthread_once_t crc32c_once_hw = PTHREAD_ONCE_INIT;
  150 static uint32_t crc32c_long[4][256];
  151 static uint32_t crc32c_short[4][256];
  152 
  153 /* Initialize tables for shifting crcs. */
  154 static void crc32c_init_hw(void) {
  155     crc32c_zeros(crc32c_long, LONG);
  156     crc32c_zeros(crc32c_short, SHORT);
  157 }
  158 
  159 /* Compute CRC-32C using the Intel hardware instruction. */
  160 static uint32_t crc32c_hw(uint32_t crc, void const *buf, size_t len) {
  161     /* populate shift tables the first time through */
  162     pthread_once(&crc32c_once_hw, crc32c_init_hw);
  163 
  164     /* pre-process the crc */
  165     crc = ~crc;
  166     uint64_t crc0 = crc;            /* 64-bits for crc32q instruction */
  167 
  168     /* compute the crc for up to seven leading bytes to bring the data pointer
  169        to an eight-byte boundary */
  170     unsigned char const *next = buf;
  171     while (len && ((uintptr_t)next & 7) != 0) {
  172         __asm__("crc32b\t" "(%1), %0"
  173                 : "=r"(crc0)
  174                 : "r"(next), "0"(crc0));
  175         next++;
  176         len--;
  177     }
  178 
  179     /* compute the crc on sets of LONG*3 bytes, executing three independent crc
  180        instructions, each on LONG bytes -- this is optimized for the Nehalem,
  181        Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
  182        throughput of one crc per cycle, but a latency of three cycles */
  183     while (len >= LONG*3) {
  184         uint64_t crc1 = 0;
  185         uint64_t crc2 = 0;
  186         unsigned char const * const end = next + LONG;
  187         do {
  188             __asm__("crc32q\t" "(%3), %0\n\t"
  189                     "crc32q\t" LONGx1 "(%3), %1\n\t"
  190                     "crc32q\t" LONGx2 "(%3), %2"
  191                     : "=r"(crc0), "=r"(crc1), "=r"(crc2)
  192                     : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
  193             next += 8;
  194         } while (next < end);
  195         crc0 = crc32c_shift(crc32c_long, crc0) ^ crc1;
  196         crc0 = crc32c_shift(crc32c_long, crc0) ^ crc2;
  197         next += LONG*2;
  198         len -= LONG*3;
  199     }
  200 
  201     /* do the same thing, but now on SHORT*3 blocks for the remaining data less
  202        than a LONG*3 block */
  203     while (len >= SHORT*3) {
  204         uint64_t crc1 = 0;
  205         uint64_t crc2 = 0;
  206         unsigned char const * const end = next + SHORT;
  207         do {
  208             __asm__("crc32q\t" "(%3), %0\n\t"
  209                     "crc32q\t" SHORTx1 "(%3), %1\n\t"
  210                     "crc32q\t" SHORTx2 "(%3), %2"
  211                     : "=r"(crc0), "=r"(crc1), "=r"(crc2)
  212                     : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
  213             next += 8;
  214         } while (next < end);
  215         crc0 = crc32c_shift(crc32c_short, crc0) ^ crc1;
  216         crc0 = crc32c_shift(crc32c_short, crc0) ^ crc2;
  217         next += SHORT*2;
  218         len -= SHORT*3;
  219     }
  220 
  221     /* compute the crc on the remaining eight-byte units less than a SHORT*3
  222        block */
  223     {
  224         unsigned char const * const end = next + (len - (len & 7));
  225         while (next < end) {
  226             __asm__("crc32q\t" "(%1), %0"
  227                     : "=r"(crc0)
  228                     : "r"(next), "0"(crc0));
  229             next += 8;
  230         }
  231         len &= 7;
  232     }
  233 
  234     /* compute the crc for up to seven trailing bytes */
  235     while (len) {
  236         __asm__("crc32b\t" "(%1), %0"
  237                 : "=r"(crc0)
  238                 : "r"(next), "0"(crc0));
  239         next++;
  240         len--;
  241     }
  242 
  243     /* return a post-processed crc */
  244     return ~crc0;
  245 }
  246 
  247 /* Check for SSE 4.2.  SSE 4.2 was first supported in Nehalem processors
  248    introduced in November, 2008.  This does not check for the existence of the
  249    cpuid instruction itself, which was introduced on the 486SL in 1992, so this
  250    will fail on earlier x86 processors.  cpuid works on all Pentium and later
  251    processors. */
  252 #define SSE42(have) \
  253     do { \
  254         uint32_t eax, ecx; \
  255         eax = 1; \
  256         __asm__("cpuid" \
  257                 : "=c"(ecx) \
  258                 : "a"(eax) \
  259                 : "%ebx", "%edx"); \
  260         (have) = (ecx >> 20) & 1; \
  261     } while (0)
  262 
  263 /* Compute a CRC-32C.  If the crc32 instruction is available, use the hardware
  264    version.  Otherwise, use the software version. */
  265 void crc32c_init(void) {
  266     int sse42;
  267 
  268     SSE42(sse42);
  269     if (sse42) {
  270         crc32c = crc32c_hw;
  271     } else {
  272         crc32c = crc32c_sw;
  273     }
  274 }
  275 
  276 #elif defined(__aarch64__) && defined(__linux__)
  277 #include <sys/auxv.h>
  278 
  279 static inline uint32_t crc32cx(uint32_t crc, const uint64_t data)
  280 {
  281         asm(".arch_extension crc\n"
  282         "crc32cx %w0, %w0, %x1" : "+r" (crc) : "r" (data));
  283         return crc;
  284 }
  285 
  286 static inline uint32_t crc32cb(uint32_t crc, const uint8_t data)
  287 {
  288         asm(".arch_extension crc\n"
  289             "crc32cb %w0, %w0, %w1" : "+r" (crc) : "r" (data));
  290         return crc;
  291 }
  292 
  293 static uint32_t crc32c_hw(uint32_t crc, void const *buf, size_t len) {
  294     crc = ~crc;
  295     unsigned char const *next = buf;
  296 
  297     while (((uintptr_t)next & 7) && len > 0) {
  298         crc = crc32cb(crc, *(uint8_t *)next);
  299         next++;
  300         len--;
  301     }
  302 
  303     while (len >= 64) {
  304         uint64_t *next8 = (uint64_t *)next;
  305         crc = crc32cx(crc, next8[0]);
  306         crc = crc32cx(crc, next8[1]);
  307         crc = crc32cx(crc, next8[2]);
  308         crc = crc32cx(crc, next8[3]);
  309         crc = crc32cx(crc, next8[4]);
  310         crc = crc32cx(crc, next8[5]);
  311         crc = crc32cx(crc, next8[6]);
  312         crc = crc32cx(crc, next8[7]);
  313         next += 64;
  314         len -= 64;
  315     }
  316 
  317     while (len >= 8) {
  318         crc = crc32cx(crc, *(uint64_t *)next);
  319         next += 8;
  320         len -= 8;
  321     }
  322 
  323     while (len > 0) {
  324         crc = crc32cb(crc, *(uint8_t *)next);
  325         next++;
  326         len--;
  327     }
  328 
  329     return ~crc;
  330 }
  331 
  332 void crc32c_init(void) {
  333     uint64_t auxv = getauxval(AT_HWCAP);
  334 
  335     crc32c = crc32c_sw;
  336     if (auxv & HWCAP_CRC32)
  337         crc32c = crc32c_hw;
  338 }
  339 #else /* !__x86_64__i && !__aarch64__ */
  340 void crc32c_init(void) {
  341     crc32c = crc32c_sw;
  342 }
  343 
  344 #endif
  345 
  346 /* Construct table for software CRC-32C little-endian calculation. */
  347 static pthread_once_t crc32c_once_little = PTHREAD_ONCE_INIT;
  348 static uint32_t crc32c_table_little[8][256];
  349 static void crc32c_init_sw_little(void) {
  350     for (unsigned n = 0; n < 256; n++) {
  351         uint32_t crc = n;
  352         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
  353         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
  354         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
  355         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
  356         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
  357         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
  358         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
  359         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
  360         crc32c_table_little[0][n] = crc;
  361     }
  362     for (unsigned n = 0; n < 256; n++) {
  363         uint32_t crc = crc32c_table_little[0][n];
  364         for (unsigned k = 1; k < 8; k++) {
  365             crc = crc32c_table_little[0][crc & 0xff] ^ (crc >> 8);
  366             crc32c_table_little[k][n] = crc;
  367         }
  368     }
  369 }
  370 
  371 /* Compute a CRC-32C in software assuming a little-endian architecture,
  372    constructing the required table if that hasn't already been done. */
  373 uint32_t crc32c_sw_little(uint32_t crc, void const *buf, size_t len) {
  374     unsigned char const *next = buf;
  375 
  376     pthread_once(&crc32c_once_little, crc32c_init_sw_little);
  377     crc = ~crc;
  378     while (len && ((uintptr_t)next & 7) != 0) {
  379         crc = crc32c_table_little[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
  380         len--;
  381     }
  382     if (len >= 8) {
  383         uint64_t crcw = crc;
  384         do {
  385             crcw ^= *(uint64_t const *)next;
  386             crcw = crc32c_table_little[7][crcw & 0xff] ^
  387                    crc32c_table_little[6][(crcw >> 8) & 0xff] ^
  388                    crc32c_table_little[5][(crcw >> 16) & 0xff] ^
  389                    crc32c_table_little[4][(crcw >> 24) & 0xff] ^
  390                    crc32c_table_little[3][(crcw >> 32) & 0xff] ^
  391                    crc32c_table_little[2][(crcw >> 40) & 0xff] ^
  392                    crc32c_table_little[1][(crcw >> 48) & 0xff] ^
  393                    crc32c_table_little[0][crcw >> 56];
  394             next += 8;
  395             len -= 8;
  396         } while (len >= 8);
  397         crc = crcw;
  398     }
  399     while (len) {
  400         crc = crc32c_table_little[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
  401         len--;
  402     }
  403     return ~crc;
  404 }
  405 
  406 /* Swap the bytes in a uint64_t.  (Only for big-endian.) */
  407 #if defined(__has_builtin) || (defined(__GNUC__) && \
  408     (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)))
  409 #  define swap __builtin_bswap64
  410 #else
  411 static inline uint64_t swap(uint64_t x) {
  412     x = ((x << 8) & 0xff00ff00ff00ff00) | ((x >> 8) & 0xff00ff00ff00ff);
  413     x = ((x << 16) & 0xffff0000ffff0000) | ((x >> 16) & 0xffff0000ffff);
  414     return (x << 32) | (x >> 32);
  415 }
  416 #endif
  417 
  418 /* Construct tables for software CRC-32C big-endian calculation. */
  419 static pthread_once_t crc32c_once_big = PTHREAD_ONCE_INIT;
  420 static uint32_t crc32c_table_big_byte[256];
  421 static uint64_t crc32c_table_big[8][256];
  422 static void crc32c_init_sw_big(void) {
  423     for (unsigned n = 0; n < 256; n++) {
  424         uint32_t crc = n;
  425         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
  426         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
  427         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
  428         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
  429         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
  430         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
  431         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
  432         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
  433         crc32c_table_big_byte[n] = crc;
  434     }
  435     for (unsigned n = 0; n < 256; n++) {
  436         uint32_t crc = crc32c_table_big_byte[n];
  437         crc32c_table_big[0][n] = swap(crc);
  438         for (unsigned k = 1; k < 8; k++) {
  439             crc = crc32c_table_big_byte[crc & 0xff] ^ (crc >> 8);
  440             crc32c_table_big[k][n] = swap(crc);
  441         }
  442     }
  443 }
  444 
  445 /* Compute a CRC-32C in software assuming a big-endian architecture,
  446    constructing the required tables if that hasn't already been done. */
  447 uint32_t crc32c_sw_big(uint32_t crc, void const *buf, size_t len) {
  448     unsigned char const *next = buf;
  449 
  450     pthread_once(&crc32c_once_big, crc32c_init_sw_big);
  451     crc = ~crc;
  452     while (len && ((uintptr_t)next & 7) != 0) {
  453         crc = crc32c_table_big_byte[(crc ^ *next++) & 0xff] ^ (crc >> 8);
  454         len--;
  455     }
  456     if (len >= 8) {
  457         uint64_t crcw = swap(crc);
  458         do {
  459             crcw ^= *(uint64_t const *)next;
  460             crcw = crc32c_table_big[0][crcw & 0xff] ^
  461                    crc32c_table_big[1][(crcw >> 8) & 0xff] ^
  462                    crc32c_table_big[2][(crcw >> 16) & 0xff] ^
  463                    crc32c_table_big[3][(crcw >> 24) & 0xff] ^
  464                    crc32c_table_big[4][(crcw >> 32) & 0xff] ^
  465                    crc32c_table_big[5][(crcw >> 40) & 0xff] ^
  466                    crc32c_table_big[6][(crcw >> 48) & 0xff] ^
  467                    crc32c_table_big[7][(crcw >> 56)];
  468             next += 8;
  469             len -= 8;
  470         } while (len >= 8);
  471         crc = swap(crcw);
  472     }
  473     while (len) {
  474         crc = crc32c_table_big_byte[(crc ^ *next++) & 0xff] ^ (crc >> 8);
  475         len--;
  476     }
  477     return ~crc;
  478 }
  479 
  480 /* Table-driven software CRC-32C.  This is about 15 times slower than using the
  481    hardware instructions.  Determine the endianess of the processor and proceed
  482    accordingly.  Ideally the endianess will be determined at compile time, in
  483    which case the unused functions and tables for the other endianess will be
  484    removed by the optimizer.  If not, then the proper routines and tables will
  485    be used, even if the endianess is changed mid-stream.  (Yes, there are
  486    processors that permit that -- go figure.) */
  487 uint32_t crc32c_sw(uint32_t crc, void const *buf, size_t len) {
  488     static int const little = 1;
  489     if (*(char const *)&little)
  490         return crc32c_sw_little(crc, buf, len);
  491     else
  492         return crc32c_sw_big(crc, buf, len);
  493 }