"Fossies" - the Fresh Open Source Software Archive 
Member "memcached-1.6.15/crc32c.c" (21 Feb 2022, 17445 Bytes) of package /linux/www/memcached-1.6.15.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "crc32c.c" see the
Fossies "Dox" file reference documentation.
1 /* crc32c.c -- compute CRC-32C using the Intel crc32 instruction
2 * Copyright (C) 2013, 2015 Mark Adler
3 * Version 1.3 31 Dec 2015 Mark Adler
4 */
5
6 /*
7 This software is provided 'as-is', without any express or implied
8 warranty. In no event will the author be held liable for any damages
9 arising from the use of this software.
10
11 Permission is granted to anyone to use this software for any purpose,
12 including commercial applications, and to alter it and redistribute it
13 freely, subject to the following restrictions:
14
15 1. The origin of this software must not be misrepresented; you must not
16 claim that you wrote the original software. If you use this software
17 in a product, an acknowledgment in the product documentation would be
18 appreciated but is not required.
19 2. Altered source versions must be plainly marked as such, and must not be
20 misrepresented as being the original software.
21 3. This notice may not be removed or altered from any source distribution.
22
23 Mark Adler
24 madler@alumni.caltech.edu
25 */
26
27 /* Use hardware CRC instruction on Intel SSE 4.2 processors. This computes a
28 CRC-32C, *not* the CRC-32 used by Ethernet and zip, gzip, etc. A software
29 version is provided as a fall-back, as well as for speed comparisons. */
30
31 /* Version history:
32 1.0 10 Feb 2013 First version
33 1.1 1 Aug 2013 Correct comments on why three crc instructions in parallel
34 1.2 1 Nov 2015 Add const qualifier to avoid compiler warning
35 Load entire input into memory (test code)
36 Argument gives number of times to repeat (test code)
37 Argument < 0 forces software implementation (test code)
38 1.3 31 Dec 2015 Check for Intel architecture using compiler macro
39 Support big-endian processors in software calculation
40 Add header for external use
41 */
42
43 #include <pthread.h>
44 #include "crc32c.h"
45
46 crc_func crc32c;
47
48 /* CRC-32C (iSCSI) polynomial in reversed bit order. */
49 #define POLY 0x82f63b78
50
51 uint32_t crc32c_sw_little(uint32_t crc, void const *buf, size_t len);
52 uint32_t crc32c_sw_big(uint32_t crc, void const *buf, size_t len);
53 #ifdef __x86_64__
54
55 /* Hardware CRC-32C for Intel and compatible processors. */
56
57 /* Multiply a matrix times a vector over the Galois field of two elements,
58 GF(2). Each element is a bit in an unsigned integer. mat must have at
59 least as many entries as the power of two for most significant one bit in
60 vec. */
61 static inline uint32_t gf2_matrix_times(uint32_t *mat, uint32_t vec) {
62 uint32_t sum = 0;
63 while (vec) {
64 if (vec & 1)
65 sum ^= *mat;
66 vec >>= 1;
67 mat++;
68 }
69 return sum;
70 }
71
72 /* Multiply a matrix by itself over GF(2). Both mat and square must have 32
73 rows. */
74 static inline void gf2_matrix_square(uint32_t *square, uint32_t *mat) {
75 for (unsigned n = 0; n < 32; n++)
76 square[n] = gf2_matrix_times(mat, mat[n]);
77 }
78
79 /* Construct an operator to apply len zeros to a crc. len must be a power of
80 two. If len is not a power of two, then the result is the same as for the
81 largest power of two less than len. The result for len == 0 is the same as
82 for len == 1. A version of this routine could be easily written for any
83 len, but that is not needed for this application. */
84 static void crc32c_zeros_op(uint32_t *even, size_t len) {
85 uint32_t odd[32]; /* odd-power-of-two zeros operator */
86
87 /* put operator for one zero bit in odd */
88 odd[0] = POLY; /* CRC-32C polynomial */
89 uint32_t row = 1;
90 for (unsigned n = 1; n < 32; n++) {
91 odd[n] = row;
92 row <<= 1;
93 }
94
95 /* put operator for two zero bits in even */
96 gf2_matrix_square(even, odd);
97
98 /* put operator for four zero bits in odd */
99 gf2_matrix_square(odd, even);
100
101 /* first square will put the operator for one zero byte (eight zero bits),
102 in even -- next square puts operator for two zero bytes in odd, and so
103 on, until len has been rotated down to zero */
104 do {
105 gf2_matrix_square(even, odd);
106 len >>= 1;
107 if (len == 0)
108 return;
109 gf2_matrix_square(odd, even);
110 len >>= 1;
111 } while (len);
112
113 /* answer ended up in odd -- copy to even */
114 for (unsigned n = 0; n < 32; n++)
115 even[n] = odd[n];
116 }
117
118 /* Take a length and build four lookup tables for applying the zeros operator
119 for that length, byte-by-byte on the operand. */
120 static void crc32c_zeros(uint32_t zeros[][256], size_t len) {
121 uint32_t op[32];
122
123 crc32c_zeros_op(op, len);
124 for (unsigned n = 0; n < 256; n++) {
125 zeros[0][n] = gf2_matrix_times(op, n);
126 zeros[1][n] = gf2_matrix_times(op, n << 8);
127 zeros[2][n] = gf2_matrix_times(op, n << 16);
128 zeros[3][n] = gf2_matrix_times(op, n << 24);
129 }
130 }
131
132 /* Apply the zeros operator table to crc. */
133 static inline uint32_t crc32c_shift(uint32_t zeros[][256], uint32_t crc) {
134 return zeros[0][crc & 0xff] ^ zeros[1][(crc >> 8) & 0xff] ^
135 zeros[2][(crc >> 16) & 0xff] ^ zeros[3][crc >> 24];
136 }
137
138 /* Block sizes for three-way parallel crc computation. LONG and SHORT must
139 both be powers of two. The associated string constants must be set
140 accordingly, for use in constructing the assembler instructions. */
141 #define LONG 8192
142 #define LONGx1 "8192"
143 #define LONGx2 "16384"
144 #define SHORT 256
145 #define SHORTx1 "256"
146 #define SHORTx2 "512"
147
148 /* Tables for hardware crc that shift a crc by LONG and SHORT zeros. */
149 static pthread_once_t crc32c_once_hw = PTHREAD_ONCE_INIT;
150 static uint32_t crc32c_long[4][256];
151 static uint32_t crc32c_short[4][256];
152
153 /* Initialize tables for shifting crcs. */
154 static void crc32c_init_hw(void) {
155 crc32c_zeros(crc32c_long, LONG);
156 crc32c_zeros(crc32c_short, SHORT);
157 }
158
159 /* Compute CRC-32C using the Intel hardware instruction. */
160 static uint32_t crc32c_hw(uint32_t crc, void const *buf, size_t len) {
161 /* populate shift tables the first time through */
162 pthread_once(&crc32c_once_hw, crc32c_init_hw);
163
164 /* pre-process the crc */
165 crc = ~crc;
166 uint64_t crc0 = crc; /* 64-bits for crc32q instruction */
167
168 /* compute the crc for up to seven leading bytes to bring the data pointer
169 to an eight-byte boundary */
170 unsigned char const *next = buf;
171 while (len && ((uintptr_t)next & 7) != 0) {
172 __asm__("crc32b\t" "(%1), %0"
173 : "=r"(crc0)
174 : "r"(next), "0"(crc0));
175 next++;
176 len--;
177 }
178
179 /* compute the crc on sets of LONG*3 bytes, executing three independent crc
180 instructions, each on LONG bytes -- this is optimized for the Nehalem,
181 Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
182 throughput of one crc per cycle, but a latency of three cycles */
183 while (len >= LONG*3) {
184 uint64_t crc1 = 0;
185 uint64_t crc2 = 0;
186 unsigned char const * const end = next + LONG;
187 do {
188 __asm__("crc32q\t" "(%3), %0\n\t"
189 "crc32q\t" LONGx1 "(%3), %1\n\t"
190 "crc32q\t" LONGx2 "(%3), %2"
191 : "=r"(crc0), "=r"(crc1), "=r"(crc2)
192 : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
193 next += 8;
194 } while (next < end);
195 crc0 = crc32c_shift(crc32c_long, crc0) ^ crc1;
196 crc0 = crc32c_shift(crc32c_long, crc0) ^ crc2;
197 next += LONG*2;
198 len -= LONG*3;
199 }
200
201 /* do the same thing, but now on SHORT*3 blocks for the remaining data less
202 than a LONG*3 block */
203 while (len >= SHORT*3) {
204 uint64_t crc1 = 0;
205 uint64_t crc2 = 0;
206 unsigned char const * const end = next + SHORT;
207 do {
208 __asm__("crc32q\t" "(%3), %0\n\t"
209 "crc32q\t" SHORTx1 "(%3), %1\n\t"
210 "crc32q\t" SHORTx2 "(%3), %2"
211 : "=r"(crc0), "=r"(crc1), "=r"(crc2)
212 : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
213 next += 8;
214 } while (next < end);
215 crc0 = crc32c_shift(crc32c_short, crc0) ^ crc1;
216 crc0 = crc32c_shift(crc32c_short, crc0) ^ crc2;
217 next += SHORT*2;
218 len -= SHORT*3;
219 }
220
221 /* compute the crc on the remaining eight-byte units less than a SHORT*3
222 block */
223 {
224 unsigned char const * const end = next + (len - (len & 7));
225 while (next < end) {
226 __asm__("crc32q\t" "(%1), %0"
227 : "=r"(crc0)
228 : "r"(next), "0"(crc0));
229 next += 8;
230 }
231 len &= 7;
232 }
233
234 /* compute the crc for up to seven trailing bytes */
235 while (len) {
236 __asm__("crc32b\t" "(%1), %0"
237 : "=r"(crc0)
238 : "r"(next), "0"(crc0));
239 next++;
240 len--;
241 }
242
243 /* return a post-processed crc */
244 return ~crc0;
245 }
246
247 /* Check for SSE 4.2. SSE 4.2 was first supported in Nehalem processors
248 introduced in November, 2008. This does not check for the existence of the
249 cpuid instruction itself, which was introduced on the 486SL in 1992, so this
250 will fail on earlier x86 processors. cpuid works on all Pentium and later
251 processors. */
252 #define SSE42(have) \
253 do { \
254 uint32_t eax, ecx; \
255 eax = 1; \
256 __asm__("cpuid" \
257 : "=c"(ecx) \
258 : "a"(eax) \
259 : "%ebx", "%edx"); \
260 (have) = (ecx >> 20) & 1; \
261 } while (0)
262
263 /* Compute a CRC-32C. If the crc32 instruction is available, use the hardware
264 version. Otherwise, use the software version. */
265 void crc32c_init(void) {
266 int sse42;
267
268 SSE42(sse42);
269 if (sse42) {
270 crc32c = crc32c_hw;
271 } else {
272 crc32c = crc32c_sw;
273 }
274 }
275
276 #elif defined(__aarch64__) && defined(__linux__)
277 #include <sys/auxv.h>
278
279 #if defined(HWCAP_CRC32)
280 static inline uint32_t crc32cx(uint32_t crc, const uint64_t data)
281 {
282 asm(".arch_extension crc\n"
283 "crc32cx %w0, %w0, %x1" : "+r" (crc) : "r" (data));
284 return crc;
285 }
286
287 static inline uint32_t crc32cb(uint32_t crc, const uint8_t data)
288 {
289 asm(".arch_extension crc\n"
290 "crc32cb %w0, %w0, %w1" : "+r" (crc) : "r" (data));
291 return crc;
292 }
293
294 static uint32_t crc32c_hw(uint32_t crc, void const *buf, size_t len) {
295 crc = ~crc;
296 unsigned char const *next = buf;
297
298 while (((uintptr_t)next & 7) && len > 0) {
299 crc = crc32cb(crc, *(uint8_t *)next);
300 next++;
301 len--;
302 }
303
304 while (len >= 64) {
305 uint64_t *next8 = (uint64_t *)next;
306 crc = crc32cx(crc, next8[0]);
307 crc = crc32cx(crc, next8[1]);
308 crc = crc32cx(crc, next8[2]);
309 crc = crc32cx(crc, next8[3]);
310 crc = crc32cx(crc, next8[4]);
311 crc = crc32cx(crc, next8[5]);
312 crc = crc32cx(crc, next8[6]);
313 crc = crc32cx(crc, next8[7]);
314 next += 64;
315 len -= 64;
316 }
317
318 while (len >= 8) {
319 crc = crc32cx(crc, *(uint64_t *)next);
320 next += 8;
321 len -= 8;
322 }
323
324 while (len > 0) {
325 crc = crc32cb(crc, *(uint8_t *)next);
326 next++;
327 len--;
328 }
329
330 return ~crc;
331 }
332
333 void crc32c_init(void) {
334 uint64_t auxv = getauxval(AT_HWCAP);
335
336 crc32c = crc32c_sw;
337 if (auxv & HWCAP_CRC32)
338 crc32c = crc32c_hw;
339 }
340 #else /* no hw crc32 on arm64 system supported? old compiler/libc/kernel? */
341 void crc32c_init(void) {
342 crc32c = crc32c_sw;
343 }
344 #endif
345 #else /* !__x86_64__i && !__aarch64__ */
346 void crc32c_init(void) {
347 crc32c = crc32c_sw;
348 }
349
350 #endif
351
352 /* Construct table for software CRC-32C little-endian calculation. */
353 static pthread_once_t crc32c_once_little = PTHREAD_ONCE_INIT;
354 static uint32_t crc32c_table_little[8][256];
355 static void crc32c_init_sw_little(void) {
356 for (unsigned n = 0; n < 256; n++) {
357 uint32_t crc = n;
358 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
359 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
360 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
361 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
362 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
363 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
364 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
365 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
366 crc32c_table_little[0][n] = crc;
367 }
368 for (unsigned n = 0; n < 256; n++) {
369 uint32_t crc = crc32c_table_little[0][n];
370 for (unsigned k = 1; k < 8; k++) {
371 crc = crc32c_table_little[0][crc & 0xff] ^ (crc >> 8);
372 crc32c_table_little[k][n] = crc;
373 }
374 }
375 }
376
377 /* Compute a CRC-32C in software assuming a little-endian architecture,
378 constructing the required table if that hasn't already been done. */
379 uint32_t crc32c_sw_little(uint32_t crc, void const *buf, size_t len) {
380 unsigned char const *next = buf;
381
382 pthread_once(&crc32c_once_little, crc32c_init_sw_little);
383 crc = ~crc;
384 while (len && ((uintptr_t)next & 7) != 0) {
385 crc = crc32c_table_little[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
386 len--;
387 }
388 if (len >= 8) {
389 uint64_t crcw = crc;
390 do {
391 crcw ^= *(uint64_t const *)next;
392 crcw = crc32c_table_little[7][crcw & 0xff] ^
393 crc32c_table_little[6][(crcw >> 8) & 0xff] ^
394 crc32c_table_little[5][(crcw >> 16) & 0xff] ^
395 crc32c_table_little[4][(crcw >> 24) & 0xff] ^
396 crc32c_table_little[3][(crcw >> 32) & 0xff] ^
397 crc32c_table_little[2][(crcw >> 40) & 0xff] ^
398 crc32c_table_little[1][(crcw >> 48) & 0xff] ^
399 crc32c_table_little[0][crcw >> 56];
400 next += 8;
401 len -= 8;
402 } while (len >= 8);
403 crc = crcw;
404 }
405 while (len) {
406 crc = crc32c_table_little[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
407 len--;
408 }
409 return ~crc;
410 }
411
412 /* Swap the bytes in a uint64_t. (Only for big-endian.) */
413 #if defined(__has_builtin) || (defined(__GNUC__) && \
414 (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)))
415 # define swap __builtin_bswap64
416 #else
417 static inline uint64_t swap(uint64_t x) {
418 x = ((x << 8) & 0xff00ff00ff00ff00) | ((x >> 8) & 0xff00ff00ff00ff);
419 x = ((x << 16) & 0xffff0000ffff0000) | ((x >> 16) & 0xffff0000ffff);
420 return (x << 32) | (x >> 32);
421 }
422 #endif
423
424 /* Construct tables for software CRC-32C big-endian calculation. */
425 static pthread_once_t crc32c_once_big = PTHREAD_ONCE_INIT;
426 static uint32_t crc32c_table_big_byte[256];
427 static uint64_t crc32c_table_big[8][256];
428 static void crc32c_init_sw_big(void) {
429 for (unsigned n = 0; n < 256; n++) {
430 uint32_t crc = n;
431 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
432 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
433 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
434 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
435 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
436 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
437 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
438 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
439 crc32c_table_big_byte[n] = crc;
440 }
441 for (unsigned n = 0; n < 256; n++) {
442 uint32_t crc = crc32c_table_big_byte[n];
443 crc32c_table_big[0][n] = swap(crc);
444 for (unsigned k = 1; k < 8; k++) {
445 crc = crc32c_table_big_byte[crc & 0xff] ^ (crc >> 8);
446 crc32c_table_big[k][n] = swap(crc);
447 }
448 }
449 }
450
451 /* Compute a CRC-32C in software assuming a big-endian architecture,
452 constructing the required tables if that hasn't already been done. */
453 uint32_t crc32c_sw_big(uint32_t crc, void const *buf, size_t len) {
454 unsigned char const *next = buf;
455
456 pthread_once(&crc32c_once_big, crc32c_init_sw_big);
457 crc = ~crc;
458 while (len && ((uintptr_t)next & 7) != 0) {
459 crc = crc32c_table_big_byte[(crc ^ *next++) & 0xff] ^ (crc >> 8);
460 len--;
461 }
462 if (len >= 8) {
463 uint64_t crcw = swap(crc);
464 do {
465 crcw ^= *(uint64_t const *)next;
466 crcw = crc32c_table_big[0][crcw & 0xff] ^
467 crc32c_table_big[1][(crcw >> 8) & 0xff] ^
468 crc32c_table_big[2][(crcw >> 16) & 0xff] ^
469 crc32c_table_big[3][(crcw >> 24) & 0xff] ^
470 crc32c_table_big[4][(crcw >> 32) & 0xff] ^
471 crc32c_table_big[5][(crcw >> 40) & 0xff] ^
472 crc32c_table_big[6][(crcw >> 48) & 0xff] ^
473 crc32c_table_big[7][(crcw >> 56)];
474 next += 8;
475 len -= 8;
476 } while (len >= 8);
477 crc = swap(crcw);
478 }
479 while (len) {
480 crc = crc32c_table_big_byte[(crc ^ *next++) & 0xff] ^ (crc >> 8);
481 len--;
482 }
483 return ~crc;
484 }
485
486 /* Table-driven software CRC-32C. This is about 15 times slower than using the
487 hardware instructions. Determine the endianess of the processor and proceed
488 accordingly. Ideally the endianess will be determined at compile time, in
489 which case the unused functions and tables for the other endianess will be
490 removed by the optimizer. If not, then the proper routines and tables will
491 be used, even if the endianess is changed mid-stream. (Yes, there are
492 processors that permit that -- go figure.) */
493 uint32_t crc32c_sw(uint32_t crc, void const *buf, size_t len) {
494 static int const little = 1;
495 if (*(char const *)&little)
496 return crc32c_sw_little(crc, buf, len);
497 else
498 return crc32c_sw_big(crc, buf, len);
499 }