"Fossies" - the Fresh Open Source Software Archive

Member "xxHash-0.8.0/tests/collisions/main.c" (27 Jul 2020, 37014 Bytes) of package /linux/misc/xxHash-0.8.0.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. See also the last Fossies "Diffs" side-by-side code changes report for "main.c": 0.7.3_vs_0.7.4.

    1 /*
    2  * Brute force collision tester for 64-bit hashes
    3  * Part of the xxHash project
    4  * Copyright (C) 2019-2020 Yann Collet
    5  *
    6  * GPL v2 License
    7  *
    8  * This program is free software; you can redistribute it and/or modify
    9  * it under the terms of the GNU General Public License as published by
   10  * the Free Software Foundation; either version 2 of the License, or
   11  * (at your option) any later version.
   12  *
   13  * This program is distributed in the hope that it will be useful,
   14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
   15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   16  * GNU General Public License for more details.
   17  *
   18  * You should have received a copy of the GNU General Public License along
   19  * with this program; if not, write to the Free Software Foundation, Inc.,
   20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
   21  *
   22  * You can contact the author at:
   23  *   - xxHash homepage: https://www.xxhash.com
   24  *   - xxHash source repository: https://github.com/Cyan4973/xxHash
   25  */
   26 
   27 /*
   28  * The collision tester will generate 24 billion hashes (by default),
   29  * and count how many collisions were produced by the 64-bit hash algorithm.
   30  * The optimal amount of collisions for 64-bit is ~18 collisions.
   31  * A good hash should be close to this figure.
   32  *
   33  * This program requires a lot of memory:
   34  * - Either store hash values directly => 192 GB
   35  * - Or use a filter:
   36  *   -  32 GB (by default) for the filter itself
   37  *   -  + ~14 GB for the list of hashes (depending on the filter's outcome)
   38  * Due to these memory constraints, it requires a 64-bit system.
   39  */
   40 
   41 
   42  /* ===  Dependencies  === */
   43 
   44 #include <stdint.h>   /* uint64_t */
   45 #include <stdlib.h>   /* malloc, free, qsort, exit */
   46 #include <string.h>   /* memset */
   47 #include <stdio.h>    /* printf, fflush */
   48 
   49 #undef NDEBUG   /* ensure assert is _not_ disabled */
   50 #include <assert.h>
   51 
   52 #include "hashes.h"   /* UniHash, hashfn, hashfnTable */
   53 
   54 #include "sort.hh"    /* sort64 */
   55 
   56 
   57 
   58 typedef enum { ht32, ht64, ht128 } Htype_e;
   59 
   60 /* ===  Debug  === */
   61 
   62 #define EXIT(...) { printf(__VA_ARGS__); printf("\n"); exit(1); }
   63 
   64 static void hexRaw(const void* buffer, size_t size)
   65 {
   66     const unsigned char* p = (const unsigned char*)buffer;
   67     for (size_t i=0; i<size; i++) {
   68         printf("%02X", p[i]);
   69     }
   70 }
   71 
   72 void hexDisp(const void* buffer, size_t size)
   73 {
   74     hexRaw(buffer, size);
   75     printf("\n");
   76 }
   77 
   78 static void printHash(const void* table, size_t n, Htype_e htype)
   79 {
   80     if ((htype == ht64) || (htype == ht32)){
   81         uint64_t const h64 = ((const uint64_t*)table)[n];
   82         hexRaw(&h64, sizeof(h64));
   83     } else {
   84         assert(htype == ht128);
   85         XXH128_hash_t const h128 = ((const XXH128_hash_t*)table)[n];
   86         hexRaw(&h128, sizeof(h128));
   87     }
   88 }
   89 
   90 /* ===  Generate Random unique Samples to hash  === */
   91 
   92 /*
   93  * These functions will generate and update a sample to hash.
   94  * initSample() will fill a buffer with random bytes,
   95  * updateSample() will modify one slab in the input buffer.
   96  * updateSample() guarantees it will produce unique samples,
   97  * but it needs to know the total number of samples.
   98  */
   99 
  100 
  101 static const uint64_t prime64_1 = 11400714785074694791ULL;   /* 0b1001111000110111011110011011000110000101111010111100101010000111 */
  102 static const uint64_t prime64_2 = 14029467366897019727ULL;   /* 0b1100001010110010101011100011110100100111110101001110101101001111 */
  103 static const uint64_t prime64_3 =  1609587929392839161ULL;   /* 0b0001011001010110011001111011000110011110001101110111100111111001 */
  104 
  105 static uint64_t avalanche64(uint64_t h64)
  106 {
  107     h64 ^= h64 >> 33;
  108     h64 *= prime64_2;
  109     h64 ^= h64 >> 29;
  110     h64 *= prime64_3;
  111     h64 ^= h64 >> 32;
  112     return h64;
  113 }
  114 
  115 static unsigned char randomByte(size_t n)
  116 {
  117     uint64_t n64 = avalanche64(n+1);
  118     n64 *= prime64_1;
  119     return (unsigned char)(n64 >> 56);
  120 }
  121 
  122 typedef enum { sf_slab5, sf_sparse } sf_genMode;
  123 
  124 
  125 #ifdef SLAB5
  126 
  127 /*
  128  * Slab5 sample generation.
  129  * This algorithm generates unique inputs flipping on average 16 bits per candidate.
  130  * It is generally much more friendly for most hash algorithms, especially
  131  * weaker ones, as it shuffles more the input.
  132  * The algorithm also avoids overfitting the per4 or per8 ingestion patterns.
  133  */
  134 
  135 #define SLAB_SIZE 5
  136 
  137 typedef struct {
  138     void* buffer;
  139     size_t size;
  140     sf_genMode mode;
  141     size_t prngSeed;
  142     uint64_t hnb;
  143 } sampleFactory;
  144 
  145 static void init_sampleFactory(sampleFactory* sf, uint64_t htotal)
  146 {
  147     uint64_t const minNbSlabs = ((htotal-1) >> 32) + 1;
  148     uint64_t const minSize = minNbSlabs * SLAB_SIZE;
  149     if (sf->size < minSize)
  150         EXIT("sample size must be >= %i bytes for this amount of hashes",
  151             (int)minSize);
  152 
  153     unsigned char* const p = (unsigned char*)sf->buffer;
  154     for (size_t n=0; n < sf->size; n++)
  155         p[n] = randomByte(n);
  156     sf->hnb = 0;
  157 }
  158 
  159 static sampleFactory*
  160 create_sampleFactory(size_t size, uint64_t htotal, uint64_t seed)
  161 {
  162     sampleFactory* const sf = malloc(sizeof(sampleFactory));
  163     if (!sf) EXIT("not enough memory");
  164     void* const buffer = malloc(size);
  165     if (!buffer) EXIT("not enough memory");
  166     sf->buffer = buffer;
  167     sf->size = size;
  168     sf->mode = sf_slab5;
  169     sf->prngSeed = seed;
  170     init_sampleFactory(sf, htotal);
  171     return sf;
  172 }
  173 
  174 static void free_sampleFactory(sampleFactory* sf)
  175 {
  176     if (!sf) return;
  177     free(sf->buffer);
  178     free(sf);
  179 }
  180 
  181 static inline void update_sampleFactory(sampleFactory* sf)
  182 {
  183     size_t const nbSlabs = sf->size / SLAB_SIZE;
  184     size_t const SlabNb = sf->hnb % nbSlabs;
  185     sf->hnb++;
  186 
  187     char* const ptr = (char*)sf->buffer;
  188     size_t const start = (SlabNb * SLAB_SIZE) + 1;
  189     uint32_t val32;
  190     memcpy(&val32, ptr+start, sizeof(val32));
  191     static const uint32_t prime32_5 = 374761393U;
  192     val32 += prime32_5;
  193     memcpy(ptr+start, &val32, sizeof(val32));
  194 }
  195 
  196 #else
  197 
  198 /*
  199  * Sparse sample generation.
  200  * This is the default pattern generator.
  201  * It only flips one bit at a time (mostly).
  202  * Low hamming distance scenario is more difficult for weak hash algorithms.
  203  * Note that CRC is immune to this scenario, since they are specifically
  204  * designed to detect low hamming distances.
  205  * Prefer the Slab5 pattern generator for collisions on CRC algorithms.
  206  */
  207 
  208 #define SPARSE_LEVEL_MAX 15
  209 
  210 /* Nb of combinations of m bits in a register of n bits */
  211 static double Cnm(int n, int m)
  212 {
  213     assert(n > 0);
  214     assert(m > 0);
  215     assert(m <= m);
  216     double acc = 1;
  217     for (int i=0; i<m; i++) {
  218         acc *= n - i;
  219         acc /= 1 + i;
  220     }
  221     return acc;
  222 }
  223 
  224 static int enoughCombos(size_t size, uint64_t htotal)
  225 {
  226     if (size < 2) return 0;   /* ensure no multiplication by negative */
  227     uint64_t acc = 0;
  228     uint64_t const srcBits = size * 8; assert(srcBits < INT_MAX);
  229     int nbBitsSet = 0;
  230     while (acc < htotal) {
  231         nbBitsSet++;
  232         if (nbBitsSet >= SPARSE_LEVEL_MAX) return 0;
  233         acc += (uint64_t)Cnm((int)srcBits, nbBitsSet);
  234     }
  235     return 1;
  236 }
  237 
  238 typedef struct {
  239     void* buffer;
  240     size_t size;
  241     sf_genMode mode;
  242     /* sparse */
  243     size_t bitIdx[SPARSE_LEVEL_MAX];
  244     int level;
  245     size_t maxBitIdx;
  246     /* slab5 */
  247     size_t nbSlabs;
  248     size_t current;
  249     size_t prngSeed;
  250 } sampleFactory;
  251 
  252 static void init_sampleFactory(sampleFactory* sf, uint64_t htotal)
  253 {
  254     if (!enoughCombos(sf->size, htotal)) {
  255         EXIT("sample size must be larger for this amount of hashes");
  256     }
  257 
  258     memset(sf->bitIdx, 0, sizeof(sf->bitIdx));
  259     sf->level = 0;
  260 
  261     unsigned char* const p = (unsigned char*)sf->buffer;
  262     for (size_t n=0; n<sf->size; n++)
  263         p[n] = randomByte(sf->prngSeed + n);
  264 }
  265 
  266 static sampleFactory*
  267 create_sampleFactory(size_t size, uint64_t htotal, uint64_t seed)
  268 {
  269     sampleFactory* const sf = malloc(sizeof(sampleFactory));
  270     if (!sf) EXIT("not enough memory");
  271     void* const buffer = malloc(size);
  272     if (!buffer) EXIT("not enough memory");
  273     sf->buffer = buffer;
  274     sf->size = size;
  275     sf->mode = sf_sparse;
  276     sf->maxBitIdx = size * 8;
  277     sf->prngSeed = seed;
  278     init_sampleFactory(sf, htotal);
  279     return sf;
  280 }
  281 
  282 static void free_sampleFactory(sampleFactory* sf)
  283 {
  284     if (!sf) return;
  285     free(sf->buffer);
  286     free(sf);
  287 }
  288 
  289 static void flipbit(void* buffer, uint64_t bitID)
  290 {
  291     size_t const pos = bitID >> 3;
  292     unsigned char const mask = (unsigned char)(1 << (bitID & 7));
  293     unsigned char* const p = (unsigned char*)buffer;
  294     p[pos] ^= mask;
  295 }
  296 
  297 static int updateBit(void* buffer, size_t* bitIdx, int level, size_t max)
  298 {
  299     if (level==0) return 0;   /* can't progress further */
  300 
  301     flipbit(buffer, bitIdx[level]); /* erase previous bits */
  302 
  303     if (bitIdx[level] < max-1) { /* simple case: go to next bit */
  304         bitIdx[level]++;
  305         flipbit(buffer, bitIdx[level]); /* set new bit */
  306         return 1;
  307     }
  308 
  309     /* reached last bit: need to update a bit from lower level */
  310     if (!updateBit(buffer, bitIdx, level-1, max-1)) return 0;
  311     bitIdx[level] = bitIdx[level-1] + 1;
  312     flipbit(buffer, bitIdx[level]); /* set new bit */
  313     return 1;
  314 }
  315 
  316 static inline void update_sampleFactory(sampleFactory* sf)
  317 {
  318     if (!updateBit(sf->buffer, sf->bitIdx, sf->level, sf->maxBitIdx)) {
  319         /* no more room => move to next level */
  320         sf->level++;
  321         assert(sf->level < SPARSE_LEVEL_MAX);
  322 
  323         /* set new bits */
  324         for (int i=1; i <= sf->level; i++) {
  325             sf->bitIdx[i] = (size_t)(i-1);
  326             flipbit(sf->buffer, sf->bitIdx[i]);
  327         }
  328     }
  329 }
  330 
  331 #endif /* pattern generator selection */
  332 
  333 
  334 /* ===  Candidate Filter  === */
  335 
  336 typedef unsigned char Filter;
  337 
  338 Filter* create_Filter(int bflog)
  339 {
  340     assert(bflog < 64 && bflog > 1);
  341     size_t bfsize = (size_t)1 << bflog;
  342     Filter* bf = malloc(bfsize);
  343     assert(((void)"Filter creation failed", bf));
  344     memset(bf, 0, bfsize);
  345     return bf;
  346 }
  347 
  348 void free_Filter(Filter* bf)
  349 {
  350     free(bf);
  351 }
  352 
  353 #ifdef FILTER_1_PROBE
  354 
  355 /*
  356  * Attach hash to a slot
  357  * return: Nb of potential collision candidates detected
  358  *          0: position not yet occupied
  359  *          2: position previously occupied by a single candidate
  360  *          1: position already occupied by multiple candidates
  361  */
  362 inline int Filter_insert(Filter* bf, int bflog, uint64_t hash)
  363 {
  364     int const slotNb = hash & 3;
  365     int const shift = slotNb * 2 ;
  366 
  367     size_t const bfmask = ((size_t)1 << bflog) - 1;
  368     size_t const pos = (hash >> 2) & bfmask;
  369 
  370     int const existingCandidates = ((((unsigned char*)bf)[pos]) >> shift) & 3;
  371 
  372     static const int addCandidates[4] = { 0, 2, 1, 1 };
  373     static const int nextValue[4] = { 1, 2, 3, 3 };
  374 
  375     ((unsigned char*)bf)[pos] |= (unsigned char)(nextValue[existingCandidates] << shift);
  376     return addCandidates[existingCandidates];
  377 }
  378 
  379 /*
  380  * Check if provided 64-bit hash is a collision candidate
  381  * Requires the slot to be occupied by at least 2 candidates.
  382  * return >0 if hash is a collision candidate
  383  *         0 otherwise (slot unoccupied, or only one candidate)
  384  * note: unoccupied slots should not happen in this algorithm,
  385  *       since all hashes are supposed to have been inserted at least once.
  386  */
  387 inline int Filter_check(const Filter* bf, int bflog, uint64_t hash)
  388 {
  389     int const slotNb = hash & 3;
  390     int const shift = slotNb * 2;
  391 
  392     size_t const bfmask = ((size_t)1 << bflog) - 1;
  393     size_t const pos = (hash >> 2) & bfmask;
  394 
  395     return (((const unsigned char*)bf)[pos]) >> (shift+1) & 1;
  396 }
  397 
  398 #else
  399 
  400 /*
  401  * 2-probes strategy,
  402  * more efficient at filtering candidates,
  403  * requires filter size to be > nb of hashes
  404  */
  405 
  406 #define MIN(a,b)   ((a) < (b) ? (a) : (b))
  407 #define MAX(a,b)   ((a) > (b) ? (a) : (b))
  408 
  409 /*
  410  * Attach hash to 2 slots
  411  * return: Nb of potential candidates detected
  412  *          0: position not yet occupied
  413  *          2: position previously occupied by a single candidate (at most)
  414  *          1: position already occupied by multiple candidates
  415  */
  416 static inline int Filter_insert(Filter* bf, int bflog, uint64_t hash)
  417  {
  418      hash = avalanche64(hash);
  419      unsigned const slot1 = hash & 255;
  420      hash >>= 8;
  421      unsigned const slot2 = hash & 255;
  422      hash >>= 8;
  423 
  424      size_t const fclmask = ((size_t)1 << (bflog-6)) - 1;
  425      size_t const cacheLineNb = hash & fclmask;
  426 
  427      size_t const pos1 = (cacheLineNb << 6) + (slot1 >> 2);
  428      unsigned const shift1 = (slot1 & 3) * 2;
  429      unsigned const ex1 = (bf[pos1] >> shift1) & 3;
  430 
  431      size_t const pos2 = (cacheLineNb << 6) + (slot2 >> 2);
  432      unsigned const shift2 = (slot2 & 3) * 2;
  433      unsigned const ex2 = (bf[pos2] >> shift2) & 3;
  434 
  435      unsigned const existing = MIN(ex1, ex2);
  436 
  437      static const int addCandidates[4] = { 0, 2, 1, 1 };
  438      static const unsigned nextValue[4] = { 1, 2, 3, 3 };
  439 
  440      bf[pos1] &= (Filter)(~(3 << shift1)); /* erase previous value */
  441      bf[pos1] |= (Filter)(MAX(ex1, nextValue[existing]) << shift1);
  442      bf[pos2] |= (Filter)(MAX(ex2, nextValue[existing]) << shift2);
  443 
  444      return addCandidates[existing];
  445  }
  446 
  447 
  448 /*
  449  * Check if provided 64-bit hash is a collision candidate
  450  * Requires the slot to be occupied by at least 2 candidates.
  451  * return >0 if hash is a collision candidate
  452  *         0 otherwise (slot unoccupied, or only one candidate)
  453  * note: unoccupied slots should not happen in this algorithm,
  454  *       since all hashes are supposed to have been inserted at least once.
  455  */
  456 static inline int Filter_check(const Filter* bf, int bflog, uint64_t hash)
  457  {
  458      hash = avalanche64(hash);
  459      unsigned const slot1 = hash & 255;
  460      hash >>= 8;
  461      unsigned const slot2 = hash & 255;
  462      hash >>= 8;
  463 
  464      size_t const fclmask = ((size_t)1 << (bflog-6)) - 1;
  465      size_t const cacheLineNb = hash & fclmask;
  466 
  467      size_t const pos1 = (cacheLineNb << 6) + (slot1 >> 2);
  468      unsigned const shift1 = (slot1 & 3) * 2;
  469      unsigned const ex1 = (bf[pos1] >> shift1) & 3;
  470 
  471      size_t const pos2 = (cacheLineNb << 6) + (slot2 >> 2);
  472      unsigned const shift2 = (slot2 & 3) * 2;
  473      unsigned const ex2 = (bf[pos2] >> shift2) & 3;
  474 
  475      return (ex1 >= 2) && (ex2 >= 2);
  476  }
  477 
  478 #endif // FILTER_1_PROBE
  479 
  480 
  481 /* ===  Display  === */
  482 
  483 #include <time.h>   /* clock_t, clock, time_t, time, difftime */
  484 
  485 void update_indicator(uint64_t v, uint64_t total)
  486 {
  487     static clock_t start = 0;
  488     if (start==0) start = clock();
  489     clock_t const updateRate = CLOCKS_PER_SEC / 2;
  490 
  491     clock_t const clockSpan = (clock_t)(clock() - start);
  492     if (clockSpan > updateRate) {
  493         start = clock();
  494         assert(v <= total);
  495         assert(total > 0);
  496         double share = ((double)v / (double)total) * 100;
  497         printf("%6.2f%% (%llu)  \r", share, (unsigned long long)v);
  498         fflush(NULL);
  499     }
  500 }
  501 
  502 /* note: not thread safe */
  503 const char* displayDelay(double delay_s)
  504 {
  505     static char delayString[50];
  506     memset(delayString, 0, sizeof(delayString));
  507 
  508     int const mn = ((int)delay_s / 60) % 60;
  509     int const h = (int)delay_s / 3600;
  510     int const sec = (int)delay_s % 60;
  511 
  512     char* p = delayString;
  513     if (h) sprintf(p, "%i h ", h);
  514     if (mn || h) {
  515         p = delayString + strlen(delayString);
  516         sprintf(p, "%i mn ", mn);
  517     }
  518     p = delayString + strlen(delayString);
  519     sprintf(p, "%is ", sec);
  520 
  521     return delayString;
  522 }
  523 
  524 
  525 /* ===  Math  === */
  526 
  527 static double power(uint64_t base, int p)
  528 {
  529     double value = 1;
  530     assert(p>=0);
  531     for (int i=0; i<p; i++) {
  532         value *= (double)base;
  533     }
  534     return value;
  535 }
  536 
  537 static double estimateNbCollisions(uint64_t nbH, int nbBits)
  538 {
  539     return ((double)nbH * (double)(nbH-1)) / power(2, nbBits+1);
  540 }
  541 
  542 static int highestBitSet(uint64_t v)
  543 {
  544     assert(v!=0);
  545     int bitId = 0;
  546     while (v >>= 1) bitId++;
  547     return bitId;
  548 }
  549 
  550 
  551 /* ===  Filter and search collisions  === */
  552 
  553 #undef NDEBUG   /* ensure assert is not disabled */
  554 #include <assert.h>
  555 
  556 /* will recommend 24 billion samples for 64-bit hashes,
  557  * expecting 18 collisions for a good 64-bit hash */
  558 #define NB_BITS_MAX 64   /* can't store nor analyze hash wider than 64-bits for the time being */
  559 uint64_t select_nbh(int nbBits)
  560 {
  561     assert(nbBits > 0);
  562     if (nbBits > NB_BITS_MAX) nbBits = NB_BITS_MAX;
  563     double targetColls = (double)((128 + 17) - (nbBits * 2));
  564     uint64_t nbH = 24;
  565     while (estimateNbCollisions(nbH, nbBits) < targetColls) nbH *= 2;
  566     return nbH;
  567 }
  568 
  569 
  570 typedef struct {
  571     uint64_t nbCollisions;
  572 } searchCollisions_results;
  573 
  574 typedef struct {
  575     uint64_t nbH;
  576     uint64_t mask;
  577     uint64_t maskSelector;
  578     size_t sampleSize;
  579     uint64_t prngSeed;
  580     int filterLog;      /* <0 = disable filter;  0 = auto-size; */
  581     int hashID;
  582     int display;
  583     int nbThreads;
  584     searchCollisions_results* resultPtr;
  585 } searchCollisions_parameters;
  586 
  587 #define DISPLAY(...) { if (display) printf(__VA_ARGS__); }
  588 
  589 static int isEqual(void* hTablePtr, size_t index1, size_t index2, Htype_e htype)
  590 {
  591     if ((htype == ht64) || (htype == ht32)) {
  592         uint64_t const h1 = ((const uint64_t*)hTablePtr)[index1];
  593         uint64_t const h2 = ((const uint64_t*)hTablePtr)[index2];
  594         return (h1 == h2);
  595     } else {
  596         assert(htype == ht128);
  597         XXH128_hash_t const h1 = ((const XXH128_hash_t*)hTablePtr)[index1];
  598         XXH128_hash_t const h2 = ((const XXH128_hash_t*)hTablePtr)[index2];
  599         return XXH128_isEqual(h1, h2);
  600     }
  601 }
  602 
  603 static int isHighEqual(void* hTablePtr, size_t index1, size_t index2, Htype_e htype, int rShift)
  604 {
  605     uint64_t h1, h2;
  606     if ((htype == ht64) || (htype == ht32)) {
  607         h1 = ((const uint64_t*)hTablePtr)[index1];
  608         h2 = ((const uint64_t*)hTablePtr)[index2];
  609     } else {
  610         assert(htype == ht128);
  611         h1 = ((const XXH128_hash_t*)hTablePtr)[index1].high64;
  612         h2 = ((const XXH128_hash_t*)hTablePtr)[index2].high64;
  613         assert(rShift >= 64);
  614         rShift -= 64;
  615     }
  616     assert(0 <= rShift && rShift < 64);
  617     return (h1 >> rShift) == (h2 >> rShift);
  618 }
  619 
  620 /* assumption: (htype*)hTablePtr[index] is valid */
  621 static void addHashCandidate(void* hTablePtr, UniHash h, Htype_e htype, size_t index)
  622 {
  623     if ((htype == ht64) || (htype == ht32)) {
  624         ((uint64_t*)hTablePtr)[index] = h.h64;
  625     } else {
  626         assert(htype == ht128);
  627         ((XXH128_hash_t*)hTablePtr)[index] = h.h128;
  628     }
  629 }
  630 
  631 static int getNbBits_fromHtype(Htype_e htype) {
  632     switch(htype) {
  633         case ht32: return 32;
  634         case ht64: return 64;
  635         case ht128:return 128;
  636         default: EXIT("hash size not supported");
  637     }
  638 }
  639 
  640 static Htype_e getHtype_fromHbits(int nbBits) {
  641     switch(nbBits) {
  642         case 32 : return ht32;
  643         case 64 : return ht64;
  644         case 128: return ht128;
  645         default: EXIT("hash size not supported");
  646     }
  647 }
  648 
  649 static size_t search_collisions(
  650     searchCollisions_parameters param)
  651 {
  652     uint64_t totalH = param.nbH;
  653     const uint64_t hMask = param.mask;
  654     const uint64_t hSelector = param.maskSelector;
  655     int bflog = param.filterLog;
  656     const int filter = (param.filterLog >= 0);
  657     const size_t sampleSize = param.sampleSize;
  658     const int hashID = param.hashID;
  659     const Htype_e htype = getHtype_fromHbits(hashfnTable[hashID].bits);
  660     const int display = param.display;
  661     /* init */
  662     sampleFactory* const sf = create_sampleFactory(sampleSize, totalH, param.prngSeed);
  663     if (!sf) EXIT("not enough memory");
  664 
  665     //const char* const hname = hashfnTable[hashID].name;
  666     hashfn const hfunction = hashfnTable[hashID].fn;
  667     int const hwidth = hashfnTable[hashID].bits;
  668     if (totalH == 0) totalH = select_nbh(hwidth);
  669     if (bflog == 0) bflog = highestBitSet(totalH) + 1;   /* auto-size filter */
  670     uint64_t const bfsize = (1ULL << bflog);
  671 
  672 
  673     /* ===  filter hashes (optional)  === */
  674 
  675     Filter* bf = NULL;
  676     uint64_t nbPresents = totalH;
  677 
  678     if (filter) {
  679         time_t const filterTBegin = time(NULL);
  680         DISPLAY(" Creating filter (%i GB) \n", (int)(bfsize >> 30));
  681         bf = create_Filter(bflog);
  682         if (!bf) EXIT("not enough memory for filter");
  683 
  684 
  685         DISPLAY(" Generate %llu hashes from samples of %u bytes \n",
  686                 (unsigned long long)totalH, (unsigned)sampleSize);
  687         nbPresents = 0;
  688 
  689         for (uint64_t n=0; n < totalH; n++) {
  690             if (display && ((n&0xFFFFF) == 1) )
  691                 update_indicator(n, totalH);
  692             update_sampleFactory(sf);
  693 
  694             UniHash const h = hfunction(sf->buffer, sampleSize);
  695             if ((h.h64 & hMask) != hSelector) continue;
  696 
  697             nbPresents += (uint64_t)Filter_insert(bf, bflog, h.h64);
  698         }
  699 
  700         if (nbPresents==0) {
  701             DISPLAY(" Analysis completed: No collision detected \n");
  702             if (param.resultPtr) param.resultPtr->nbCollisions = 0;
  703             free_Filter(bf);
  704             free_sampleFactory(sf);
  705             return 0;
  706         }
  707 
  708         {   double const filterDelay = difftime(time(NULL), filterTBegin);
  709             DISPLAY(" Generation and filter completed in %s, detected up to %llu candidates \n",
  710                     displayDelay(filterDelay), (unsigned long long) nbPresents);
  711     }   }
  712 
  713 
  714     /* === store hash candidates: duplicates will be present here === */
  715 
  716     time_t const storeTBegin = time(NULL);
  717     size_t const hashByteSize = (htype == ht128) ? 16 : 8;
  718     size_t const tableSize = (nbPresents+1) * hashByteSize;
  719     assert(tableSize > nbPresents);  /* check tableSize calculation overflow */
  720     DISPLAY(" Storing hash candidates (%i MB) \n", (int)(tableSize >> 20));
  721 
  722     /* Generate and store hashes */
  723     void* const hashCandidates = malloc(tableSize);
  724     if (!hashCandidates) EXIT("not enough memory to store candidates");
  725     init_sampleFactory(sf, totalH);
  726     size_t nbCandidates = 0;
  727     for (uint64_t n=0; n < totalH; n++) {
  728         if (display && ((n&0xFFFFF) == 1) ) update_indicator(n, totalH);
  729         update_sampleFactory(sf);
  730 
  731         UniHash const h = hfunction(sf->buffer, sampleSize);
  732         if ((h.h64 & hMask) != hSelector) continue;
  733 
  734         if (filter) {
  735             if (Filter_check(bf, bflog, h.h64)) {
  736                 assert(nbCandidates < nbPresents);
  737                 addHashCandidate(hashCandidates, h, htype, nbCandidates++);
  738             }
  739         } else {
  740             assert(nbCandidates < nbPresents);
  741             addHashCandidate(hashCandidates, h, htype, nbCandidates++);
  742         }
  743     }
  744     if (nbCandidates < nbPresents) {
  745         /* Try to mitigate gnuc_quicksort behavior, by reducing allocated memory,
  746          * since gnuc_quicksort uses a lot of additional memory for mergesort */
  747         void* const checkPtr = realloc(hashCandidates, nbCandidates * hashByteSize);
  748         assert(checkPtr != NULL);
  749         assert(checkPtr == hashCandidates);  /* simplification: since we are reducing the size,
  750                                               * we hope to keep the same ptr position.
  751                                               * Otherwise, hashCandidates must be mutable. */
  752         DISPLAY(" List of hashes reduced to %u MB from %u MB (saved %u MB) \n",
  753                 (unsigned)((nbCandidates * hashByteSize) >> 20),
  754                 (unsigned)(tableSize >> 20),
  755                 (unsigned)((tableSize - (nbCandidates * hashByteSize)) >> 20) );
  756     }
  757     double const storeTDelay = difftime(time(NULL), storeTBegin);
  758     DISPLAY(" Stored %llu hash candidates in %s \n",
  759             (unsigned long long) nbCandidates, displayDelay(storeTDelay));
  760     free_Filter(bf);
  761     free_sampleFactory(sf);
  762 
  763 
  764     /* === step 3: look for duplicates === */
  765     time_t const sortTBegin = time(NULL);
  766     DISPLAY(" Sorting candidates... ");
  767     fflush(NULL);
  768     if ((htype == ht64) || (htype == ht32)) {
  769         /*
  770          * Use C++'s std::sort, as it's faster than C stdlib's qsort, and
  771          * doesn't suffer from gnuc_libsort's memory expansion
  772          */
  773         sort64(hashCandidates, nbCandidates);
  774     } else {
  775         assert(htype == ht128);
  776         sort128(hashCandidates, nbCandidates); /* sort with custom comparator */
  777     }
  778     double const sortTDelay = difftime(time(NULL), sortTBegin);
  779     DISPLAY(" Completed in %s \n", displayDelay(sortTDelay));
  780 
  781     /* scan and count duplicates */
  782     time_t const countBegin = time(NULL);
  783     DISPLAY(" Looking for duplicates: ");
  784     fflush(NULL);
  785     size_t collisions = 0;
  786     for (size_t n=1; n<nbCandidates; n++) {
  787         if (isEqual(hashCandidates, n, n-1, htype)) {
  788             printf("collision: ");
  789             printHash(hashCandidates, n, htype);
  790             printf(" / ");
  791             printHash(hashCandidates, n-1, htype);
  792             printf(" \n");
  793             collisions++;
  794     }   }
  795 
  796     if (!filter /* all candidates */ && display /*single thead*/ ) {
  797         /* check partial bitfields (high bits) */
  798         DISPLAY(" \n");
  799         int const hashBits = getNbBits_fromHtype(htype);
  800         double worstRatio = 0.;
  801         int worstNbHBits = 0;
  802         for (int nbHBits = 1; nbHBits < hashBits; nbHBits++) {
  803             uint64_t const nbSlots = (uint64_t)1 << nbHBits;
  804             double const expectedCollisions = estimateNbCollisions(nbCandidates, nbHBits);
  805             if ( (nbSlots > nbCandidates * 100)  /* within range for meaningfull collision analysis results */
  806               && (expectedCollisions > 18.0) ) {
  807                 int const rShift = hashBits - nbHBits;
  808                 size_t HBits_collisions = 0;
  809                 for (size_t n=1; n<nbCandidates; n++) {
  810                     if (isHighEqual(hashCandidates, n, n-1, htype, rShift)) {
  811                         HBits_collisions++;
  812                 }   }
  813                 double const collisionRatio = (double)HBits_collisions / expectedCollisions;
  814                 if (collisionRatio > 2.0) DISPLAY("WARNING !!!  ===> ");
  815                 DISPLAY(" high %i bits: %zu collision (%.1f expected): x%.2f \n",
  816                         nbHBits, HBits_collisions, expectedCollisions, collisionRatio);
  817                 if (collisionRatio > worstRatio) {
  818                     worstNbHBits = nbHBits;
  819                     worstRatio = collisionRatio;
  820         }   }   }
  821         DISPLAY("Worst collision ratio at %i high bits: x%.2f \n",
  822                 worstNbHBits, worstRatio);
  823     }
  824     double const countDelay = difftime(time(NULL), countBegin);
  825     DISPLAY(" Completed in %s \n", displayDelay(countDelay));
  826 
  827     /* clean and exit */
  828     free (hashCandidates);
  829 
  830 #if 0  /* debug */
  831     for (size_t n=0; n<nbCandidates; n++)
  832         printf("0x%016llx \n", (unsigned long long)hashCandidates[n]);
  833 #endif
  834 
  835     if (param.resultPtr) param.resultPtr->nbCollisions = collisions;
  836     return collisions;
  837 }
  838 
  839 
  840 
  841 #if defined(__MACH__) || defined(__linux__)
  842 #include <sys/resource.h>
  843 static size_t getProcessMemUsage(int children)
  844 {
  845     struct rusage stats;
  846     if (getrusage(children ? RUSAGE_CHILDREN : RUSAGE_SELF, &stats) == 0)
  847       return (size_t)stats.ru_maxrss;
  848     return 0;
  849 }
  850 #else
  851 static size_t getProcessMemUsage(int ignore) { return 0; }
  852 #endif
  853 
  854 void time_collisions(searchCollisions_parameters param)
  855 {
  856     uint64_t totalH = param.nbH;
  857     int hashID = param.hashID;
  858     int display = param.display;
  859 
  860     /* init */
  861     assert(0 <= hashID && hashID < HASH_FN_TOTAL);
  862     //const char* const hname = hashfnTable[hashID].name;
  863     int const hwidth = hashfnTable[hashID].bits;
  864     if (totalH == 0) totalH = select_nbh(hwidth);
  865     double const targetColls = estimateNbCollisions(totalH, hwidth);
  866 
  867     /* Start the timer to measure start/end of hashing + collision detection. */
  868     time_t const programTBegin = time(NULL);
  869 
  870     /* Generate hashes, and count collisions */
  871     size_t const collisions = search_collisions(param);
  872 
  873     /* display results */
  874     double const programTDelay = difftime(time(NULL), programTBegin);
  875     size_t const programBytesSelf = getProcessMemUsage(0);
  876     size_t const programBytesChildren = getProcessMemUsage(1);
  877     DISPLAY("\n\n");
  878     DISPLAY("===>   Found %llu collisions (x%.2f, %.1f expected) in %s\n",
  879             (unsigned long long)collisions,
  880             (double)collisions / targetColls,
  881             targetColls,
  882             displayDelay(programTDelay));
  883     if (programBytesSelf)
  884       DISPLAY("===>   MaxRSS(self) %zuMB, MaxRSS(children) %zuMB\n",
  885               programBytesSelf>>20,
  886               programBytesChildren>>20);
  887     DISPLAY("------------------------------------------ \n");
  888 }
  889 
  890 // wrapper for pthread interface
  891 void MT_searchCollisions(void* payload)
  892 {
  893     search_collisions(*(searchCollisions_parameters*)payload);
  894 }
  895 
  896 /* ===  Command Line  === */
  897 
  898 /*!
  899  * readU64FromChar():
  900  * Allows and interprets K, KB, KiB, M, MB and MiB suffix.
  901  * Will also modify `*stringPtr`, advancing it to the position where it stopped reading.
  902  */
  903 static uint64_t readU64FromChar(const char** stringPtr)
  904 {
  905     static uint64_t const max = (((uint64_t)(-1)) / 10) - 1;
  906     uint64_t result = 0;
  907     while ((**stringPtr >='0') && (**stringPtr <='9')) {
  908         assert(result < max);
  909         result *= 10;
  910         result += (unsigned)(**stringPtr - '0');
  911         (*stringPtr)++ ;
  912     }
  913     if ((**stringPtr=='K') || (**stringPtr=='M') || (**stringPtr=='G')) {
  914         uint64_t const maxK = ((uint64_t)(-1)) >> 10;
  915         assert(result < maxK);
  916         result <<= 10;
  917         if ((**stringPtr=='M') || (**stringPtr=='G')) {
  918             assert(result < maxK);
  919             result <<= 10;
  920             if (**stringPtr=='G') {
  921                 assert(result < maxK);
  922                 result <<= 10;
  923             }
  924         }
  925         (*stringPtr)++;  /* skip `K` or `M` */
  926         if (**stringPtr=='i') (*stringPtr)++;
  927         if (**stringPtr=='B') (*stringPtr)++;
  928     }
  929     return result;
  930 }
  931 
  932 
  933 /**
  934  * longCommandWArg():
  935  * Checks if *stringPtr is the same as longCommand.
  936  * If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand.
  937  * @return 0 and doesn't modify *stringPtr otherwise.
  938  */
  939 static int longCommandWArg(const char** stringPtr, const char* longCommand)
  940 {
  941     assert(longCommand); assert(stringPtr); assert(*stringPtr);
  942     size_t const comSize = strlen(longCommand);
  943     int const result = !strncmp(*stringPtr, longCommand, comSize);
  944     if (result) *stringPtr += comSize;
  945     return result;
  946 }
  947 
  948 
  949 #include "pool.h"
  950 
  951 /*
  952  * As some hashes use different algorithms depending on input size,
  953  * it can be necessary to test multiple input sizes
  954  * to paint an accurate picture of collision performance
  955  */
  956 #define SAMPLE_SIZE_DEFAULT 256
  957 #define HASHFN_ID_DEFAULT 0
  958 
  959 void help(const char* exeName)
  960 {
  961     printf("usage: %s [hashName] [opt] \n\n", exeName);
  962     printf("list of hashNames:");
  963     printf("%s ", hashfnTable[0].name);
  964     for (int i=1; i < HASH_FN_TOTAL; i++) {
  965         printf(", %s ", hashfnTable[i].name);
  966     }
  967     printf(" \n");
  968     printf("Default hashName is %s\n", hashfnTable[HASHFN_ID_DEFAULT].name);
  969 
  970     printf(" \n");
  971     printf("Optional parameters: \n");
  972     printf("  --nbh=NB       Select nb of hashes to generate (%llu by default) \n", (unsigned long long)select_nbh(64));
  973     printf("  --filter       Activates the filter. Slower, but reduces memory usage for the same nb of hashes.\n");
  974     printf("  --threadlog=NB Use 2^NB threads.\n");
  975     printf("  --len=MB       Set length of the input (%i bytes by default) \n", SAMPLE_SIZE_DEFAULT);
  976 }
  977 
  978 int bad_argument(const char* exeName)
  979 {
  980     printf("incorrect command: \n");
  981     help(exeName);
  982     return 1;
  983 }
  984 
  985 
  986 int main(int argc, const char** argv)
  987 {
  988     if (sizeof(size_t) < 8) return 1;  // cannot work on systems without ability to allocate objects >= 4 GB
  989 
  990     assert(argc > 0);
  991     const char* const exeName = argv[0];
  992     uint64_t totalH = 0;  /* auto, based on nbBits */
  993     int bflog = 0;    /* auto */
  994     int filter = 0;   /* disabled */
  995     size_t sampleSize = SAMPLE_SIZE_DEFAULT;
  996     int hashID = HASHFN_ID_DEFAULT;
  997     int threadlog = 0;
  998     uint64_t prngSeed = 0;
  999 
 1000     int arg_nb;
 1001     for (arg_nb = 1; arg_nb < argc; arg_nb++) {
 1002         const char** arg = argv + arg_nb;
 1003 
 1004         if (!strcmp(*arg, "-h")) { help(exeName); return 0; }
 1005         if (longCommandWArg(arg, "-T")) { threadlog = (int)readU64FromChar(arg); continue; }
 1006 
 1007         if (!strcmp(*arg, "--filter"))    { filter=1; continue; }
 1008         if (!strcmp(*arg, "--no-filter")) { filter=0; continue; }
 1009 
 1010         if (longCommandWArg(arg, "--seed")) { prngSeed = readU64FromChar(arg); continue; }
 1011         if (longCommandWArg(arg, "--nbh=")) { totalH = readU64FromChar(arg); continue; }
 1012         if (longCommandWArg(arg, "--filter=")) { filter=1; bflog = (int)readU64FromChar(arg); assert(bflog < 64); continue; }
 1013         if (longCommandWArg(arg, "--filterlog=")) { filter=1; bflog = (int)readU64FromChar(arg); assert(bflog < 64); continue; }
 1014         if (longCommandWArg(arg, "--size=")) { sampleSize = (size_t)readU64FromChar(arg); continue; }
 1015         if (longCommandWArg(arg, "--len=")) { sampleSize = (size_t)readU64FromChar(arg); continue; }
 1016         if (longCommandWArg(arg, "--threadlog=")) { threadlog = (int)readU64FromChar(arg); continue; }
 1017 
 1018         /* argument understood as hash name (must be correct) */
 1019         int hnb;
 1020         for (hnb=0; hnb < HASH_FN_TOTAL; hnb++) {
 1021             if (!strcmp(*arg, hashfnTable[hnb].name)) { hashID = hnb; break; }
 1022         }
 1023         if (hnb == HASH_FN_TOTAL) return bad_argument(exeName);
 1024     }
 1025 
 1026     /* init */
 1027     const char* const hname = hashfnTable[hashID].name;
 1028     int const hwidth = hashfnTable[hashID].bits;
 1029     if (totalH == 0) totalH = select_nbh(hwidth);
 1030     double const targetColls = estimateNbCollisions(totalH, hwidth);
 1031     if (bflog == 0) bflog = highestBitSet(totalH) + 1;   /* auto-size filter */
 1032     if (!filter) bflog = -1; // disable filter
 1033 
 1034     if (sizeof(size_t) < 8)
 1035       EXIT("This program has not been validated on architectures other than "
 1036            "64bit \n");
 1037 
 1038     printf(" *** Collision tester for 64+ bit hashes ***  \n\n");
 1039     printf("Testing %s algorithm (%i-bit) \n", hname, hwidth);
 1040     printf("This program will allocate a lot of memory,\n");
 1041     printf("generate %llu %i-bit hashes from samples of %u bytes, \n",
 1042             (unsigned long long)totalH, hwidth, (unsigned)sampleSize);
 1043     printf("and attempt to produce %.0f collisions. \n\n", targetColls);
 1044 
 1045     int const nbThreads = 1 << threadlog;
 1046     if (nbThreads <= 0) EXIT("Invalid --threadlog value.");
 1047 
 1048     if (nbThreads == 1) {
 1049 
 1050         searchCollisions_parameters params;
 1051         params.nbH = totalH;
 1052         params.mask = 0;
 1053         params.maskSelector = 0;
 1054         params.sampleSize = sampleSize;
 1055         params.filterLog = bflog;
 1056         params.hashID = hashID;
 1057         params.display = 1;
 1058         params.resultPtr = NULL;
 1059         params.prngSeed = prngSeed;
 1060         params.nbThreads = 1;
 1061         time_collisions(params);
 1062 
 1063     } else { /*  nbThreads > 1 */
 1064 
 1065         /* use multithreading */
 1066         if (threadlog >= 30) EXIT("too many threads requested");
 1067         if ((uint64_t)nbThreads > (totalH >> 16))
 1068             EXIT("too many threads requested");
 1069         if (bflog > 0 && threadlog > (bflog-10))
 1070             EXIT("too many threads requested");
 1071         printf("using %i threads ... \n", nbThreads);
 1072 
 1073         /* allocation */
 1074         time_t const programTBegin = time(NULL);
 1075         POOL_ctx* const pt = POOL_create((size_t)nbThreads, 1);
 1076         if (!pt) EXIT("not enough memory for threads");
 1077         searchCollisions_results* const MTresults = calloc (sizeof(searchCollisions_results), (size_t)nbThreads);
 1078         if (!MTresults) EXIT("not enough memory");
 1079         searchCollisions_parameters* const MTparams = calloc (sizeof(searchCollisions_parameters), (size_t)nbThreads);
 1080         if (!MTparams) EXIT("not enough memory");
 1081 
 1082         /* distribute jobs */
 1083         for (int tnb=0; tnb<nbThreads; tnb++) {
 1084             MTparams[tnb].nbH = totalH;
 1085             MTparams[tnb].mask = (uint64_t)nbThreads - 1;
 1086             MTparams[tnb].sampleSize = sampleSize;
 1087             MTparams[tnb].filterLog = bflog ? bflog - threadlog : 0;
 1088             MTparams[tnb].hashID = hashID;
 1089             MTparams[tnb].display = 0;
 1090             MTparams[tnb].resultPtr = MTresults+tnb;
 1091             MTparams[tnb].prngSeed = prngSeed;
 1092             MTparams[tnb].maskSelector = (uint64_t)tnb;
 1093             POOL_add(pt, MT_searchCollisions, MTparams + tnb);
 1094         }
 1095         POOL_free(pt);  /* actually joins and free */
 1096 
 1097         /* Gather results */
 1098         uint64_t nbCollisions=0;
 1099         for (int tnb=0; tnb<nbThreads; tnb++) {
 1100             nbCollisions += MTresults[tnb].nbCollisions;
 1101         }
 1102 
 1103         double const programTDelay = difftime(time(NULL), programTBegin);
 1104         size_t const programBytesSelf = getProcessMemUsage(0);
 1105         size_t const programBytesChildren = getProcessMemUsage(1);
 1106         printf("\n\n");
 1107         printf("===>   Found %llu collisions (x%.2f, %.1f expected) in %s\n",
 1108                 (unsigned long long)nbCollisions,
 1109                 (double)nbCollisions / targetColls,
 1110                 targetColls,
 1111                 displayDelay(programTDelay));
 1112         if (programBytesSelf)
 1113           printf("===>   MaxRSS(self) %zuMB, MaxRSS(children) %zuMB\n",
 1114                  programBytesSelf>>20,
 1115                  programBytesChildren>>20);
 1116         printf("------------------------------------------ \n");
 1117 
 1118         /* Clean up */
 1119         free(MTparams);
 1120         free(MTresults);
 1121     }
 1122 
 1123     return 0;
 1124 }