"Fossies" - the Fresh Open Source Software Archive

Member "cb2bib-2.0.1/src/c2b/txtmatcher.cpp" (12 Feb 2021, 15492 Bytes) of package /linux/privat/cb2bib-2.0.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "txtmatcher.cpp" see the Fossies "Dox" file reference documentation.

    1 /***************************************************************************
    2  *   Copyright (C) 2004-2021 by Pere Constans
    3  *   constans@molspaces.com
    4  *   cb2Bib version 2.0.1. Licensed under the GNU GPL version 3.
    5  *   See the LICENSE file that comes with this distribution.
    6  ***************************************************************************/
    7 #include "txtmatcher.h"
    8 
    9 #ifdef C2B_USE_TXTMATCHER_AVX2
   10 #include <immintrin.h>
   11 #endif
   12 #ifdef C2B_USE_TXTMATCHER_SSE2
   13 #include <emmintrin.h>
   14 #endif
   15 
   16 static const double _upper_case_letter_frequency[] =
   17 {
   18     //  A          B          C          D          E          F          G          H          I
   19     0.0461215, 0.0219791, 0.0542205, 0.0247118, 0.0243137, 0.0249272, 0.0155418, 0.0318637, 0.0319445,
   20     //  J          K          L          M          N          O          P          Q          R
   21     0.0185178, 0.0110203, 0.0211695, 0.0345829, 0.0251332, 0.0222969, 0.0335058, 0.0034723, 0.0278396,
   22     //  S          T          U          V          W          X          Y          Z
   23     0.0433791, 0.0483445, 0.0070459, 0.0108463, 0.0114125, 0.0053894, 0.0057301, 0.0039877
   24 };
   25 static const double _lower_case_letter_frequency[] =
   26 {
   27     //  a          b          c          d          e          f          g          h          i
   28     0.6139480, 0.1124940, 0.3052070, 0.2845240, 0.9756860, 0.1816290, 0.1361380, 0.3223960, 0.6192860,
   29     //  j          k          l          m          n          o          p          q          r
   30     0.0084664, 0.0315632, 0.3386540, 0.2145900, 0.5768830, 0.5937770, 0.1803630, 0.0162243, 0.5046140,
   31     //  s          t          u          v          w          x          y          z
   32     0.4905200, 0.6968060, 0.2104950, 0.0785456, 0.0858753, 0.0380061, 0.1229070, 0.0193867
   33 };
   34 
   35 
   36 /**
   37    Specialized substring search for English texts
   38 
   39 */
   40 txtmatcher::txtmatcher() : _case_sensitive(true), _chook(-1), _frequency(10000) {}
   41 
   42 txtmatcher::txtmatcher(const QString& spattern, const Qt::CaseSensitivity cs, const int chook)
   43 {
   44     setPattern(spattern, cs, chook);
   45 }
   46 
   47 
   48 void txtmatcher::setPattern(const QString& spattern, const Qt::CaseSensitivity cs, const int chook)
   49 {
   50     _pattern = spattern;
   51     _case_sensitive = cs == Qt::CaseSensitive;
   52     const int m(_pattern.size());
   53 #ifdef C2B_USE_TXTMATCHER_SCALAR
   54     _padded_pattern = _pattern;
   55 #else
   56 #ifdef C2B_USE_TXTMATCHER_SSE2
   57     _padded_pattern.fill(QChar(65535), std::max(m, 8));
   58 #endif
   59 #ifdef C2B_USE_TXTMATCHER_AVX2
   60     _padded_pattern.fill(QChar(65535), std::max(m, 16));
   61 #endif
   62     for (int i = 0; i < m; ++i)
   63         _padded_pattern[i] = _pattern.at(i);
   64 #endif
   65     if (_case_sensitive)
   66     {
   67         _lcp.clear();
   68         _ucp.clear();
   69     }
   70     else
   71     {
   72         // Safe character case folding
   73         _lcp = _padded_pattern;
   74         for (int i = 0; i < m; ++i)
   75             _lcp[i] = _padded_pattern.at(i).toLower();
   76         _ucp = _padded_pattern;
   77         for (int i = 0; i < m; ++i)
   78             _ucp[i] = _padded_pattern.at(i).toUpper();
   79     }
   80     if (chook < 0 || chook >= m)
   81         _set_lowest_frequency_character(_pattern, _case_sensitive);
   82     else
   83     {
   84         _chook = chook;
   85         _frequency = 0;
   86     }
   87 }
   88 
   89 int txtmatcher::indexIn(const QChar* const text, const int length, const int from) const
   90 {
   91     if (from + _pattern.size() > length)
   92         return -1;
   93     if (_pattern.size() == 0) // Convention
   94         return from;
   95 #ifdef C2B_USE_TXTMATCHER_SCALAR
   96     if (_case_sensitive)
   97         return _find_case_sensitive(reinterpret_cast<const ushort*>(text), length, from);
   98     else
   99         return _find_case_insensitive(reinterpret_cast<const ushort*>(text), length, from);
  100 #endif
  101 #ifdef C2B_USE_TXTMATCHER_SSE2
  102     if (_pattern.size() > 8 || length - from < 5 * 8)
  103     {
  104         if (_case_sensitive)
  105             return _find_case_sensitive(reinterpret_cast<const ushort*>(text), length, from);
  106         else
  107             return _find_case_insensitive(reinterpret_cast<const ushort*>(text), length, from);
  108     }
  109     else
  110     {
  111         if (_case_sensitive)
  112             return _find_case_sensitive_sse2(reinterpret_cast<const ushort*>(text), length, from);
  113         else
  114             return _find_case_insensitive_sse2(reinterpret_cast<const ushort*>(text), length, from);
  115     }
  116 #endif
  117 #ifdef C2B_USE_TXTMATCHER_AVX2
  118     if (_pattern.size() > 16 || length - from < 5 * 16)
  119     {
  120         if (_case_sensitive)
  121             return _find_case_sensitive(reinterpret_cast<const ushort*>(text), length, from);
  122         else
  123             return _find_case_insensitive(reinterpret_cast<const ushort*>(text), length, from);
  124     }
  125     else
  126     {
  127         if (_case_sensitive)
  128             return _find_case_sensitive_avx2(reinterpret_cast<const ushort*>(text), length, from);
  129         else
  130             return _find_case_insensitive_avx2(reinterpret_cast<const ushort*>(text), length, from);
  131     }
  132 #endif
  133 }
  134 
  135 void txtmatcher::_set_lowest_frequency_character(const QString& spattern, const bool cs)
  136 {
  137     const int n(spattern.length());
  138     int lfc(-1);
  139     double lf(2);
  140     if (cs)
  141         for (int l = 0; l < n; ++l)
  142         {
  143             const ushort lcs(spattern.at(l).unicode());
  144             double cf(0);
  145             if (lcs > 96 && lcs < 123)
  146                 cf = _lower_case_letter_frequency[lcs - 97];
  147             else if (lcs > 64 && lcs < 91)
  148                 cf = _upper_case_letter_frequency[lcs - 65];
  149             if (cf < lf)
  150             {
  151                 lf = cf;
  152                 lfc = l;
  153             }
  154         }
  155     else
  156         for (int l = 0; l < n; ++l)
  157         {
  158             const ushort lcu(spattern.at(l).toLower().unicode());
  159             double cf(0);
  160             if (lcu > 96 && lcu < 123)
  161                 cf = _lower_case_letter_frequency[lcu - 97] + _upper_case_letter_frequency[lcu - 97];
  162             if (cf < lf)
  163             {
  164                 lf = cf;
  165                 lfc = l;
  166             }
  167         }
  168     _chook = lfc;
  169     _frequency = 1 + int(1000 * lf);
  170 }
  171 
  172 int txtmatcher::_find_case_sensitive(const ushort* const t, const int length, const int from) const
  173 {
  174     const int sp(_pattern.size());
  175     const int ch(_chook);
  176     const ushort* const p(reinterpret_cast<const ushort*>(_pattern.unicode()));
  177     const ushort pch(p[ch]);
  178     const int a(from + ch), b(length - sp + ch + 1);
  179 
  180     for (int i = a; i < b; ++i)
  181         if (t[i] == pch)
  182         {
  183             for (int j = 0; j < sp; ++j)
  184                 if (t[i - ch + j] != p[j])
  185                     goto next;
  186             return i - ch;
  187 next:
  188             continue;
  189         }
  190     return -1;
  191 }
  192 
  193 int txtmatcher::_find_case_insensitive(const ushort* const t, const int length, const int from) const
  194 {
  195     const int sp(_pattern.size());
  196     const int ch(_chook);
  197     const ushort* const lp(reinterpret_cast<const ushort*>(_lcp.unicode()));
  198     const ushort* const up(reinterpret_cast<const ushort*>(_ucp.unicode()));
  199     const ushort lpch(lp[ch]);
  200     const ushort upch(up[ch]);
  201     const int a(from + ch), b(length - sp + ch + 1);
  202 
  203     for (int i = a; i < b; ++i)
  204         if (t[i] == lpch || t[i] == upch)
  205         {
  206             for (int j = 0; j < sp; ++j)
  207                 if (t[i - ch + j] != lp[j] && t[i - ch + j] != up[j])
  208                     goto next;
  209             return i - ch;
  210 next:
  211             continue;
  212         }
  213     return -1;
  214 }
  215 
  216 /*
  217    SIMD implementations for _find_case_sensitive and _find_case_insensitive
  218 
  219    Optimizes short substring pattern matching (words and stemmed words) required
  220    in context and approximate searches
  221 
  222    The _find_case_sensitive and _find_case_insensitive routines use effective
  223    skipping provided by lowest frequency letters
  224 
  225 
  226 
  227    See B Smyth, Computing Patterns in Strings (2003), chapter 8, for a review on
  228    search techniques and letter frequency specialization
  229 
  230    See W Muła, SIMD-friendly algorithms for substring searching (2016), for
  231    generic SIMD implementations
  232 
  233 */
  234 #ifdef C2B_USE_TXTMATCHER_SSE2
  235 static const uint16_t _zpattern_sse2[] = { 0x0, 0x3, 0xf, 0x3f, 0xff, 0x3ff, 0xfff, 0x3fff, 0xffff };
  236 
  237 int txtmatcher::_find_case_sensitive_sse2(const ushort* const t, const int length, const int from) const
  238 {
  239     const int sp(_pattern.size());
  240     const int ch(_chook);
  241     const ushort* const p(reinterpret_cast<const ushort*>(_padded_pattern.unicode()));
  242     const ushort pch(p[ch]);
  243     const uint16_t zpattern(_zpattern_sse2[sp]);
  244 
  245     const __m128i ppattern(_mm_loadu_si128(reinterpret_cast<const __m128i*>(p)));
  246     const __m128i spattern(_mm_set1_epi16(pch));
  247 
  248     const int tposition(length - 8 - 7 + 1);
  249     int cposition(from + ch);
  250 
  251     while (cposition < tposition)
  252     {
  253         __m128i text(_mm_loadu_si128(reinterpret_cast<const __m128i*>(t + cposition)));
  254         const uint16_t zsp(_mm_movemask_epi8(_mm_cmpeq_epi16(text, spattern)));
  255         if (zsp == uint16_t(0))
  256             cposition += 8;
  257         else
  258         {
  259             const int ctz(__builtin_ctz(zsp) / 2);
  260             cposition += ctz;
  261             text = _mm_loadu_si128(reinterpret_cast<const __m128i*>(t + cposition - ch));
  262             const uint16_t zp(_mm_movemask_epi8(_mm_cmpeq_epi16(text, ppattern)));
  263             if (zp == zpattern)
  264                 return cposition - ch;
  265             ++cposition;
  266         }
  267     }
  268     for (int i = cposition; i < length - sp + ch + 1; ++i)
  269         if (t[i] == pch)
  270         {
  271             for (int j = 0; j < sp; ++j)
  272                 if (t[i - ch + j] != p[j])
  273                     goto next;
  274             return i - ch;
  275 next:
  276             continue;
  277         }
  278     return -1;
  279 }
  280 
  281 int txtmatcher::_find_case_insensitive_sse2(const ushort* const t, const int length, const int from) const
  282 {
  283     const int sp(_pattern.size());
  284     const int ch(_chook);
  285     const ushort* const lp(reinterpret_cast<const ushort*>(_lcp.unicode()));
  286     const ushort* const up(reinterpret_cast<const ushort*>(_ucp.unicode()));
  287     const ushort lpch(lp[ch]);
  288     const ushort upch(up[ch]);
  289     const uint16_t zpattern(_zpattern_sse2[sp]);
  290 
  291     const __m128i lcpattern(_mm_loadu_si128(reinterpret_cast<const __m128i*>(lp)));
  292     const __m128i ucpattern(_mm_loadu_si128(reinterpret_cast<const __m128i*>(up)));
  293     const __m128i lspattern(_mm_set1_epi16(lpch));
  294     const __m128i uspattern(_mm_set1_epi16(upch));
  295 
  296     const int tposition(length - 8 - 7 + 1);
  297     int cposition(from + ch);
  298 
  299     while (cposition < tposition)
  300     {
  301         __m128i text(_mm_loadu_si128(reinterpret_cast<const __m128i*>(t + cposition)));
  302         const uint16_t zlsp(_mm_movemask_epi8(_mm_cmpeq_epi16(text, lspattern)));
  303         const uint16_t zusp(_mm_movemask_epi8(_mm_cmpeq_epi16(text, uspattern)));
  304         const uint16_t zsp(zlsp | zusp);
  305         if (zsp == uint16_t(0))
  306             cposition += 8;
  307         else
  308         {
  309             const int ctz(__builtin_ctz(zsp) / 2);
  310             cposition += ctz;
  311             text = _mm_loadu_si128(reinterpret_cast<const __m128i*>(t + cposition - ch));
  312             const uint16_t zlp(_mm_movemask_epi8(_mm_cmpeq_epi16(text, lcpattern)));
  313             const uint16_t zup(_mm_movemask_epi8(_mm_cmpeq_epi16(text, ucpattern)));
  314             const uint16_t zp(zlp | zup);
  315             if (zp == zpattern)
  316                 return cposition - ch;
  317             ++cposition;
  318         }
  319     }
  320     for (int i = cposition; i < length - sp + ch + 1; ++i)
  321         if (t[i] == lpch || t[i] == upch)
  322         {
  323             for (int j = 0; j < sp; ++j)
  324                 if (t[i - ch + j] != lp[j] && t[i - ch + j] != up[j])
  325                     goto next;
  326             return i - ch;
  327 next:
  328             continue;
  329         }
  330     return -1;
  331 }
  332 #endif
  333 
  334 #ifdef C2B_USE_TXTMATCHER_AVX2
  335 static const uint32_t _zpattern_avx2[] = { 0x0,      0x3,       0xf,       0x3f,       0xff,      0x3ff,
  336                                            0xfff,    0x3fff,    0xffff,    0x3ffff,    0xfffff,   0x3fffff,
  337                                            0xffffff, 0x3ffffff, 0xfffffff, 0x3fffffff, 0xffffffff
  338                                          };
  339 
  340 int txtmatcher::_find_case_sensitive_avx2(const ushort* const t, const int length, const int from) const
  341 {
  342     const int sp(_pattern.size());
  343     const int ch(_chook);
  344     const ushort* const p(reinterpret_cast<const ushort*>(_padded_pattern.unicode()));
  345     const ushort pch(p[ch]);
  346     const uint32_t zpattern(_zpattern_avx2[sp]);
  347 
  348     const __m256i ppattern(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(p)));
  349     const __m256i spattern(_mm256_set1_epi16(pch));
  350 
  351     const int tposition(length - 16 - 15 + 1);
  352     int cposition(from + ch);
  353 
  354     while (cposition < tposition)
  355     {
  356         __m256i text(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(t + cposition)));
  357         const uint32_t zsp(_mm256_movemask_epi8(_mm256_cmpeq_epi16(text, spattern)));
  358         if (zsp == uint32_t(0))
  359             cposition += 16;
  360         else
  361         {
  362             const int ctz(__builtin_ctz(zsp) / 2);
  363             cposition += ctz;
  364             text = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(t + cposition - ch));
  365             const uint32_t zp(_mm256_movemask_epi8(_mm256_cmpeq_epi16(text, ppattern)));
  366             if (zp == zpattern)
  367                 return cposition - ch;
  368             ++cposition;
  369         }
  370     }
  371     for (int i = cposition; i < length - sp + ch + 1; ++i)
  372         if (t[i] == pch)
  373         {
  374             for (int j = 0; j < sp; ++j)
  375                 if (t[i - ch + j] != p[j])
  376                     goto next;
  377             return i - ch;
  378 next:
  379             continue;
  380         }
  381     return -1;
  382 }
  383 
  384 int txtmatcher::_find_case_insensitive_avx2(const ushort* const t, const int length, const int from) const
  385 {
  386     const int sp(_pattern.size());
  387     const int ch(_chook);
  388     const ushort* const lp(reinterpret_cast<const ushort*>(_lcp.unicode()));
  389     const ushort* const up(reinterpret_cast<const ushort*>(_ucp.unicode()));
  390     const ushort lpch(lp[ch]);
  391     const ushort upch(up[ch]);
  392     const uint32_t zpattern(_zpattern_avx2[sp]);
  393 
  394     const __m256i lcpattern(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(lp)));
  395     const __m256i ucpattern(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(up)));
  396     const __m256i lspattern(_mm256_set1_epi16(lpch));
  397     const __m256i uspattern(_mm256_set1_epi16(upch));
  398 
  399     const int tposition(length - 16 - 15 + 1);
  400     int cposition(from + ch);
  401 
  402     while (cposition < tposition)
  403     {
  404         __m256i text(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(t + cposition)));
  405         const uint32_t zlsp(_mm256_movemask_epi8(_mm256_cmpeq_epi16(text, lspattern)));
  406         const uint32_t zusp(_mm256_movemask_epi8(_mm256_cmpeq_epi16(text, uspattern)));
  407         const uint32_t zsp(zlsp | zusp);
  408         if (zsp == uint32_t(0))
  409             cposition += 16;
  410         else
  411         {
  412             const int ctz(__builtin_ctz(zsp) / 2);
  413             cposition += ctz;
  414             text = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(t + cposition - ch));
  415             const uint32_t zlp(_mm256_movemask_epi8(_mm256_cmpeq_epi16(text, lcpattern)));
  416             const uint32_t zup(_mm256_movemask_epi8(_mm256_cmpeq_epi16(text, ucpattern)));
  417             const uint32_t zp(zlp | zup);
  418             if (zp == zpattern)
  419                 return cposition - ch;
  420             ++cposition;
  421         }
  422     }
  423     for (int i = cposition; i < length - sp + ch + 1; ++i)
  424         if (t[i] == lpch || t[i] == upch)
  425         {
  426             for (int j = 0; j < sp; ++j)
  427                 if (t[i - ch + j] != lp[j] && t[i - ch + j] != up[j])
  428                     goto next;
  429             return i - ch;
  430 next:
  431             continue;
  432         }
  433     return -1;
  434 }
  435 #endif