"Fossies" - the Fresh Open Source Software Archive

Member "cb2bib-2.0.1/src/c2b/stemMatcher.cpp" (12 Feb 2021, 6441 Bytes) of package /linux/privat/cb2bib-2.0.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "stemMatcher.cpp" see the Fossies "Dox" file reference documentation.

    1 /***************************************************************************
    2  *   Copyright (C) 2004-2021 by Pere Constans
    3  *   constans@molspaces.com
    4  *   cb2Bib version 2.0.1. Licensed under the GNU GPL version 3.
    5  *   See the LICENSE file that comes with this distribution.
    6  ***************************************************************************/
    7 #include "stemMatcher.h"
    8 
    9 #include "cb2bib_utilities.h"
   10 #include "stemmer.h"
   11 #include "triads.h"
   12 
   13 #include <limits>
   14 
   15 
   16 stemMatcher::stemMatcher()
   17     : _hook(-1),
   18       _stretch(0),
   19       _substring_count(0),
   20       _substring_rex_count(0),
   21       _substring_txt_count(0),
   22       _matched_length(-1),
   23       _p0(0),
   24       _pn(0) {}
   25 
   26 stemMatcher::stemMatcher(const QString& pattern, const Qt::CaseSensitivity cs) : _p0(0), _pn(0)
   27 {
   28     setPattern(pattern, cs);
   29 }
   30 
   31 
   32 void stemMatcher::setPattern(const QString& pattern, const Qt::CaseSensitivity cs)
   33 {
   34     _hook = -1;
   35     _matched_length = -1;
   36     _stretch = 0;
   37 
   38     QStringList substrings;
   39     {
   40         const QRegExp rmnww("[\\W_]");
   41         const QString boundedt("\\b%1\\b");
   42         const QString ort("(:?%1)");
   43         const QString stretcht(".{0,%1}");
   44         const int minimum_word_length(4);
   45         stemmer ls;
   46 
   47         const QStringList segments(pattern.split(' ', QString::SkipEmptyParts));
   48         for (int s = 0; s < segments.count(); ++s)
   49         {
   50             const QStringList qors(segments.at(s).split('|', QString::SkipEmptyParts));
   51             QStringList so;
   52             int slength(0);
   53             for (int o = 0; o < qors.count(); ++o)
   54             {
   55                 slength = std::max(slength, qors.at(o).length());
   56                 const QStringList words(qors.at(o).split('_', QString::SkipEmptyParts));
   57                 QStringList sw;
   58                 for (int w = 0; w < words.count(); ++w)
   59                     if (words.at(w).startsWith('+'))
   60                     {
   61                         QString bw(words.at(w).right(words.at(w).length() - 1));
   62                         bw.replace(c2bUtils::nonLetter, stretcht.arg(1));
   63                         sw.append(boundedt.arg(bw));
   64                     }
   65                     else
   66                     {
   67                         QStringList wl(ls.stems(words.at(w)));
   68                         for (int l = 0; l < wl.count(); ++l)
   69                         {
   70                             const int nnl(wl.at(l).count(c2bUtils::nonLetter));
   71                             if (nnl > 0 && nnl < wl.at(l).length())
   72                                 wl[l].replace(c2bUtils::nonLetter, stretcht.arg(1));
   73                         }
   74                         sw.append(wl.count() == 1 ? wl.at(0) : ort.arg(wl.join('|')));
   75                     }
   76                 so.append(sw.join(stretcht.arg(25)));
   77             }
   78             substrings.append(so.count() == 1 ? so.at(0) : ort.arg(so.join('|')));
   79             _stretch += slength > minimum_word_length ? 35 : 10;
   80         }
   81 
   82         _subpatterns = pattern.split(rmnww, QString::SkipEmptyParts);
   83     }
   84 
   85     _substring_count = substrings.count();
   86     _substring_rex_count = 0;
   87     for (int i = 0; i < _substring_count; ++i)
   88         if (substrings.at(i).contains(c2bUtils::nonLetter))
   89             ++_substring_rex_count;
   90     _substring_txt_count = _substring_count - _substring_rex_count;
   91     _substrings_rex.resize(_substring_rex_count);
   92     _substrings_txt.resize(_substring_txt_count);
   93     _matched_lengths.fill(0, _substring_count);
   94     _sp0.resize(_substring_count);
   95     _signature_string.clear();
   96 
   97     for (int i = 0, ir = 0, it = 0; i < _substring_count; ++i)
   98         if (substrings.at(i).contains(c2bUtils::nonLetter))
   99         {
  100             QRegularExpression& re(_substrings_rex[ir++]);
  101 #if (QT_VERSION >= QT_VERSION_CHECK(5, 4, 0)) && (QT_VERSION < QT_VERSION_CHECK(5, 12, 0))
  102             const QRegularExpression::PatternOptions po(QRegularExpression::DontCaptureOption |
  103                     QRegularExpression::UseUnicodePropertiesOption |
  104                     QRegularExpression::OptimizeOnFirstUsageOption);
  105 #else
  106             const QRegularExpression::PatternOptions po(QRegularExpression::DontCaptureOption |
  107                     QRegularExpression::UseUnicodePropertiesOption);
  108 #endif
  109             re.setPattern(substrings.at(i));
  110             re.setPatternOptions(cs == Qt::CaseSensitive ? po : po | QRegularExpression::CaseInsensitiveOption);
  111         }
  112         else
  113         {
  114             txtmatcher& tm(_substrings_txt[it++]);
  115             tm.setPattern(substrings.at(i), cs);
  116             _signature_string += substrings.at(i) + ' ';
  117         }
  118     _signature_string.chop(1);
  119 
  120     for (int i = 0, lf = std::numeric_limits<int>::max(); i < _substring_txt_count; ++i)
  121     {
  122         const int f(_substrings_txt.at(i).frequency() * triads::textFrequency(_substrings_txt.at(i).pattern()));
  123         if (f < lf)
  124         {
  125             lf = f;
  126             _hook = i;
  127         }
  128     }
  129     if (_hook == -1)
  130         for (int i = 0, lf = std::numeric_limits<int>::max(); i < _substring_rex_count; ++i)
  131         {
  132             const int f(triads::textFrequency(_substrings_rex.at(i).pattern()));
  133             if (f < lf)
  134             {
  135                 lf = f;
  136                 _hook = _substring_txt_count + i;
  137             }
  138         }
  139 #ifdef C2B_DEBUG_SEARCHING
  140     for (int i = 0; i < _substring_txt_count; ++i)
  141         qDebug() << "subpattern txt     " << _substrings_txt.at(i).pattern();
  142     for (int i = 0; i < _substring_rex_count; ++i)
  143         qDebug() << "subpattern rex     " << _substrings_rex.at(i).pattern();
  144     if (_hook < _substring_txt_count)
  145         qDebug() << "hook               " << _substrings_txt.at(_hook).pattern() << _hook;
  146     else
  147         qDebug() << "hook               " << _substrings_rex.at(_hook - _substring_txt_count).pattern() << _hook;
  148     qDebug() << "_stretch           " << _stretch;
  149     qDebug() << "subpatternstrings  " << subpatternStrings();
  150     qDebug() << "signaturestring    " << signatureString();
  151 #endif
  152 }
  153 
  154 int stemMatcher::indexIn(const QString& text, const int from) const
  155 {
  156     _matched_length = -1;
  157     if (_hook == -1) // Uninitialized
  158         return -1;
  159     _p0 = from;
  160     if (_p0 < 0)
  161         _p0 = 0;
  162     _pn = text.length();
  163     if (_pn == 0)
  164         return -1;
  165 
  166     int hp(_p0);
  167     int p(-1);
  168     while (p == -1)
  169     {
  170         hp = _index_in(_hook, text, hp);
  171         if (hp == -1)
  172             return -1;
  173         p = _index_around(text, hp);
  174         hp += _matched_lengths.at(_hook);
  175     }
  176     return p;
  177 }