"Fossies" - the Fresh Open Source Software Archive

Member "cb2bib-2.0.1/src/c2b/stemmer.h" (12 Feb 2021, 8797 Bytes) of package /linux/privat/cb2bib-2.0.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "stemmer.h" see the Fossies "Dox" file reference documentation.

    1 /***************************************************************************
    2  *   Copyright (C) 2004-2021 by Pere Constans
    3  *   constans@molspaces.com
    4  *   cb2Bib version 2.0.1. Licensed under the GNU GPL version 3.
    5  *   See the LICENSE file that comes with this distribution.
    6  ***************************************************************************/
    7 #ifndef STEMMER_H
    8 #define STEMMER_H
    9 
   10 #include <QRegExp>
   11 #include <QString>
   12 
   13 
   14 struct stemmer
   15 {
   16     stemmer()
   17         : digits("\\d"),
   18           suffix_1_letter("(:?a|e|f|i|o|r|s|t|x|y)$", Qt::CaseInsensitive),
   19           suffix_2_letter("(:?as|ce|er|es|fs|ic|id|ie|is|on|os|se|sm|st|um|us|ys)$", Qt::CaseInsensitive),
   20           suffix_2_letter_inflected("(:?ed|en|er)$", Qt::CaseInsensitive),
   21           suffix_2_letter_length_5("(:?ed|er)$", Qt::CaseInsensitive),
   22           suffix_3_letter("(:?ers|ics|ies|ion|ish|ism|ist|ium|oes|ons|ora|sms|sts|ums|ves|xes)$", Qt::CaseInsensitive),
   23           suffix_3_letter_length_7("(:?ers|ies|ish|ora|sms|ums|ves|xes)$", Qt::CaseInsensitive),
   24           suffix_4_letter("(:?ical|ions|isms|ists)$", Qt::CaseInsensitive),
   25           suffix_double_esses("(:?sses|ssis|ssus)$", Qt::CaseInsensitive),
   26           suffix_isz("(:?ise|ised|iser|isers|ises|ising|isis|ize|ized|izer|izers|izes|izing)$", Qt::CaseInsensitive),
   27           suffix_man("(:?man|men)$", Qt::CaseInsensitive),
   28           suffix_misc("(:?mme|mmes|lty|lties|lity|lities)$", Qt::CaseInsensitive),
   29           suffix_ogues("(:?ogue|ogues)$", Qt::CaseInsensitive),
   30           suffix_ours("(:?or|ors|our|ours)$", Qt::CaseInsensitive),
   31           suffix_reer("(:?tre|tres|bre|bres)$", Qt::CaseInsensitive),
   32           suffix_sz("(:?isation|ization|ysation|yzation|isations|izations|ysations|yzations)$", Qt::CaseInsensitive),
   33           suffix_ysz("(:?yse|ysed|yser|ysers|yses|ysing|ysis|yze|yzed|yzer|yzers|yzes|yzing)$", Qt::CaseInsensitive)
   34     {
   35         dictionary.insert("alpha", QChar(913));
   36         dictionary.insert("beta", QChar(914));
   37         dictionary.insert("gamma", QChar(915));
   38         dictionary.insert("delta", QChar(916));
   39         dictionary.insert("epsilon", QChar(917));
   40         dictionary.insert("zeta", QChar(918));
   41         dictionary.insert("eta", QChar(919));
   42         dictionary.insert("theta", QChar(920));
   43         dictionary.insert("iota", QChar(921));
   44         dictionary.insert("kappa", QChar(922));
   45         dictionary.insert("lambda", QChar(923));
   46         dictionary.insert("mu", QChar(924));
   47         dictionary.insert("nu", QChar(925));
   48         dictionary.insert("xi", QChar(926));
   49         dictionary.insert("omicron", QChar(927));
   50         dictionary.insert("pi", QChar(928));
   51         dictionary.insert("rho", QChar(929));
   52         dictionary.insert("sigma", QChar(931));
   53         dictionary.insert("tau", QChar(932));
   54         dictionary.insert("upsilon", QChar(933));
   55         dictionary.insert("phi", QChar(934));
   56         dictionary.insert("chi", QChar(935));
   57         dictionary.insert("psi", QChar(936));
   58         dictionary.insert("omega", QChar(937));
   59 
   60         dictionary.insert("axes", "axis");
   61         dictionary.insert("axis", "axes");
   62         dictionary.insert("bases", "basis");
   63         dictionary.insert("basis", "bases");
   64         dictionary.insert("data", "datum");
   65         dictionary.insert("datum", "data");
   66         dictionary.insert("liked", "like");
   67 
   68         dictionary.insert("infinity", QChar(8734));
   69     }
   70 
   71     QString stem(const QString& word) const
   72     {
   73         const Qt::CaseSensitivity ci(Qt::CaseInsensitive);
   74         const int nl(word.length());
   75         if (nl < 4)
   76             return word;
   77 
   78         if (digits.indexIn(word) >= 0)
   79             return word;
   80 
   81         if (nl == 4)
   82         {
   83             if (word.endsWith(QLatin1Char('s'), ci))
   84                 return word.left(3);
   85             return word;
   86         }
   87 
   88         if (suffix_man.indexIn(word) >= 0)
   89             return word.left(nl - 2);
   90         if (suffix_reer.indexIn(word) >= 0)
   91             return word.left(nl - suffix_reer.matchedLength() + 1);
   92 
   93         if (nl == 5)
   94         {
   95             if (suffix_2_letter_length_5.indexIn(word) >= 0)
   96                 return word.left(3);
   97             if (suffix_1_letter.indexIn(word) >= 0)
   98                 return word.left(4);
   99             return word;
  100         }
  101 
  102         if (suffix_isz.indexIn(word) >= 0)
  103             return word.left(nl - suffix_isz.matchedLength() + 1);
  104         if (suffix_ysz.indexIn(word) >= 0)
  105             return word.left(nl - suffix_ysz.matchedLength() + 1);
  106         if (suffix_2_letter_inflected.indexIn(word) >= 0)
  107             return word.at(nl - 3) == word.at(nl - 4) || word.at(nl - 3).toLower() == QLatin1Char('i')
  108                    ? word.left(nl - 3)
  109                    : word.left(nl - 2);
  110         if (suffix_ours.indexIn(word) >= 0)
  111             return word.left(nl - suffix_ours.matchedLength() + 1);
  112         if (suffix_misc.indexIn(word) >= 0)
  113             return word.left(nl - suffix_misc.matchedLength() + 1);
  114 
  115         if (nl == 6)
  116         {
  117             if (word.endsWith(QLatin1String("ing"), ci))
  118                 return word.left(3);
  119             if (suffix_2_letter.indexIn(word) >= 0)
  120                 return word.left(4);
  121             if (suffix_1_letter.indexIn(word) >= 0)
  122                 return word.left(5);
  123             return word;
  124         }
  125 
  126         if (word.endsWith(QLatin1String("est"), ci))
  127             return word.at(nl - 4) == word.at(nl - 5) || word.at(nl - 4).toLower() == QLatin1Char('i')
  128                    ? word.left(nl - 4)
  129                    : word.left(nl - 3);
  130         if (word.endsWith(QLatin1String("ing"), ci))
  131             return word.at(nl - 4) == word.at(nl - 5) || word.at(nl - 4).toLower() == QLatin1Char('y')
  132                    ? word.left(nl - 4)
  133                    : word.left(nl - 3);
  134         if (word.endsWith(QLatin1String("ices"), ci))
  135             return word.left(nl - 3);
  136         if (word.endsWith(QLatin1String("uses"), ci))
  137             return word.at(nl - 5).toLower() == QLatin1Char('f') ? word.left(nl - 2) : word.left(nl - 4);
  138         if (suffix_ogues.indexIn(word) >= 0)
  139             return word.left(nl - suffix_ogues.matchedLength() + 2);
  140 
  141         if (nl == 7)
  142         {
  143             if (suffix_3_letter_length_7.indexIn(word) >= 0)
  144                 return word.left(4);
  145             if (suffix_2_letter.indexIn(word) >= 0)
  146                 return word.left(5);
  147             if (suffix_1_letter.indexIn(word) >= 0)
  148                 return word.left(6);
  149             return word;
  150         }
  151 
  152         if (word.endsWith(QLatin1String("ings"), ci))
  153             return word.at(nl - 5) == word.at(nl - 6) || word.at(nl - 5).toLower() == QLatin1Char('y')
  154                    ? word.left(nl - 5)
  155                    : word.left(nl - 4);
  156 
  157         if (nl > 9 && suffix_sz.indexIn(word) >= 0)
  158             return word.left(nl - suffix_sz.matchedLength() + 1);
  159         if (nl > 8 && suffix_4_letter.indexIn(word) >= 0)
  160             return word.left(nl - 4);
  161 
  162         if (suffix_3_letter.indexIn(word) >= 0)
  163             return word.left(nl - 3);
  164         if (suffix_double_esses.indexIn(word) >= 0)
  165             return word.left(nl - 3);
  166         if (suffix_2_letter.indexIn(word) >= 0)
  167             return word.left(nl - 2);
  168         if (suffix_1_letter.indexIn(word) >= 0)
  169             return word.left(nl - 1);
  170 
  171         return word;
  172     }
  173     QStringList stems(const QString& word) const
  174     {
  175         QStringList sts;
  176         {
  177             QString stripped(word);
  178             c2bUtils::stripDiacritics(stripped);
  179             if (stripped != word)
  180             {
  181                 sts.append(word);
  182                 sts.append(stripped);
  183                 return sts;
  184             }
  185         }
  186         {
  187             const QString lower(word.toLower());
  188             if (dictionary.contains(lower))
  189             {
  190                 sts.append(word);
  191                 sts.append(cased(word, dictionary.value(lower)));
  192                 return sts;
  193             }
  194         }
  195         sts.append(stem(word));
  196         return sts;
  197     }
  198     static QString cased(const QString& word, const QString& other)
  199     {
  200         const int n(std::min(word.length(), other.length()));
  201         QString wc(other);
  202         for (int i = 0; i < n; ++i)
  203             wc[i] = word.at(i).isUpper() ? wc.at(i).toUpper() : wc.at(i).toLower();
  204         return wc;
  205     }
  206 
  207     QHash<QString, QString> dictionary;
  208     QRegExp digits;
  209     QRegExp suffix_1_letter;
  210     QRegExp suffix_2_letter;
  211     QRegExp suffix_2_letter_inflected;
  212     QRegExp suffix_2_letter_length_5;
  213     QRegExp suffix_3_letter;
  214     QRegExp suffix_3_letter_length_7;
  215     QRegExp suffix_4_letter;
  216     QRegExp suffix_double_esses;
  217     QRegExp suffix_isz;
  218     QRegExp suffix_man;
  219     QRegExp suffix_misc;
  220     QRegExp suffix_ogues;
  221     QRegExp suffix_ours;
  222     QRegExp suffix_reer;
  223     QRegExp suffix_sz;
  224     QRegExp suffix_ysz;
  225 };
  226 
  227 #endif