"Fossies" - the Fresh Open Source Software Archive

Member "cb2bib-2.0.1/src/c2b/posTagger.cpp" (12 Feb 2021, 3641 Bytes) of package /linux/privat/cb2bib-2.0.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "posTagger.cpp" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 2.0.0_vs_2.0.1.

    1 /***************************************************************************
    2  *   Copyright (C) 2004-2021 by Pere Constans
    3  *   constans@molspaces.com
    4  *   cb2Bib version 2.0.1. Licensed under the GNU GPL version 3.
    5  *   See the LICENSE file that comes with this distribution.
    6  ***************************************************************************/
    7 #include "posTagger.h"
    8 
    9 #include "cb2bib_utilities.h"
   10 #include "settings.h"
   11 
   12 
   13 posTagger::posTagger()
   14     : _period('.'),
   15       _space(' '),
   16       _pos_a("A"),
   17       _pos_g("G"),
   18       _pos_n("N"),
   19       _pos_o("O"),
   20       _pos_r("R"),
   21       _pos_s("S"),
   22       _suffix_able("able"),
   23       _suffix_ae("ae"),
   24       _suffix_al("al"),
   25       _suffix_apostrophes("'s"),
   26       _suffix_ar("ar"),
   27       _suffix_ed("ed"),
   28       _suffix_ful("ful"),
   29       _suffix_ic("ic"),
   30       _suffix_ics("ics"),
   31       _suffix_ing("ing"),
   32       _suffix_is("is"),
   33       _suffix_ive("ive"),
   34       _suffix_lent("lent"),
   35       _suffix_less("less"),
   36       _suffix_like("like"),
   37       _suffix_ly("ly"),
   38       _suffix_ous("ous"),
   39       _suffix_s("s"),
   40       _suffix_ss("ss"),
   41       _suffix_us("us") {}
   42 
   43 
   44 bool posTagger::loadLexicon()
   45 {
   46     const QString lexfn(settings::instance()->fileName("cb2Bib/PosLexiconFile"));
   47     QFile lexf(lexfn);
   48     if (!lexf.open(QIODevice::ReadOnly | QIODevice::Text))
   49     {
   50         c2bUtils::warn(QObject::tr("Could not open POS lexicon file %1 for reading").arg(lexfn));
   51         return false;
   52     }
   53     QTextStream lex(&lexf);
   54     lex.setCodec("UTF-8");
   55     lex.setAutoDetectUnicode(true);
   56     const QChar comment('#');
   57     QString line;
   58 
   59     while (!lex.atEnd())
   60     {
   61         line = lex.readLine();
   62         if (line.isEmpty() || line.startsWith(comment))
   63             continue;
   64         const int n(line.toInt());
   65         for (int i = 0; i < n; ++i)
   66             _sentence_patterns.insert(lex.readLine(), QString());
   67         break;
   68     }
   69     while (!lex.atEnd())
   70     {
   71         line = lex.readLine();
   72         if (line.isEmpty() || line.startsWith(comment))
   73             continue;
   74         const int n(line.toInt());
   75         const QString tag(lex.readLine());
   76         for (int i = 0; i < n; ++i)
   77             _lexicon.insert(lex.readLine(), tag);
   78     }
   79     lexf.close();
   80     return _sentence_patterns.size() > 0 && _lexicon.size() > 0;
   81 }
   82 
   83 QString posTagger::tagged(const QString& text) const
   84 {
   85     const QStringList words(text.split(_space, QString::SkipEmptyParts));
   86     const int nw(words.count());
   87     QString t(_period);
   88     for (int i = 0; i < nw; ++i)
   89     {
   90         const QString& w(words.at(i));
   91         if (_lexicon.contains(w))
   92             t += _lexicon.value(w) + _period;
   93         else
   94             t += ruletag(w) + _period;
   95     }
   96     return t;
   97 }
   98 
   99 QString posTagger::ruletag(const QString& word) const
  100 {
  101     if (word.length() < 3)
  102         return _pos_s;
  103     if (word.endsWith(_suffix_ed) || word.endsWith(_suffix_ar) || word.endsWith(_suffix_ive) ||
  104         word.endsWith(_suffix_able) || word.endsWith(_suffix_ic) || word.endsWith(_suffix_al) ||
  105         word.endsWith(_suffix_ous) || word.endsWith(_suffix_lent) || word.endsWith(_suffix_like) ||
  106         word.endsWith(_suffix_ful) || word.endsWith(_suffix_less))
  107         return _pos_a;
  108     if (word.endsWith(_suffix_ing))
  109         return _pos_g;
  110     if (word.endsWith(_suffix_ly))
  111         return _pos_r;
  112     if (word.endsWith(_suffix_ss) || word.endsWith(_suffix_is) || word.endsWith(_suffix_ics) ||
  113         word.endsWith(_suffix_apostrophes) || word.endsWith(_suffix_us))
  114         return _pos_n;
  115     if (word.endsWith(_suffix_ae))
  116         return _pos_o;
  117     if (word.endsWith(_suffix_s))
  118         return _pos_o;
  119     return _pos_n;
  120 }