"Fossies" - the Fresh Open Source Software Archive 
Member "cb2bib-2.0.1/src/c2b/posTagger.cpp" (12 Feb 2021, 3641 Bytes) of package /linux/privat/cb2bib-2.0.1.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "posTagger.cpp" see the
Fossies "Dox" file reference documentation and the latest
Fossies "Diffs" side-by-side code changes report:
2.0.0_vs_2.0.1.
1 /***************************************************************************
2 * Copyright (C) 2004-2021 by Pere Constans
3 * constans@molspaces.com
4 * cb2Bib version 2.0.1. Licensed under the GNU GPL version 3.
5 * See the LICENSE file that comes with this distribution.
6 ***************************************************************************/
7 #include "posTagger.h"
8
9 #include "cb2bib_utilities.h"
10 #include "settings.h"
11
12
13 posTagger::posTagger()
14 : _period('.'),
15 _space(' '),
16 _pos_a("A"),
17 _pos_g("G"),
18 _pos_n("N"),
19 _pos_o("O"),
20 _pos_r("R"),
21 _pos_s("S"),
22 _suffix_able("able"),
23 _suffix_ae("ae"),
24 _suffix_al("al"),
25 _suffix_apostrophes("'s"),
26 _suffix_ar("ar"),
27 _suffix_ed("ed"),
28 _suffix_ful("ful"),
29 _suffix_ic("ic"),
30 _suffix_ics("ics"),
31 _suffix_ing("ing"),
32 _suffix_is("is"),
33 _suffix_ive("ive"),
34 _suffix_lent("lent"),
35 _suffix_less("less"),
36 _suffix_like("like"),
37 _suffix_ly("ly"),
38 _suffix_ous("ous"),
39 _suffix_s("s"),
40 _suffix_ss("ss"),
41 _suffix_us("us") {}
42
43
44 bool posTagger::loadLexicon()
45 {
46 const QString lexfn(settings::instance()->fileName("cb2Bib/PosLexiconFile"));
47 QFile lexf(lexfn);
48 if (!lexf.open(QIODevice::ReadOnly | QIODevice::Text))
49 {
50 c2bUtils::warn(QObject::tr("Could not open POS lexicon file %1 for reading").arg(lexfn));
51 return false;
52 }
53 QTextStream lex(&lexf);
54 lex.setCodec("UTF-8");
55 lex.setAutoDetectUnicode(true);
56 const QChar comment('#');
57 QString line;
58
59 while (!lex.atEnd())
60 {
61 line = lex.readLine();
62 if (line.isEmpty() || line.startsWith(comment))
63 continue;
64 const int n(line.toInt());
65 for (int i = 0; i < n; ++i)
66 _sentence_patterns.insert(lex.readLine(), QString());
67 break;
68 }
69 while (!lex.atEnd())
70 {
71 line = lex.readLine();
72 if (line.isEmpty() || line.startsWith(comment))
73 continue;
74 const int n(line.toInt());
75 const QString tag(lex.readLine());
76 for (int i = 0; i < n; ++i)
77 _lexicon.insert(lex.readLine(), tag);
78 }
79 lexf.close();
80 return _sentence_patterns.size() > 0 && _lexicon.size() > 0;
81 }
82
83 QString posTagger::tagged(const QString& text) const
84 {
85 const QStringList words(text.split(_space, QString::SkipEmptyParts));
86 const int nw(words.count());
87 QString t(_period);
88 for (int i = 0; i < nw; ++i)
89 {
90 const QString& w(words.at(i));
91 if (_lexicon.contains(w))
92 t += _lexicon.value(w) + _period;
93 else
94 t += ruletag(w) + _period;
95 }
96 return t;
97 }
98
99 QString posTagger::ruletag(const QString& word) const
100 {
101 if (word.length() < 3)
102 return _pos_s;
103 if (word.endsWith(_suffix_ed) || word.endsWith(_suffix_ar) || word.endsWith(_suffix_ive) ||
104 word.endsWith(_suffix_able) || word.endsWith(_suffix_ic) || word.endsWith(_suffix_al) ||
105 word.endsWith(_suffix_ous) || word.endsWith(_suffix_lent) || word.endsWith(_suffix_like) ||
106 word.endsWith(_suffix_ful) || word.endsWith(_suffix_less))
107 return _pos_a;
108 if (word.endsWith(_suffix_ing))
109 return _pos_g;
110 if (word.endsWith(_suffix_ly))
111 return _pos_r;
112 if (word.endsWith(_suffix_ss) || word.endsWith(_suffix_is) || word.endsWith(_suffix_ics) ||
113 word.endsWith(_suffix_apostrophes) || word.endsWith(_suffix_us))
114 return _pos_n;
115 if (word.endsWith(_suffix_ae))
116 return _pos_o;
117 if (word.endsWith(_suffix_s))
118 return _pos_o;
119 return _pos_n;
120 }