"Fossies" - the Fresh Open Source Software Archive 
Member "cb2bib-2.0.1/src/c2b/stemMatcher.cpp" (12 Feb 2021, 6441 Bytes) of package /linux/privat/cb2bib-2.0.1.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "stemMatcher.cpp" see the
Fossies "Dox" file reference documentation.
1 /***************************************************************************
2 * Copyright (C) 2004-2021 by Pere Constans
3 * constans@molspaces.com
4 * cb2Bib version 2.0.1. Licensed under the GNU GPL version 3.
5 * See the LICENSE file that comes with this distribution.
6 ***************************************************************************/
7 #include "stemMatcher.h"
8
9 #include "cb2bib_utilities.h"
10 #include "stemmer.h"
11 #include "triads.h"
12
13 #include <limits>
14
15
16 stemMatcher::stemMatcher()
17 : _hook(-1),
18 _stretch(0),
19 _substring_count(0),
20 _substring_rex_count(0),
21 _substring_txt_count(0),
22 _matched_length(-1),
23 _p0(0),
24 _pn(0) {}
25
26 stemMatcher::stemMatcher(const QString& pattern, const Qt::CaseSensitivity cs) : _p0(0), _pn(0)
27 {
28 setPattern(pattern, cs);
29 }
30
31
32 void stemMatcher::setPattern(const QString& pattern, const Qt::CaseSensitivity cs)
33 {
34 _hook = -1;
35 _matched_length = -1;
36 _stretch = 0;
37
38 QStringList substrings;
39 {
40 const QRegExp rmnww("[\\W_]");
41 const QString boundedt("\\b%1\\b");
42 const QString ort("(:?%1)");
43 const QString stretcht(".{0,%1}");
44 const int minimum_word_length(4);
45 stemmer ls;
46
47 const QStringList segments(pattern.split(' ', QString::SkipEmptyParts));
48 for (int s = 0; s < segments.count(); ++s)
49 {
50 const QStringList qors(segments.at(s).split('|', QString::SkipEmptyParts));
51 QStringList so;
52 int slength(0);
53 for (int o = 0; o < qors.count(); ++o)
54 {
55 slength = std::max(slength, qors.at(o).length());
56 const QStringList words(qors.at(o).split('_', QString::SkipEmptyParts));
57 QStringList sw;
58 for (int w = 0; w < words.count(); ++w)
59 if (words.at(w).startsWith('+'))
60 {
61 QString bw(words.at(w).right(words.at(w).length() - 1));
62 bw.replace(c2bUtils::nonLetter, stretcht.arg(1));
63 sw.append(boundedt.arg(bw));
64 }
65 else
66 {
67 QStringList wl(ls.stems(words.at(w)));
68 for (int l = 0; l < wl.count(); ++l)
69 {
70 const int nnl(wl.at(l).count(c2bUtils::nonLetter));
71 if (nnl > 0 && nnl < wl.at(l).length())
72 wl[l].replace(c2bUtils::nonLetter, stretcht.arg(1));
73 }
74 sw.append(wl.count() == 1 ? wl.at(0) : ort.arg(wl.join('|')));
75 }
76 so.append(sw.join(stretcht.arg(25)));
77 }
78 substrings.append(so.count() == 1 ? so.at(0) : ort.arg(so.join('|')));
79 _stretch += slength > minimum_word_length ? 35 : 10;
80 }
81
82 _subpatterns = pattern.split(rmnww, QString::SkipEmptyParts);
83 }
84
85 _substring_count = substrings.count();
86 _substring_rex_count = 0;
87 for (int i = 0; i < _substring_count; ++i)
88 if (substrings.at(i).contains(c2bUtils::nonLetter))
89 ++_substring_rex_count;
90 _substring_txt_count = _substring_count - _substring_rex_count;
91 _substrings_rex.resize(_substring_rex_count);
92 _substrings_txt.resize(_substring_txt_count);
93 _matched_lengths.fill(0, _substring_count);
94 _sp0.resize(_substring_count);
95 _signature_string.clear();
96
97 for (int i = 0, ir = 0, it = 0; i < _substring_count; ++i)
98 if (substrings.at(i).contains(c2bUtils::nonLetter))
99 {
100 QRegularExpression& re(_substrings_rex[ir++]);
101 #if (QT_VERSION >= QT_VERSION_CHECK(5, 4, 0)) && (QT_VERSION < QT_VERSION_CHECK(5, 12, 0))
102 const QRegularExpression::PatternOptions po(QRegularExpression::DontCaptureOption |
103 QRegularExpression::UseUnicodePropertiesOption |
104 QRegularExpression::OptimizeOnFirstUsageOption);
105 #else
106 const QRegularExpression::PatternOptions po(QRegularExpression::DontCaptureOption |
107 QRegularExpression::UseUnicodePropertiesOption);
108 #endif
109 re.setPattern(substrings.at(i));
110 re.setPatternOptions(cs == Qt::CaseSensitive ? po : po | QRegularExpression::CaseInsensitiveOption);
111 }
112 else
113 {
114 txtmatcher& tm(_substrings_txt[it++]);
115 tm.setPattern(substrings.at(i), cs);
116 _signature_string += substrings.at(i) + ' ';
117 }
118 _signature_string.chop(1);
119
120 for (int i = 0, lf = std::numeric_limits<int>::max(); i < _substring_txt_count; ++i)
121 {
122 const int f(_substrings_txt.at(i).frequency() * triads::textFrequency(_substrings_txt.at(i).pattern()));
123 if (f < lf)
124 {
125 lf = f;
126 _hook = i;
127 }
128 }
129 if (_hook == -1)
130 for (int i = 0, lf = std::numeric_limits<int>::max(); i < _substring_rex_count; ++i)
131 {
132 const int f(triads::textFrequency(_substrings_rex.at(i).pattern()));
133 if (f < lf)
134 {
135 lf = f;
136 _hook = _substring_txt_count + i;
137 }
138 }
139 #ifdef C2B_DEBUG_SEARCHING
140 for (int i = 0; i < _substring_txt_count; ++i)
141 qDebug() << "subpattern txt " << _substrings_txt.at(i).pattern();
142 for (int i = 0; i < _substring_rex_count; ++i)
143 qDebug() << "subpattern rex " << _substrings_rex.at(i).pattern();
144 if (_hook < _substring_txt_count)
145 qDebug() << "hook " << _substrings_txt.at(_hook).pattern() << _hook;
146 else
147 qDebug() << "hook " << _substrings_rex.at(_hook - _substring_txt_count).pattern() << _hook;
148 qDebug() << "_stretch " << _stretch;
149 qDebug() << "subpatternstrings " << subpatternStrings();
150 qDebug() << "signaturestring " << signatureString();
151 #endif
152 }
153
154 int stemMatcher::indexIn(const QString& text, const int from) const
155 {
156 _matched_length = -1;
157 if (_hook == -1) // Uninitialized
158 return -1;
159 _p0 = from;
160 if (_p0 < 0)
161 _p0 = 0;
162 _pn = text.length();
163 if (_pn == 0)
164 return -1;
165
166 int hp(_p0);
167 int p(-1);
168 while (p == -1)
169 {
170 hp = _index_in(_hook, text, hp);
171 if (hp == -1)
172 return -1;
173 p = _index_around(text, hp);
174 hp += _matched_lengths.at(_hook);
175 }
176 return p;
177 }