"Fossies" - the Fresh Open Source Software Archive 
Member "cb2bib-2.0.1/src/c2b/approximatePattern.cpp" (12 Feb 2021, 11628 Bytes) of package /linux/privat/cb2bib-2.0.1.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "approximatePattern.cpp" see the
Fossies "Dox" file reference documentation and the latest
Fossies "Diffs" side-by-side code changes report:
2.0.0_vs_2.0.1.
1 /***************************************************************************
2 * Copyright (C) 2004-2021 by Pere Constans
3 * constans@molspaces.com
4 * cb2Bib version 2.0.1. Licensed under the GNU GPL version 3.
5 * See the LICENSE file that comes with this distribution.
6 *
7 * Class implementation of the approximate search algorithm
8 * P. Constans. Approximate textual retrieval. arXiv:0705.0751, 2007.
9 ***************************************************************************/
10 #include "approximatePattern.h"
11
12 #include "cb2bib_utilities.h"
13 #include "triads.h"
14
15 namespace
16 {
17 #if (QT_VERSION >= QT_VERSION_CHECK(5, 4, 0)) && (QT_VERSION < QT_VERSION_CHECK(5, 12, 0))
18 static const QRegularExpression::PatternOptions _qre_pattern_options(QRegularExpression::DontCaptureOption |
19 QRegularExpression::UseUnicodePropertiesOption |
20 QRegularExpression::OptimizeOnFirstUsageOption);
21 #else
22 static const QRegularExpression::PatternOptions _qre_pattern_options(QRegularExpression::DontCaptureOption |
23 QRegularExpression::UseUnicodePropertiesOption);
24 #endif
25 } // namespace
26
27
28 approximatePattern::approximatePattern() : compositePattern() {}
29
30 approximatePattern::approximatePattern(const QString& pattern, const Qt::CaseSensitivity cs)
31 : compositePattern(pattern, cs)
32 {
33 setPattern(pattern, cs);
34 }
35
36
37 void approximatePattern::setPattern(const QString& pattern, const Qt::CaseSensitivity cs)
38 {
39 _string = pattern;
40 _subpatterns = pattern.split(c2bUtils::nonLetter, QString::SkipEmptyParts);
41 _case_sensitivity = cs;
42 _is_multipattern = false;
43 _matched_length = -1;
44 _subpattern_count = 0;
45 // Exact match
46 if (_string.length() < 5)
47 {
48 _regexp.setPattern(escape(_string, _case_sensitivity));
49 _regexp.setPatternOptions(_qre_pattern_options);
50 return;
51 }
52 // Single word: Allow 1 error (1 missing / 3 excess), anywhere
53 if (wordCount(_string) == 1)
54 {
55 _regexp.setPattern(wordPattern(_string, _case_sensitivity));
56 _regexp.setPatternOptions(_qre_pattern_options);
57 return;
58 }
59 // Multiple words
60 _prefixes = c2bUtils::fileToString(":/txt/txt/prefixes.txt").split(c2bUtils::nonLetter, QString::SkipEmptyParts);
61 _suffixes = c2bUtils::fileToString(":/txt/txt/suffixes.txt").split(c2bUtils::nonLetter, QString::SkipEmptyParts);
62 splitPattern();
63 if (_string_pieces.count() < 3) // Cases: "qq pp", "qqq pp", etc
64 {
65 _regexp.setPattern(wordPattern(_string, _case_sensitivity));
66 _regexp.setPatternOptions(_qre_pattern_options);
67 // Avoid rare cases triggering 'too large regular expression' by just composing prefix and suffix
68 if (!_regexp.isValid())
69 {
70 _regexp.setPattern("(?:" + _string_pieces.join('|') + ')');
71 _regexp.setPatternOptions(_qre_pattern_options);
72 }
73 return;
74 }
75 _is_multipattern = true;
76
77 // Set parameters
78 const double percent_scan(50.);
79 const double scan_factor(100. / percent_scan);
80 const int max_blocks(_string_pieces.count() / 2);
81 const int requested_blocks(c2bUtils::nearInteger(scan_factor));
82 const int blocks(std::min(max_blocks, 1 + requested_blocks));
83 int pieces_per_block(_string_pieces.count() / blocks); // Ceiling
84 if ((_string_pieces.count() % blocks) > 0)
85 ++pieces_per_block;
86
87 #if C2B_DEBUG_APPROXIMATEPATTERN
88 qDebug() << _string_pieces << max_blocks << scan_factor << requested_blocks << pieces_per_block
89 << _string_pieces.count() % blocks;
90 for (int b = 0; b < blocks; ++b)
91 for (int i = 0; i < pieces_per_block; ++i)
92 if (b + (i * blocks) < _string_pieces.count())
93 qDebug() << "block# " << b << "pick item " << b + (i * blocks) << _string_pieces.at(b + (i * blocks));
94 #endif
95
96 _ranks.clear();
97 QStringList submatcher;
98 QStringList subpattern;
99 QStringList substring;
100 const QString sp_stretch(".{0,%1}%2");
101 for (int b = 0; b < blocks; ++b)
102 {
103 int ii(b);
104 QString sm(_string_pieces.at(ii));
105 QString sp(escape(_string_pieces.at(ii), _case_sensitivity));
106 QString ss(_string_pieces.at(ii));
107 double stretch_product(1.);
108 for (int j = 1; j < pieces_per_block; ++j)
109 {
110 int jj(b + (j * blocks));
111 if (jj < _string_pieces.count())
112 {
113 const int st(stretch(ii, jj));
114 sm += sp_stretch.arg(st).arg(_string_pieces.at(jj));
115 sp += sp_stretch.arg(st).arg(escape(_string_pieces.at(jj), _case_sensitivity));
116 ss += ' ' + _string_pieces.at(jj);
117 stretch_product *= st;
118 ii = jj;
119 }
120 }
121 submatcher.append(sm);
122 subpattern.append(sp);
123 substring.append(ss);
124 // Rank according expectation (arXiv:0705.0751, Eq. 6)
125 _ranks.append(stretch_product * triads::textFrequency(ss));
126 ++_subpattern_count;
127 }
128 set_sort_index();
129 _regexp.setPattern(QString("(?:%1)").arg(subpattern.join("|")));
130 _submatchers.resize(_subpattern_count);
131 _subregexps.resize(_subpattern_count);
132 _substrings.clear();
133 for (int i = 0; i < _subpattern_count; ++i)
134 {
135 const int ii(_index.at(i));
136 _submatchers[i] = substringMatcher(submatcher.at(ii), _case_sensitivity);
137 _subregexps[i].setPattern(subpattern.at(ii));
138 _substrings.append(substring.at(ii));
139 }
140 _p0.resize(_subpattern_count);
141 _pn.resize(_subpattern_count);
142 }
143
144 void approximatePattern::splitPattern()
145 {
146 if (_string.isEmpty())
147 return;
148 QStringList wordList;
149 QList<int> wordStarts;
150 QList<int> wordEnds;
151 splitPattern(_string, &wordList, &wordStarts, &wordEnds);
152 for (int i = 0; i < wordList.count(); ++i)
153 {
154 const QStringList pieces(splitWord(wordList.at(i)));
155 const QString prefix(pieces.at(0));
156 if (prefix.length() > 2)
157 {
158 _string_pieces.append(prefix);
159 _piece_starts.append(wordStarts.at(i));
160 _piece_ends.append(wordStarts.at(i) + prefix.length());
161 }
162 const QString suffix(pieces.at(1));
163 if (suffix.length() > 2)
164 {
165 _string_pieces.append(suffix);
166 _piece_starts.append(wordEnds.at(i) - suffix.length());
167 _piece_ends.append(wordEnds.at(i));
168 }
169 }
170 }
171
172 void approximatePattern::splitPattern(const QString& p, QStringList* w, QList<int>* ws, QList<int>* we)
173 {
174 w->clear();
175 ws->clear();
176 we->clear();
177 QString str(p);
178 str.replace(c2bUtils::nonLetter, " ");
179 str.append(' ');
180 int w_starts(0);
181 int w_ends(0);
182 bool in_word(false);
183 for (int i = 0; i < str.length(); ++i)
184 if (str.at(i) == ' ')
185 {
186 if (in_word)
187 {
188 // Word actually ends at i - 1. However, this convention simplifies things.
189 w_ends = i;
190 w->append(str.mid(w_starts, w_ends - w_starts));
191 ws->append(w_starts);
192 we->append(w_ends);
193 }
194 in_word = false;
195 }
196 else
197 {
198 if (!in_word)
199 w_starts = i;
200 in_word = true;
201 }
202 }
203
204 const QStringList approximatePattern::splitWord(const QString& word) const
205 {
206 // Returns the pair "prefix+root root+suffix"
207 const int minimum_length(5);
208 const int wlen(word.length());
209 if (wlen < minimum_length)
210 return QStringList() << word << word;
211
212 QString w(word.toLower());
213 int plen_min(wlen);
214 int slen_min(wlen);
215 int plen_max(0);
216 int slen_max(0);
217 for (int i = 0; i < _prefixes.count(); ++i)
218 if (w.startsWith(_prefixes.at(i)))
219 {
220 const int plen(_prefixes.at(i).length());
221 if (plen < plen_min)
222 plen_min = plen;
223 if (plen > plen_max)
224 plen_max = plen;
225 }
226 for (int i = 0; i < _suffixes.count(); ++i)
227 if (w.endsWith(_suffixes.at(i)))
228 {
229 const int slen(_suffixes.at(i).length());
230 if (slen < slen_min)
231 slen_min = slen;
232 if (slen > slen_max)
233 slen_max = slen;
234 }
235 if (plen_min == wlen)
236 plen_min = 0;
237 if (slen_min == wlen)
238 slen_min = 0;
239
240 w = word;
241 const int wlenMp_min(std::max(plen_min, wlen - slen_min));
242 const int wlenMs_min(std::max(slen_min, wlen - plen_min));
243 const int wlenMp_max(std::max(plen_max, wlen - slen_max));
244 const int wlenMs_max(std::max(slen_max, wlen - plen_max));
245 #if C2B_DEBUG_APPROXIMATEPATTERN
246 qDebug() << wlenMp_min << wlenMs_min << w.left(wlenMp_min) << w.right(wlenMs_min);
247 qDebug() << wlenMp_max << wlenMs_max << w.left(wlenMp_max) << w.right(wlenMs_max);
248 #endif
249 if (wlenMp_max >= minimum_length && wlenMs_max >= minimum_length)
250 return QStringList() << w.left(wlenMp_max) << w.right(wlenMs_max);
251 else if (wlenMp_min >= minimum_length && wlenMs_min >= minimum_length)
252 return QStringList() << w.left(wlenMp_min) << w.right(wlenMs_min);
253 else
254 return QStringList() << word << word;
255 }
256
257 QString approximatePattern::wordPattern(const QString& word, Qt::CaseSensitivity cs)
258 {
259 const int len(word.length());
260 const QString ord(word.right(len - 1));
261 QStringList possible;
262 possible.append(escape(ord.left(len - 2), cs));
263 for (int i = 1; i < len - 2; ++i)
264 possible.append(QString("%1.{0,2}%2").arg(escape(ord.left(len - i - 3), cs), escape(ord.right(i), cs)));
265 #if C2B_DEBUG_APPROXIMATEPATTERN
266 qDebug() << "WordPattern: "
267 << QString("(?:%1(?:%2)|%3)").arg(escape(word.at(0), cs), possible.join("|"), escape(ord, cs));
268 #endif
269 return QString("(?:%1(?:%2)|%3)").arg(escape(word.at(0), cs), possible.join("|"), escape(ord, cs));
270 }
271
272 int approximatePattern::wordCount(const QString& str)
273 {
274 const QString tstr(QString(str).replace(c2bUtils::nonLetter, " ").simplified());
275 return 1 + tstr.count(' ');
276 }
277
278 int approximatePattern::stretch(int piece_i, int piece_j) const
279 {
280 const int minStretch(3);
281 const int maxStretch(20);
282 if (_string_pieces.at(piece_j).length() > 4)
283 return std::max((maxStretch * (piece_j - piece_i)),
284 minStretch * (_piece_starts.at(piece_j) - _piece_ends.at(piece_i)));
285 else
286 return minStretch * (_piece_starts.at(piece_j) - _piece_ends.at(piece_i)); // Estimated error ratio only
287 }
288
289 void approximatePattern::mergeIndices(int* index_in, const QString& str) const
290 {
291 int index = str.length();
292 for (int i = 0; i < _subpattern_count; ++i)
293 if (_p0.at(i) != -1 && _p0.at(i) < index)
294 index = _p0.at(i);
295 if (index == str.length())
296 index = -1;
297 else
298 {
299 int pn(0);
300 for (int i = 0; i < _subpattern_count; ++i)
301 if (_pn.at(i) > pn && _p0.at(i) == index)
302 pn = _pn.at(i);
303 for (int m = 0; m < _subpattern_count; ++m)
304 for (int i = 0; i < _subpattern_count; ++i)
305 if (_pn.at(i) > pn && _p0.at(i) < pn)
306 pn = _pn.at(i);
307 // Beautify match by including whole words
308 for (int w = index - 1; w > std::max(0, index - 13); --w)
309 if (!str.at(w).isLetter())
310 {
311 index = w + 1;
312 break;
313 }
314 for (int w = pn; w < std::min(pn + 13, str.length()); ++w)
315 if (!str.at(w).isLetter())
316 {
317 pn = w;
318 break;
319 }
320 _matched_length = pn - index;
321 }
322 *index_in = index;
323 }