"Fossies" - the Fresh Open Source Software Archive

Member "cb2bib-2.0.1/src/c2b/approximatePattern.cpp" (12 Feb 2021, 11628 Bytes) of package /linux/privat/cb2bib-2.0.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "approximatePattern.cpp" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 2.0.0_vs_2.0.1.

    1 /***************************************************************************
    2  *   Copyright (C) 2004-2021 by Pere Constans
    3  *   constans@molspaces.com
    4  *   cb2Bib version 2.0.1. Licensed under the GNU GPL version 3.
    5  *   See the LICENSE file that comes with this distribution.
    6  *
    7  *   Class implementation of the approximate search algorithm
    8  *   P. Constans. Approximate textual retrieval. arXiv:0705.0751, 2007.
    9  ***************************************************************************/
   10 #include "approximatePattern.h"
   11 
   12 #include "cb2bib_utilities.h"
   13 #include "triads.h"
   14 
   15 namespace
   16 {
   17 #if (QT_VERSION >= QT_VERSION_CHECK(5, 4, 0)) && (QT_VERSION < QT_VERSION_CHECK(5, 12, 0))
   18 static const QRegularExpression::PatternOptions _qre_pattern_options(QRegularExpression::DontCaptureOption |
   19         QRegularExpression::UseUnicodePropertiesOption |
   20         QRegularExpression::OptimizeOnFirstUsageOption);
   21 #else
   22 static const QRegularExpression::PatternOptions _qre_pattern_options(QRegularExpression::DontCaptureOption |
   23         QRegularExpression::UseUnicodePropertiesOption);
   24 #endif
   25 } // namespace
   26 
   27 
   28 approximatePattern::approximatePattern() : compositePattern() {}
   29 
   30 approximatePattern::approximatePattern(const QString& pattern, const Qt::CaseSensitivity cs)
   31     : compositePattern(pattern, cs)
   32 {
   33     setPattern(pattern, cs);
   34 }
   35 
   36 
   37 void approximatePattern::setPattern(const QString& pattern, const Qt::CaseSensitivity cs)
   38 {
   39     _string = pattern;
   40     _subpatterns = pattern.split(c2bUtils::nonLetter, QString::SkipEmptyParts);
   41     _case_sensitivity = cs;
   42     _is_multipattern = false;
   43     _matched_length = -1;
   44     _subpattern_count = 0;
   45     // Exact match
   46     if (_string.length() < 5)
   47     {
   48         _regexp.setPattern(escape(_string, _case_sensitivity));
   49         _regexp.setPatternOptions(_qre_pattern_options);
   50         return;
   51     }
   52     // Single word: Allow 1 error (1 missing / 3 excess), anywhere
   53     if (wordCount(_string) == 1)
   54     {
   55         _regexp.setPattern(wordPattern(_string, _case_sensitivity));
   56         _regexp.setPatternOptions(_qre_pattern_options);
   57         return;
   58     }
   59     // Multiple words
   60     _prefixes = c2bUtils::fileToString(":/txt/txt/prefixes.txt").split(c2bUtils::nonLetter, QString::SkipEmptyParts);
   61     _suffixes = c2bUtils::fileToString(":/txt/txt/suffixes.txt").split(c2bUtils::nonLetter, QString::SkipEmptyParts);
   62     splitPattern();
   63     if (_string_pieces.count() < 3) // Cases: "qq pp", "qqq pp", etc
   64     {
   65         _regexp.setPattern(wordPattern(_string, _case_sensitivity));
   66         _regexp.setPatternOptions(_qre_pattern_options);
   67         // Avoid rare cases triggering 'too large regular expression' by just composing prefix and suffix
   68         if (!_regexp.isValid())
   69         {
   70             _regexp.setPattern("(?:" + _string_pieces.join('|') + ')');
   71             _regexp.setPatternOptions(_qre_pattern_options);
   72         }
   73         return;
   74     }
   75     _is_multipattern = true;
   76 
   77     // Set parameters
   78     const double percent_scan(50.);
   79     const double scan_factor(100. / percent_scan);
   80     const int max_blocks(_string_pieces.count() / 2);
   81     const int requested_blocks(c2bUtils::nearInteger(scan_factor));
   82     const int blocks(std::min(max_blocks, 1 + requested_blocks));
   83     int pieces_per_block(_string_pieces.count() / blocks); // Ceiling
   84     if ((_string_pieces.count() % blocks) > 0)
   85         ++pieces_per_block;
   86 
   87 #if C2B_DEBUG_APPROXIMATEPATTERN
   88     qDebug() << _string_pieces << max_blocks << scan_factor << requested_blocks << pieces_per_block
   89              << _string_pieces.count() % blocks;
   90     for (int b = 0; b < blocks; ++b)
   91         for (int i = 0; i < pieces_per_block; ++i)
   92             if (b + (i * blocks) < _string_pieces.count())
   93                 qDebug() << "block#   " << b << "pick item " << b + (i * blocks) << _string_pieces.at(b + (i * blocks));
   94 #endif
   95 
   96     _ranks.clear();
   97     QStringList submatcher;
   98     QStringList subpattern;
   99     QStringList substring;
  100     const QString sp_stretch(".{0,%1}%2");
  101     for (int b = 0; b < blocks; ++b)
  102     {
  103         int ii(b);
  104         QString sm(_string_pieces.at(ii));
  105         QString sp(escape(_string_pieces.at(ii), _case_sensitivity));
  106         QString ss(_string_pieces.at(ii));
  107         double stretch_product(1.);
  108         for (int j = 1; j < pieces_per_block; ++j)
  109         {
  110             int jj(b + (j * blocks));
  111             if (jj < _string_pieces.count())
  112             {
  113                 const int st(stretch(ii, jj));
  114                 sm += sp_stretch.arg(st).arg(_string_pieces.at(jj));
  115                 sp += sp_stretch.arg(st).arg(escape(_string_pieces.at(jj), _case_sensitivity));
  116                 ss += ' ' + _string_pieces.at(jj);
  117                 stretch_product *= st;
  118                 ii = jj;
  119             }
  120         }
  121         submatcher.append(sm);
  122         subpattern.append(sp);
  123         substring.append(ss);
  124         // Rank according expectation (arXiv:0705.0751, Eq. 6)
  125         _ranks.append(stretch_product * triads::textFrequency(ss));
  126         ++_subpattern_count;
  127     }
  128     set_sort_index();
  129     _regexp.setPattern(QString("(?:%1)").arg(subpattern.join("|")));
  130     _submatchers.resize(_subpattern_count);
  131     _subregexps.resize(_subpattern_count);
  132     _substrings.clear();
  133     for (int i = 0; i < _subpattern_count; ++i)
  134     {
  135         const int ii(_index.at(i));
  136         _submatchers[i] = substringMatcher(submatcher.at(ii), _case_sensitivity);
  137         _subregexps[i].setPattern(subpattern.at(ii));
  138         _substrings.append(substring.at(ii));
  139     }
  140     _p0.resize(_subpattern_count);
  141     _pn.resize(_subpattern_count);
  142 }
  143 
  144 void approximatePattern::splitPattern()
  145 {
  146     if (_string.isEmpty())
  147         return;
  148     QStringList wordList;
  149     QList<int> wordStarts;
  150     QList<int> wordEnds;
  151     splitPattern(_string, &wordList, &wordStarts, &wordEnds);
  152     for (int i = 0; i < wordList.count(); ++i)
  153     {
  154         const QStringList pieces(splitWord(wordList.at(i)));
  155         const QString prefix(pieces.at(0));
  156         if (prefix.length() > 2)
  157         {
  158             _string_pieces.append(prefix);
  159             _piece_starts.append(wordStarts.at(i));
  160             _piece_ends.append(wordStarts.at(i) + prefix.length());
  161         }
  162         const QString suffix(pieces.at(1));
  163         if (suffix.length() > 2)
  164         {
  165             _string_pieces.append(suffix);
  166             _piece_starts.append(wordEnds.at(i) - suffix.length());
  167             _piece_ends.append(wordEnds.at(i));
  168         }
  169     }
  170 }
  171 
  172 void approximatePattern::splitPattern(const QString& p, QStringList* w, QList<int>* ws, QList<int>* we)
  173 {
  174     w->clear();
  175     ws->clear();
  176     we->clear();
  177     QString str(p);
  178     str.replace(c2bUtils::nonLetter, " ");
  179     str.append(' ');
  180     int w_starts(0);
  181     int w_ends(0);
  182     bool in_word(false);
  183     for (int i = 0; i < str.length(); ++i)
  184         if (str.at(i) == ' ')
  185         {
  186             if (in_word)
  187             {
  188                 // Word actually ends at i - 1. However, this convention simplifies things.
  189                 w_ends = i;
  190                 w->append(str.mid(w_starts, w_ends - w_starts));
  191                 ws->append(w_starts);
  192                 we->append(w_ends);
  193             }
  194             in_word = false;
  195         }
  196         else
  197         {
  198             if (!in_word)
  199                 w_starts = i;
  200             in_word = true;
  201         }
  202 }
  203 
  204 const QStringList approximatePattern::splitWord(const QString& word) const
  205 {
  206     // Returns the pair "prefix+root root+suffix"
  207     const int minimum_length(5);
  208     const int wlen(word.length());
  209     if (wlen < minimum_length)
  210         return QStringList() << word << word;
  211 
  212     QString w(word.toLower());
  213     int plen_min(wlen);
  214     int slen_min(wlen);
  215     int plen_max(0);
  216     int slen_max(0);
  217     for (int i = 0; i < _prefixes.count(); ++i)
  218         if (w.startsWith(_prefixes.at(i)))
  219         {
  220             const int plen(_prefixes.at(i).length());
  221             if (plen < plen_min)
  222                 plen_min = plen;
  223             if (plen > plen_max)
  224                 plen_max = plen;
  225         }
  226     for (int i = 0; i < _suffixes.count(); ++i)
  227         if (w.endsWith(_suffixes.at(i)))
  228         {
  229             const int slen(_suffixes.at(i).length());
  230             if (slen < slen_min)
  231                 slen_min = slen;
  232             if (slen > slen_max)
  233                 slen_max = slen;
  234         }
  235     if (plen_min == wlen)
  236         plen_min = 0;
  237     if (slen_min == wlen)
  238         slen_min = 0;
  239 
  240     w = word;
  241     const int wlenMp_min(std::max(plen_min, wlen - slen_min));
  242     const int wlenMs_min(std::max(slen_min, wlen - plen_min));
  243     const int wlenMp_max(std::max(plen_max, wlen - slen_max));
  244     const int wlenMs_max(std::max(slen_max, wlen - plen_max));
  245 #if C2B_DEBUG_APPROXIMATEPATTERN
  246     qDebug() << wlenMp_min << wlenMs_min << w.left(wlenMp_min) << w.right(wlenMs_min);
  247     qDebug() << wlenMp_max << wlenMs_max << w.left(wlenMp_max) << w.right(wlenMs_max);
  248 #endif
  249     if (wlenMp_max >= minimum_length && wlenMs_max >= minimum_length)
  250         return QStringList() << w.left(wlenMp_max) << w.right(wlenMs_max);
  251     else if (wlenMp_min >= minimum_length && wlenMs_min >= minimum_length)
  252         return QStringList() << w.left(wlenMp_min) << w.right(wlenMs_min);
  253     else
  254         return QStringList() << word << word;
  255 }
  256 
  257 QString approximatePattern::wordPattern(const QString& word, Qt::CaseSensitivity cs)
  258 {
  259     const int len(word.length());
  260     const QString ord(word.right(len - 1));
  261     QStringList possible;
  262     possible.append(escape(ord.left(len - 2), cs));
  263     for (int i = 1; i < len - 2; ++i)
  264         possible.append(QString("%1.{0,2}%2").arg(escape(ord.left(len - i - 3), cs), escape(ord.right(i), cs)));
  265 #if C2B_DEBUG_APPROXIMATEPATTERN
  266     qDebug() << "WordPattern: "
  267              << QString("(?:%1(?:%2)|%3)").arg(escape(word.at(0), cs), possible.join("|"), escape(ord, cs));
  268 #endif
  269     return QString("(?:%1(?:%2)|%3)").arg(escape(word.at(0), cs), possible.join("|"), escape(ord, cs));
  270 }
  271 
  272 int approximatePattern::wordCount(const QString& str)
  273 {
  274     const QString tstr(QString(str).replace(c2bUtils::nonLetter, " ").simplified());
  275     return 1 + tstr.count(' ');
  276 }
  277 
  278 int approximatePattern::stretch(int piece_i, int piece_j) const
  279 {
  280     const int minStretch(3);
  281     const int maxStretch(20);
  282     if (_string_pieces.at(piece_j).length() > 4)
  283         return std::max((maxStretch * (piece_j - piece_i)),
  284                         minStretch * (_piece_starts.at(piece_j) - _piece_ends.at(piece_i)));
  285     else
  286         return minStretch * (_piece_starts.at(piece_j) - _piece_ends.at(piece_i)); // Estimated error ratio only
  287 }
  288 
  289 void approximatePattern::mergeIndices(int* index_in, const QString& str) const
  290 {
  291     int index = str.length();
  292     for (int i = 0; i < _subpattern_count; ++i)
  293         if (_p0.at(i) != -1 && _p0.at(i) < index)
  294             index = _p0.at(i);
  295     if (index == str.length())
  296         index = -1;
  297     else
  298     {
  299         int pn(0);
  300         for (int i = 0; i < _subpattern_count; ++i)
  301             if (_pn.at(i) > pn && _p0.at(i) == index)
  302                 pn = _pn.at(i);
  303         for (int m = 0; m < _subpattern_count; ++m)
  304             for (int i = 0; i < _subpattern_count; ++i)
  305                 if (_pn.at(i) > pn && _p0.at(i) < pn)
  306                     pn = _pn.at(i);
  307         // Beautify match by including whole words
  308         for (int w = index - 1; w > std::max(0, index - 13); --w)
  309             if (!str.at(w).isLetter())
  310             {
  311                 index = w + 1;
  312                 break;
  313             }
  314         for (int w = pn; w < std::min(pn + 13, str.length()); ++w)
  315             if (!str.at(w).isLetter())
  316             {
  317                 pn = w;
  318                 break;
  319             }
  320         _matched_length = pn - index;
  321     }
  322     *index_in = index;
  323 }