"Fossies" - the Fresh Open Source Software Archive

Member "cb2bib-2.0.1/src/c2b/collectionIndex.cpp" (12 Feb 2021, 57634 Bytes) of package /linux/privat/cb2bib-2.0.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "collectionIndex.cpp" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 2.0.0_vs_2.0.1.

    1 /***************************************************************************
    2  *   Copyright (C) 2004-2021 by Pere Constans
    3  *   constans@molspaces.com
    4  *   cb2Bib version 2.0.1. Licensed under the GNU GPL version 3.
    5  *   See the LICENSE file that comes with this distribution.
    6  ***************************************************************************/
    7 #include "collectionIndex.h"
    8 
    9 #include "bibParser.h"
   10 #include "cb2bib_utilities.h"
   11 #include "documentCache.h"
   12 #include "documentParser.h"
   13 #include "posTagger.h"
   14 #include "settings.h"
   15 
   16 #include <QDataStream>
   17 #include <QRegularExpression>
   18 
   19 #include <time.h>
   20 
   21 #define C2B_DEBUG_COLLECTIONINDEX 0
   22 
   23 namespace
   24 {
   25 // Tunable parameters
   26 static const int _document_length_threshold(100000);
   27 static const int _in_long_document_sentence_repetition(3);
   28 static const int _in_short_document_sentence_repetition(2);
   29 static const int _inter_document_sentence_repetition(3);
   30 
   31 #if (QT_VERSION >= QT_VERSION_CHECK(5, 4, 0)) && (QT_VERSION < QT_VERSION_CHECK(5, 12, 0))
   32 static const QRegularExpression::PatternOptions _qre_pattern_options(QRegularExpression::UseUnicodePropertiesOption |
   33         QRegularExpression::OptimizeOnFirstUsageOption);
   34 #else
   35 static const QRegularExpression::PatternOptions _qre_pattern_options(QRegularExpression::UseUnicodePropertiesOption);
   36 #endif
   37 
   38 class reversedsorting
   39 {
   40 public:
   41     inline reversedsorting() {}
   42     inline bool operator()(const QString& si, const QString& sj) const
   43     {
   44         const int li(si.length());
   45         const int lj(sj.length());
   46         const QChar* a = si.unicode() + li - 1;
   47         const QChar* b = sj.unicode() + lj - 1;
   48         int cl(std::min(li, lj));
   49         while (cl-- > 0)
   50         {
   51             if (*a == *b)
   52             {
   53                 --a;
   54                 --b;
   55             }
   56             else
   57                 return *a < *b;
   58         }
   59         return li < lj;
   60     }
   61 };
   62 
   63 class ussorting
   64 {
   65 public:
   66     inline ussorting() : ss(' '), us('_') {}
   67     inline bool operator()(const QString& si, const QString& sj)
   68     {
   69         QString ssi(si);
   70         QString ssj(sj);
   71         return ssi.replace(us, ss) < ssj.replace(us, ss);
   72     }
   73     const QChar ss;
   74     const QChar us;
   75 };
   76 
   77 } // namespace
   78 
   79 
   80 collectionIndex::collectionIndex(bibParser* bp)
   81     : _out(*(new QTextStream(stdout))), _bpP(bp), _space_char(QChar(' ')), _nerrors(0), _settingsP(settings::instance()) {}
   82 
   83 collectionIndex::~collectionIndex()
   84 {
   85     delete &_out;
   86 }
   87 
   88 
   89 int collectionIndex::index(const QString& dir)
   90 {
   91     const unsigned long start_time(clock());
   92     _nerrors = 0;
   93     _keysentences.clear();
   94 
   95     const QString bibtexdir(dir.isEmpty()
   96                             ? QDir::toNativeSeparators(QFileInfo(_settingsP->fileName("cb2Bib/BibTeXFile")).path())
   97                             : QDir::toNativeSeparators(dir));
   98     _out << QObject::tr("Indexing references from directory %1...").arg(bibtexdir) << endl;
   99 
  100     bibReference reference;
  101     documentCache dc;
  102     int documentid(0);
  103     QStringList documents;
  104     QStringList documentfns;
  105 
  106     const QStringList bibtexfns(c2bUtils::filesInDir(bibtexdir, QStringList() << "*.bib"));
  107     if (bibtexfns.count() == 0)
  108     {
  109         ++_nerrors;
  110         _out << QObject::tr("Error: No BibTeX files at %1").arg(bibtexdir) << endl;
  111         return _nerrors;
  112     }
  113 
  114     const QStringList fields(QStringList() << "file"
  115                              << "journal"
  116                              << "title"
  117                              << "booktitle");
  118     int nfailures(0);
  119     for (int i = 0; i < bibtexfns.count(); ++i)
  120     {
  121         const QString bibtexfn(bibtexfns.at(i));
  122         _out << QObject::tr("Indexing %1...").arg(bibtexfn) << endl;
  123         const QString bibtex(c2bUtils::fileToString(bibtexfn));
  124         dc.load(bibtexfn, documentContents::Complete);
  125 
  126         _bpP->initReferenceParsing(bibtexfn, fields, &reference);
  127         while (_bpP->referencesIn(bibtex, &reference))
  128         {
  129             QString docfn(reference.value("file"));
  130             if (docfn.isEmpty())
  131                 continue;
  132             docfn = QDir::cleanPath(docfn);
  133             const QString journal(reference.value("journal"));
  134             QString title(reference.anyTitle());
  135             c2bUtils::cleanTitle(title, true);
  136             if (dc.setCurrent(docfn))
  137                 setKeySentences(documentid, preprocessedText(journal, dc.current().text()));
  138             else
  139                 ++nfailures;
  140             ++documentid;
  141             documents.append(title);
  142             documentfns.append(docfn);
  143         }
  144         dc.unload();
  145     }
  146     if (nfailures > 0)
  147     {
  148         _out << QObject::tr("Warning: %1 documents could not be converted to text. Check search converter is set.")
  149              .arg(nfailures)
  150              << endl;
  151         _out << QObject::tr("         Refresh cache by running a prove search including document contents.") << endl;
  152     }
  153     _out << QObject::tr("done") << endl;
  154 
  155     digestKeySentences();
  156     cleanupKeySentences();
  157     analyzeKeySentences();
  158 
  159 #if C2B_DEBUG_COLLECTIONINDEX
  160     for (int i = 0; i < documents.count(); ++i)
  161     {
  162         QStringList sentences(_document_sentences.values(i));
  163         std::sort(sentences.begin(), sentences.end());
  164         _out << i << "  " << documents.at(i) << ':' << endl;
  165         for (int j = 0; j < sentences.count(); ++j)
  166             _out << "        " << sentences.at(j) << endl;
  167     }
  168 #endif
  169 
  170     const QString cache_dir(_settingsP->fileName("cb2Bib/CacheDirectory"));
  171     const QString documentslfn(QDir::toNativeSeparators(cache_dir + "/documents.lc2b"));
  172     QFile documentslf(QDir::cleanPath(documentslfn));
  173     if (documentslf.open(QIODevice::WriteOnly))
  174     {
  175         _out << QObject::tr("Writing %1...").arg(documentslfn) << endl;
  176         QDataStream stream(&documentslf);
  177         stream << documentfns.count();
  178         for (int i = 0; i < documentfns.count(); ++i)
  179             stream << documentfns.at(i);
  180         documentslf.close();
  181     }
  182     else
  183     {
  184         ++_nerrors;
  185         _out << QObject::tr("Error: %1 can not be written").arg(documentslfn) << endl;
  186     }
  187     const QString documentsifn(QDir::toNativeSeparators(cache_dir + "/documents.ic2b"));
  188     QFile documentsif(documentsifn);
  189     if (documentsif.open(QIODevice::WriteOnly))
  190     {
  191         _out << QObject::tr("Writing %1...").arg(documentsifn) << endl;
  192         QDataStream stream(&documentsif);
  193         for (int i = 0; i < documents.count(); ++i)
  194         {
  195             QList<int> sentenceids(_document_sentenceids.values(i));
  196             std::sort(sentenceids.begin(), sentenceids.end());
  197             stream << sentenceids.count();
  198             for (int j = 0; j < sentenceids.count(); ++j)
  199                 stream << sentenceids.at(j);
  200         }
  201         documentsif.close();
  202     }
  203     else
  204     {
  205         ++_nerrors;
  206         _out << QObject::tr("Error: %1 can not be written").arg(documentsifn) << endl;
  207     }
  208 
  209     _out << QObject::tr("done") << endl;
  210     const double time = double(clock() - start_time) / double(CLOCKS_PER_SEC);
  211     _out << QObject::tr("Indexing CPU time: %1s").arg(time) << endl;
  212 
  213     return _nerrors;
  214 }
  215 
  216 void collectionIndex::setKeySentences(const int documentid, const QString& text)
  217 {
  218     QHash<QStringRef, int> docsentences;
  219     documentParser dp(text, 7);
  220     while (dp.parses())
  221     {
  222         if (dp.nlength() < 4 * dp.nwords())
  223             continue;
  224         const QStringRef s(dp.subsentence());
  225         docsentences.insert(s, 1 + docsentences.value(s));
  226     }
  227     const int dsr(text.length() < _document_length_threshold ? _in_short_document_sentence_repetition
  228                   : _in_long_document_sentence_repetition);
  229     for (QHash<QStringRef, int>::const_iterator si = docsentences.constBegin(); si != docsentences.constEnd(); ++si)
  230     {
  231         if (si.value() < dsr)
  232             continue;
  233         const QString ss(si.key().toString());
  234         if (!isValidKeySentence(ss))
  235             continue;
  236         if (_keysentences.contains(ss))
  237             _keysentences[ss].update(documentid);
  238         else
  239             _keysentences.insert(ss, KeysentenceData(documentid));
  240     }
  241 }
  242 
  243 void collectionIndex::analyzeKeySentences()
  244 {
  245     _out << QObject::tr("Analysing keyword sentences...") << endl;
  246 
  247     posTagger pt;
  248     if (!pt.loadLexicon())
  249     {
  250         ++_nerrors;
  251         _out << QObject::tr("Error: cb2Bib POS lexicon could not be loaded; check Configure Files") << endl;
  252         return;
  253     }
  254     QStringList validlist;
  255 #if C2B_DEBUG_COLLECTIONINDEX
  256     QStringList taggedvalidlist;
  257 #endif
  258 
  259     for (QMap<QString, KeysentenceData>::const_iterator si = _keysentences.constBegin(); si != _keysentences.constEnd();
  260          ++si)
  261     {
  262         const QString& s(si.key());
  263         const KeysentenceData& sdata(si.value());
  264         if (sdata.ndocuments < _inter_document_sentence_repetition)
  265             continue;
  266         const QString pts(pt.tagged(s));
  267         if (pt.contains(pts))
  268         {
  269             validlist.append(s);
  270 #if C2B_DEBUG_COLLECTIONINDEX
  271             taggedvalidlist.append(pts + "   " + s);
  272             for (int j = 0; j < sdata.documents.count(); ++j)
  273                 _document_sentences.insert(sdata.documents.at(j), s);
  274 #endif
  275         }
  276     }
  277     ussorting uss;
  278     std::sort(validlist.begin(), validlist.end(), uss);
  279 
  280     const QString cache_dir(_settingsP->fileName("cb2Bib/CacheDirectory"));
  281     const QString sentenceslfn(QDir::toNativeSeparators(QDir::cleanPath(cache_dir + "/sentences.lc2b")));
  282     QFile sentenceslf(sentenceslfn);
  283     if (sentenceslf.open(QIODevice::WriteOnly))
  284     {
  285         _out << QObject::tr("Writing %1...").arg(sentenceslfn) << endl;
  286         QDataStream stream(&sentenceslf);
  287         stream << validlist.count();
  288         const QChar hs('-');
  289         const QChar us('_');
  290         const QString ss("\\1 ");
  291         QRegularExpression aprx(_bpP->authorPrefixes() + '_', _qre_pattern_options);
  292         for (int i = 0; i < validlist.count(); ++i)
  293         {
  294             QString v(validlist.at(i));
  295             v.replace(aprx, ss);
  296             stream << v.replace(us, hs);
  297         }
  298         sentenceslf.close();
  299     }
  300     else
  301     {
  302         ++_nerrors;
  303         _out << QObject::tr("Error: %1 can not be written").arg(sentenceslfn) << endl;
  304     }
  305     const QString sentencesifn(QDir::toNativeSeparators(QDir::cleanPath(cache_dir + "/sentences.ic2b")));
  306     QFile sentencesif(sentencesifn);
  307     if (sentencesif.open(QIODevice::WriteOnly))
  308     {
  309         _out << QObject::tr("Writing %1...").arg(sentencesifn) << endl;
  310         QDataStream stream(&sentencesif);
  311         for (int i = 0; i < validlist.count(); ++i)
  312         {
  313             const QString& s(validlist.at(i));
  314             const KeysentenceData& sdata(_keysentences.value(s));
  315             QList<int> dl(sdata.documents);
  316             std::sort(dl.begin(), dl.end());
  317             stream << dl.count();
  318             for (int j = 0; j < dl.count(); ++j)
  319                 stream << dl.at(j);
  320         }
  321         sentencesif.close();
  322     }
  323     else
  324     {
  325         ++_nerrors;
  326         _out << QObject::tr("Error: %1 can not be written").arg(sentencesifn) << endl;
  327     }
  328 #ifdef C2B_TRACE_MEMORY
  329     qDebug() << "After writing sentencesifn:";
  330     c2bUtils::print_maximum_resident_set_size();
  331 #endif
  332     const QString documentsifn(QDir::toNativeSeparators(cache_dir + "/documents.ic2b.tmp"));
  333     QFile documentsif(documentsifn);
  334     int document_sentenceids_count(0);
  335     if (documentsif.open(QIODevice::WriteOnly))
  336     {
  337         QDataStream stream(&documentsif);
  338         for (int i = 0; i < validlist.count(); ++i)
  339         {
  340             const QString& s(validlist.at(i));
  341             const KeysentenceData& sdata(_keysentences.value(s));
  342             document_sentenceids_count += sdata.documents.count();
  343             for (int j = 0; j < sdata.documents.count(); ++j)
  344                 stream << sdata.documents.at(j) << i;
  345         }
  346         documentsif.close();
  347     }
  348     else
  349     {
  350         ++_nerrors;
  351         _out << QObject::tr("Error: %1 can not be written").arg(documentsifn) << endl;
  352     }
  353 
  354     _out << QObject::tr("Raw keyword sentences: ") << _keysentences.size() << endl;
  355     _out << QObject::tr("Keyword sentences: ") << validlist.count() << endl;
  356     _out << QObject::tr("done") << endl;
  357 #if !C2B_DEBUG_COLLECTIONINDEX
  358     _out << QObject::tr("Clearing arrays...") << endl;
  359     _keysentences.clear();
  360     _document_sentences.clear();
  361     _out << QObject::tr("done") << endl;
  362 #endif
  363 #ifdef C2B_TRACE_MEMORY
  364     qDebug() << "After clearing arrays:";
  365     c2bUtils::print_maximum_resident_set_size();
  366 #endif
  367     if (documentsif.open(QIODevice::ReadOnly))
  368     {
  369         QDataStream stream(&documentsif);
  370         for (int l = 0, d, i; l < document_sentenceids_count; ++l)
  371         {
  372             stream >> d >> i;
  373             _document_sentenceids.insert(d, i);
  374         }
  375         documentsif.close();
  376         documentsif.remove();
  377     }
  378     else
  379     {
  380         ++_nerrors;
  381         _out << QObject::tr("Error: %1 can not be read").arg(documentsifn) << endl;
  382     }
  383 #ifdef C2B_TRACE_MEMORY
  384     qDebug() << "After inserting _document_sentenceids:";
  385     c2bUtils::print_maximum_resident_set_size();
  386 #endif
  387 #if C2B_DEBUG_COLLECTIONINDEX
  388     std::sort(taggedvalidlist.begin(), taggedvalidlist.end());
  389     for (int i = 0; i < taggedvalidlist.count(); ++i)
  390         _out << taggedvalidlist.at(i) << endl;
  391 #endif
  392 }
  393 
  394 void collectionIndex::cleanupKeySentences()
  395 {
  396     _out << QObject::tr("Cleaning keyword sentences...") << endl;
  397 
  398     QStringList keysentenceskeys;
  399     for (QMap<QString, KeysentenceData>::const_iterator si = _keysentences.constBegin(); si != _keysentences.constEnd();
  400          ++si)
  401     {
  402         if (si.value().ndocuments < _inter_document_sentence_repetition)
  403             continue;
  404         keysentenceskeys.append(si.key());
  405     }
  406     const int nkeysentenceskeys(keysentenceskeys.count());
  407 
  408     std::sort(keysentenceskeys.begin(), keysentenceskeys.end());
  409     for (int i = 0; i < nkeysentenceskeys - 1; ++i)
  410     {
  411         const QString& si(keysentenceskeys.at(i));
  412         const QString& sj(keysentenceskeys.at(i + 1));
  413         if (sj.startsWith(si) && _keysentences.value(si).ndocuments == _keysentences.value(sj).ndocuments)
  414             _keysentences[si].ndocuments = 0;
  415     }
  416 
  417     reversedsorting rs;
  418     std::sort(keysentenceskeys.begin(), keysentenceskeys.end(), rs);
  419     for (int i = 0; i < nkeysentenceskeys - 1; ++i)
  420     {
  421         const QString& si(keysentenceskeys.at(i));
  422         const QString& sj(keysentenceskeys.at(i + 1));
  423         if (sj.endsWith(si) && _keysentences.value(si).ndocuments == _keysentences.value(sj).ndocuments)
  424             _keysentences[si].ndocuments = 0;
  425     }
  426     _out << QObject::tr("done") << endl;
  427 }
  428 
  429 void collectionIndex::digestKeySentences()
  430 {
  431     _out << QObject::tr("Digesting keyword sentences...") << endl;
  432 
  433     const QChar hyphen('_');
  434 
  435     const QString coordination_and(" and ");
  436     const QString coordination_in(" in ");
  437     const QString coordination_of(" of ");
  438     const QString coordination_on(" on ");
  439     const QString coordination_or(" or ");
  440 
  441     const QRegularExpression trailing_ae("ae\\b", _qre_pattern_options);
  442     const QRegularExpression trailing_ches("ches\\b", _qre_pattern_options);
  443     const QRegularExpression trailing_dices("dices\\b", _qre_pattern_options);
  444     const QRegularExpression trailing_ices("ices\\b", _qre_pattern_options);
  445     const QRegularExpression trailing_ies("ies\\b", _qre_pattern_options);
  446     const QRegularExpression trailing_is("is\\b", _qre_pattern_options);
  447     const QRegularExpression trailing_us("us\\b", _qre_pattern_options);
  448     const QRegularExpression trailing_s("s\\b", _qre_pattern_options);
  449     const QRegularExpression trailing_ss("ss\\b", _qre_pattern_options);
  450     const QRegularExpression trailing_sses("sses\\b", _qre_pattern_options);
  451 
  452     const QRegularExpression trailing_ap("'\\b", _qre_pattern_options);
  453     const QRegularExpression trailing_aps("'s\\b", _qre_pattern_options);
  454 
  455     for (QMap<QString, KeysentenceData>::iterator si = _keysentences.begin(); si != _keysentences.end(); ++si)
  456     {
  457         if (si.value().ndocuments == 0)
  458             continue;
  459 
  460         const QString& s(si.key());
  461         KeysentenceData* ksdndmax(&si.value());
  462         int ndmax(ksdndmax->ndocuments);
  463 
  464         // Unify hyphenation
  465         if (s.contains(hyphen))
  466         {
  467             QString sp(s);
  468             sp.remove(hyphen);
  469             if (_keysentences.contains(sp))
  470                 ksdndmax = _digest_sentence(&_keysentences[sp], ksdndmax, &ndmax);
  471             sp = s;
  472             sp.replace(hyphen, _space_char);
  473             if (_keysentences.contains(sp))
  474                 ksdndmax = _digest_sentence(&_keysentences[sp], ksdndmax, &ndmax);
  475         }
  476 
  477         // Unify simple coordinations
  478         if (s.count(coordination_and) == 1)
  479         {
  480             const QStringList p(s.split(coordination_and, QString::SkipEmptyParts));
  481             const QString sp(p.at(1) + coordination_and + p.at(0));
  482             if (_keysentences.contains(sp))
  483                 ksdndmax = _digest_sentence(&_keysentences[sp], ksdndmax, &ndmax);
  484         }
  485         else if (s.count(coordination_or) == 1)
  486         {
  487             const QStringList p(s.split(coordination_or, QString::SkipEmptyParts));
  488             const QString sp(p.at(1) + coordination_or + p.at(0));
  489             if (_keysentences.contains(sp))
  490                 ksdndmax = _digest_sentence(&_keysentences[sp], ksdndmax, &ndmax);
  491         }
  492         else if (s.count(coordination_of) == 1)
  493         {
  494             const QStringList p(s.split(coordination_of, QString::SkipEmptyParts));
  495             const QString sp(p.at(1) + _space_char + p.at(0));
  496             if (_keysentences.contains(sp))
  497                 ksdndmax = _digest_sentence(&_keysentences[sp], ksdndmax, &ndmax);
  498         }
  499         else if (s.count(coordination_on) == 1)
  500         {
  501             const QStringList p(s.split(coordination_on, QString::SkipEmptyParts));
  502             const QString sp(p.at(1) + _space_char + p.at(0));
  503             if (_keysentences.contains(sp))
  504                 ksdndmax = _digest_sentence(&_keysentences[sp], ksdndmax, &ndmax);
  505         }
  506         else if (s.count(coordination_in) == 1)
  507         {
  508             const QStringList p(s.split(coordination_in, QString::SkipEmptyParts));
  509             const QString sp(p.at(1) + _space_char + p.at(0));
  510             if (_keysentences.contains(sp))
  511                 ksdndmax = _digest_sentence(&_keysentences[sp], ksdndmax, &ndmax);
  512         }
  513 
  514         // Unify genitives
  515         if (s.contains(trailing_aps) || s.contains(trailing_ap))
  516         {
  517             QString sp(s);
  518             sp.remove(trailing_aps);
  519             sp.remove(trailing_ap);
  520             if (s.length() != sp.length() && _keysentences.contains(sp))
  521                 ksdndmax = _digest_sentence(&_keysentences[sp], ksdndmax, &ndmax);
  522         }
  523 
  524         // Unify number
  525         if (s.contains(trailing_s) || s.contains(trailing_ae))
  526         {
  527             QString sp(s);
  528             sp.replace(trailing_dices, "dex");
  529             sp.replace(trailing_ices, "ix");
  530             sp.replace(trailing_ae, "a");
  531             sp.replace(trailing_ies, "y");
  532             sp.replace(trailing_ches, "ch");
  533             sp.replace(trailing_ss, "sss");
  534             sp.replace(trailing_us, "uss");
  535             sp.replace(trailing_aps, "'ss");
  536             sp.replace(trailing_sses, "sss");
  537             sp.replace(trailing_is, "iss");
  538             sp.remove(trailing_s);
  539             if (s.length() != sp.length() && _keysentences.contains(sp))
  540                 ksdndmax = _digest_sentence(&_keysentences[sp], ksdndmax, &ndmax);
  541         }
  542 
  543         ksdndmax->ndocuments = ksdndmax->documents.count();
  544     }
  545 
  546     _out << QObject::tr("done") << endl;
  547 }
  548 
  549 collectionIndex::KeysentenceData* collectionIndex::_digest_sentence(KeysentenceData* ksd, KeysentenceData* ksdndmax,
  550         int* ndmax) const
  551 {
  552     if (ksd->ndocuments > *ndmax)
  553     {
  554         // Transfer from ksdndmax
  555         *ndmax = ksd->ndocuments;
  556         const int nd(ksdndmax->documents.count());
  557         for (int i = 0; i < nd; ++i)
  558         {
  559             const int did(ksdndmax->documents.at(i));
  560             if (!ksd->documents.contains(did))
  561                 ksd->documents.append(did);
  562         }
  563         ksdndmax->ndocuments = 0;
  564         return ksd;
  565     }
  566     else
  567     {
  568         // Transfer to ksdndmax
  569         const int nd(ksd->documents.count());
  570         for (int i = 0; i < nd; ++i)
  571         {
  572             const int did(ksd->documents.at(i));
  573             if (!ksdndmax->documents.contains(did))
  574                 ksdndmax->documents.append(did);
  575         }
  576         ksd->ndocuments = 0;
  577         return ksdndmax;
  578     }
  579 }
  580 
  581 bool collectionIndex::isValidKeySentence(const QString& text) const
  582 {
  583     const ushort fc(text.at(0).unicode());
  584     if (fc == 39)
  585         return false;
  586     if (fc == 95)
  587         return false;
  588     if (fc == 97)
  589     {
  590         if (text.startsWith(QLatin1String("a ")))
  591             return false;
  592         if (text.startsWith(QLatin1String("ability ")))
  593             return false;
  594         if (text.startsWith(QLatin1String("able ")))
  595             return false;
  596         if (text.startsWith(QLatin1String("about ")))
  597             return false;
  598         if (text.startsWith(QLatin1String("above ")))
  599             return false;
  600         if (text.startsWith(QLatin1String("academy of ")))
  601             return false;
  602         if (text.startsWith(QLatin1String("according ")))
  603             return false;
  604         if (text.startsWith(QLatin1String("acknowledgment ")))
  605             return false;
  606         if (text.startsWith(QLatin1String("across ")))
  607             return false;
  608         if (text.startsWith(QLatin1String("added ")))
  609             return false;
  610         if (text.startsWith(QLatin1String("advances in ")))
  611             return false;
  612         if (text.startsWith(QLatin1String("after ")))
  613             return false;
  614         if (text.startsWith(QLatin1String("agreement ")))
  615             return false;
  616         if (text.startsWith(QLatin1String("all ")) && text.count(_space_char) < 3)
  617             return false;
  618         if (text.startsWith(QLatin1String("along ")))
  619             return false;
  620         if (text.startsWith(QLatin1String("also ")))
  621             return false;
  622         if (text.startsWith(QLatin1String("although ")))
  623             return false;
  624         if (text.startsWith(QLatin1String("among ")))
  625             return false;
  626         if (text.startsWith(QLatin1String("an ")))
  627             return false;
  628         if (text.startsWith(QLatin1String("and ")))
  629             return false;
  630         if (text.startsWith(QLatin1String("annals of ")))
  631             return false;
  632         if (text.startsWith(QLatin1String("any ")) && text.count(_space_char) < 3)
  633             return false;
  634         if (text.startsWith(QLatin1String("are ")))
  635             return false;
  636         if (text.startsWith(QLatin1String("as ")))
  637             return false;
  638         if (text.startsWith(QLatin1String("at ")))
  639             return false;
  640         if (text.startsWith(QLatin1String("author ")))
  641             return false;
  642         if (text.startsWith(QLatin1String("author's ")))
  643             return false;
  644         if (text.startsWith(QLatin1String("authors ")))
  645             return false;
  646     }
  647     else if (fc == 98)
  648     {
  649         if (text.startsWith(QLatin1String("based on ")))
  650             return false;
  651         if (text.startsWith(QLatin1String("be ")))
  652             return false;
  653         if (text.startsWith(QLatin1String("been ")))
  654             return false;
  655         if (text.startsWith(QLatin1String("before ")))
  656             return false;
  657         if (text.startsWith(QLatin1String("between ")))
  658             return false;
  659         if (text.startsWith(QLatin1String("both ")))
  660             return false;
  661         if (text.startsWith(QLatin1String("but ")))
  662             return false;
  663         if (text.startsWith(QLatin1String("by ")))
  664             return false;
  665     }
  666     else if (fc == 99)
  667     {
  668         if (text.startsWith(QLatin1String("calculated ")) && text.count(_space_char) < 3)
  669             return false;
  670         if (text.startsWith(QLatin1String("calculations ")))
  671             return false;
  672         if (text.startsWith(QLatin1String("can ")))
  673             return false;
  674         if (text.startsWith(QLatin1String("cannot ")))
  675             return false;
  676         if (text.startsWith(QLatin1String("capable ")))
  677             return false;
  678         if (text.startsWith(QLatin1String("carried ")))
  679             return false;
  680         if (text.startsWith(QLatin1String("case ")))
  681             return false;
  682         if (text.startsWith(QLatin1String("chapter ")))
  683             return false;
  684         if (text.startsWith(QLatin1String("chapters ")))
  685             return false;
  686         if (text.startsWith(QLatin1String("college of ")))
  687             return false;
  688         if (text.startsWith(QLatin1String("come ")))
  689             return false;
  690         if (text.startsWith(QLatin1String("conference on ")))
  691             return false;
  692         if (text.startsWith(QLatin1String("contribution from ")))
  693             return false;
  694         if (text.startsWith(QLatin1String("corresponding ")))
  695             return false;
  696         if (text.startsWith(QLatin1String("could ")))
  697             return false;
  698         if (text.startsWith(QLatin1String("current ")))
  699             return false;
  700         if (text.startsWith(QLatin1String("currently ")))
  701             return false;
  702     }
  703     else if (fc == 100)
  704     {
  705         if (text.startsWith(QLatin1String("data ")))
  706             return false;
  707         if (text.startsWith(QLatin1String("de ")))
  708             return false;
  709         if (text.startsWith(QLatin1String("department ")))
  710             return false;
  711         if (text.startsWith(QLatin1String("depicted ")))
  712             return false;
  713         if (text.startsWith(QLatin1String("depicts ")))
  714             return false;
  715         if (text.startsWith(QLatin1String("der ")))
  716             return false;
  717         if (text.startsWith(QLatin1String("described ")))
  718             return false;
  719         if (text.startsWith(QLatin1String("detailed ")))
  720             return false;
  721         if (text.startsWith(QLatin1String("details ")))
  722             return false;
  723         if (text.startsWith(QLatin1String("determined ")))
  724             return false;
  725         if (text.startsWith(QLatin1String("did not ")))
  726             return false;
  727         if (text.startsWith(QLatin1String("different ")) && text.count(_space_char) < 3)
  728             return false;
  729         if (text.startsWith(QLatin1String("difficult ")))
  730             return false;
  731         if (text.startsWith(QLatin1String("discussed ")))
  732             return false;
  733         if (text.startsWith(QLatin1String("dissolved ")))
  734             return false;
  735         if (text.startsWith(QLatin1String("do ")))
  736             return false;
  737         if (text.startsWith(QLatin1String("does ")))
  738             return false;
  739         if (text.startsWith(QLatin1String("due ")) && text.count(_space_char) < 3)
  740             return false;
  741         if (text.startsWith(QLatin1String("during ")))
  742             return false;
  743     }
  744     else if (fc == 101)
  745     {
  746         if (text.startsWith(QLatin1String("each ")))
  747             return false;
  748         if (text.startsWith(QLatin1String("easily ")))
  749             return false;
  750         if (text.startsWith(QLatin1String("easy ")))
  751             return false;
  752         if (text.startsWith(QLatin1String("edited ")))
  753             return false;
  754         if (text.startsWith(QLatin1String("effect ")) && text.count(_space_char) < 3)
  755             return false;
  756         if (text.startsWith(QLatin1String("effects ")) && text.count(_space_char) < 3)
  757             return false;
  758         if (text.startsWith(QLatin1String("either ")))
  759             return false;
  760         if (text.startsWith(QLatin1String("elsevier ")))
  761             return false;
  762     }
  763     else if (fc == 102)
  764     {
  765         if (text.startsWith(QLatin1String("faculty of ")))
  766             return false;
  767         if (text.startsWith(QLatin1String("figure ")))
  768             return false;
  769         if (text.startsWith(QLatin1String("figures ")))
  770             return false;
  771         if (text.startsWith(QLatin1String("find ")))
  772             return false;
  773         if (text.startsWith(QLatin1String("finding ")))
  774             return false;
  775         if (text.startsWith(QLatin1String("findings ")))
  776             return false;
  777         if (text.startsWith(QLatin1String("first ")) && text.count(_space_char) < 3)
  778             return false;
  779         if (text.startsWith(QLatin1String("followed ")))
  780             return false;
  781         if (text.startsWith(QLatin1String("following ")))
  782             return false;
  783         if (text.startsWith(QLatin1String("footnote ")))
  784             return false;
  785         if (text.startsWith(QLatin1String("for ")))
  786             return false;
  787         if (text.startsWith(QLatin1String("form ")))
  788             return false;
  789         if (text.startsWith(QLatin1String("found ")))
  790             return false;
  791         if (text.startsWith(QLatin1String("foundation ")))
  792             return false;
  793         if (text.startsWith(QLatin1String("free of charge ")))
  794             return false;
  795         if (text.startsWith(QLatin1String("from ")))
  796             return false;
  797         if (text.startsWith(QLatin1String("fur ")))
  798             return false;
  799         if (text.startsWith(QLatin1String("further ")))
  800             return false;
  801     }
  802     else if (fc == 103)
  803     {
  804         if (text.startsWith(QLatin1String("gave ")))
  805             return false;
  806         if (text.startsWith(QLatin1String("give ")))
  807             return false;
  808         if (text.startsWith(QLatin1String("given ")))
  809             return false;
  810         if (text.startsWith(QLatin1String("gives ")))
  811             return false;
  812         if (text.startsWith(QLatin1String("go ")))
  813             return false;
  814         if (text.startsWith(QLatin1String("goes ")))
  815             return false;
  816         if (text.startsWith(QLatin1String("going ")))
  817             return false;
  818         if (text.startsWith(QLatin1String("groups ")))
  819             return false;
  820     }
  821     else if (fc == 104)
  822     {
  823         if (text.startsWith(QLatin1String("has ")))
  824             return false;
  825         if (text.startsWith(QLatin1String("have ")))
  826             return false;
  827         if (text.startsWith(QLatin1String("he ")))
  828             return false;
  829         if (text.startsWith(QLatin1String("highlighted ")))
  830             return false;
  831         if (text.startsWith(QLatin1String("his ")))
  832             return false;
  833         if (text.startsWith(QLatin1String("how ")))
  834             return false;
  835     }
  836     else if (fc == 105)
  837     {
  838         if (text.startsWith(QLatin1String("i ")))
  839             return false;
  840         if (text.startsWith(QLatin1String("if ")))
  841             return false;
  842         if (text.startsWith(QLatin1String("ii ")))
  843             return false;
  844         if (text.startsWith(QLatin1String("iii ")))
  845             return false;
  846         if (text.startsWith(QLatin1String("in ")))
  847             return false;
  848         if (text.startsWith(QLatin1String("initially ")))
  849             return false;
  850         if (text.startsWith(QLatin1String("int j ")))
  851             return false;
  852         if (text.startsWith(QLatin1String("international conference on ")))
  853             return false;
  854         if (text.startsWith(QLatin1String("international journal ")))
  855             return false;
  856         if (text.startsWith(QLatin1String("into ")))
  857             return false;
  858         if (text.startsWith(QLatin1String("is ")))
  859             return false;
  860         if (text.startsWith(QLatin1String("it ")))
  861             return false;
  862         if (text.startsWith(QLatin1String("its ")))
  863             return false;
  864     }
  865     else if (fc == 106)
  866     {
  867         if (text.startsWith(QLatin1String("j ")))
  868             return false;
  869         if (text.startsWith(QLatin1String("journal of ")))
  870             return false;
  871         if (text.startsWith(QLatin1String("journal on ")))
  872             return false;
  873         if (text.startsWith(QLatin1String("just ")))
  874             return false;
  875     }
  876     else if (fc == 107)
  877     {
  878         if (text.startsWith(QLatin1String("kept ")))
  879             return false;
  880     }
  881     else if (fc == 108)
  882     {
  883         if (text.startsWith(QLatin1String("laboratory for ")))
  884             return false;
  885         if (text.startsWith(QLatin1String("laboratory of ")))
  886             return false;
  887         if (text.startsWith(QLatin1String("larger ")))
  888             return false;
  889         if (text.startsWith(QLatin1String("less ")))
  890             return false;
  891         if (text.startsWith(QLatin1String("let ")))
  892             return false;
  893         if (text.startsWith(QLatin1String("lower ")))
  894             return false;
  895     }
  896     else if (fc == 109)
  897     {
  898         if (text.startsWith(QLatin1String("may ")))
  899             return false;
  900         if (text.startsWith(QLatin1String("mol ")))
  901             return false;
  902         if (text.startsWith(QLatin1String("more ")))
  903             return false;
  904         if (text.startsWith(QLatin1String("most ")) && text.count(_space_char) < 3)
  905             return false;
  906         if (text.startsWith(QLatin1String("much ")) && text.count(_space_char) < 3)
  907             return false;
  908         if (text.startsWith(QLatin1String("my ")))
  909             return false;
  910     }
  911     else if (fc == 110)
  912     {
  913         if (text.startsWith(QLatin1String("national academy of ")))
  914             return false;
  915         if (text.startsWith(QLatin1String("need ")))
  916             return false;
  917         if (text.startsWith(QLatin1String("new ")))
  918             return false;
  919         if (text.startsWith(QLatin1String("not ")) && text.count(_space_char) < 3)
  920             return false;
  921     }
  922     else if (fc == 111)
  923     {
  924         if (text.startsWith(QLatin1String("observed ")))
  925             return false;
  926         if (text.startsWith(QLatin1String("obtained ")))
  927             return false;
  928         if (text.startsWith(QLatin1String("occur ")))
  929             return false;
  930         if (text.startsWith(QLatin1String("occurs ")))
  931             return false;
  932         if (text.startsWith(QLatin1String("of ")))
  933             return false;
  934         if (text.startsWith(QLatin1String("office ")))
  935             return false;
  936         if (text.startsWith(QLatin1String("on ")))
  937             return false;
  938         if (text.startsWith(QLatin1String("only ")))
  939             return false;
  940         if (text.startsWith(QLatin1String("onto ")))
  941             return false;
  942         if (text.startsWith(QLatin1String("or ")))
  943             return false;
  944         if (text.startsWith(QLatin1String("other ")))
  945             return false;
  946         if (text.startsWith(QLatin1String("otherwise ")))
  947             return false;
  948         if (text.startsWith(QLatin1String("our ")))
  949             return false;
  950         if (text.startsWith(QLatin1String("over ")))
  951             return false;
  952     }
  953     else if (fc == 112)
  954     {
  955         if (text.startsWith(QLatin1String("per ")))
  956             return false;
  957         if (text.startsWith(QLatin1String("permission of ")))
  958             return false;
  959         if (text.startsWith(QLatin1String("play ")))
  960             return false;
  961         if (text.startsWith(QLatin1String("played ")))
  962             return false;
  963         if (text.startsWith(QLatin1String("please ")))
  964             return false;
  965         if (text.startsWith(QLatin1String("possible ")))
  966             return false;
  967         if (text.startsWith(QLatin1String("present")))
  968             return false;
  969         if (text.startsWith(QLatin1String("previous ")))
  970             return false;
  971         if (text.startsWith(QLatin1String("proceedings ")))
  972             return false;
  973         if (text.startsWith(QLatin1String("proof ")))
  974             return false;
  975     }
  976     else if (fc == 113)
  977     {
  978         if (text.startsWith(QLatin1String("quite ")))
  979             return false;
  980     }
  981     else if (fc == 114)
  982     {
  983         if (text.startsWith(QLatin1String("rather ")))
  984             return false;
  985         if (text.startsWith(QLatin1String("rest of ")))
  986             return false;
  987         if (text.startsWith(QLatin1String("result of ")))
  988             return false;
  989         if (text.startsWith(QLatin1String("reviews in ")))
  990             return false;
  991         if (text.startsWith(QLatin1String("run ")))
  992             return false;
  993     }
  994     else if (fc == 115)
  995     {
  996         if (text.startsWith(QLatin1String("same ")))
  997             return false;
  998         if (text.startsWith(QLatin1String("see ")))
  999             return false;
 1000         if (text.startsWith(QLatin1String("several ")))
 1001             return false;
 1002         if (text.startsWith(QLatin1String("shall ")))
 1003             return false;
 1004         if (text.startsWith(QLatin1String("show ")))
 1005             return false;
 1006         if (text.startsWith(QLatin1String("shown ")))
 1007             return false;
 1008         if (text.startsWith(QLatin1String("since ")))
 1009             return false;
 1010         if (text.startsWith(QLatin1String("so ")))
 1011             return false;
 1012         if (text.startsWith(QLatin1String("some ")))
 1013             return false;
 1014         if (text.startsWith(QLatin1String("strongly ")))
 1015             return false;
 1016         if (text.startsWith(QLatin1String("studied ")))
 1017             return false;
 1018         if (text.startsWith(QLatin1String("studies ")))
 1019             return false;
 1020         if (text.startsWith(QLatin1String("study ")))
 1021             return false;
 1022         if (text.startsWith(QLatin1String("such ")))
 1023             return false;
 1024         if (text.startsWith(QLatin1String("supporting ")))
 1025             return false;
 1026         if (text.startsWith(QLatin1String("suppose ")))
 1027             return false;
 1028         if (text.startsWith(QLatin1String("symposium on ")))
 1029             return false;
 1030     }
 1031     else if (fc == 116)
 1032     {
 1033         if (text.startsWith(QLatin1String("than ")))
 1034             return false;
 1035         if (text.startsWith(QLatin1String("thank ")))
 1036             return false;
 1037         if (text.startsWith(QLatin1String("that ")))
 1038             return false;
 1039         if (text.startsWith(QLatin1String("the ")))
 1040             return false;
 1041         if (text.startsWith(QLatin1String("their ")))
 1042             return false;
 1043         if (text.startsWith(QLatin1String("then ")))
 1044             return false;
 1045         if (text.startsWith(QLatin1String("there ")))
 1046             return false;
 1047         if (text.startsWith(QLatin1String("these ")))
 1048             return false;
 1049         if (text.startsWith(QLatin1String("they ")))
 1050             return false;
 1051         if (text.startsWith(QLatin1String("this ")))
 1052             return false;
 1053         if (text.startsWith(QLatin1String("those ")))
 1054             return false;
 1055         if (text.startsWith(QLatin1String("thus ")))
 1056             return false;
 1057         if (text.startsWith(QLatin1String("title ")))
 1058             return false;
 1059         if (text.startsWith(QLatin1String("to ")))
 1060             return false;
 1061         if (text.startsWith(QLatin1String("too ")))
 1062             return false;
 1063         if (text.startsWith(QLatin1String("top ")))
 1064             return false;
 1065         if (text.startsWith(QLatin1String("transactions on ")))
 1066             return false;
 1067     }
 1068     else if (fc == 117)
 1069     {
 1070         if (text.startsWith(QLatin1String("under ")))
 1071             return false;
 1072         if (text.startsWith(QLatin1String("us ")))
 1073             return false;
 1074         if (text.startsWith(QLatin1String("use ")))
 1075             return false;
 1076         if (text.startsWith(QLatin1String("used ")))
 1077             return false;
 1078         if (text.startsWith(QLatin1String("uses ")))
 1079             return false;
 1080         if (text.startsWith(QLatin1String("using ")))
 1081             return false;
 1082     }
 1083     else if (fc == 118)
 1084     {
 1085         if (text.startsWith(QLatin1String("very ")) && text.count(_space_char) < 3)
 1086             return false;
 1087     }
 1088     else if (fc == 119)
 1089     {
 1090         if (text.startsWith(QLatin1String("was ")))
 1091             return false;
 1092         if (text.startsWith(QLatin1String("we ")))
 1093             return false;
 1094         if (text.startsWith(QLatin1String("were ")))
 1095             return false;
 1096         if (text.startsWith(QLatin1String("were ")))
 1097             return false;
 1098         if (text.startsWith(QLatin1String("what ")))
 1099             return false;
 1100         if (text.startsWith(QLatin1String("when ")))
 1101             return false;
 1102         if (text.startsWith(QLatin1String("whenever ")))
 1103             return false;
 1104         if (text.startsWith(QLatin1String("where ")))
 1105             return false;
 1106         if (text.startsWith(QLatin1String("whether ")))
 1107             return false;
 1108         if (text.startsWith(QLatin1String("which ")))
 1109             return false;
 1110         if (text.startsWith(QLatin1String("while ")))
 1111             return false;
 1112         if (text.startsWith(QLatin1String("whose ")))
 1113             return false;
 1114         if (text.startsWith(QLatin1String("wiley ")))
 1115             return false;
 1116         if (text.startsWith(QLatin1String("will ")))
 1117             return false;
 1118         if (text.startsWith(QLatin1String("with ")))
 1119             return false;
 1120         if (text.startsWith(QLatin1String("within ")))
 1121             return false;
 1122         if (text.startsWith(QLatin1String("without ")))
 1123             return false;
 1124         if (text.startsWith(QLatin1String("work ")))
 1125             return false;
 1126         if (text.startsWith(QLatin1String("workshop on ")))
 1127             return false;
 1128         if (text.startsWith(QLatin1String("worth ")))
 1129             return false;
 1130         if (text.startsWith(QLatin1String("would ")))
 1131             return false;
 1132     }
 1133     else if (fc == 120)
 1134     {
 1135         if (text.startsWith(QLatin1String("x ")))
 1136             return false;
 1137     }
 1138     else if (fc == 121)
 1139     {
 1140         if (text.startsWith(QLatin1String("yes ")))
 1141             return false;
 1142         if (text.startsWith(QLatin1String("you ")))
 1143             return false;
 1144         if (text.startsWith(QLatin1String("your ")))
 1145             return false;
 1146     }
 1147 
 1148     const ushort lc(text.at(text.length() - 1).unicode());
 1149     if (lc == 95)
 1150         return false;
 1151     if (lc == 97)
 1152     {
 1153         if (text.endsWith(QLatin1String(" a")))
 1154             return false;
 1155         if (text.endsWith(QLatin1String(" via")))
 1156             return false;
 1157     }
 1158     else if (lc == 100)
 1159     {
 1160         if (text.endsWith(QLatin1String(" and")))
 1161             return false;
 1162         if (text.endsWith(QLatin1String(" applied")))
 1163             return false;
 1164         if (text.endsWith(QLatin1String(" carried")))
 1165             return false;
 1166         if (text.endsWith(QLatin1String(" could")))
 1167             return false;
 1168         if (text.endsWith(QLatin1String(" performed")))
 1169             return false;
 1170         if (text.endsWith(QLatin1String(" second")))
 1171             return false;
 1172         if (text.endsWith(QLatin1String(" should")))
 1173             return false;
 1174         if (text.endsWith(QLatin1String(" showed")))
 1175             return false;
 1176         if (text.endsWith(QLatin1String(" third")))
 1177             return false;
 1178         if (text.endsWith(QLatin1String(" used")))
 1179             return false;
 1180         if (text.endsWith(QLatin1String(" would")))
 1181             return false;
 1182     }
 1183     else if (lc == 101)
 1184     {
 1185         if (text.endsWith(QLatin1String(" above")))
 1186             return false;
 1187         if (text.endsWith(QLatin1String(" are")))
 1188             return false;
 1189         if (text.endsWith(QLatin1String(" available")))
 1190             return false;
 1191         if (text.endsWith(QLatin1String(" be")))
 1192             return false;
 1193         if (text.endsWith(QLatin1String(" because")))
 1194             return false;
 1195         if (text.endsWith(QLatin1String(" due")))
 1196             return false;
 1197         if (text.endsWith(QLatin1String(" gave")))
 1198             return false;
 1199         if (text.endsWith(QLatin1String(" have")))
 1200             return false;
 1201         if (text.endsWith(QLatin1String(" he")))
 1202             return false;
 1203         if (text.endsWith(QLatin1String(" here")))
 1204             return false;
 1205         if (text.endsWith(QLatin1String(" importance")))
 1206             return false;
 1207         if (text.endsWith(QLatin1String(" indicate")))
 1208             return false;
 1209         if (text.endsWith(QLatin1String(" like")))
 1210             return false;
 1211         if (text.endsWith(QLatin1String(" made")))
 1212             return false;
 1213         if (text.endsWith(QLatin1String(" more")))
 1214             return false;
 1215         if (text.endsWith(QLatin1String(" same")))
 1216             return false;
 1217         if (text.endsWith(QLatin1String(" since")))
 1218             return false;
 1219         if (text.endsWith(QLatin1String(" the")))
 1220             return false;
 1221         if (text.endsWith(QLatin1String(" there")))
 1222             return false;
 1223         if (text.endsWith(QLatin1String(" these")))
 1224             return false;
 1225         if (text.endsWith(QLatin1String(" we")))
 1226             return false;
 1227         if (text.endsWith(QLatin1String(" were")))
 1228             return false;
 1229         if (text.endsWith(QLatin1String(" where")))
 1230             return false;
 1231         if (text.endsWith(QLatin1String(" where")))
 1232             return false;
 1233         if (text.endsWith(QLatin1String(" while")))
 1234             return false;
 1235         if (text.endsWith(QLatin1String(" whose")))
 1236             return false;
 1237     }
 1238     else if (lc == 102)
 1239     {
 1240         if (text.endsWith(QLatin1String(" if")))
 1241             return false;
 1242         if (text.endsWith(QLatin1String(" of")))
 1243             return false;
 1244     }
 1245     else if (lc == 103)
 1246     {
 1247         if (text.endsWith(QLatin1String(" according")))
 1248             return false;
 1249         if (text.endsWith(QLatin1String(" along")))
 1250             return false;
 1251         if (text.endsWith(QLatin1String(" belonging")))
 1252             return false;
 1253         if (text.endsWith(QLatin1String(" containing")))
 1254             return false;
 1255         if (text.endsWith(QLatin1String(" corresponding")))
 1256             return false;
 1257         if (text.endsWith(QLatin1String(" during")))
 1258             return false;
 1259         if (text.endsWith(QLatin1String(" fig")))
 1260             return false;
 1261         if (text.endsWith(QLatin1String(" having")))
 1262             return false;
 1263         if (text.endsWith(QLatin1String(" involving")))
 1264             return false;
 1265         if (text.endsWith(QLatin1String(" using")))
 1266             return false;
 1267     }
 1268     else if (lc == 104)
 1269     {
 1270         if (text.endsWith(QLatin1String(" although")))
 1271             return false;
 1272         if (text.endsWith(QLatin1String(" both")))
 1273             return false;
 1274         if (text.endsWith(QLatin1String(" much")))
 1275             return false;
 1276         if (text.endsWith(QLatin1String(" such")))
 1277             return false;
 1278         if (text.endsWith(QLatin1String(" through")))
 1279             return false;
 1280         if (text.endsWith(QLatin1String(" which")))
 1281             return false;
 1282         if (text.endsWith(QLatin1String(" with")))
 1283             return false;
 1284     }
 1285     else if (lc == 105)
 1286     {
 1287         if (text.endsWith(QLatin1String(" i")))
 1288             return false;
 1289     }
 1290     else if (lc == 108)
 1291     {
 1292         if (text.endsWith(QLatin1String(" all")))
 1293             return false;
 1294         if (text.endsWith(QLatin1String(" several")))
 1295             return false;
 1296         if (text.endsWith(QLatin1String(" will")))
 1297             return false;
 1298     }
 1299     else if (lc == 109)
 1300     {
 1301         if (text.endsWith(QLatin1String(" from")))
 1302             return false;
 1303         if (text.endsWith(QLatin1String(" them")))
 1304             return false;
 1305     }
 1306     else if (lc == 110)
 1307     {
 1308         if (text.endsWith(QLatin1String(" an")))
 1309             return false;
 1310         if (text.endsWith(QLatin1String(" been")))
 1311             return false;
 1312         if (text.endsWith(QLatin1String(" between")))
 1313             return false;
 1314         if (text.endsWith(QLatin1String(" can")))
 1315             return false;
 1316         if (text.endsWith(QLatin1String(" in")))
 1317             return false;
 1318         if (text.endsWith(QLatin1String(" known")))
 1319             return false;
 1320         if (text.endsWith(QLatin1String(" on")))
 1321             return false;
 1322         if (text.endsWith(QLatin1String(" shown")))
 1323             return false;
 1324         if (text.endsWith(QLatin1String(" shown")))
 1325             return false;
 1326         if (text.endsWith(QLatin1String(" supporting information")))
 1327             return false;
 1328         if (text.endsWith(QLatin1String(" taken")))
 1329             return false;
 1330         if (text.endsWith(QLatin1String(" than")))
 1331             return false;
 1332         if (text.endsWith(QLatin1String(" then")))
 1333             return false;
 1334         if (text.endsWith(QLatin1String(" upon")))
 1335             return false;
 1336         if (text.endsWith(QLatin1String(" when")))
 1337             return false;
 1338         if (text.endsWith(QLatin1String(" within")))
 1339             return false;
 1340     }
 1341     else if (lc == 111)
 1342     {
 1343         if (text.endsWith(QLatin1String(" also")))
 1344             return false;
 1345         if (text.endsWith(QLatin1String(" do")))
 1346             return false;
 1347         if (text.endsWith(QLatin1String(" into")))
 1348             return false;
 1349         if (text.endsWith(QLatin1String(" to")))
 1350             return false;
 1351         if (text.endsWith(QLatin1String(" two")))
 1352             return false;
 1353         if (text.endsWith(QLatin1String(" who")))
 1354             return false;
 1355     }
 1356     else if (lc == 114)
 1357     {
 1358         if (text.endsWith(QLatin1String(" after")))
 1359             return false;
 1360         if (text.endsWith(QLatin1String(" chapter")))
 1361             return false;
 1362         if (text.endsWith(QLatin1String(" elsevier")))
 1363             return false;
 1364         if (text.endsWith(QLatin1String(" for")))
 1365             return false;
 1366         if (text.endsWith(QLatin1String(" or")))
 1367             return false;
 1368         if (text.endsWith(QLatin1String(" our")))
 1369             return false;
 1370         if (text.endsWith(QLatin1String(" over")))
 1371             return false;
 1372         if (text.endsWith(QLatin1String(" per")))
 1373             return false;
 1374         if (text.endsWith(QLatin1String(" their")))
 1375             return false;
 1376         if (text.endsWith(QLatin1String(" under")))
 1377             return false;
 1378         if (text.endsWith(QLatin1String(" whether")))
 1379             return false;
 1380         if (text.endsWith(QLatin1String(" your")))
 1381             return false;
 1382     }
 1383     else if (lc == 115)
 1384     {
 1385         if (text.endsWith(QLatin1String(" across")))
 1386             return false;
 1387         if (text.endsWith(QLatin1String(" as")))
 1388             return false;
 1389         if (text.endsWith(QLatin1String(" does")))
 1390             return false;
 1391         if (text.endsWith(QLatin1String(" does")))
 1392             return false;
 1393         if (text.endsWith(QLatin1String(" figs")))
 1394             return false;
 1395         if (text.endsWith(QLatin1String(" follows")))
 1396             return false;
 1397         if (text.endsWith(QLatin1String(" gives")))
 1398             return false;
 1399         if (text.endsWith(QLatin1String(" has")))
 1400             return false;
 1401         if (text.endsWith(QLatin1String(" his")))
 1402             return false;
 1403         if (text.endsWith(QLatin1String(" is")))
 1404             return false;
 1405         if (text.endsWith(QLatin1String(" its")))
 1406             return false;
 1407         if (text.endsWith(QLatin1String(" ones")))
 1408             return false;
 1409         if (text.endsWith(QLatin1String(" shows")))
 1410             return false;
 1411         if (text.endsWith(QLatin1String(" this")))
 1412             return false;
 1413         if (text.endsWith(QLatin1String(" us")))
 1414             return false;
 1415         if (text.endsWith(QLatin1String(" was")))
 1416             return false;
 1417         if (text.endsWith(QLatin1String(" yes")))
 1418             return false;
 1419         if (text.endsWith(QLatin1String("acids res")))
 1420             return false;
 1421     }
 1422     else if (lc == 116)
 1423     {
 1424         if (text.endsWith(QLatin1String(" about")))
 1425             return false;
 1426         if (text.endsWith(QLatin1String(" at")))
 1427             return false;
 1428         if (text.endsWith(QLatin1String(" department")))
 1429             return false;
 1430         if (text.endsWith(QLatin1String(" et")))
 1431             return false;
 1432         if (text.endsWith(QLatin1String(" first")))
 1433             return false;
 1434         if (text.endsWith(QLatin1String(" important")))
 1435             return false;
 1436         if (text.endsWith(QLatin1String(" it")))
 1437             return false;
 1438         if (text.endsWith(QLatin1String(" let")))
 1439             return false;
 1440         if (text.endsWith(QLatin1String(" most")))
 1441             return false;
 1442         if (text.endsWith(QLatin1String(" must")))
 1443             return false;
 1444         if (text.endsWith(QLatin1String(" not")))
 1445             return false;
 1446         if (text.endsWith(QLatin1String(" out")))
 1447             return false;
 1448         if (text.endsWith(QLatin1String(" suggest")))
 1449             return false;
 1450         if (text.endsWith(QLatin1String(" that")))
 1451             return false;
 1452         if (text.endsWith(QLatin1String(" without")))
 1453             return false;
 1454     }
 1455     else if (lc == 117)
 1456     {
 1457         if (text.endsWith(QLatin1String(" you")))
 1458             return false;
 1459     }
 1460     else if (lc == 119)
 1461     {
 1462         if (text.endsWith(QLatin1String(" how")))
 1463             return false;
 1464         if (text.endsWith(QLatin1String(" new")))
 1465             return false;
 1466         if (text.endsWith(QLatin1String(" show")))
 1467             return false;
 1468     }
 1469     else if (lc == 121)
 1470     {
 1471         if (text.endsWith(QLatin1String(" any")))
 1472             return false;
 1473         if (text.endsWith(QLatin1String(" by")))
 1474             return false;
 1475         if (text.endsWith(QLatin1String(" may")))
 1476             return false;
 1477         if (text.endsWith(QLatin1String(" my")))
 1478             return false;
 1479         if (text.endsWith(QLatin1String(" only")))
 1480             return false;
 1481         if (text.endsWith(QLatin1String(" society")))
 1482             return false;
 1483         if (text.endsWith(QLatin1String(" they")))
 1484             return false;
 1485         if (text.endsWith(QLatin1String(" wiley")))
 1486             return false;
 1487     }
 1488 
 1489     if (text.contains(QLatin1String("_ ")))
 1490         return false;
 1491     if (text.contains(QLatin1String(" _")))
 1492         return false;
 1493 
 1494     if (_last_equals_first(text))
 1495         return false;
 1496 
 1497     return true;
 1498 }
 1499 
 1500 QString collectionIndex::preprocessedText(const QString& journal, const QString& text) const
 1501 {
 1502     QString pt(text.toLower());
 1503     if (journal.contains(_space_char))
 1504     {
 1505         QStringList jns;
 1506         jns.append(journal.toLower());
 1507         jns.append(_bpP->fullJournal(journal).toLower());
 1508         jns.append(_bpP->abbreviatedJournal(journal).toLower());
 1509         jns.append(QString(jns.at(0)).remove('.'));
 1510         jns.append(QString(jns.at(1)).remove('.'));
 1511         jns.append(QString(jns.at(2)).remove('.'));
 1512         jns.removeDuplicates();
 1513         for (int i = 0; i < jns.count(); ++i)
 1514             c2bUtils::fillString(pt, QStringMatcher(jns.at(i), Qt::CaseSensitive), '.');
 1515     }
 1516 
 1517     c2bUtils::fillString(pt, txtmatcher("\"", Qt::CaseSensitive, 0), ' ');
 1518     c2bUtils::fillString(pt, txtmatcher("-", Qt::CaseSensitive, 0), '_');
 1519     pt.replace(QRegularExpression("_{2,}", _qre_pattern_options), "_");
 1520 
 1521     pt.replace(QRegularExpression("\\bin *situ\\b", _qre_pattern_options), " in_situ ");
 1522     pt.replace(QRegularExpression("\\bin *vivo\\b", _qre_pattern_options), " in_vivo ");
 1523     pt.replace(QRegularExpression("\\bin *vitro\\b", _qre_pattern_options), " in_vitro ");
 1524     pt.replace(QRegularExpression("\\bex *situ\\b", _qre_pattern_options), " ex_situ ");
 1525     pt.replace(QRegularExpression("\\bex *vivo\\b", _qre_pattern_options), " ex_vivo ");
 1526     pt.replace(QRegularExpression("\\bex *vitro\\b", _qre_pattern_options), " ex_vitro ");
 1527 
 1528     // Clear some strings
 1529     c2bUtils::fillString(pt, QRegularExpression("\\bbased (?=on\\b)", _qre_pattern_options), ' ');
 1530     c2bUtils::fillString(pt, QRegularExpression("\\bin order (?=to\\b)", _qre_pattern_options), ' ');
 1531 
 1532     c2bUtils::fillString(pt, txtmatcher(" a ", Qt::CaseSensitive, 0), ' ');
 1533     c2bUtils::fillString(pt, txtmatcher(" an ", Qt::CaseSensitive, 2), ' ');
 1534     c2bUtils::fillString(pt, txtmatcher(" its ", Qt::CaseSensitive, 3), ' ');
 1535     c2bUtils::fillString(pt, txtmatcher(" the ", Qt::CaseSensitive, 2), ' ');
 1536     c2bUtils::fillString(pt, txtmatcher(" their ", Qt::CaseSensitive, 2), ' ');
 1537     c2bUtils::fillString(pt, txtmatcher(" '", Qt::CaseSensitive, 1), ' ');
 1538     c2bUtils::fillString(pt, txtmatcher("' ", Qt::CaseSensitive, 0), ' ');
 1539 
 1540     // Set additional breakpoints
 1541     c2bUtils::fillString(pt, txtmatcher(" are ", Qt::CaseSensitive, 2), '.');
 1542     c2bUtils::fillString(pt, txtmatcher(" be ", Qt::CaseSensitive, 1), '.');
 1543     c2bUtils::fillString(pt, txtmatcher(" is ", Qt::CaseSensitive, 2), '.');
 1544     c2bUtils::fillString(pt, txtmatcher(" was ", Qt::CaseSensitive, 1), '.');
 1545     c2bUtils::fillString(pt, txtmatcher(" were ", Qt::CaseSensitive, 1), '.');
 1546 
 1547     c2bUtils::fillString(pt, QRegularExpression("\\binstitut\\w*", _qre_pattern_options), '.');
 1548     c2bUtils::fillString(pt, QRegularExpression("\\buniversi\\w*", _qre_pattern_options), '.');
 1549     c2bUtils::fillString(pt, QRegularExpression("\\bdoi\\w*", _qre_pattern_options), '.');
 1550     c2bUtils::fillString(pt, QRegularExpression("\\bet al\\w*", _qre_pattern_options), '.');
 1551 
 1552     // Normalize proper names
 1553     c2bUtils::fillString(pt, QRegularExpression("\\bdo not\\b", _qre_pattern_options), '.');
 1554     pt.replace(QRegularExpression("\\b" + _bpP->authorPrefixes() + "\\s", _qre_pattern_options),
 1555                QString("\\1%1").arg(QChar(127)));
 1556     replace(pt, QChar(127), '_');
 1557 
 1558     // Remove noisy elements from PDF articles
 1559     c2bUtils::fillString(pt, QStringMatcher("page intentionally left", Qt::CaseSensitive), '.');
 1560     c2bUtils::fillString(pt, QStringMatcher("journal article", Qt::CaseSensitive), '.');
 1561     c2bUtils::fillString(pt, QRegularExpression(" \\w \\w \\w ", _qre_pattern_options), '.');
 1562 
 1563     c2bUtils::stripDiacritics(pt);
 1564     c2bUtils::simplifyString(pt);
 1565 
 1566     return pt;
 1567 }
 1568 
 1569 QString& collectionIndex::replace(QString& str, const QChar& a, const QChar& b) const
 1570 {
 1571     const int length(str.length());
 1572     QChar* const s(str.data());
 1573     int i(0);
 1574     while (i < length)
 1575     {
 1576         QChar& c(s[i]);
 1577         if (c == a)
 1578             c = b;
 1579         ++i;
 1580     }
 1581     return str;
 1582 }