"Fossies" - the Fresh Open Source Software Archive

Member "cb2bib-2.0.1/src/c2b/heuristicBibParser.cpp" (12 Feb 2021, 35381 Bytes) of package /linux/privat/cb2bib-2.0.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "heuristicBibParser.cpp" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 2.0.0_vs_2.0.1.

    1 /***************************************************************************
    2  *   Copyright (C) 2004-2021 by Pere Constans
    3  *   constans@molspaces.com
    4  *   cb2Bib version 2.0.1. Licensed under the GNU GPL version 3.
    5  *   See the LICENSE file that comes with this distribution.
    6  ***************************************************************************/
    7 #include "heuristicBibParser.h"
    8 
    9 #include "bibParser.h"
   10 #include "journalDB.h"
   11 
   12 #include <QDate>
   13 
   14 #include <cmath>
   15 
   16 
   17 heuristicBibParser::heuristicBibParser(bibParser* bp)
   18     : _bpP(bp),
   19       _current_reference(bp->_current_reference),
   20       _reliable_number(false),
   21       _reliable_pages(false),
   22       _reliable_volume(false),
   23       _abstract("abstract"),
   24       _addauthors("addauthors"),
   25       _author("author"),
   26       _isbn("isbn"),
   27       _keywords("keywords"),
   28       _number("number"),
   29       _pages("pages"),
   30       _title("title"),
   31       _volume("volume"),
   32       _year("year"),
   33       _bibliographic_fields(bp->_bibliographic_fields),
   34       _journal_db(*bp->_journal_dbP)
   35 {
   36     _leading_non_letters = QRegExp("^[^<\\w]+");
   37     // Char 65533 appears many times in pdftotext outputs due to encoding errors
   38     _hyphens = QRegExp("\\s*[-" + QString(QChar(8211)) + QString(QChar(8722)) + QString(QChar(65533)) + "]+\\s*");
   39     _hyphen_nums = "(?:\\d+|\\d+-\\d+)";
   40     _hyphen_pages = "(?:\\d+|\\d+-\\d+p{0,2}|\\d+ \\d+pp)";
   41     _max_year = 3 + QDate::currentDate().year();
   42 
   43     // Setup author recognition
   44     // Implementation of author field extraction
   45     // P. Constans. A Simple Extraction Procedure for Bibliographical Author Field.
   46     // arXiv:0902.0755, 2009.
   47     _word_prefix_lexicon =
   48         c2bUtils::fileToString(":/txt/txt/word_prefix_lexicon.txt").split(c2bUtils::nonLetter, QString::SkipEmptyParts);
   49     for (int i = 0; i < _word_prefix_lexicon.count(); ++i)
   50     {
   51         _word_prefix_lexicon[i].replace('_', ' ');
   52         _word_prefix_lexicon[i].squeeze();
   53     }
   54     const QString author_lc_t("(?:n%1n|n%1%1n|%1n|%1nn|%1%1n|%1%1%1n|n%1nn|nn%1n|%1%1nn|%1n%1n|nn|nnn)");
   55     const QString author_uc_t(
   56         "(?:[nN]%1N|[nN]%1%1N|%1N|%1[nN]N|%1%1N|%1%1%1N|[nN]%1[nN]N|[nN]N%1N|%1%1[nN]N|%1[nN]%1N|[nN]N|[nN][nN]N)");
   57     const QString author_initial("Ip{0,1}");
   58     const QString author_line("L%1(?:[,;&L]+%1)*(?=L)");
   59     const QString author_lc(author_line.arg(author_lc_t.arg(author_initial)));
   60     const QString author_uc(author_line.arg(author_uc_t.arg(author_initial)));
   61     const QString author_address("(?:L[^L]*)");
   62 
   63     _author_sb = QRegExp(QString("L%1(?:[,;&]+L{0,2}%1)*[,;&L]*L").arg(author_lc_t.arg(author_initial)),
   64                          Qt::CaseInsensitive, QRegExp::RegExp);
   65     _author_sb_lc = QRegExp(QString("L%1(?:[,;&L]+%1)*(?=L)").arg(author_lc_t.arg(author_initial)), Qt::CaseSensitive,
   66                             QRegExp::RegExp);
   67     _author_sb_uc = QRegExp(QString("L%1(?:[,;&L]+%1)*(?=L)").arg(author_uc_t.arg(author_initial)), Qt::CaseSensitive,
   68                             QRegExp::RegExp);
   69     _author_sb.setMinimal(false);
   70     _author_sb_lc.setMinimal(false);
   71     _author_sb_uc.setMinimal(false);
   72     // Note: Syntax must be RegExp and not RegExp2.
   73     // If not, in cases as 'LnnL ... Lnn,nn,Lnn,&nnL' it will overextend addresses to include Lnn,nn,L.
   74     // This is not related to minimal/greedy.
   75     // Note also that Lnn,nn,L will not be taken, unless it is followed by another author line. This is a feature.
   76     _author_b2_lc_rx =
   77         new QRegExp(QString("(%1)%2{0,7}(%1)").arg(author_lc, author_address), Qt::CaseSensitive, QRegExp::RegExp);
   78     _author_b2_uc_rx =
   79         new QRegExp(QString("(%1)%2{0,7}(%1)").arg(author_uc, author_address), Qt::CaseSensitive, QRegExp::RegExp);
   80     _author_b3_lc_rx = new QRegExp(QString("(%1)%2{0,7}(%1)%2{0,7}(%1)").arg(author_lc, author_address),
   81                                    Qt::CaseSensitive, QRegExp::RegExp);
   82     _author_b3_uc_rx = new QRegExp(QString("(%1)%2{0,7}(%1)%2{0,7}(%1)").arg(author_uc, author_address),
   83                                    Qt::CaseSensitive, QRegExp::RegExp);
   84     _author_b4_lc_rx = new QRegExp(QString("(%1)%2{0,7}(%1)%2{0,7}(%1)%2{0,7}(%1)").arg(author_lc, author_address),
   85                                    Qt::CaseSensitive, QRegExp::RegExp);
   86     _author_b4_uc_rx = new QRegExp(QString("(%1)%2{0,7}(%1)%2{0,7}(%1)%2{0,7}(%1)").arg(author_uc, author_address),
   87                                    Qt::CaseSensitive, QRegExp::RegExp);
   88     _author_b5_lc_rx =
   89         new QRegExp(QString("(%1)%2{0,7}(%1)%2{0,7}(%1)%2{0,7}(%1)%2{0,7}(%1)").arg(author_lc, author_address),
   90                     Qt::CaseSensitive, QRegExp::RegExp);
   91     _author_b5_uc_rx =
   92         new QRegExp(QString("(%1)%2{0,7}(%1)%2{0,7}(%1)%2{0,7}(%1)%2{0,7}(%1)").arg(author_uc, author_address),
   93                     Qt::CaseSensitive, QRegExp::RegExp);
   94     _author_b2_lc_rx->setMinimal(true);
   95     _author_b2_uc_rx->setMinimal(true);
   96     _author_b3_lc_rx->setMinimal(true);
   97     _author_b3_uc_rx->setMinimal(true);
   98     _author_b4_lc_rx->setMinimal(true);
   99     _author_b4_uc_rx->setMinimal(true);
  100     _author_b5_lc_rx->setMinimal(true);
  101     _author_b5_uc_rx->setMinimal(true);
  102 }
  103 
  104 heuristicBibParser::~heuristicBibParser()
  105 {
  106     delete _author_b2_lc_rx;
  107     delete _author_b2_uc_rx;
  108     delete _author_b3_lc_rx;
  109     delete _author_b3_uc_rx;
  110     delete _author_b4_lc_rx;
  111     delete _author_b4_uc_rx;
  112     delete _author_b5_lc_rx;
  113     delete _author_b5_uc_rx;
  114 }
  115 
  116 
  117 void heuristicBibParser::guessFields(const QString& clean_text, const QString& tagged_text)
  118 {
  119     QString clean_num(clean_text);
  120     clean_num.replace(_hyphens, "-");
  121     // Order is important to increase the chances of a proper recognition
  122     guessJournal(clean_text);
  123     guessAbstract(tagged_text);
  124     guessAuthor(tagged_text);
  125     guessKeywords(tagged_text);
  126     guessYear(clean_num);
  127     _debug_guess("guessYear");
  128     guessVolume(clean_num);
  129     _debug_guess("guessVolume");
  130     guessNumber(clean_num);
  131     _debug_guess("guessNumber");
  132     guessPages(clean_num);
  133     _debug_guess("guessPages");
  134     guessTitle(tagged_text);
  135     guessISBN(clean_num);
  136     guessVolumeYearPages(clean_num);
  137     _debug_guess("guessVolumeYearPages");
  138     guessYearVolumePages(clean_num);
  139     _debug_guess("guessYearVolumePages");
  140     guessVolumePagesYear(clean_num);
  141     _debug_guess("guessVolumePagesYear");
  142     guessFromMetadata(clean_text);
  143 }
  144 
  145 void heuristicBibParser::heuristicFields(const QString& text)
  146 {
  147     // Heuristics for reasonably secure extraction
  148     QRegExp rxdoi("(10\\.[\\d\\.]+/\\S+)");
  149     if (rxdoi.indexIn(text) > -1)
  150     {
  151         QString cdoi(rxdoi.cap(1));
  152         // This happens when publishers set doi to title in metadata: <title>doi:10. ... </title>
  153         if (cdoi.endsWith("</title>"))
  154             cdoi.chop(8);
  155         cdoi.remove(QRegExp("[\\,\"\\}\\)]+$"));
  156         _current_reference["doi"] = _bpP->parse("doi", cdoi);
  157     }
  158 
  159     QRegExp rxarxiv("arXiv:([\\w\\./-]+)");
  160     if (rxarxiv.indexIn(text) > -1)
  161     {
  162         // https://arxiv.org/hypertex/bibstyles/
  163         QString aid(rxarxiv.cap(1));
  164         aid.remove(QRegExp("v\\d{1,2}$"));
  165         _current_reference["eprint"] = _bpP->parse("eprint", aid);
  166         _current_reference["journal"] = _bpP->parse("journal", "arXiv:" + aid);
  167         _current_reference["url"] = _bpP->parse("url", c2bUtils::arxivUrl.arg(aid));
  168     }
  169 
  170     QRegExp rxhtml("((http://|https://|ftp://|www\\.|ftp\\.)(www\\.|ftp\\.){0,1}\\S+)");
  171     if (rxhtml.indexIn(text) > -1)
  172         _current_reference["url"] = _bpP->parse("url", rxhtml.cap(1));
  173 }
  174 
  175 /** \page heuristics Field Recognition Rules
  176 
  177 - <b>Abstract</b>
  178   - If <tt>Abstract\b</tt> is found.
  179   - If <tt>Summary\b</tt> is found.
  180 
  181 */
  182 void heuristicBibParser::guessAbstract(const QString& text)
  183 {
  184     // Check whether text might come from a web page or from a PDF with new lines
  185     // Set line length to 93
  186     // Check first for abstract, check later for summary
  187     if (text.contains("Abstract", Qt::CaseInsensitive))
  188     {
  189         const QString crl(QChar(169)); // Avoid abstract overextending to copyright line.
  190         QRegExp rxH("(?:<NewLine\\d+>|<Tab\\d+>)\\s*Abstract[\\s\\.:]*<NewLine\\d+>(.+)(?:" + crl + "|<NewLine|$)",
  191                     Qt::CaseInsensitive);
  192         rxH.setMinimal(true);
  193         int nH(rxH.indexIn(text));
  194         if (nH > -1)
  195             if (rxH.cap(1).length() > 93)
  196             {
  197                 const QString val(rxH.cap(1).remove(_leading_non_letters));
  198                 _current_reference[_abstract] = _bpP->parse(_abstract, val);
  199                 return;
  200             }
  201         rxH = QRegExp("(?:<NewLine\\d+>|<Tab\\d+>)\\s*Abstract[\\s\\.:]*<NewLine\\d+>(.+)(?:" + crl +
  202                       "|<NewLine\\d+>\\s*<NewLine|$)",
  203                       Qt::CaseInsensitive);
  204         rxH.setMinimal(true);
  205         nH = rxH.indexIn(text);
  206         if (nH > -1)
  207         {
  208             const QString val(rxH.cap(1).remove(_leading_non_letters));
  209             _current_reference[_abstract] = _bpP->parse(_abstract, val);
  210             return;
  211         }
  212         rxH = QRegExp("(?:<NewLine\\d+>|<Tab\\d+>)\\s*Abstract\\b(.+)(?:" + crl + "|<NewLine|$)", Qt::CaseInsensitive);
  213         rxH.setMinimal(true);
  214         nH = rxH.indexIn(text);
  215         if (nH > -1)
  216             if (rxH.cap(1).length() > 93)
  217             {
  218                 const QString val(rxH.cap(1).remove(_leading_non_letters));
  219                 _current_reference[_abstract] = _bpP->parse(_abstract, val);
  220                 return;
  221             }
  222         rxH = QRegExp("(?:<NewLine\\d+>|<Tab\\d+>)\\s*Abstract\\b(.+)(?:" + crl + "|<NewLine\\d+>\\s*<NewLine|$)",
  223                       Qt::CaseInsensitive);
  224         rxH.setMinimal(true);
  225         nH = rxH.indexIn(text);
  226         if (nH > -1)
  227         {
  228             const QString val(rxH.cap(1).remove(_leading_non_letters));
  229             _current_reference[_abstract] = _bpP->parse(_abstract, val);
  230             return;
  231         }
  232     }
  233     if (text.contains("Summary", Qt::CaseInsensitive))
  234     {
  235         QRegExp rxH("(?:<NewLine\\d+>|<Tab\\d+>)\\s*Summary[\\s\\.:]*<NewLine\\d+>(.+)(?:<NewLine|$)",
  236                     Qt::CaseInsensitive);
  237         rxH.setMinimal(true);
  238         int nH(rxH.indexIn(text));
  239         if (nH > -1)
  240             if (rxH.cap(1).length() > 93)
  241             {
  242                 const QString val(rxH.cap(1).remove(_leading_non_letters));
  243                 _current_reference[_abstract] = _bpP->parse(_abstract, val);
  244                 return;
  245             }
  246         rxH =
  247             QRegExp("(?:<NewLine\\d+>|<Tab\\d+>)\\s*Summary[\\s\\.:]*<NewLine\\d+>(.+)(?:<NewLine\\d+>\\s*<NewLine|$)",
  248                     Qt::CaseInsensitive);
  249         rxH.setMinimal(true);
  250         nH = rxH.indexIn(text);
  251         if (nH > -1)
  252         {
  253             const QString val(rxH.cap(1).remove(_leading_non_letters));
  254             _current_reference[_abstract] = _bpP->parse(_abstract, val);
  255             return;
  256         }
  257         rxH = QRegExp("(?:<NewLine\\d+>|<Tab\\d+>)\\s*Summary\\b(.+)(?:<NewLine|$)", Qt::CaseInsensitive);
  258         rxH.setMinimal(true);
  259         nH = rxH.indexIn(text);
  260         if (nH > -1)
  261             if (rxH.cap(1).length() > 93)
  262             {
  263                 const QString val(rxH.cap(1).remove(_leading_non_letters));
  264                 _current_reference[_abstract] = _bpP->parse(_abstract, val);
  265                 return;
  266             }
  267         rxH =
  268             QRegExp("(?:<NewLine\\d+>|<Tab\\d+>)\\s*Summary\\b(.+)(?:<NewLine\\d+>\\s*<NewLine|$)", Qt::CaseInsensitive);
  269         rxH.setMinimal(true);
  270         nH = rxH.indexIn(text);
  271         if (nH > -1)
  272         {
  273             const QString val(rxH.cap(1).remove(_leading_non_letters));
  274             _current_reference[_abstract] = _bpP->parse(_abstract, val);
  275             return;
  276         }
  277     }
  278 }
  279 
  280 /** \page heuristics Field Recognition Rules
  281 
  282 - <b>Author</b>
  283   - Check capitalization patterns. See
  284     \htmlonly
  285     <a href="https://arxiv.org/abs/0902.0755" target="_blank">
  286     A Simple Extraction Procedure for Bibliographical Author Field</a>.
  287     \endhtmlonly
  288 
  289 */
  290 void heuristicBibParser::guessAuthor(const QString& tagged_text)
  291 {
  292     // Prepare input stream
  293     QString simplified_text("| " + tagged_text + " |");
  294     simplified_text.replace('|', ' ');
  295     simplified_text = "| " + simplified_text + " |";
  296 
  297     simplified_text.replace(QRegExp("<NewLine\\d+>"), " | ");
  298     simplified_text.replace(QRegExp("(\\w)<Tab\\d+>(\\w)"), "\\1 , \\2");
  299     simplified_text.replace(QRegExp("<Tab\\d+>"), " ");
  300 
  301     simplified_text.remove(QRegExp("author.{0,3}:{0,1}", Qt::CaseInsensitive));
  302     simplified_text.replace(QRegExp("(\\w{4,10})[a-z]\\)", Qt::CaseSensitive),
  303                             "\\1 "); // Remove superscript, e. g. LASTNAMEa
  304     simplified_text.replace(QRegExp("([A-Z]{2,10})[a-z]\\b", Qt::CaseSensitive),
  305                             "\\1 "); // Remove superscript, e. g. LASTNAMEa)
  306     simplified_text.replace(_hyphens, "-");
  307     simplified_text.replace(QChar(183), ',');               // Fancy author separator
  308     simplified_text.replace(' ' + QChar(198) + ' ', " , "); // Fancy author separator
  309     simplified_text.replace(QChar(8226), ',');              // Fancy author separator
  310     simplified_text.replace(QChar(178), ' ');               // Superscript dagger as sometimes translated by pdftotext
  311     _aencoder.aunifier.simplifyString(simplified_text);
  312 
  313     // Capitalize and encode
  314     for (int i = 0; i < _word_prefix_lexicon.count(); ++i)
  315     {
  316         const QString& wp = _word_prefix_lexicon.at(i);
  317         simplified_text.replace(wp, wp, Qt::CaseInsensitive);
  318     }
  319     simplified_text.replace(" by ", " ");
  320     _aencoder.encode(simplified_text);
  321     const QString sb_author(guessAuthor_single_block());
  322     const int n_sb_author(authorCount(sb_author));
  323     const QString mb_author(guessAuthor_multi_block());
  324     const int n_mb_author(authorCount(mb_author));
  325     if (n_mb_author > n_sb_author)
  326         _current_reference[_author] = mb_author;
  327     else
  328         _current_reference[_author] = sb_author;
  329 }
  330 
  331 QString heuristicBibParser::guessAuthor_single_block()
  332 {
  333     QString author;
  334     if (_author_sb_lc.indexIn(_aencoder.code) > -1)
  335         author = _bpP->parse(_author, _aencoder.decoded(_author_sb_lc.pos(0), _author_sb_lc.matchedLength()));
  336     else if (_author_sb_uc.indexIn(_aencoder.code) > -1)
  337         author = _bpP->parse(_author, _aencoder.decoded(_author_sb_uc.pos(0), _author_sb_uc.matchedLength()));
  338     else if (_author_sb.indexIn(_aencoder.code) > -1)
  339         // Few cases are extracted by _author_sb, keep it for cases 'A. Name,', with ',' usually coming
  340         // from a faulty plain text conversion.
  341         author = _bpP->parse(_author, _aencoder.decoded(_author_sb.pos(0), _author_sb.matchedLength()));
  342     return author;
  343 }
  344 
  345 QString heuristicBibParser::guessAuthor_multi_block()
  346 {
  347     QString author;
  348     if (_author_b5_uc_rx->indexIn(_aencoder.code) > -1)
  349         for (int i = 1; i < _author_b5_uc_rx->capturedTexts().count(); ++i)
  350             author = _bpP->parse(
  351                          _addauthors, _aencoder.decoded(_author_b5_uc_rx->pos(i), _author_b5_uc_rx->cap(i).length()), author);
  352     else if (_author_b5_lc_rx->indexIn(_aencoder.code) > -1)
  353         for (int i = 1; i < _author_b5_lc_rx->capturedTexts().count(); ++i)
  354             author = _bpP->parse(
  355                          _addauthors, _aencoder.decoded(_author_b5_lc_rx->pos(i), _author_b5_lc_rx->cap(i).length()), author);
  356     else if (_author_b4_uc_rx->indexIn(_aencoder.code) > -1)
  357         for (int i = 1; i < _author_b4_uc_rx->capturedTexts().count(); ++i)
  358             author = _bpP->parse(
  359                          _addauthors, _aencoder.decoded(_author_b4_uc_rx->pos(i), _author_b4_uc_rx->cap(i).length()), author);
  360     else if (_author_b4_lc_rx->indexIn(_aencoder.code) > -1)
  361         for (int i = 1; i < _author_b4_lc_rx->capturedTexts().count(); ++i)
  362             author = _bpP->parse(
  363                          _addauthors, _aencoder.decoded(_author_b4_lc_rx->pos(i), _author_b4_lc_rx->cap(i).length()), author);
  364     else if (_author_b3_uc_rx->indexIn(_aencoder.code) > -1)
  365         for (int i = 1; i < _author_b3_uc_rx->capturedTexts().count(); ++i)
  366             author = _bpP->parse(
  367                          _addauthors, _aencoder.decoded(_author_b3_uc_rx->pos(i), _author_b3_uc_rx->cap(i).length()), author);
  368     else if (_author_b3_lc_rx->indexIn(_aencoder.code) > -1)
  369         for (int i = 1; i < _author_b3_lc_rx->capturedTexts().count(); ++i)
  370             author = _bpP->parse(
  371                          _addauthors, _aencoder.decoded(_author_b3_lc_rx->pos(i), _author_b3_lc_rx->cap(i).length()), author);
  372     else if (_author_b2_uc_rx->indexIn(_aencoder.code) > -1)
  373         for (int i = 1; i < _author_b2_uc_rx->capturedTexts().count(); ++i)
  374             author = _bpP->parse(
  375                          _addauthors, _aencoder.decoded(_author_b2_uc_rx->pos(i), _author_b2_uc_rx->cap(i).length()), author);
  376     else if (_author_b2_lc_rx->indexIn(_aencoder.code) > -1)
  377         for (int i = 1; i < _author_b2_lc_rx->capturedTexts().count(); ++i)
  378             author = _bpP->parse(
  379                          _addauthors, _aencoder.decoded(_author_b2_lc_rx->pos(i), _author_b2_lc_rx->cap(i).length()), author);
  380     return author;
  381 }
  382 
  383 int heuristicBibParser::authorCount(const QString& authors)
  384 {
  385     if (authors.isEmpty())
  386         return 0;
  387     return 1 + authors.count(" and ");
  388 }
  389 
  390 /** \page heuristics
  391 
  392 - <b>Keywords</b>
  393   - If <tt>Key\\s{0,1}words\b</tt> is found.
  394 
  395 */
  396 void heuristicBibParser::guessKeywords(const QString& text)
  397 {
  398     QRegExp rxH("<(?:NewLine|Tab)\\d+>\\s*Key\\s{0,1}words and phrases[\\s\\.:]*<NewLine\\d+>(.+)(<NewLine|$)",
  399                 Qt::CaseInsensitive);
  400     rxH.setMinimal(true);
  401     int nH(rxH.indexIn(text));
  402     if (nH > -1)
  403     {
  404         const QString val(rxH.cap(1).remove(_leading_non_letters));
  405         _current_reference[_keywords] = _bpP->parse(_keywords, val);
  406         return;
  407     }
  408     rxH = QRegExp("<(?:NewLine|Tab)\\d+>\\s*Key\\s{0,1}words and phrases\\b(.+)(<NewLine|$)", Qt::CaseInsensitive);
  409     rxH.setMinimal(true);
  410     nH = rxH.indexIn(text);
  411     if (nH > -1)
  412     {
  413         const QString val(rxH.cap(1).remove(_leading_non_letters));
  414         _current_reference[_keywords] = _bpP->parse(_keywords, val);
  415     }
  416 
  417     rxH =
  418         QRegExp("<(?:NewLine|Tab)\\d+>\\s*Key\\s{0,1}words[\\s\\.:]*<NewLine\\d+>(.+)(<NewLine|$)", Qt::CaseInsensitive);
  419     rxH.setMinimal(true);
  420     nH = rxH.indexIn(text);
  421     if (nH > -1)
  422     {
  423         const QString val(rxH.cap(1).remove(_leading_non_letters));
  424         _current_reference[_keywords] = _bpP->parse(_keywords, val);
  425         return;
  426     }
  427     rxH = QRegExp("<(?:NewLine|Tab)\\d+>\\s*Key\\s{0,1}words\\b(.+)(<NewLine|$)", Qt::CaseInsensitive);
  428     rxH.setMinimal(true);
  429     nH = rxH.indexIn(text);
  430     if (nH > -1)
  431     {
  432         const QString val(rxH.cap(1).remove(_leading_non_letters));
  433         _current_reference[_keywords] = _bpP->parse(_keywords, val);
  434     }
  435 }
  436 
  437 /** \page heuristics
  438 
  439 - <b>Volume</b>
  440   - If <tt>Volume:{0,1}</tt> is found.
  441   - If <tt>Vol.{0,1}</tt> is found.
  442   - If <tt>\\b(\\d+)[,:]\\s*\\d+\\W+\\d+</tt> is found.
  443   - If <tt>\\b(\\d+)\\s*\\(\\d+\\)</tt> is found.
  444   - If <tt>\\b(\\d+)[,:]\\s*\\d+\\b</tt> is found.
  445 
  446 */
  447 void heuristicBibParser::guessVolume(const QString& text)
  448 {
  449     _reliable_volume = true;
  450     QRegExp rxH("Volumes{0,1}:{0,1}\\s*(" + _hyphen_nums + ')', Qt::CaseInsensitive);
  451     int nH(rxH.indexIn(text));
  452     if (nH > -1)
  453     {
  454         _current_reference[_volume] = _bpP->parse(_volume, rxH.cap(1));
  455         return;
  456     }
  457     rxH = QRegExp("Vols{0,1}\\.{0,1}\\s*(" + _hyphen_nums + ')', Qt::CaseInsensitive);
  458     nH = rxH.indexIn(text);
  459     if (nH > -1)
  460     {
  461         _current_reference[_volume] = _bpP->parse(_volume, rxH.cap(1));
  462         return;
  463     }
  464     rxH = QRegExp("\\b(\\d+)\\s*\\(" + _hyphen_nums + "\\)[,:]\\s*pp\\.{0,1}\\s*\\d+", Qt::CaseInsensitive);
  465     nH = rxH.indexIn(text);
  466     if (nH > -1)
  467     {
  468         _current_reference[_volume] = _bpP->parse(_volume, rxH.cap(1));
  469         return;
  470     }
  471     rxH = QRegExp("\\b(\\d+)[,:]\\s*pp\\.{0,1}\\s*\\d+", Qt::CaseInsensitive);
  472     nH = rxH.indexIn(text);
  473     if (nH > -1)
  474     {
  475         _current_reference[_volume] = _bpP->parse(_volume, rxH.cap(1));
  476         return;
  477     }
  478     _reliable_volume = false;
  479     rxH = QRegExp("(\\d+)\\s*\\(" + _hyphen_nums + "\\)[,:]\\s*\\d+", Qt::CaseInsensitive);
  480     nH = rxH.indexIn(text);
  481     if (nH > -1)
  482     {
  483         _current_reference[_volume] = _bpP->parse(_volume, rxH.cap(1));
  484         return;
  485     }
  486     rxH = QRegExp("\\b(\\d+)\\s*\\(\\d+\\)");
  487     nH = rxH.indexIn(text);
  488     if (nH > -1)
  489     {
  490         _current_reference[_volume] = _bpP->parse(_volume, rxH.cap(1));
  491         return;
  492     }
  493     rxH = QRegExp("\\b(\\d+)[,:]\\s*" + _hyphen_nums, Qt::CaseInsensitive);
  494     nH = rxH.indexIn(text);
  495     if (nH > -1)
  496     {
  497         _current_reference[_volume] = _bpP->parse(_volume, rxH.cap(1));
  498         return;
  499     }
  500     rxH = QRegExp("\\b(\\d+)[,:]\\s*\\d+\\b");
  501     nH = rxH.indexIn(text);
  502     if (nH > -1)
  503         _current_reference[_volume] = _bpP->parse(_volume, rxH.cap(1));
  504 }
  505 
  506 /** \page heuristics
  507 
  508 - <b>Number</b>
  509   - If <tt>Numbers{0,1}:{0,1}\\s*([\\d-]+)</tt> is found.
  510   - If <tt>No\\.{0,1}\\s*(\\d+)</tt> is found.
  511   - If <tt>Issue\\:{0,1}\\s*(\\d+)</tt> is found.
  512   - If <tt>\\d\\s*\\((\\d+)\\)[^\\.]</tt> is found.
  513 
  514 */
  515 void heuristicBibParser::guessNumber(const QString& text)
  516 {
  517     _reliable_number = true;
  518     QRegExp rxH("Numbers{0,1}\\:{0,1}\\s*(" + _hyphen_nums + ')', Qt::CaseInsensitive);
  519     int nH(rxH.indexIn(text));
  520     if (nH > -1)
  521     {
  522         _current_reference[_number] = _bpP->parse(_number, rxH.cap(1));
  523         return;
  524     }
  525     rxH = QRegExp("Nos{0,1}\\.{0,1}\\s*(" + _hyphen_nums + ')', Qt::CaseInsensitive);
  526     nH = rxH.indexIn(text);
  527     if (nH > -1)
  528     {
  529         _current_reference[_number] = _bpP->parse(_number, rxH.cap(1));
  530         return;
  531     }
  532     rxH = QRegExp("Issues{0,1}\\:{0,1}\\s*(" + _hyphen_nums + ')', Qt::CaseInsensitive);
  533     nH = rxH.indexIn(text);
  534     if (nH > -1)
  535     {
  536         _current_reference[_number] = _bpP->parse(_number, rxH.cap(1));
  537         return;
  538     }
  539     _reliable_number = false;
  540     rxH = QRegExp("\\d\\s*\\((" + _hyphen_nums + ")\\)[^\\.]");
  541     nH = rxH.indexIn(text);
  542     if (nH > -1)
  543         if (rxH.cap(1) != _current_reference.value(_year)) // Avoid confusing (number) and (year)
  544             _current_reference[_number] = _bpP->parse(_number, rxH.cap(1));
  545 }
  546 
  547 /** \page heuristics
  548 
  549 - <b>Pages</b>
  550   - If <tt>\\bPages{0,1}[:\\.]{0,1}([\\d\\s-]+)</tt> is found.
  551   - If <tt>\\bp{1,2}\\.{0,1}\\s+(\\d+)</tt> is found.
  552   - If <tt>\\b(\\d+)\\s*-{1,2}\\s*(\\d+pp)\\b</tt> is found.
  553   - If <tt>\\b(\\d+)\\s*-{1,2}\\s*(\\d+)\\b</tt> is found.
  554 
  555 */
  556 void heuristicBibParser::guessPages(const QString& text)
  557 {
  558     _reliable_pages = true;
  559     QRegExp rxH("\\bPages{0,1}[:\\.]{0,1}\\s*((?!\\()" + _hyphen_nums + "(?!\\)))", Qt::CaseInsensitive);
  560     int nH(rxH.indexIn(text));
  561     if (nH > -1)
  562     {
  563         _current_reference[_pages] = _bpP->parse(_pages, rxH.cap(1));
  564         return;
  565     }
  566     rxH = QRegExp("(?!\\()(\\d+[\\s-]\\d+pp)(?!\\))", Qt::CaseInsensitive);
  567     nH = rxH.indexIn(text);
  568     if (nH > -1)
  569     {
  570         _current_reference[_pages] = _bpP->parse(_pages, rxH.cap(1));
  571         return;
  572     }
  573     rxH = QRegExp("\\bpp\\.{0,1}\\s+(" + _hyphen_nums + ')', Qt::CaseInsensitive);
  574     nH = rxH.indexIn(text);
  575     if (nH > -1)
  576     {
  577         _current_reference[_pages] = _bpP->parse(_pages, rxH.cap(1));
  578         return;
  579     }
  580     _reliable_pages = false;
  581     rxH = QRegExp("\\bp\\.{0,1}\\s+(" + _hyphen_nums + ')');
  582     nH = rxH.indexIn(text);
  583     if (nH > -1)
  584     {
  585         _current_reference[_pages] = _bpP->parse(_pages, rxH.cap(1));
  586         return;
  587     }
  588     rxH = QRegExp("\\d+\\s*\\(" + _hyphen_nums + "\\),{0,1}\\s*(" + _hyphen_nums + ')');
  589     nH = rxH.indexIn(text); // 120 (1-3), 927
  590     if (nH > -1)
  591     {
  592         _current_reference[_pages] = _bpP->parse(_pages, rxH.cap(1));
  593         return;
  594     }
  595     rxH = QRegExp("(?!\\()(\\d+)\\s*-{1,2}\\s*(\\d+)\\b(?!\\))");
  596     nH = 0;
  597     while (nH >= 0)
  598     {
  599         nH = rxH.indexIn(text, nH);
  600         if (nH > -1)
  601         {
  602             if (!rxH.cap(1).startsWith('0'))
  603             {
  604                 const QString pp(_bpP->parse(_pages, QString("%1 %2").arg(rxH.cap(1), rxH.cap(2))));
  605                 const QStringList flpp(pp.split(QRegExp("\\D"), QString::SkipEmptyParts));
  606                 if (flpp.count() != 2)
  607                     continue;
  608                 const int fp(flpp.first().toInt());
  609                 const int lp(flpp.last().toInt());
  610                 if (fp < lp && lp - fp < 250)
  611                 {
  612                     _current_reference[_pages] = pp;
  613                     return;
  614                 }
  615             }
  616             nH += rxH.matchedLength();
  617         }
  618     }
  619 }
  620 
  621 /** \page heuristics
  622 
  623 - <b>Year</b>
  624   - If <tt>\\b(19|20)(\\d\\d)\\b</tt> is found.
  625 
  626 */
  627 void heuristicBibParser::guessYear(const QString& text)
  628 {
  629     QRegExp rxH("\\((19|20)(\\d\\d)\\)");
  630     int nH(rxH.indexIn(text));
  631     if (nH > -1)
  632     {
  633         const QString y(_bpP->parse(_year, rxH.cap(1) + rxH.cap(2)));
  634         if (y.toInt() < _max_year)
  635         {
  636             _current_reference[_year] = y;
  637             return;
  638         }
  639     }
  640     rxH = QRegExp("\\d+:" + _hyphen_nums + "[,\\s]+(19|20)(\\d\\d)\\b");
  641     nH = rxH.indexIn(text); // 44:2077 – 2082, 2004.
  642     if (nH > -1)
  643     {
  644         const QString y(_bpP->parse(_year, rxH.cap(1) + rxH.cap(2)));
  645         if (y.toInt() < _max_year)
  646         {
  647             _current_reference[_year] = y;
  648             return;
  649         }
  650     }
  651     rxH = QRegExp("\\b(19|20)(\\d\\d)\\b");
  652     nH = rxH.indexIn(text);
  653     if (nH > -1)
  654     {
  655         const QString y(_bpP->parse(_year, rxH.cap(1) + rxH.cap(2)));
  656         if (y.toInt() < _max_year)
  657         {
  658             _current_reference[_year] = y;
  659             return;
  660         }
  661     }
  662 }
  663 
  664 /** \page heuristics
  665 
  666 - <b>Title</b>
  667   - If <tt>\\bTitle:{0,1}</tt> is found.
  668 
  669 */
  670 void heuristicBibParser::guessTitle(const QString& text)
  671 {
  672     QRegExp rxH("\\bTitle:{0,1}\\s*<NewLine\\d+>(.+)(<NewLine|$)", Qt::CaseInsensitive);
  673     rxH.setMinimal(true);
  674     int nH(rxH.indexIn(text));
  675     if (nH > -1)
  676     {
  677         QString val(rxH.cap(1).remove(_leading_non_letters));
  678         _current_reference[_title] = _bpP->parse(_title, val);
  679         return;
  680     }
  681     rxH = QRegExp("\\bTitle:{0,1}(.+)(<NewLine|$)", Qt::CaseInsensitive);
  682     rxH.setMinimal(true);
  683     nH = rxH.indexIn(text);
  684     if (nH > -1)
  685     {
  686         QString val(rxH.cap(1).remove(_leading_non_letters));
  687         _current_reference[_title] = _bpP->parse(_title, val);
  688     }
  689 }
  690 
  691 /** \page heuristics
  692 
  693 - <b>ISBN</b>
  694   - If <tt>\\bISBN\\b(?:-\\d+){0,1}:{0,1}(?:-\\d+){0,1}\\s*(\\d+-[\\d-]+-\\d+)</tt> is found.
  695   - If <tt>\\bISBN\\b(?:-\\d+){0,1}:{0,1}(?:-\\d+){0,1}\\s*(\\d+)</tt> is found.
  696 
  697 */
  698 void heuristicBibParser::guessISBN(const QString& text)
  699 {
  700     QRegExp rxH("\\bISBN\\b(?:[ -]\\d+){0,1}:{0,1}(?:-\\d+){0,1}\\s*(\\d+-[\\d-]+-\\d+)", Qt::CaseInsensitive);
  701     int nH(rxH.indexIn(text));
  702     if (nH > -1)
  703     {
  704         _current_reference[_isbn] = _bpP->parse(_isbn, rxH.cap(1));
  705         return;
  706     }
  707     rxH = QRegExp("\\bISBN\\b(?:[ -]\\d+){0,1}:{0,1}(?:-\\d+){0,1}\\s*(\\d+)", Qt::CaseInsensitive);
  708     nH = rxH.indexIn(text);
  709     if (nH > -1)
  710     {
  711         _current_reference[_isbn] = _bpP->parse(_isbn, rxH.cap(1));
  712         return;
  713     }
  714 }
  715 
  716 /** \page heuristics
  717 
  718 - <b>Journal</b>
  719   - Check cb2Bib internal database.
  720 
  721 */
  722 void heuristicBibParser::guessJournal(const QString& text)
  723 {
  724     QString stext(text.toLower());
  725     stext.remove(c2bUtils::nonLetter);
  726     if (stext.length() < 3 || stext.length() > 10000)
  727         return;
  728 
  729     const int pheader(0);
  730     const int pfooter(stext.length());
  731     double oscore(0);
  732     int oindex(-1);
  733 
  734     QStringMatcher blank(" ");
  735     for (int i = 0; i < _journal_db.count(); ++i)
  736         if (blank.indexIn(_journal_db.retrieveFull(i)) >= 0)
  737         {
  738             const int p(stext.indexOf(_journal_db.fullsimplifiedFull(i), 0, Qt::CaseSensitive));
  739             if (p == -1)
  740                 continue;
  741             const int jlength(_journal_db.fullsimplifiedFull(i).length());
  742             const int d(1 + std::min(p - pheader, pfooter - p - jlength));
  743             const double score(pow(double(jlength), 1.75) / double(d));
  744             if (score > oscore)
  745             {
  746                 oscore = score;
  747                 oindex = i;
  748             }
  749         }
  750     for (int i = 0; i < _journal_db.count(); ++i)
  751     {
  752         const int p(stext.indexOf(_journal_db.fullsimplified(i), 0, Qt::CaseSensitive));
  753         if (p == -1)
  754             continue;
  755         const int jlength(_journal_db.fullsimplified(i).length());
  756         const int d(1 + std::min(p - pheader, pfooter - p - jlength));
  757         const double score(pow(double(jlength), 1.75) / double(d));
  758         if (score > oscore)
  759         {
  760             oscore = score;
  761             oindex = i;
  762         }
  763     }
  764     if (oscore > double(1) && oindex > -1)
  765         _current_reference["journal"] = _bpP->parse("journal", _journal_db.retrieve(oindex));
  766 }
  767 
  768 void heuristicBibParser::guessFromMetadata(const QString& text)
  769 {
  770     if (!text.contains("[Bibliographic Metadata"))
  771         return;
  772     QRegExp bf;
  773     bf.setMinimal(true);
  774     bf.setCaseSensitivity(Qt::CaseSensitive);
  775     QString p("<%1>(.+)</%1>");
  776     for (int i = 0; i < _bibliographic_fields.count(); ++i)
  777     {
  778         bf.setPattern(p.arg(_bibliographic_fields.at(i)));
  779         if (bf.indexIn(text) > -1)
  780             _current_reference[_bibliographic_fields.at(i)] = _bpP->parse(_bibliographic_fields.at(i), bf.cap(1));
  781     }
  782 }
  783 
  784 void heuristicBibParser::guessVolumePagesYear(const QString& text)
  785 {
  786     // Does several volume pages year formats
  787     if (_reliable_pages && _reliable_volume && _reliable_number)
  788         return;
  789     // J. Sci., 108 (15), 3206, 2004
  790     // J. Sci., 108 (15), 3206 2004
  791     QRegExp rxH("(\\d+)\\s*\\((" + _hyphen_nums + ")\\)\\s*[,:]\\s*(" + _hyphen_pages + ")[,\\s]+(19|20)(\\d\\d)");
  792     int nH(rxH.indexIn(text));
  793     if (nH > -1)
  794     {
  795         _current_reference[_volume] = _bpP->parse(_volume, rxH.cap(1));
  796         _current_reference[_number] = _bpP->parse(_number, rxH.cap(2));
  797         _current_reference[_pages] = _bpP->parse(_pages, rxH.cap(3));
  798         _current_reference[_year] = _bpP->parse(_year, rxH.cap(4) + rxH.cap(5));
  799         return;
  800     }
  801     // J. Sci., 108 (15), 3206 (2004)
  802     rxH =
  803         QRegExp("(\\d+)\\s*\\((" + _hyphen_nums + ")\\)\\s*[,:]\\s*(" + _hyphen_pages + ")[,\\s]*\\((19|20)(\\d\\d)\\)");
  804     nH = rxH.indexIn(text);
  805     if (nH > -1)
  806     {
  807         _current_reference[_volume] = _bpP->parse(_volume, rxH.cap(1));
  808         _current_reference[_number] = _bpP->parse(_number, rxH.cap(2));
  809         _current_reference[_pages] = _bpP->parse(_pages, rxH.cap(3));
  810         _current_reference[_year] = _bpP->parse(_year, rxH.cap(4) + rxH.cap(5));
  811         return;
  812     }
  813     if (_reliable_pages && _reliable_volume)
  814         return;
  815     // J. Sci. 124, 204109 2006
  816     // J. Sci. 124, 204109, 2006
  817     rxH = QRegExp("(\\d+)[,:]\\s*(" + _hyphen_pages + ")[,\\s]+(19|20)(\\d\\d)");
  818     nH = rxH.indexIn(text);
  819     if (nH > -1)
  820     {
  821         _current_reference[_volume] = _bpP->parse(_volume, rxH.cap(1));
  822         _current_reference[_pages] = _bpP->parse(_pages, rxH.cap(2));
  823         _current_reference[_year] = _bpP->parse(_year, rxH.cap(3) + rxH.cap(4));
  824         return;
  825     }
  826     // 120, 8425 - 8433 (2004)
  827     // J. Sci. 30, 2745 (1984)
  828     rxH = QRegExp("(\\d+)[,:]\\s*(" + _hyphen_pages + ")[,\\s]*\\((19|20)(\\d\\d)\\)");
  829     nH = rxH.indexIn(text);
  830     if (nH > -1)
  831     {
  832         _current_reference[_volume] = _bpP->parse(_volume, rxH.cap(1));
  833         _current_reference[_pages] = _bpP->parse(_pages, rxH.cap(2));
  834         _current_reference[_year] = _bpP->parse(_year, rxH.cap(3) + rxH.cap(4));
  835         return;
  836     }
  837 }
  838 
  839 void heuristicBibParser::guessVolumeYearPages(const QString& text)
  840 {
  841     // Does several volume year pages formats
  842     if (_reliable_number && _reliable_volume)
  843         return;
  844     // J. Sci. 203 (2003) 209.
  845     QRegExp rxH("(\\d+)\\s*\\(" + _current_reference.value(_year) + "\\)\\s*(" + _hyphen_pages + ')');
  846     int nH(rxH.indexIn(text));
  847     if (nH > -1)
  848     {
  849         _current_reference[_volume] = _bpP->parse(_volume, rxH.cap(1));
  850         _current_reference[_pages] = _bpP->parse(_pages, rxH.cap(2));
  851         return;
  852     }
  853 }
  854 
  855 void heuristicBibParser::guessYearVolumePages(const QString& text)
  856 {
  857     // Does several year volume pages formats
  858     if (_reliable_pages && _reliable_volume && _reliable_number)
  859         return;
  860     // J. Sci. 1995 January 25; 247(4):536-40.
  861     // J. Sci. 1995, 247(4):536-40.
  862     QRegExp rxH(_current_reference.value(_year) + "[\\w ]{0,15}[,:; ]\\s*(\\d+)\\s*\\((" + _hyphen_nums +
  863                 ")\\)\\s*[,:;]\\s*(" + _hyphen_pages + ')');
  864     int nH(rxH.indexIn(text));
  865     if (nH > -1)
  866     {
  867         _current_reference[_volume] = _bpP->parse(_volume, rxH.cap(1));
  868         _current_reference[_number] = _bpP->parse(_number, rxH.cap(2));
  869         _current_reference[_pages] = _bpP->parse(_pages, rxH.cap(3));
  870         _reliable_pages = true;
  871         _reliable_volume = true;
  872         return;
  873     }
  874     // J. Sci. (1999), 86, 3, pp. 635-648
  875     rxH = QRegExp("\\(" + _current_reference.value(_year) + "\\)" + "\\s*[,:;]\\s*(\\d+)\\s*[,:;]\\s*(" + _hyphen_nums +
  876                   ")\\s*[,:;]\\s*(?:pp)?\\.?\\s*(" + _hyphen_pages + ')');
  877     nH = rxH.indexIn(text);
  878     if (nH > -1)
  879     {
  880         _current_reference[_volume] = _bpP->parse(_volume, rxH.cap(1));
  881         _current_reference[_number] = _bpP->parse(_number, rxH.cap(2));
  882         _current_reference[_pages] = _bpP->parse(_pages, rxH.cap(3));
  883         _reliable_pages = true;
  884         _reliable_volume = true;
  885         return;
  886     }
  887     if (_reliable_volume)
  888         return;
  889     // J. Sci. 1995 January 25; 247:536-40.
  890     // J. Sci. 2005, 103, 818
  891     // J. Sci. 2002;9:101–106.5.
  892     rxH =
  893         QRegExp(_current_reference.value(_year) + "\\s*[\\w ]{0,15}[,:;]\\s*(\\d+)\\s*[,:;]\\s*(" + _hyphen_pages + ')');
  894     nH = rxH.indexIn(text);
  895     if (nH > -1)
  896     {
  897         _current_reference[_volume] = _bpP->parse(_volume, rxH.cap(1));
  898         _current_reference[_pages] = _bpP->parse(_pages, rxH.cap(2));
  899         _reliable_pages = true;
  900         _reliable_volume = true;
  901         return;
  902     }
  903     // J. Sci. 2005 103:818
  904     rxH = QRegExp(_current_reference.value(_year) + "\\s+(\\d+)\\s*:\\s*(" + _hyphen_pages + ')');
  905     nH = rxH.indexIn(text);
  906     if (nH > -1)
  907     {
  908         _current_reference[_volume] = _bpP->parse(_volume, rxH.cap(1));
  909         _current_reference[_pages] = _bpP->parse(_pages, rxH.cap(2));
  910         return;
  911     }
  912     // (2006) J. Sci. 39:3047
  913     rxH = QRegExp("\\(" + _current_reference.value(_year) + "\\)\\D{5,30}(\\d+)\\s*[,:;]\\s*(" + _hyphen_pages + ')');
  914     nH = rxH.indexIn(text);
  915     if (nH > -1)
  916     {
  917         _current_reference[_volume] = _bpP->parse(_volume, rxH.cap(1));
  918         _current_reference[_pages] = _bpP->parse(_pages, rxH.cap(2));
  919         return;
  920     }
  921     // 2006 J. Sci. 39 3047
  922     rxH = QRegExp(_current_reference.value(_year) + "\\D{5,30}(\\d+)\\s*[,:; ]\\s*(" + _hyphen_pages + ')');
  923     nH = rxH.indexIn(text);
  924     if (nH > -1)
  925     {
  926         _current_reference[_volume] = _bpP->parse(_volume, rxH.cap(1));
  927         _current_reference[_pages] = _bpP->parse(_pages, rxH.cap(2));
  928         return;
  929     }
  930 }