"Fossies" - the Fresh Open Source Software Archive

Member "cb2bib-2.0.1/src/c2b/networkQuery.cpp" (12 Feb 2021, 16840 Bytes) of package /linux/privat/cb2bib-2.0.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "networkQuery.cpp" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 2.0.0_vs_2.0.1.

    1 /***************************************************************************
    2  *   Copyright (C) 2004-2021 by Pere Constans
    3  *   constans@molspaces.com
    4  *   cb2Bib version 2.0.1. Licensed under the GNU GPL version 3.
    5  *   See the LICENSE file that comes with this distribution.
    6  ***************************************************************************/
    7 #include "networkQuery.h"
    8 
    9 #include "bibParser.h"
   10 #include "cb2bib_utilities.h"
   11 #include "network.h"
   12 #include "settings.h"
   13 
   14 #include <QTimer>
   15 
   16 
   17 networkQuery::networkQuery(bibParser* bp, QObject* parento) : QObject(parento), _bpP(bp)
   18 {
   19     Q_ASSERT_X(_bpP, "networkQuery", "bibParser was not instantiated");
   20     _networkP = new network(this);
   21     init();
   22 }
   23 
   24 networkQuery::networkQuery(bibParser* bp, network* net, QObject* parento) : QObject(parento), _bpP(bp), _networkP(net)
   25 {
   26     Q_ASSERT_X(_bpP, "networkQuery", "bibParser was not instantiated");
   27     Q_ASSERT_X(_networkP, "networkQuery", "network was not instantiated");
   28     init();
   29 }
   30 
   31 
   32 void networkQuery::init()
   33 {
   34     _settingsP = settings::instance();
   35     _settingsP->setValue("networkQuery/isSupervised", true);
   36     _networkquery_tmp_fn1 = _settingsP->tempPath() + "/cb2bib_query_tmp_html1_" + _settingsP->applicationPid();
   37     _networkquery_tmp_fn2 = _settingsP->tempPath() + "/cb2bib_query_tmp_html2_" + _settingsP->applicationPid();
   38     _timing = QDateTime::currentDateTime();
   39 }
   40 
   41 void networkQuery::submitQuery(const bibReference& reference, const QString& raw_reference, const bool check_document)
   42 {
   43     // Submission Initialization
   44     _error_string.clear();
   45     _query_parameter_count.clear();
   46     _is_end_of_file = false;
   47     _pdfurl_is_captured = false;
   48     _check_document = check_document && _settingsP->value("cb2Bib/AutomaticPdfDownload").toBool();
   49 
   50     // Query data
   51     _Qtitle = reference.value("title");
   52     _Qjournal = _bpP->fullJournal(reference.value("journal"));
   53     _Qvolume = reference.value("volume");
   54     _Qpage = c2bUtils::firstPage(reference.value("pages"));
   55     _Qdoi = reference.value("doi").trimmed();
   56     if (!_Qdoi.isEmpty() && !_Qdoi.contains(QRegExp("^10\\.[\\d\\.]+/\\S+$")))
   57     {
   58         _Qdoi.clear();
   59         c2bUtils::warn(tr("Warning: DOI skipped: '%1' is not a valid DOI").arg(_Qdoi));
   60     }
   61     _Qauthor = reference.value("author");
   62     _Qauthor.replace(QRegExp("(?:\\b\\w\\b|\\band\\b|\\W)"), " ");
   63     _Qauthor = c2bUtils::simplifyString(_Qauthor);
   64     _Qexcerpt = _bpP->excerpt(raw_reference, QStringList() << _Qdoi << _Qauthor << _Qtitle);
   65     _Qeprint = reference.value("eprint").trimmed();
   66 
   67     _raw_reference = raw_reference;
   68     // Remove raw metadata in case user repeats the network query
   69     _raw_reference.remove(QRegExp("\\n\\[Raw Metadata.+$"));
   70 
   71     if (_timing.msecsTo(QDateTime::currentDateTime()) < 550)
   72         QTimer::singleShot(550, this, SLOT(submitQuery1()));
   73     else
   74         submitQuery1();
   75 }
   76 
   77 void networkQuery::submitQuery1()
   78 {
   79     _raw_metadata.clear();
   80 
   81     // Submission, first step, setting journal codes
   82     if (!setQueryParameters())
   83     {
   84         _error_string = tr("No data for query.");
   85         emit queryEnded(false, _targetQ, _networkquery_tmp_fn1);
   86         return;
   87     }
   88     if (_is_end_of_file)
   89     {
   90         _error_string = tr("Performed %1 queries: No reference found.").arg(_query_parameter_count.count());
   91         emit queryEnded(false, _targetQ, _networkquery_tmp_fn1);
   92         return;
   93     }
   94 
   95     c2bUtils::debug(tr("Query Number = %1").arg(_query_parameter_count.count()));
   96     c2bUtils::debug(tr("targetQ[%1]").arg(_targetQ));
   97     c2bUtils::debug(tr("captionQ[%1]").arg(_captionQ));
   98     c2bUtils::debug(tr("referenceurl_prefix[%1]").arg(_referenceurl_prefix));
   99     c2bUtils::debug(tr("referenceurl_sufix[%1]").arg(_referenceurl_sufix));
  100     c2bUtils::debug(tr("pdfurl_prefix[%1]").arg(_pdfurl_prefix));
  101     c2bUtils::debug(tr("pdfurl_sufix[%1]").arg(_pdfurl_sufix));
  102     c2bUtils::debug(tr("action[%1]").arg(_action));
  103     c2bUtils::debug(tr("POST1[%1]").arg(_targetQ));
  104 
  105     if (_action == "browse_query")
  106     {
  107         if (openFile(encodeUrl(_targetQ)))
  108         {
  109             _error_string = tr("Browsing query.");
  110             emit queryEnded(true, QString(), QString());
  111         }
  112         else
  113         {
  114             _error_string = tr("Could not open URL '%1'.").arg(encodeUrl(_targetQ));
  115             emit queryEnded(false, QString(), QString());
  116         }
  117         return;
  118     }
  119 
  120     _timing = QDateTime::currentDateTime();
  121     if (_action == "htm2txt_query")
  122         emit statusMessage(tr("Importing: %1.").arg(_targetQ));
  123     else
  124         emit statusMessage(tr("Query: %1.").arg(_targetQ));
  125     _networkP->getFile(_targetQ, _networkquery_tmp_fn1, network::Copy, this, SLOT(submitQuery2(bool)),
  126                        !_settingsP->value("cb2Bib/KeepTmpNQFiles").toBool());
  127 }
  128 
  129 void networkQuery::submitQuery2(bool succeeded)
  130 {
  131     // Submission, second part: check query replay and PDF existence
  132     if (!succeeded)
  133     {
  134         _error_string = _networkP->errorString();
  135         emit statusMessage(tr("Query failed with %1.").arg(_error_string));
  136         QTimer::singleShot(10, this, SLOT(submitQuery1()));
  137         return;
  138     }
  139 
  140     QString lines(c2bUtils::fileToString(_networkquery_tmp_fn1, !_settingsP->value("cb2Bib/KeepTmpNQFiles").toBool()));
  141 
  142     // For the single query cases with no htm2txt_query and referenceurl_prefix use
  143     // non empty capture_from_query to check for result availability
  144     QString captured;
  145     if (!_captionQ.isEmpty())
  146     {
  147         QRegExp rx(_captionQ);
  148         rx.setMinimal(true);
  149         if (!rx.isValid())
  150             c2bUtils::warn(tr("Warning: RegExp '%1' is not valid").arg(_captionQ));
  151         const int ncap(rx.indexIn(lines));
  152         if (ncap == -1)
  153         {
  154             QTimer::singleShot(10, this, SLOT(submitQuery1()));
  155             return;
  156         }
  157         captured = fromHtmlString(rx.cap(1));
  158         c2bUtils::debug(tr("CAPTURED[%1]").arg(captured));
  159     }
  160     if (_action == "htm2txt_query")
  161     {
  162         _error_string = tr("Importing query URL.");
  163         emit queryEnded(true, QString(), fromHtmlString(lines, true));
  164         return;
  165     }
  166     if (_action == "merge_all_metadata")
  167         _raw_metadata = _raw_reference + c2bUtils::metadatasection.arg(lines.trimmed());
  168     else if (_action == "merge_referenceurl_metadata")
  169         _raw_metadata = _raw_reference;
  170     else
  171         _raw_metadata = c2bUtils::metadatasection.arg(lines.trimmed());
  172     if (_referenceurl_prefix.isEmpty() && _pdfurl_prefix.isEmpty())
  173     {
  174         _error_string = tr("Importing query URL.");
  175         emit queryEnded(true, QString(), _raw_metadata);
  176         return;
  177     }
  178     if (captured.isEmpty())
  179     {
  180         QTimer::singleShot(10, this, SLOT(submitQuery1()));
  181         return;
  182     }
  183     if (_referenceurl_prefix.isEmpty())
  184         _targetBib.clear();
  185     else
  186         _targetBib = _referenceurl_prefix + captured + _referenceurl_sufix;
  187     if (_check_document && !_pdfurl_prefix.isEmpty())
  188     {
  189         if (_pdfurl_is_captured)
  190             _targetPDF = _pdfurl_prefix + _pdfurl_sufix;
  191         else
  192             _targetPDF = _pdfurl_prefix + captured + _pdfurl_sufix;
  193         emit statusMessage(tr("Checking: %1").arg(_targetPDF));
  194         _networkP->headFile(_targetPDF, this, SLOT(submitQuery3(bool)));
  195     }
  196     else
  197         submitQuery3(false);
  198 }
  199 
  200 void networkQuery::submitQuery3(bool succeeded)
  201 {
  202     // Submission, third part: extracting reference location
  203     if (!succeeded || !_networkP->mimetypeString().contains(QRegExp("\\b(chm|djvu|pdf|ps)\\b")))
  204         _targetPDF.clear();
  205     else
  206         _targetPDF = _networkP->sourceFilename();
  207 
  208     if (_referenceurl_prefix.isEmpty())
  209     {
  210         emit queryEnded(true, _targetPDF, _raw_metadata);
  211         return;
  212     }
  213 
  214     c2bUtils::debug(tr("POST2[%1]").arg(_targetBib));
  215     c2bUtils::debug(tr("POST3[%1]").arg(_targetPDF));
  216 
  217     if (_action == "browse_referenceurl")
  218     {
  219         if (openFile(encodeUrl(_targetBib)))
  220         {
  221             _error_string = tr("Browsing reference.");
  222             emit queryEnded(true, QString(), QString());
  223         }
  224         else
  225         {
  226             _error_string = tr("Could not open URL '%1'.").arg(encodeUrl(_targetBib));
  227             emit queryEnded(false, QString(), QString());
  228         }
  229         return;
  230     }
  231     if (_action == "htm2txt_referenceurl")
  232         emit statusMessage(tr("Importing: %1.").arg(_targetBib));
  233     else
  234         emit statusMessage(tr("Retrieving: %1.").arg(_targetBib));
  235     _networkP->getFile(_targetBib, _networkquery_tmp_fn2, network::Copy, this, SLOT(queryDone(bool)),
  236                        !_settingsP->value("cb2Bib/KeepTmpNQFiles").toBool());
  237 }
  238 
  239 void networkQuery::queryDone(bool succeeded)
  240 {
  241     // Submission Done
  242     if (!succeeded)
  243     {
  244         QTimer::singleShot(10, this, SLOT(submitQuery1()));
  245         return;
  246     }
  247     QString lines(c2bUtils::fileToString(_networkquery_tmp_fn2, !_settingsP->value("cb2Bib/KeepTmpNQFiles").toBool()));
  248     if (_action == "htm2txt_referenceurl")
  249     {
  250         _error_string = tr("Importing reference URL.");
  251         emit queryEnded(true, _targetPDF, fromHtmlString(lines, true));
  252         return;
  253     }
  254     if (_action == "merge_all_metadata" || _action == "merge_referenceurl_metadata")
  255         _raw_metadata += c2bUtils::metadatasection.arg(lines.trimmed());
  256     else
  257         _raw_metadata = c2bUtils::metadatasection.arg(lines.trimmed());
  258     emit queryEnded(true, _targetPDF, _raw_metadata);
  259 }
  260 
  261 bool networkQuery::setQueryParameters()
  262 {
  263     if (!checkQueryFile(_settingsP->fileName("cb2Bib/NetworkFile")))
  264         return false;
  265 
  266     QFile file(_settingsP->fileName("cb2Bib/NetworkFile"));
  267     file.open(QIODevice::ReadOnly | QIODevice::Text);
  268     QTextStream stream(&file);
  269     stream.setCodec("UTF-8");
  270     stream.setAutoDetectUnicode(true);
  271     QRegExp Journal("journal=" + _Qjournal + "\\|");
  272     QRegExp AnyJournal("journal=\\s*$");
  273     uint readQueryParams(0);
  274     _is_end_of_file = false;
  275     QString line;
  276     while (!stream.atEnd())
  277     {
  278         line = stream.readLine();
  279         if (line.startsWith("%c2b_stop_parsing"))
  280             break;
  281         // Skip comments and blanks
  282         if (!(line.isEmpty() || line.contains(QRegExp("^#"))))
  283         {
  284             if (line.contains(Journal))
  285             {
  286                 const QStringList lc(line.split('|'));
  287                 if (lc.count() > 1)
  288                     _QjournalCode = lc.at(1);
  289                 else
  290                     _QjournalCode.clear();
  291             }
  292             else if (line.contains(AnyJournal))
  293             {
  294                 _QjournalCode = _Qjournal;
  295                 _QjournalCode.replace(" & ", " and "); // Avoid sending '&' to confuse URLs
  296                 _QjournalCode.replace(QRegExp("\\W"), " ");
  297                 _QjournalCode = _QjournalCode.simplified();
  298                 _QjournalCode.replace(" ", "+");
  299             }
  300             // Get appropiate parameters for Journal or AnyJournal
  301             if (line.contains(Journal) || line.contains(AnyJournal))
  302             {
  303                 // Skip if already performed
  304                 if (!_query_parameter_count.contains(++readQueryParams))
  305                 {
  306                     while (line.contains(QRegExp("^journal=")))
  307                         line = stream.readLine();
  308                     _targetQ = line.remove(QRegExp("^query="));
  309                     line = stream.readLine();
  310                     _captionQ = line.remove(QRegExp("^capture_from_query="));
  311                     line = stream.readLine();
  312                     _referenceurl_prefix = line.remove(QRegExp("^referenceurl_prefix="));
  313                     line = stream.readLine();
  314                     _referenceurl_sufix = line.remove(QRegExp("^referenceurl_sufix="));
  315                     line = stream.readLine();
  316                     _pdfurl_prefix = line.remove(QRegExp("^pdfurl_prefix="));
  317                     line = stream.readLine();
  318                     _pdfurl_sufix = line.remove(QRegExp("^pdfurl_sufix="));
  319                     line = stream.readLine();
  320                     _action = line.remove(QRegExp("^action="));
  321                     // Setting Query Parameters
  322                     updateQueryPlaceholders();
  323                     // Finally, check for unresolved cb2Bib tags
  324                     if (areQueryParametersValid())
  325                     {
  326                         _query_parameter_count.append(readQueryParams);
  327                         return true;
  328                     }
  329                 }
  330             }
  331         }
  332     }
  333     file.close();
  334     _is_end_of_file = true;
  335     return (!_query_parameter_count.isEmpty());
  336 }
  337 
  338 void networkQuery::updateQueryPlaceholders()
  339 {
  340     const QString pdfurl_prefix(_pdfurl_prefix);
  341     if (!_Qtitle.isEmpty())
  342     {
  343         _targetQ.replace("<<title>>", _Qtitle);
  344         _captionQ.replace("<<title>>", _Qtitle);
  345         _referenceurl_prefix.replace("<<title>>", _Qtitle);
  346         _referenceurl_sufix.replace("<<title>>", _Qtitle);
  347         _pdfurl_prefix.replace("<<title>>", _Qtitle);
  348         _pdfurl_sufix.replace("<<title>>", _Qtitle);
  349     }
  350     if (!_QjournalCode.isEmpty())
  351     {
  352         _targetQ.replace("<<journal>>", _QjournalCode);
  353         _captionQ.replace("<<journal>>", _QjournalCode);
  354         _referenceurl_prefix.replace("<<journal>>", _QjournalCode);
  355         _referenceurl_sufix.replace("<<journal>>", _QjournalCode);
  356         _pdfurl_prefix.replace("<<journal>>", _QjournalCode);
  357         _pdfurl_sufix.replace("<<journal>>", _QjournalCode);
  358     }
  359     if (!_Qpage.isEmpty())
  360     {
  361         _targetQ.replace("<<pages>>", _Qpage);
  362         _captionQ.replace("<<pages>>", _Qpage);
  363         _referenceurl_prefix.replace("<<pages>>", _Qpage);
  364         _referenceurl_sufix.replace("<<pages>>", _Qpage);
  365         _pdfurl_prefix.replace("<<pages>>", _Qpage);
  366         _pdfurl_sufix.replace("<<pages>>", _Qpage);
  367     }
  368     if (!_Qvolume.isEmpty())
  369     {
  370         _targetQ.replace("<<volume>>", _Qvolume);
  371         _captionQ.replace("<<volume>>", _Qvolume);
  372         _referenceurl_prefix.replace("<<volume>>", _Qvolume);
  373         _referenceurl_sufix.replace("<<volume>>", _Qvolume);
  374         _pdfurl_prefix.replace("<<volume>>", _Qvolume);
  375         _pdfurl_sufix.replace("<<volume>>", _Qvolume);
  376     }
  377     if (!_Qdoi.isEmpty())
  378     {
  379         _targetQ.replace("<<doi>>", _Qdoi);
  380         _captionQ.replace("<<doi>>", _Qdoi);
  381         _referenceurl_prefix.replace("<<doi>>", _Qdoi);
  382         _referenceurl_sufix.replace("<<doi>>", _Qdoi);
  383         _pdfurl_prefix.replace("<<doi>>", _Qdoi);
  384         _pdfurl_sufix.replace("<<doi>>", _Qdoi);
  385     }
  386     if (!_Qexcerpt.isEmpty())
  387     {
  388         _targetQ.replace("<<excerpt>>", _Qexcerpt);
  389         _captionQ.replace("<<excerpt>>", _Qexcerpt);
  390         _referenceurl_prefix.replace("<<excerpt>>", _Qexcerpt);
  391         _referenceurl_sufix.replace("<<excerpt>>", _Qexcerpt);
  392         _pdfurl_prefix.replace("<<excerpt>>", _Qexcerpt);
  393         _pdfurl_sufix.replace("<<excerpt>>", _Qexcerpt);
  394     }
  395     if (!_Qeprint.isEmpty())
  396     {
  397         _targetQ.replace("<<eprint>>", _Qeprint);
  398         _captionQ.replace("<<eprint>>", QRegExp::escape(_Qeprint));
  399         _referenceurl_prefix.replace("<<eprint>>", _Qeprint);
  400         _referenceurl_sufix.replace("<<eprint>>", _Qeprint);
  401         _pdfurl_prefix.replace("<<eprint>>", _Qeprint);
  402         _pdfurl_sufix.replace("<<eprint>>", _Qeprint);
  403     }
  404     _pdfurl_is_captured = (pdfurl_prefix != _pdfurl_prefix);
  405 }
  406 
  407 bool networkQuery::areQueryParametersValid()
  408 {
  409     if (!_action.isEmpty())
  410     {
  411         if (_action == "browse_query" || _action == "browse_referenceurl")
  412         {
  413             if (!_settingsP->value("networkQuery/isSupervised").toBool())
  414                 return false;
  415         }
  416         else if (!(_action == "htm2txt_query" || _action == "htm2txt_referenceurl" || _action == "merge_all_metadata" ||
  417                    _action == "merge_referenceurl_metadata"))
  418             return false;
  419     }
  420     const QString allParams(_targetQ + _captionQ + _referenceurl_prefix + _referenceurl_sufix + _pdfurl_prefix +
  421                             _pdfurl_sufix);
  422     return !(
  423                allParams.contains(QRegExp("(?:<<title>>|<<journal>>|<<pages>>|<<volume>>|<<doi>>|<<excerpt>>|<<eprint>>)")));
  424 }
  425 
  426 const QString networkQuery::encodeUrl(const QString& url) const
  427 {
  428     // Removes <<post>> tag if present and encodes URL to percent encoding
  429     QString encoded_url(url);
  430     encoded_url.remove(QRegExp("^<<post>>"));
  431     encoded_url = QUrl::toPercentEncoding(encoded_url, "+:/?=&\\");
  432     return encoded_url;
  433 }
  434 
  435 bool networkQuery::checkQueryFile(const QString& fn) const
  436 {
  437     if (fn.isEmpty())
  438     {
  439         c2bUtils::warn(tr("No network query file especified"));
  440         return false;
  441     }
  442     QFileInfo fi(fn);
  443     if (!fi.exists() || !fi.isReadable())
  444     {
  445         c2bUtils::warn(tr("Could not open network query file %1 for reading").arg(fn));
  446         return false;
  447     }
  448     return true;
  449 }