"Fossies" - the Fresh Open Source Software Archive

Member "cb2bib-2.0.1/src/c2b/bibParser.cpp" (12 Feb 2021, 25908 Bytes) of package /linux/privat/cb2bib-2.0.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "bibParser.cpp" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 2.0.0_vs_2.0.1.

    1 /***************************************************************************
    2  *   Copyright (C) 2004-2021 by Pere Constans
    3  *   constans@molspaces.com
    4  *   cb2Bib version 2.0.1. Licensed under the GNU GPL version 3.
    5  *   See the LICENSE file that comes with this distribution.
    6  ***************************************************************************/
    7 #include "bibParser.h"
    8 
    9 #include "arxivXml.h"
   10 #include "bibPreparser.h"
   11 #include "crJson.h"
   12 #include "document.h"
   13 #include "heuristicBibParser.h"
   14 #include "idMaker.h"
   15 #include "preprocess.h"
   16 #include "pubmedXml.h"
   17 #include "settings.h"
   18 
   19 #include <QCoreApplication>
   20 #include <QUrl>
   21 
   22 
   23 bibParser::bibParser(QObject* parento) : coreBibParser(parento)
   24 {
   25     // Creating journal name database
   26     _journal_dbP = new journalDB(_settingsP->fileName("cb2Bib/JournalFile"));
   27 
   28     // Creating month list
   29     _month_dbP = new monthDB();
   30 
   31     // Creating cite and document ID makers
   32     _cite_idmP = new idMaker("cb2Bib/CiteIdPattern", this);
   33     _file_idmP = new idMaker("cb2Bib/DocumentIdPattern", this);
   34 
   35     // Creating (external) reference preparser
   36     _preparserP = new bibPreparser(this);
   37     connect(_preparserP, SIGNAL(statusMessage(QString)), this, SIGNAL(statusMessage(QString)));
   38 
   39     // Creating stream preprocess object
   40     _preprocessP = new preprocess(this);
   41 
   42     // Creating heuristic bibliographic parser
   43     _heuristic_parserP = new heuristicBibParser(this);
   44 }
   45 
   46 bibParser::~bibParser()
   47 {
   48     delete _journal_dbP;
   49     delete _month_dbP;
   50     delete _heuristic_parserP;
   51 }
   52 
   53 /**
   54     Process each field and set its final format
   55 */
   56 QString bibParser::parse(const QString& field, const QString& value, const QString& init_value)
   57 {
   58     QString v(value);
   59     if (field == QLatin1String("file"))
   60         return v.trimmed();
   61     if (field == "abstract")
   62         document::normalize(v, document::Complete);
   63     v = removeTags(v);
   64     if (v.isEmpty())
   65         return v;
   66     c2bUtils::fullBibToC2b(v);
   67     if (field == QLatin1String("author"))
   68     {
   69         c2bUtils::debug(v);
   70         v = _authorString.toBibTeX(v, _settingsP->value("cb2Bib/UseFullNames").toBool());
   71     }
   72     else if (field == QLatin1String("addauthors"))
   73     {
   74         c2bUtils::debug(v);
   75         if (init_value.isEmpty())
   76             v = _authorString.toBibTeX(v, _settingsP->value("cb2Bib/UseFullNames").toBool());
   77         else
   78             v = init_value + " and " + _authorString.toBibTeX(v, _settingsP->value("cb2Bib/UseFullNames").toBool());
   79     }
   80     else if (field == QLatin1String("editor"))
   81     {
   82         c2bUtils::debug(v);
   83         v.remove(QRegExp("\\((Editor|Editors|Ed|Eds)\\.{0,1}\\)", Qt::CaseInsensitive));
   84         v = _authorString.toBibTeX(v, _settingsP->value("cb2Bib/UseFullNames").toBool());
   85     }
   86     else if (field == QLatin1String("addeditors"))
   87     {
   88         c2bUtils::debug(v);
   89         v.remove(QRegExp("\\((Editor|Editors|Ed|Eds)\\.{0,1}\\)", Qt::CaseInsensitive));
   90         if (init_value.isEmpty())
   91             v = _authorString.toBibTeX(v, _settingsP->value("cb2Bib/UseFullNames").toBool());
   92         else
   93             v = init_value + " and " + _authorString.toBibTeX(v, _settingsP->value("cb2Bib/UseFullNames").toBool());
   94     }
   95     else if (field == QLatin1String("doi"))
   96     {
   97         v.remove(QRegExp("^.+(?=10\\.[\\d\\.]+/\\S+)"));
   98         if (v.endsWith(QLatin1Char('.')) || v.endsWith(QLatin1Char(',')) || v.endsWith(QLatin1Char(';')))
   99             v.chop(1);
  100     }
  101     else if (field == QLatin1String("url"))
  102     {
  103         // Remove redundant DOI URLs
  104         if (v.contains("doi.org/10."))
  105             v.clear();
  106     }
  107     else if (field == QLatin1String("isbn"))
  108         v.remove(' ');
  109     else if (field == QLatin1String("journal"))
  110     {
  111         if (_settingsP->value("cb2Bib/SetJournalsToFullname").toBool())
  112             v = fullJournal(v);
  113         else
  114             v = abbreviatedJournal(v);
  115     }
  116     else if (field == QLatin1String("keywords"))
  117     {
  118         v.replace(" - ", ",");
  119         QStringList kl(v.split(QRegExp("[^\\w\\s-']"), QString::SkipEmptyParts));
  120         kl.removeAll(" ");
  121         kl.removeDuplicates();
  122         v = kl.join(", ");
  123         v.replace(QRegExp("\\s+,"), ",");
  124         v = v.toLower();
  125     }
  126     else if (field == QLatin1String("month"))
  127         v = _month_dbP->retrieve(v);
  128     // Fields edition and note require first letter capitalization
  129     else if (field == QLatin1String("edition"))
  130         v = c2bUtils::setCapitalization(v);
  131     else if (field == QLatin1String("note"))
  132         v = c2bUtils::setCapitalization(v);
  133     // Process pages, volume, number, and year to set hyphenation
  134     else if (field == QLatin1String("pages"))
  135         v = adjacentNumbers(v);
  136     else if (field == QLatin1String("volume"))
  137         v = adjacentNumbers(v);
  138     else if (field == QLatin1String("number"))
  139         v = adjacentNumbers(v);
  140     else if (field == QLatin1String("year"))
  141         v = adjacentNumbers(v);
  142     else if (field == QLatin1String("title") || field == QLatin1String("booktitle"))
  143         v = c2bUtils::setCapitalization(v);
  144     else if (field == QLatin1String("addtitle"))
  145     {
  146         if (!init_value.isEmpty())
  147             v = init_value + QLatin1String(": ") + v;
  148         v = c2bUtils::setCapitalization(v);
  149     }
  150     return c2bUtils::simplifyString(v);
  151 }
  152 
  153 /**
  154     Process each field and set its final format for a complete reference
  155 */
  156 bibReference& bibParser::parse(bibReference& reference)
  157 {
  158     QMutableHashIterator<QString, QString> i(reference);
  159     while (i.hasNext())
  160     {
  161         i.next();
  162         const QString v(parse(i.key(), i.value()));
  163         i.setValue(v);
  164     }
  165     return reference;
  166 }
  167 
  168 QString bibParser::setJournalsToFull(const QString& text, const bool alternate)
  169 {
  170     const bool ConvertReferenceToLaTeX(_settingsP->value("cb2Bib/ConvertReferenceToLaTeX").toBool());
  171     QString substituted_text(text);
  172     QRegExp jnre("\\bjournal\\s*=\\s*[\\{\"]", Qt::CaseInsensitive);
  173     QString jn;
  174     int pos(0);
  175     uint nj(0);
  176     while (pos >= 0)
  177     {
  178         pos = jnre.indexIn(substituted_text, pos);
  179         if (pos > 0 && c2bUtils::inBraces(pos + jnre.matchedLength(), substituted_text, &jn))
  180         {
  181             const int jnlength(jn.length());
  182             c2bUtils::bibToC2b(jn);
  183             jn = alternate ? alternateFullJournal(jn) : fullJournal(jn);
  184             if (ConvertReferenceToLaTeX)
  185                 c2bUtils::c2bToBib(jn);
  186             pos += jnre.matchedLength();
  187             substituted_text.replace(pos, jnlength, jn);
  188             nj++;
  189         }
  190         else if (pos >= 0)
  191             ++pos;
  192     }
  193     emit statusMessage(tr("Processed %1 journal names.").arg(nj));
  194     return substituted_text;
  195 }
  196 
  197 QString bibParser::setJournalsToAbbreviated(const QString& text, const bool alternate)
  198 {
  199     const bool ConvertReferenceToLaTeX(_settingsP->value("cb2Bib/ConvertReferenceToLaTeX").toBool());
  200     QString substituted_text(text);
  201     QRegExp jnre("\\bjournal\\s*=\\s*[\\{\"]", Qt::CaseInsensitive);
  202     QString jn;
  203     int pos(0);
  204     uint nj(0);
  205     while (pos >= 0)
  206     {
  207         pos = jnre.indexIn(substituted_text, pos);
  208         if (pos > 0 && c2bUtils::inBraces(pos + jnre.matchedLength(), substituted_text, &jn))
  209         {
  210             const int jnlength(jn.length());
  211             c2bUtils::bibToC2b(jn);
  212             jn = alternate ? alternateAbbreviatedJournal(jn) : abbreviatedJournal(jn);
  213             if (ConvertReferenceToLaTeX)
  214                 c2bUtils::c2bToBib(jn);
  215             pos += jnre.matchedLength();
  216             substituted_text.replace(pos, jnlength, jn);
  217             nj++;
  218         }
  219         else if (pos >= 0)
  220             ++pos;
  221     }
  222     emit statusMessage(tr("Processed %1 journal names.").arg(nj));
  223     return substituted_text;
  224 }
  225 
  226 QString bibParser::excerpt(const QString& text, const QStringList& hints) const
  227 {
  228     QString txt(removeTags(text));
  229     txt.replace(QRegExp("\\[Bibliographic Metadata.+/Bibliographic Metadata\\]"), " ");
  230     txt.replace(QRegExp("(http://|https://|ftp://|www\\.|ftp\\.)(www\\.|ftp\\.){0,1}\\S+"), " ");
  231     txt.replace(QRegExp("\\b[A-Z]+\\b"), " ");
  232     txt.replace(QRegExp("\\d"), " ");
  233     txt.replace(c2bUtils::nonLetter, " ");
  234     txt.replace(QRegExp("\\b\\w{1,2}\\b"), " ");
  235     txt.replace(
  236         QRegExp("\\b(about|and|are|com|for|from|how|into|that|the|their|this|where|with|www)\\b", Qt::CaseInsensitive),
  237         " ");
  238     txt.replace(QRegExp("\\b(january|february|march|april|may|june|july|august|september|october|november|december)\\b",
  239                         Qt::CaseInsensitive),
  240                 " ");
  241     txt = hints.join(" ") + ' ' + txt;
  242     txt = c2bUtils::simplifyString(txt);
  243     const QStringList txtlist(txt.split(' ', QString::SkipEmptyParts));
  244     QStringList txtlistSimp;
  245     for (int i = 0; i < std::min(15, txtlist.count()); ++i)
  246         txtlistSimp.append(txtlist.at(i));
  247     return txtlistSimp.join(" ");
  248 }
  249 
  250 void bibParser::setField(const QString& name, const QString& value)
  251 {
  252     _current_reference[name] = parse(name, value);
  253 }
  254 
  255 void bibParser::setReferenceType(const QString& type)
  256 {
  257     _current_reference.typeName = type;
  258 }
  259 
  260 void bibParser::setCiteID()
  261 {
  262     _current_reference.citeidName = _cite_idmP->makeID(_current_reference);
  263 }
  264 
  265 QString bibParser::documentFilename(const QString& base_fn, const QString& bibtex_fn)
  266 {
  267     if (base_fn.isEmpty())
  268         return base_fn;
  269     if (!_settingsP->value("cb2Bib/MovePdf").toBool())
  270     {
  271         const QUrl u(base_fn);
  272         const QString scheme(u.scheme());
  273         if (scheme == "file")
  274             return parse("file", u.toLocalFile());
  275         else
  276             return parse("file", base_fn);
  277     }
  278     QString filename(_file_idmP->makeID(_current_reference));
  279     if (filename.isEmpty())
  280         filename = "no_cite_id";
  281     QFileInfo fi(base_fn);
  282     QString docExtension('.' + fi.suffix().toLower());
  283     // Possible document extensions
  284     if (!docExtension.contains(QRegExp("^\\.\\w{2,4}$")))
  285         docExtension = ".pdf"; // Default
  286     if (docExtension == ".gz")
  287         if (fi.completeSuffix().toLower() == ".ps.gz")
  288             docExtension = ".ps.gz"; // Composite extension
  289     filename = c2bUtils::documentFilename(_settingsP->value("cb2Bib/RelativePdfDirectory").toBool(), bibtex_fn,
  290                                           _settingsP->fileName("cb2Bib/PdfDirectory"), filename + docExtension);
  291     return parse("file", filename);
  292 }
  293 
  294 
  295 /****************************************************************************
  296 
  297 AUTOMATIC BIB CAPTION
  298 
  299 *****************************************************************************/
  300 
  301 void bibParser::preparse(const QString& text, QString* out_text)
  302 {
  303     _preparserP->preparse(text, out_text);
  304 }
  305 
  306 /** \page clipboard Extracting Data from the Clipboard
  307 
  308     Clipboard contents is processed according to the following rules:
  309 
  310     - Perform external, user-defined preparsing on input stream. See \ref c2bconf_clipboard.
  311 
  312     - Perform user-defined substitutions on input stream. See \ref c2bconf_clipboard.
  313 
  314     - Check if input stream is already a BibTeX entry. If so, process entry.
  315 
  316     - Check if input stream is, in this order of preference, a PubMed XML,
  317     arXiv XML, CR JSON, or Medline entry. If so, process entry.
  318 
  319     - Preprocess author names: PI JOAN III -> Pi III, J.
  320     (care of name prefixes, suffixes, and removal of ambiguities).
  321 
  322 
  323     If otherwise,
  324 
  325     - Extract DOI \n (DOI, URL and FILE/PDF are preprocessed, performed before
  326     the automatic recognition takes place.)
  327 
  328     - Extract URL
  329 
  330     - Remove leading and trailing white spaces, TABs and CRs.
  331 
  332     - "\r\n", "\n" and/or "\r" replaced by the line indicator tag <code><NewLineN></code>.
  333 
  334     - Replace "\t" and ten or more consecutive "\s" by the tabular tag <code><TabN></code>.
  335 
  336     - Simplify white spaces
  337 
  338     - Start the automatic recognition engine.
  339 
  340 
  341     If the automatic recognition engine fails, optionally, a heuristic guessing
  342     will be performed.
  343 
  344 
  345     See also \ref heuristics and \ref metadata.
  346 
  347 */
  348 void bibParser::parse(const QString& text, QString* out_text, QString* out_tagtext)
  349 {
  350     QString& otext = *out_text;
  351     otext = text;
  352     QString& ottext = *out_tagtext;
  353     ottext.clear();
  354     _preprocessP->preprocessText(out_text);
  355 
  356     if (hasBibTeX(otext))
  357     {
  358         _current_reference = wholeReference(otext);
  359         parse(_current_reference);
  360         currentReferenceUpdated();
  361         _auto_recognized_string = tr("Processed as 'BibTeX'.");
  362         _auto_recognized = true;
  363         emit statusMessage(_auto_recognized_string);
  364         return;
  365     }
  366 
  367     const QString pubmed_identifier("<PubmedArticle>");
  368     if (otext.contains(pubmed_identifier))
  369     {
  370         pubmedXml pxml(metadataSection(otext, pubmed_identifier));
  371         if (pxml.hasError())
  372         {
  373             _auto_recognized_string = tr("Error parsing 'PubMed XML': %1.").arg(pxml.errorString());
  374             _auto_recognized = false;
  375         }
  376         else
  377         {
  378             _current_reference = pxml.reference();
  379             parse(_current_reference);
  380             currentReferenceUpdated();
  381             _auto_recognized_string = tr("Processed as 'PubMed XML'.");
  382             _auto_recognized = true;
  383         }
  384         emit statusMessage(_auto_recognized_string);
  385         return;
  386     }
  387 
  388     const QString arxiv_identifier("arxiv.org/api/");
  389     const QString cr_identifier("{\"status\":\"ok\",\"message-type\":\"work\"");
  390     if (otext.contains(arxiv_identifier) || otext.contains(cr_identifier))
  391     {
  392         if (otext.contains(cr_identifier))
  393         {
  394             crJson crjson(metadataSection(otext, cr_identifier));
  395             if (crjson.hasError())
  396             {
  397                 _auto_recognized_string = tr("Error parsing 'CR JSON': %1.").arg(crjson.errorString());
  398                 _auto_recognized = false;
  399                 emit statusMessage(_auto_recognized_string);
  400                 return;
  401             }
  402             else
  403             {
  404                 _current_reference = crjson.reference();
  405                 _auto_recognized_string = tr("Processed as 'CR JSON'.");
  406             }
  407         }
  408         if (otext.contains(arxiv_identifier))
  409         {
  410             arxivXml axml(metadataSection(otext, arxiv_identifier));
  411             if (axml.hasError())
  412             {
  413                 _auto_recognized_string = tr("Error parsing 'arXiv XML': %1.").arg(axml.errorString());
  414                 _auto_recognized = false;
  415                 if (_current_reference.size() > 0)
  416                     _current_reference.clearReference();
  417                 emit statusMessage(_auto_recognized_string);
  418                 return;
  419             }
  420             else
  421             {
  422                 if (_current_reference.size() == 0)
  423                 {
  424                     _current_reference = axml.reference();
  425                     _auto_recognized_string = tr("Processed as 'arXiv XML'.");
  426                 }
  427                 else
  428                 {
  429                     // Merging arXiv and CR metadata
  430                     const bibReference& areference(axml.reference());
  431                     if (!areference.value("abstract").isEmpty())
  432                         _current_reference["abstract"] = areference.value("abstract");
  433                     if (!areference.value("title").isEmpty())
  434                         _current_reference["title"] = areference.value("title");
  435                     if (!areference.value("eprint").isEmpty())
  436                         _current_reference["eprint"] = areference.value("eprint");
  437                     if (!areference.value("url").isEmpty())
  438                         _current_reference["url"] = areference.value("url");
  439                     _auto_recognized_string = tr("Processed as merged 'arXiv XML' and 'CR JSON'.");
  440                 }
  441             }
  442         }
  443         parse(_current_reference);
  444         currentReferenceUpdated();
  445         _auto_recognized = true;
  446         emit statusMessage(_auto_recognized_string);
  447         return;
  448     }
  449 
  450     if (otext.contains(QRegExp("^\\s*PMID\\s*-")))
  451     {
  452         _current_reference.typeName = "article";
  453         otext = ' ' + otext;
  454         ottext = otext;
  455         // http://www.nlm.nih.gov/bsd/mms/medlineelements.html
  456         ottext.replace(QRegExp("[\\n\\r]\\s*([A-Z]{2,4}\\s*-)"), "][\\1"); // Two to four capital letter in field tags
  457         ottext = c2bUtils::simplifyString(ottext);
  458         if (!ottext.contains(QRegExp("\\[FAU\\s+-")))
  459             ottext.replace(QRegExp("\\[(AU\\s*-\\s*[-'\\w]+)"), "[F\\1 ");
  460         QStringList fList(ottext.split("]["));
  461         QString kw;
  462         QRegExp fld("^([A-Z]{2,4})\\s{0,1}-\\s*(.+)$");
  463         fld.setPatternSyntax(QRegExp::RegExp2);
  464         for (QStringList::Iterator it = fList.begin(); it != fList.end(); ++it)
  465         {
  466             if (fld.indexIn(*it) == -1)
  467                 continue;
  468             const QString tag(fld.cap(1));
  469             QString value(fld.cap(2));
  470             if (tag == "AB")
  471                 _current_reference["abstract"] = parse("abstract", value);
  472             else if (tag == "FAU")
  473                 _current_reference["author"] =
  474                     parse("addauthors", authorFromMedline(value), _current_reference.value("author"));
  475             else if (tag == "TA")
  476                 _current_reference["journal"] = parse("journal", value);
  477             else if (tag == "IP")
  478                 _current_reference["number"] = parse("number", value);
  479             else if (tag == "PG")
  480                 _current_reference["pages"] = parse("pages", value);
  481             else if (tag == "TI")
  482                 _current_reference["title"] = parse("title", value);
  483             else if (tag == "PMID")
  484                 _current_reference["url"] = parse("url", c2bUtils::pubmedUrl.arg(value));
  485             else if (tag == "VI")
  486                 _current_reference["volume"] = parse("volume", value);
  487             else if (tag == "AID")
  488             {
  489                 if (value.contains("[doi]"))
  490                     _current_reference["doi"] = parse("doi", value.remove("[doi]"));
  491             }
  492             else if (tag == "DP")
  493                 _current_reference["year"] = parse("year", value.replace(QRegExp("^([\\d\\s]+).*$"), "\\1"));
  494             else if (tag == "MH")
  495                 kw += "; " + value.trimmed();
  496         }
  497         if (!kw.isEmpty())
  498             _current_reference["keywords"] = parse("keywords", kw.remove(0, 2));
  499         currentReferenceUpdated();
  500         _auto_recognized_string = tr("Processed as 'PubMed - Medline Journals'.");
  501         _auto_recognized = true;
  502         emit statusMessage(_auto_recognized_string);
  503         return;
  504     }
  505 
  506     _heuristic_parserP->heuristicFields(otext);
  507 
  508     // Set tags and start regular expression extraction
  509     ottext = setTags(otext);
  510     QString regular_expression_f(_settingsP->fileName("cb2Bib/RegularExpressionFile"));
  511     checkRegExpFile(regular_expression_f);
  512     QFile file(regular_expression_f);
  513     file.open(QIODevice::ReadOnly | QIODevice::Text);
  514     QString ItemX;
  515     QString line;
  516     QString reftype;
  517     QString fieldset;
  518     QTextStream stream(&file);
  519     stream.setCodec("UTF-8");
  520     stream.setAutoDetectUnicode(true);
  521     int nfilters(0);
  522 
  523     while (!stream.atEnd())
  524     {
  525         line = stream.readLine();
  526         if (!(line.isEmpty() || line.contains(QRegExp("^#"))))
  527         {
  528             reftype = stream.readLine();
  529             fieldset = stream.readLine();
  530             ItemX = stream.readLine();
  531 
  532             c2bUtils::debug(tr("The RegExp file contains1: |%1|").arg(line));
  533             c2bUtils::debug(tr("The RegExp file contains2: |%1|").arg(reftype));
  534             c2bUtils::debug(tr("The RegExp file contains3: |%1|").arg(fieldset));
  535             c2bUtils::debug(tr("The RegExp file contains4: |%1|").arg(ItemX));
  536 
  537             QRegExp rx(ItemX);
  538             rx.setMinimal(true);
  539             if (!rx.isValid())
  540                 c2bUtils::warn(tr("RegExp |%1| is not valid").arg(ItemX));
  541 
  542             const QStringList list(fieldset.split(' ', QString::SkipEmptyParts));
  543             const int efields(list.count());
  544             const int cfields(rx.captureCount());
  545             int npos(rx.indexIn(ottext));
  546             c2bUtils::debug(tr("Expected Fields: |%1|").arg(efields));
  547             c2bUtils::debug(tr("Captured Fields: |%1|").arg(cfields));
  548             c2bUtils::debug(tr("Position: |%1|").arg(npos));
  549             if (efields != cfields)
  550             {
  551                 c2bUtils::warn(
  552                     tr("RegExp |%1| is not valid. Mismatch between expected and actual captures").arg(ItemX));
  553                 npos = -1;
  554             }
  555             nfilters++;
  556 
  557             if (npos > -1)
  558             {
  559                 for (int i = 0; i < cfields; i++)
  560                 {
  561                     const QString& listi = list.at(i);
  562                     int ii(i + 1);
  563                     c2bUtils::debug(QString("Fields in Template %1: |%2|").arg(i).arg(rx.cap(ii)));
  564                     if (_field_re.indexIn(listi) > -1)
  565                     {
  566                         if (listi == "author")
  567                             // Reminder: "addauthors" requires to init _current_reference["author"]
  568                             _current_reference[listi] =
  569                                 parse("addauthors", rx.cap(ii), _current_reference.value(listi));
  570                         else if (listi == "editor")
  571                             // Reminder: "addeditors" requires to init _current_reference["editor"]
  572                             _current_reference[listi] =
  573                                 parse("addeditors", rx.cap(ii), _current_reference.value(listi));
  574                         else if (listi == "title")
  575                             // Reminder: "addtitle" requires to init _current_reference["title"]
  576                             _current_reference[listi] = parse("addtitle", rx.cap(ii), _current_reference.value(listi));
  577                         else
  578                             _current_reference[listi] = parse(listi, rx.cap(ii));
  579                     }
  580                 }
  581                 _current_reference.typeName = reftype;
  582                 currentReferenceUpdated();
  583                 _auto_recognized_string = tr("Processed as '%1'.").arg(line);
  584                 _auto_recognized = true;
  585                 emit statusMessage(_auto_recognized_string);
  586                 file.close();
  587                 return;
  588             }
  589         }
  590     }
  591     file.close();
  592 
  593     // Heuristic Bib Parsing
  594     if (_settingsP->value("cb2Bib/DoHeuristicGuess").toBool())
  595     {
  596         // Sometimes (if user is on tag mode) tag could be on otext. Revert tags here, just in case.
  597         const QString clean_text(removeTags(ottext));
  598         _heuristic_parserP->guessFields(clean_text, ottext);
  599         currentReferenceUpdated();
  600         _auto_recognized_string =
  601             tr("Applied %1 filters: No automatic format detection. %2 fields guessed.").arg(nfilters).arg(fieldCount());
  602     }
  603     else
  604         _auto_recognized_string = tr("Applied %1 filters: No automatic format detection.").arg(nfilters);
  605     emit statusMessage(_auto_recognized_string);
  606 }
  607 
  608 void bibParser::checkRegExpFile(const QString& fn)
  609 {
  610     if (fn.isEmpty())
  611     {
  612         c2bUtils::warn(tr("No regular expression file especified"));
  613         return;
  614     }
  615     QFileInfo fi(fn);
  616     if (!fi.exists() || !fi.isReadable())
  617     {
  618         c2bUtils::warn(tr("Could not open regular expression file %1 for reading").arg(fn));
  619         return;
  620     }
  621 }
  622 
  623 void bibParser::guessFields(const QString& text)
  624 {
  625     const QString clean_text(text.simplified());
  626     const QString tagged_text(setTags(text));
  627     _heuristic_parserP->heuristicFields(clean_text);
  628     _heuristic_parserP->guessFields(clean_text, tagged_text);
  629     currentReferenceUpdated();
  630     _auto_recognized_string = tr("%1 fields guessed.").arg(fieldCount());
  631     emit statusMessage(_auto_recognized_string);
  632 }
  633 
  634 QString bibParser::setTags(const QString& text) const
  635 {
  636     QString tagged_text(text.trimmed());
  637     tagged_text.replace(QRegExp("\\r\\n"), "<found_new_line>"); // Windows new line
  638     tagged_text.replace(QRegExp("\\n"), "<found_new_line>");    // Linux new line, LF
  639     tagged_text.replace(QRegExp("\\r"), "<found_new_line>");    // OSX new line, CR
  640     QStringList spText(tagged_text.split("<found_new_line>"));
  641     int n(spText.count());
  642     tagged_text.clear();
  643     for (int i = 0; i < n - 1; i++)
  644         tagged_text += spText.at(i) + QString("<NewLine%1>").arg(i + 1);
  645     tagged_text += spText[n - 1];
  646     spText = tagged_text.split(QRegExp("(\\s{10,}|\\t)"));
  647     n = spText.count();
  648     tagged_text.clear();
  649     for (int i = 0; i < n - 1; i++)
  650         tagged_text += spText.at(i) + QString("<Tab%1>").arg(i + 1);
  651     tagged_text += spText[n - 1];
  652     tagged_text = c2bUtils::simplifyString(tagged_text);
  653     return tagged_text;
  654 }
  655 
  656 QString bibParser::removeTags(const QString& text) const
  657 {
  658     QString clean(text);
  659     clean.remove("[[");
  660     clean.remove("]]");
  661     clean.replace(QRegExp("<NewLine\\d+>"), " ");
  662     clean.replace(QRegExp("<Tab\\d+>"), " ");
  663     clean = c2bUtils::simplifyString(clean);
  664     return clean;
  665 }
  666 
  667 QString bibParser::metadataSection(const QString& text, const QString& identifier) const
  668 {
  669     const QStringList lines(text.split(QRegExp("[\\r\\n]"), QString::KeepEmptyParts));
  670     const int nl(lines.count());
  671     int il(-1);
  672     int sl(0);
  673     int el(0);
  674     for (int l = 0; l < nl; ++l)
  675         if (lines.at(l).contains(identifier))
  676         {
  677             il = l;
  678             break;
  679         }
  680     if (il == -1)
  681         return QString();
  682     for (int l = il + 1; l < nl; ++l)
  683         if (lines.at(l) == "/Raw Metadata]")
  684         {
  685             el = l;
  686             break;
  687         }
  688     for (int l = il; l >= 0; --l)
  689         if (lines.at(l) == "[Raw Metadata")
  690         {
  691             sl = l;
  692             break;
  693         }
  694     if (sl == 0 && el == 0)
  695         return text.trimmed();
  696     if (sl == 0 || el == 0)
  697         return QString();
  698     QString section;
  699     for (int l = sl + 1; l < el; ++l)
  700         section += '\n' + lines.at(l);
  701     return section.trimmed();
  702 }