"Fossies" - the Fresh Open Source Software Archive  

Source code changes of the file "src/c2b/bibParser.cpp" between
cb2bib-1.9.9.tar.gz and cb2bib-2.0.0.tar.gz

About: cb2Bib is a multiplatform application for rapidly extracting unformatted, or unstandardized bibliographic references from email alerts, journal Web pages, and PDF files.

bibParser.cpp  (cb2bib-1.9.9):bibParser.cpp  (cb2bib-2.0.0)
/*************************************************************************** /***************************************************************************
* Copyright (C) 2004-2018 by Pere Constans * Copyright (C) 2004-2019 by Pere Constans
* constans@molspaces.com * constans@molspaces.com
* cb2Bib version 1.9.9. Licensed under the GNU GPL version 3. * cb2Bib version 2.0.0. Licensed under the GNU GPL version 3.
* See the LICENSE file that comes with this distribution. * See the LICENSE file that comes with this distribution.
***************************************************************************/ ***************************************************************************/
#include "bibParser.h" #include "bibParser.h"
#include "arxivXml.h" #include "arxivXml.h"
#include "authorString.h" #include "authorString.h"
#include "bibPreparser.h" #include "bibPreparser.h"
#include "crJson.h" #include "crJson.h"
#include "document.h" #include "document.h"
#include "heuristicBibParser.h" #include "heuristicBibParser.h"
skipping to change at line 37 skipping to change at line 37
// Creating month list // Creating month list
_month_dbP = new monthDB(); _month_dbP = new monthDB();
// Creating cite and document ID makers // Creating cite and document ID makers
_cite_idmP = new idMaker("cb2Bib/CiteIdPattern", this); _cite_idmP = new idMaker("cb2Bib/CiteIdPattern", this);
_file_idmP = new idMaker("cb2Bib/DocumentIdPattern", this); _file_idmP = new idMaker("cb2Bib/DocumentIdPattern", this);
// Creating (external) reference preparser // Creating (external) reference preparser
_preparserP = new bibPreparser(this); _preparserP = new bibPreparser(this);
connect(_preparserP, SIGNAL(statusMessage(const QString&)), this, SIGNAL(sta tusMessage(const QString&))); connect(_preparserP, SIGNAL(statusMessage(QString)), this, SIGNAL(statusMess age(QString)));
// Creating stream preprocess object // Creating stream preprocess object
_preprocessP = new preprocess(this); _preprocessP = new preprocess(this);
// Creating heuristic bibliographic parser // Creating heuristic bibliographic parser
_heuristic_parserP = new heuristicBibParser(this); _heuristic_parserP = new heuristicBibParser(this);
} }
bibParser::~bibParser() bibParser::~bibParser()
{ {
skipping to change at line 65 skipping to change at line 65
*/ */
QString bibParser::parse(const QString& field, const QString& value, const QStri ng& init_value) QString bibParser::parse(const QString& field, const QString& value, const QStri ng& init_value)
{ {
QString v(value); QString v(value);
if (field == "file") if (field == "file")
return v.trimmed(); return v.trimmed();
if (field == "abstract") if (field == "abstract")
document::normalize(v, document::Complete); document::normalize(v, document::Complete);
v = removeTags(v); v = removeTags(v);
if (v.isEmpty()) if (v.isEmpty())
return (v); return v;
c2bUtils::fullBibToC2b(v); c2bUtils::fullBibToC2b(v);
if (field == "author") if (field == "author")
{ {
c2bUtils::debug(v); c2bUtils::debug(v);
v = _authorString.toBibTeX(v, _settingsP->value("cb2Bib/UseFullNames").t oBool()); v = _authorString.toBibTeX(v, _settingsP->value("cb2Bib/UseFullNames").t oBool());
} }
else if (field == "addauthors") else if (field == "addauthors")
{ {
c2bUtils::debug(v); c2bUtils::debug(v);
if (init_value.isEmpty()) if (init_value.isEmpty())
skipping to change at line 166 skipping to change at line 166
{ {
i.next(); i.next();
const QString v(parse(i.key(), i.value())); const QString v(parse(i.key(), i.value()));
i.setValue(v); i.setValue(v);
} }
return reference; return reference;
} }
QString bibParser::setJournalsToFull(const QString& text, const bool alternate) QString bibParser::setJournalsToFull(const QString& text, const bool alternate)
{ {
const bool ConvertReferenceToLaTeX(_settingsP->value("cb2Bib/ConvertReferenc eToLaTeX").toBool());
QString substituted_text(text); QString substituted_text(text);
QRegExp jnre("\\bjournal\\s*=\\s*[\\{\"](.*)[\\}\"]", Qt::CaseInsensitive); QRegExp jnre("\\bjournal\\s*=\\s*[\\{\"]", Qt::CaseInsensitive);
jnre.setMinimal(true); QString jn;
int pos(0); int pos(0);
uint nj(0); uint nj(0);
while (pos >= 0) while (pos >= 0)
{ {
pos = jnre.indexIn(substituted_text, pos); pos = jnre.indexIn(substituted_text, pos);
if (pos > -1) if (pos > 0 && c2bUtils::inBraces(pos + jnre.matchedLength(), substitute d_text, &jn))
{ {
QString line(jnre.cap(0)); const int jnlength(jn.length());
const QString jn(jnre.cap(1)); c2bUtils::bibToC2b(jn);
line.replace(jn, alternate ? alternateFullJournal(jn) : fullJournal( jn = alternate ? alternateFullJournal(jn) : fullJournal(jn);
jn)); if (ConvertReferenceToLaTeX)
substituted_text.replace(pos, jnre.matchedLength(), line); c2bUtils::c2bToBib(jn);
pos += line.length(); pos += jnre.matchedLength();
substituted_text.replace(pos, jnlength, jn);
nj++; nj++;
} }
emit statusMessage(tr("Processed %1 journal names...").arg(nj)); else if (pos >= 0)
QCoreApplication::processEvents(); ++pos;
} }
emit statusMessage(tr("Processed %1 journal names.").arg(nj)); emit statusMessage(tr("Processed %1 journal names.").arg(nj));
return (substituted_text); return substituted_text;
} }
QString bibParser::setJournalsToAbbreviated(const QString& text, const bool alte rnate) QString bibParser::setJournalsToAbbreviated(const QString& text, const bool alte rnate)
{ {
const bool ConvertReferenceToLaTeX(_settingsP->value("cb2Bib/ConvertReferenc eToLaTeX").toBool());
QString substituted_text(text); QString substituted_text(text);
QRegExp jnre("\\bjournal\\s*=\\s*[\\{\"](.*)[\\}\"]", Qt::CaseInsensitive); QRegExp jnre("\\bjournal\\s*=\\s*[\\{\"]", Qt::CaseInsensitive);
jnre.setMinimal(true); QString jn;
int pos(0); int pos(0);
uint nj(0); uint nj(0);
while (pos >= 0) while (pos >= 0)
{ {
pos = jnre.indexIn(substituted_text, pos); pos = jnre.indexIn(substituted_text, pos);
if (pos > -1) if (pos > 0 && c2bUtils::inBraces(pos + jnre.matchedLength(), substitute d_text, &jn))
{ {
QString line(jnre.cap(0)); const int jnlength(jn.length());
const QString jn(jnre.cap(1)); c2bUtils::bibToC2b(jn);
line.replace(jn, alternate ? alternateAbbreviatedJournal(jn) : abbre jn = alternate ? alternateAbbreviatedJournal(jn) : abbreviatedJourna
viatedJournal(jn)); l(jn);
substituted_text.replace(pos, jnre.matchedLength(), line); if (ConvertReferenceToLaTeX)
pos += line.length(); c2bUtils::c2bToBib(jn);
pos += jnre.matchedLength();
substituted_text.replace(pos, jnlength, jn);
nj++; nj++;
} }
emit statusMessage(tr("Processed %1 journal names...").arg(nj)); else if (pos >= 0)
QCoreApplication::processEvents(); ++pos;
} }
emit statusMessage(tr("Processed %1 journal names.").arg(nj)); emit statusMessage(tr("Processed %1 journal names.").arg(nj));
return (substituted_text); return substituted_text;
} }
QString bibParser::excerpt(const QString& text, const QStringList& hints) const QString bibParser::excerpt(const QString& text, const QStringList& hints) const
{ {
QString txt(removeTags(text)); QString txt(removeTags(text));
txt.replace(QRegExp("\\[Bibliographic Metadata.+/Bibliographic Metadata\\]") , " "); txt.replace(QRegExp("\\[Bibliographic Metadata.+/Bibliographic Metadata\\]") , " ");
txt.replace(QRegExp("(http://|https://|ftp://|www\\.|ftp\\.)(www\\.|ftp\\.){ 0,1}\\S+"), " "); txt.replace(QRegExp("(http://|https://|ftp://|www\\.|ftp\\.)(www\\.|ftp\\.){ 0,1}\\S+"), " ");
txt.replace(QRegExp("\\b[A-Z]+\\b"), " "); txt.replace(QRegExp("\\b[A-Z]+\\b"), " ");
txt.replace(QRegExp("\\d"), " "); txt.replace(QRegExp("\\d"), " ");
txt.replace(c2bUtils::nonLetter, " "); txt.replace(c2bUtils::nonLetter, " ");
txt.replace(QRegExp("\\b\\w{1,2}\\b"), " "); txt.replace(QRegExp("\\b\\w{1,2}\\b"), " ");
txt.replace(QRegExp("\\b(about|and|are|com|for|from|how|into|that|the|their| txt.replace(
this|where|with|www)\\b", Qt::CaseInsensitive), " "); QRegExp("\\b(about|and|are|com|for|from|how|into|that|the|their|this|whe
re|with|www)\\b", Qt::CaseInsensitive),
" ");
txt.replace(QRegExp("\\b(january|february|march|april|may|june|july|august|s eptember|october|november|december)\\b", txt.replace(QRegExp("\\b(january|february|march|april|may|june|july|august|s eptember|october|november|december)\\b",
Qt::CaseInsensitive), " "); Qt::CaseInsensitive),
" ");
txt = hints.join(" ") + ' ' + txt; txt = hints.join(" ") + ' ' + txt;
txt = c2bUtils::simplifyString(txt); txt = c2bUtils::simplifyString(txt);
const QStringList txtlist(txt.split(' ', QString::SkipEmptyParts)); const QStringList txtlist(txt.split(' ', QString::SkipEmptyParts));
QStringList txtlistSimp; QStringList txtlistSimp;
for (int i = 0; i < qMin(15, txtlist.count()); ++i) for (int i = 0; i < qMin(15, txtlist.count()); ++i)
txtlistSimp.append(txtlist.at(i)); txtlistSimp.append(txtlist.at(i));
return txtlistSimp.join(" "); return txtlistSimp.join(" ");
} }
void bibParser::setField(const QString& name, const QString& value) void bibParser::setField(const QString& name, const QString& value)
skipping to change at line 456 skipping to change at line 465
fld.setPatternSyntax(QRegExp::RegExp2); fld.setPatternSyntax(QRegExp::RegExp2);
for (QStringList::Iterator it = fList.begin(); it != fList.end(); ++it) for (QStringList::Iterator it = fList.begin(); it != fList.end(); ++it)
{ {
if (fld.indexIn(*it) == -1) if (fld.indexIn(*it) == -1)
continue; continue;
const QString tag(fld.cap(1)); const QString tag(fld.cap(1));
QString value(fld.cap(2)); QString value(fld.cap(2));
if (tag == "AB") if (tag == "AB")
_current_reference["abstract"] = parse("abstract", value); _current_reference["abstract"] = parse("abstract", value);
else if (tag == "FAU") else if (tag == "FAU")
_current_reference["author"] = parse("addauthors", author::fromM _current_reference["author"] =
edline(value), parse("addauthors", author::fromMedline(value), _current_ref
_current_reference.value("a erence.value("author"));
uthor"));
else if (tag == "TA") else if (tag == "TA")
_current_reference["journal"] = parse("journal", value); _current_reference["journal"] = parse("journal", value);
else if (tag == "IP") else if (tag == "IP")
_current_reference["number"] = parse("number", value); _current_reference["number"] = parse("number", value);
else if (tag == "PG") else if (tag == "PG")
_current_reference["pages"] = parse("pages", value); _current_reference["pages"] = parse("pages", value);
else if (tag == "TI") else if (tag == "TI")
_current_reference["title"] = parse("title", value); _current_reference["title"] = parse("title", value);
else if (tag == "PMID") else if (tag == "PMID")
_current_reference["url"] = parse("url", c2bUtils::pubmedUrl.arg (value)); _current_reference["url"] = parse("url", c2bUtils::pubmedUrl.arg (value));
skipping to change at line 534 skipping to change at line 543
const QStringList list(fieldset.split(' ', QString::SkipEmptyParts)) ; const QStringList list(fieldset.split(' ', QString::SkipEmptyParts)) ;
const int efields(list.count()); const int efields(list.count());
const int cfields(rx.captureCount()); const int cfields(rx.captureCount());
int npos(rx.indexIn(ottext)); int npos(rx.indexIn(ottext));
c2bUtils::debug(tr("Expected Fields: |%1|").arg(efields)); c2bUtils::debug(tr("Expected Fields: |%1|").arg(efields));
c2bUtils::debug(tr("Captured Fields: |%1|").arg(cfields)); c2bUtils::debug(tr("Captured Fields: |%1|").arg(cfields));
c2bUtils::debug(tr("Position: |%1|").arg(npos)); c2bUtils::debug(tr("Position: |%1|").arg(npos));
if (efields != cfields) if (efields != cfields)
{ {
c2bUtils::warn(tr("RegExp |%1| is not valid. Mismatch between ex c2bUtils::warn(
pected and actual captures").arg(ItemX)); tr("RegExp |%1| is not valid. Mismatch between expected and
actual captures").arg(ItemX));
npos = -1; npos = -1;
} }
nfilters++; nfilters++;
if (npos > -1) if (npos > -1)
{ {
for (int i = 0; i < cfields; i++) for (int i = 0; i < cfields; i++)
{ {
const QString& listi = list.at(i); const QString& listi = list.at(i);
int ii(i + 1); int ii(i + 1);
c2bUtils::debug(QString("Fields in Template %1: |%2|").arg(i ).arg(rx.cap(ii))); c2bUtils::debug(QString("Fields in Template %1: |%2|").arg(i ).arg(rx.cap(ii)));
if (_field_re.indexIn(listi) > -1) if (_field_re.indexIn(listi) > -1)
{ {
if (listi == "author") if (listi == "author")
// Reminder: "addauthors" requires to init _current_ reference["author"] // Reminder: "addauthors" requires to init _current_ reference["author"]
_current_reference[listi] = parse("addauthors", rx.c _current_reference[listi] =
ap(ii), _current_reference.value(listi)); parse("addauthors", rx.cap(ii), _current_referen
ce.value(listi));
else if (listi == "editor") else if (listi == "editor")
// Reminder: "addeditors" requires to init _current_ reference["editor"] // Reminder: "addeditors" requires to init _current_ reference["editor"]
_current_reference[listi] = parse("addeditors", rx.c _current_reference[listi] =
ap(ii), _current_reference.value(listi)); parse("addeditors", rx.cap(ii), _current_referen
ce.value(listi));
else if (listi == "title") else if (listi == "title")
// Reminder: "addtitle" requires to init _current_re ference["title"] // Reminder: "addtitle" requires to init _current_re ference["title"]
_current_reference[listi] = parse("addtitle", rx.cap (ii), _current_reference.value(listi)); _current_reference[listi] = parse("addtitle", rx.cap (ii), _current_reference.value(listi));
else else
_current_reference[listi] = parse(listi, rx.cap(ii)) ; _current_reference[listi] = parse(listi, rx.cap(ii)) ;
} }
} }
_current_reference.typeName = reftype; _current_reference.typeName = reftype;
currentReferenceUpdated(); currentReferenceUpdated();
_auto_recognized_string = tr("Processed as '%1'.").arg(line); _auto_recognized_string = tr("Processed as '%1'.").arg(line);
skipping to change at line 580 skipping to change at line 592
} }
file.close(); file.close();
// Heuristic Bib Parsing // Heuristic Bib Parsing
if (_settingsP->value("cb2Bib/DoHeuristicGuess").toBool()) if (_settingsP->value("cb2Bib/DoHeuristicGuess").toBool())
{ {
// Sometimes (if user is on tag mode) tag could be on otext. Revert tags here, just in case. // Sometimes (if user is on tag mode) tag could be on otext. Revert tags here, just in case.
const QString clean_text(removeTags(ottext)); const QString clean_text(removeTags(ottext));
_heuristic_parserP->guessFields(clean_text, ottext); _heuristic_parserP->guessFields(clean_text, ottext);
currentReferenceUpdated(); currentReferenceUpdated();
_auto_recognized_string = tr("Applied %1 filters: No automatic format de _auto_recognized_string =
tection. %2 fields guessed.") tr("Applied %1 filters: No automatic format detection. %2 fields gue
.arg(nfilters).arg(fieldCount()); ssed.").arg(nfilters).arg(fieldCount());
} }
else else
_auto_recognized_string = tr("Applied %1 filters: No automatic format de tection.").arg(nfilters); _auto_recognized_string = tr("Applied %1 filters: No automatic format de tection.").arg(nfilters);
emit statusMessage(_auto_recognized_string); emit statusMessage(_auto_recognized_string);
} }
void bibParser::checkRegExpFile(const QString& fn) void bibParser::checkRegExpFile(const QString& fn)
{ {
if (fn.isEmpty()) if (fn.isEmpty())
{ {
skipping to change at line 617 skipping to change at line 629
_heuristic_parserP->heuristicFields(clean_text); _heuristic_parserP->heuristicFields(clean_text);
_heuristic_parserP->guessFields(clean_text, tagged_text); _heuristic_parserP->guessFields(clean_text, tagged_text);
currentReferenceUpdated(); currentReferenceUpdated();
_auto_recognized_string = tr("%1 fields guessed.").arg(fieldCount()); _auto_recognized_string = tr("%1 fields guessed.").arg(fieldCount());
emit statusMessage(_auto_recognized_string); emit statusMessage(_auto_recognized_string);
} }
QString bibParser::setTags(const QString& text) const QString bibParser::setTags(const QString& text) const
{ {
QString tagged_text(text.trimmed()); QString tagged_text(text.trimmed());
tagged_text.replace(QRegExp("\\r\\n"), "<found_new_line>"); // Windows tagged_text.replace(QRegExp("\\r\\n"), "<found_new_line>"); // Windows new l
new line ine
tagged_text.replace(QRegExp("\\n"), "<found_new_line>"); // Linux n tagged_text.replace(QRegExp("\\n"), "<found_new_line>"); // Linux new lin
ew line, LF e, LF
tagged_text.replace(QRegExp("\\r"), "<found_new_line>"); // OSX new tagged_text.replace(QRegExp("\\r"), "<found_new_line>"); // OSX new line,
line, CR CR
QStringList spText(tagged_text.split("<found_new_line>")); QStringList spText(tagged_text.split("<found_new_line>"));
int n(spText.count()); int n(spText.count());
tagged_text.clear(); tagged_text.clear();
for (int i = 0; i < n - 1; i++) for (int i = 0; i < n - 1; i++)
tagged_text += spText.at(i) + QString("<NewLine%1>").arg(i + 1); tagged_text += spText.at(i) + QString("<NewLine%1>").arg(i + 1);
tagged_text += spText[n-1]; tagged_text += spText[n - 1];
spText = tagged_text.split(QRegExp("(\\s{10,}|\\t)")); spText = tagged_text.split(QRegExp("(\\s{10,}|\\t)"));
n = spText.count(); n = spText.count();
tagged_text.clear(); tagged_text.clear();
for (int i = 0; i < n - 1; i++) for (int i = 0; i < n - 1; i++)
tagged_text += spText.at(i) + QString("<Tab%1>").arg(i + 1); tagged_text += spText.at(i) + QString("<Tab%1>").arg(i + 1);
tagged_text += spText[n-1]; tagged_text += spText[n - 1];
tagged_text = c2bUtils::simplifyString(tagged_text); tagged_text = c2bUtils::simplifyString(tagged_text);
return tagged_text; return tagged_text;
} }
QString bibParser::removeTags(const QString& text) const QString bibParser::removeTags(const QString& text) const
{ {
QString clean(text); QString clean(text);
clean.remove("[["); clean.remove("[[");
clean.remove("]]"); clean.remove("]]");
clean.replace(QRegExp("<NewLine\\d+>"), " "); clean.replace(QRegExp("<NewLine\\d+>"), " ");
 End of changes. 26 change blocks. 
52 lines changed or deleted 62 lines changed or added

Home  |  About  |  Features  |  All  |  Newest  |  Dox  |  Diffs  |  RSS Feeds  |  Screenshots  |  Comments  |  Imprint  |  Privacy  |  HTTP(S)