"Fossies" - the Fresh Open Source Software Archive  

Source code changes of the file "src/c2b/authorString.cpp" between
cb2bib-1.9.9.tar.gz and cb2bib-2.0.0.tar.gz

About: cb2Bib is a multiplatform application for rapidly extracting unformatted, or unstandardized bibliographic references from email alerts, journal Web pages, and PDF files.

authorString.cpp  (cb2bib-1.9.9):authorString.cpp  (cb2bib-2.0.0)
/*************************************************************************** /***************************************************************************
* Copyright (C) 2004-2018 by Pere Constans * Copyright (C) 2004-2019 by Pere Constans
* constans@molspaces.com * constans@molspaces.com
* cb2Bib version 1.9.9. Licensed under the GNU GPL version 3. * cb2Bib version 2.0.0. Licensed under the GNU GPL version 3.
* See the LICENSE file that comes with this distribution. * See the LICENSE file that comes with this distribution.
***************************************************************************/ ***************************************************************************/
#include "authorString.h" #include "authorString.h"
/** \page authorproc Processing of Author Names /** \page authorproc Processing of Author Names
cb2Bib automatically processes the author names string. It uses a set of cb2Bib automatically processes the author names string. It uses a set of
heuristic rules. First, the authors separator is identified. And second, it heuristic rules. First, the authors separator is identified. And second, it
is decided whether or not author names are in natural or reverse order, or is decided whether or not author names are in natural or reverse order, or
in the 'Abcd, E., F. Ghij, ...' mixed order. in the 'Abcd, E., F. Ghij, ...' mixed order.
*/ */
authorString::authorString() : _full_form(false) authorString::authorString() : _full_form(false) {}
{}
/** \page authorproc /** \page authorproc
Cleanup author string: Cleanup author string:
- Escape BibTeX to Unicode - Escape BibTeX to Unicode
- Remove digits from authors string - Remove digits from authors string
- Remove any character except <tt>-',;&\\.\\s\\w</tt> - Remove any character except <tt>-',;&\\.\\s\\w</tt>
skipping to change at line 80 skipping to change at line 79
QString separator; QString separator;
if (is_special_case) if (is_special_case)
separator = " and "; separator = " and ";
else if (has_comma && has_semicolon) else if (has_comma && has_semicolon)
separator = ';'; // Multiple Authors, separated by semicolon, reversed n aming separator = ';'; // Multiple Authors, separated by semicolon, reversed n aming
else if (has_comma) else if (has_comma)
{ {
if (is_first_reversed) if (is_first_reversed)
{ {
if (_author_string.contains(QRegExp('^' + author::name + ",(?:\\s*-{ if (_author_string.contains(
0,1}\\b\\w\\b\\.){1,3},\\s*" + author::name))) QRegExp('^' + author::name + ",(?:\\s*-{0,1}\\b\\w\\b\\.){1,
3},\\s*" + author::name)))
{ {
_author_string.replace(QRegExp("\\bJr.", Qt::CaseSensitive), "Jr "); _author_string.replace(QRegExp("\\bJr.", Qt::CaseSensitive), "Jr ");
_author_string.replace(".,", ".;"); _author_string.replace(".,", ".;");
separator = ';'; // Reversed, comma separated 'Abrahamsson, A.-L separator =
., Springett, J., Karlsson, L., Ottosson, T.' ';'; // Reversed, comma separated 'Abrahamsson, A.-L., Sprin
gett, J., Karlsson, L., Ottosson, T.'
is_string_reversed = true; is_string_reversed = true;
} }
else if (_author_string.contains(QRegExp('^' + author::name + ',' + author::initials + ','))) else if (_author_string.contains(QRegExp('^' + author::name + ',' + author::initials + ',')))
{ {
_author_string.replace(QRegExp("^([-'\\w]+),"), "\\1 "); _author_string.replace(QRegExp("^([-'\\w]+),"), "\\1 ");
separator = ','; // Mixed naming 'Smith, J.-L., R. Jones, and K. Gibbons' separator = ','; // Mixed naming 'Smith, J.-L., R. Jones, and K. Gibbons'
} }
else else
separator = " and "; // Reversed naming separator = " and "; // Reversed naming
} }
else if (has_ands) else if (has_ands)
separator = " and "; separator = " and ";
else // Natural naming else // Natural naming
separator = ','; separator = ',';
} }
else if (has_semicolon) else if (has_semicolon)
separator = ';'; // Multiple Authors, separated by semicolon separator = ';'; // Multiple Authors, separated by semicolon
else else
separator = " and "; separator = " and ";
c2bUtils::debug(QObject::tr("Separator: |%1|").arg(separator)); c2bUtils::debug(QObject::tr("Separator: |%1|").arg(separator));
c2bUtils::debug("1--|" + _author_string + '|'); c2bUtils::debug("1--|" + _author_string + '|');
_author_string.replace(QRegExp("\\band\\b", Qt::CaseInsensitive), separator) ; _author_string.replace(QRegExp("\\band\\b", Qt::CaseInsensitive), separator) ;
_author_string.replace(QRegExp("\\s&\\s", Qt::CaseInsensitive), separator); _author_string.replace(QRegExp("\\s&\\s", Qt::CaseInsensitive), separator);
c2bUtils::debug("2--|" + _author_string + '|'); c2bUtils::debug("2--|" + _author_string + '|');
_author_string.remove(QRegExp("[^\\w\\.]+$")); // Removing of duplicate comm as and semicolons _author_string.remove(QRegExp("[^\\w\\.]+$")); // Removing of duplicate comm as and semicolons
_author_string.replace(QRegExp(",\\s*"), ","); _author_string.replace(QRegExp(",\\s*"), ",");
c2bUtils::debug("3--|" + _author_string + '|'); c2bUtils::debug("3--|" + _author_string + '|');
_author_string.replace(QRegExp(",+"), ","); _author_string.replace(QRegExp(",+"), ",");
_author_string.replace(QRegExp(";\\s*"), ";"); _author_string.replace(QRegExp(";\\s*"), ";");
_author_string.replace(QRegExp(";+"), ";"); _author_string.replace(QRegExp(";+"), ";");
c2bUtils::debug("4--|" + _author_string + '|'); c2bUtils::debug("4--|" + _author_string + '|');
const bool are_authors_in_uppercase(containUpperCaseLetter(_author_string) & const bool are_authors_in_uppercase(containUpperCaseLetter(_author_string) &
& !containLowerCaseLetter(_author_string)); &
!containLowerCaseLetter(_author_string))
;
if (are_authors_in_uppercase) if (are_authors_in_uppercase)
c2bUtils::debug("Input Authors in Uppercase"); c2bUtils::debug("Input Authors in Uppercase");
QStringList authors; QStringList authors;
if (separator == " and ") if (separator == " and ")
authors = _author_string.split(QRegExp("\\band\\b")); authors = _author_string.split(QRegExp("\\band\\b"));
else else
authors = _author_string.split(separator); authors = _author_string.split(separator);
// Setting author ordering // Setting author ordering
const QString first_author(authors.first().trimmed()); const QString first_author(authors.first().trimmed());
skipping to change at line 141 skipping to change at line 143
const bool is_string_mixed(is_current_reversed && !is_last_reversed); const bool is_string_mixed(is_current_reversed && !is_last_reversed);
if (is_string_mixed) // Mixed naming 'Smith, J., R. Jones' if (is_string_mixed) // Mixed naming 'Smith, J., R. Jones'
c2bUtils::debug("Mixed order"); c2bUtils::debug("Mixed order");
// Process each author name // Process each author name
for (int ai = 0; ai < authors.count(); ++ai) for (int ai = 0; ai < authors.count(); ++ai)
{ {
QString author_i(authors.at(ai)); QString author_i(authors.at(ai));
c2bUtils::debug(author_i); c2bUtils::debug(author_i);
author_i.replace(QRegExp("\\.{0,1}\\s{0,1}-"), "-"); // Abbreviated case s, eg M.-H. Something author_i.replace(QRegExp("\\.{0,1}\\s{0,1}-"), "-"); // Abbreviated case s, eg M.-H. Something
author_i.replace(QRegExp("[^-'\\w,]"), " "); // Only these characters co mpose a name; keep commas author_i.replace(QRegExp("[^-'\\w,]"), " "); // Only these chara cters compose a name; keep commas
author_i = c2bUtils::simplifyString(author_i); author_i = c2bUtils::simplifyString(author_i);
// Split author name // Split author name
QStringList fore_name_parts; QStringList fore_name_parts;
QString last_name; QString last_name;
if (is_current_reversed) if (is_current_reversed)
{ {
const QStringList parts(author_i.split(',', QString::SkipEmptyParts) ); const QStringList parts(author_i.split(',', QString::SkipEmptyParts) );
const int nparts(parts.count()); const int nparts(parts.count());
if (nparts == 2) if (nparts == 2)
skipping to change at line 217 skipping to change at line 219
for (int l = 0; l < fore_length; ++l) for (int l = 0; l < fore_length; ++l)
author_name += fore_name[l] + ". "; author_name += fore_name[l] + ". ";
} }
else if (fore_name_parts.count() == 2 && fore_length > 1 && fore _length < 3 && is_current_reversed && else if (fore_name_parts.count() == 2 && fore_length > 1 && fore _length < 3 && is_current_reversed &&
!are_authors_in_uppercase && is_uppercase) !are_authors_in_uppercase && is_uppercase)
{ {
// Cases 'Last1 Last2, FST': Always abbreviated, no call to processFirstMiddle // Cases 'Last1 Last2, FST': Always abbreviated, no call to processFirstMiddle
for (int l = 0; l < fore_length; ++l) for (int l = 0; l < fore_length; ++l)
author_name += fore_name[l] + ". "; author_name += fore_name[l] + ". ";
} }
else if (i == 1 && fore_name_parts.count() == 2 && fore_length > else if (i == 1 && fore_name_parts.count() == 2 && fore_length >
1 && fore_length < 3 && !is_current_reversed && 1 && fore_length < 3 &&
!are_authors_in_uppercase && is_uppercase) !is_current_reversed && !are_authors_in_uppercase && is
_uppercase)
{ {
// Cases 'Fore IJ Last': Process initials // Cases 'Fore IJ Last': Process initials
for (int l = 0; l < fore_length; ++l) for (int l = 0; l < fore_length; ++l)
author_name += fore_name[l] + ". "; author_name += fore_name[l] + ". ";
} }
else else
author_name += processFirstMiddle(fore_name) + ' '; author_name += processFirstMiddle(fore_name) + ' ';
} }
} }
// Add last name // Add last name
skipping to change at line 272 skipping to change at line 274
proc_fm = first_middle + '.'; proc_fm = first_middle + '.';
} }
else else
{ {
if (first_middle.contains('_')) // Composite names should not be abbrevi ated if (first_middle.contains('_')) // Composite names should not be abbrevi ated
{ {
proc_fm = capitalize(first_middle); proc_fm = capitalize(first_middle);
if (first_middle.length() - first_middle.indexOf('_') == 2) if (first_middle.length() - first_middle.indexOf('_') == 2)
proc_fm += '.'; proc_fm += '.';
} }
else else if (first_middle.length() > 0)
proc_fm = first_middle.left(1) + '.'; proc_fm = first_middle.at(0) + '.';
} }
return proc_fm; return proc_fm;
} }
QString authorString::capitalize(const QString& name) QString authorString::capitalize(const QString& name)
{ {
// Capitalizes author's name // Capitalizes author's name
if (name.isEmpty()) if (name.isEmpty())
return QString(); return QString();
QString proc_name(name); QString proc_name(name);
skipping to change at line 351 skipping to change at line 353
const QString Last(rRevNISI.cap(3)); const QString Last(rRevNISI.cap(3));
if (Last != "and") if (Last != "and")
return false; return false;
} }
rRevNISI = QRegExp("^([-'\\w]+) ([-\\w]{1,3})$"); // Consider only 1 to 3 in itials rRevNISI = QRegExp("^([-'\\w]+) ([-\\w]{1,3})$"); // Consider only 1 to 3 in itials
rRevNISI.setMinimal(false); rRevNISI.setMinimal(false);
if (rRevNISI.indexIn(author_line) > -1) if (rRevNISI.indexIn(author_line) > -1)
{ {
const QString Last(rRevNISI.cap(1)); const QString Last(rRevNISI.cap(1));
const QString First(rRevNISI.cap(2)); const QString First(rRevNISI.cap(2));
c2bUtils::debug(QObject::tr("ISI: |%1| |%2|").arg(Last).arg(First)); c2bUtils::debug(QObject::tr("ISI: |%1| |%2|").arg(Last, First));
if (containLowerCaseLetter(First)) if (containLowerCaseLetter(First))
return false; return false;
if (!containLowerCaseLetter(Last)) if (!containLowerCaseLetter(Last))
return false; return false;
return true; return true;
} }
return false; return false;
} }
bool authorString::containLowerCaseLetter(const QString& author) bool authorString::containLowerCaseLetter(const QString& author)
{ {
QString author_line(author); QString author_line(author);
author_line.remove(QRegExp("\\band\\b")); // Remove possible 'and' separator author_line.remove(QRegExp("\\band\\b")); // Remove possible 'and' separator
author_line.remove(QRegExp(author::prefixes + '_', Qt::CaseInsensitive)); // Remove possible prefixes author_line.remove(QRegExp(author::prefixes + '_', Qt::CaseInsensitive)); // Remove possible prefixes
author_line.remove(QRegExp(author::double_initials + '_', Qt::CaseSensitive) author_line.remove(
); // Remove possible two-letter initials QRegExp(author::double_initials + '_', Qt::CaseSensitive)); // Remove po
ssible two-letter initials
for (int i = 0; i < author_line.length(); i++) for (int i = 0; i < author_line.length(); i++)
{ {
if (author_line.at(i).isLetter()) if (author_line.at(i).isLetter())
if (author_line.at(i).category() == QChar::Letter_Lowercase) if (author_line.at(i).category() == QChar::Letter_Lowercase)
return true; return true;
} }
return false; return false;
} }
bool authorString::containUpperCaseLetter(const QString& author) bool authorString::containUpperCaseLetter(const QString& author)
skipping to change at line 546 skipping to change at line 549
else if (w.at(0) == ':') else if (w.at(0) == ':')
code += ':'; code += ':';
else if (w.at(0) == '|') else if (w.at(0) == '|')
code += 'L'; code += 'L';
else else
code += 'o'; code += 'o';
} }
scapePattern("aL+[nN]{1,2}"); scapePattern("aL+[nN]{1,2}");
scapePattern("a[nNw]&L+[nN]{1,2}"); // in Linear and / Sublinear Time scapePattern("a[nNw]&L+[nN]{1,2}"); // in Linear and / Sublinear Time
scapePattern(":L+[InN]{1,2}"); // ... Structure Classification: / A Survey scapePattern(":L+[InN]{1,2}"); // ... Structure Classification: / A Sur
scapePattern("[nN]*&L[nN]L"); // Not an & for author vey
scapePattern("[nN]*&L[nN]L"); // Not an & for author
} }
QString encoder::decoded(const int position, const int length) const QString encoder::decoded(const int position, const int length) const
{ {
if (position < 0) if (position < 0)
return QString(); return QString();
if (length < 1 || position + length > fragments.count()) if (length < 1 || position + length > fragments.count())
return QString(); return QString();
QString d(fragments.at(position)); QString d(fragments.at(position));
for (int i = 1; i < length; ++i) for (int i = 1; i < length; ++i)
skipping to change at line 613 skipping to change at line 616
const int ws(w.size()); const int ws(w.size());
if (ws < 2) if (ws < 2)
return false; return false;
if (ws > 6) if (ws > 6)
return false; return false;
const QByteArray ba(w.toLatin1()); const QByteArray ba(w.toLatin1());
const char* s = ba.data(); const char* s = ba.data();
const int ss(ws * int(sizeof(char))); const int ss(ws * int(sizeof(char)));
if (ws == 2) if (ws == 2)
{ {
if (memcmp("of", s, ss) == 0) return true; if (memcmp("of", s, ss) == 0)
if (memcmp("on", s, ss) == 0) return true; return true;
if (memcmp("to", s, ss) == 0) return true; if (memcmp("on", s, ss) == 0)
if (memcmp("in", s, ss) == 0) return true; return true;
if (memcmp("as", s, ss) == 0) return true; if (memcmp("to", s, ss) == 0)
if (memcmp("vs", s, ss) == 0) return true; return true;
if (memcmp("at", s, ss) == 0) return true; if (memcmp("in", s, ss) == 0)
if (memcmp("is", s, ss) == 0) return true; return true;
if (memcmp("an", s, ss) == 0) return true; if (memcmp("as", s, ss) == 0)
return true;
if (memcmp("vs", s, ss) == 0)
return true;
if (memcmp("at", s, ss) == 0)
return true;
if (memcmp("is", s, ss) == 0)
return true;
if (memcmp("an", s, ss) == 0)
return true;
} }
if (ws == 3) if (ws == 3)
{ {
if (memcmp("for", s, ss) == 0) return true; if (memcmp("for", s, ss) == 0)
if (memcmp("but", s, ss) == 0) return true; return true;
if (memcmp("are", s, ss) == 0) return true; if (memcmp("but", s, ss) == 0)
if (memcmp("its", s, ss) == 0) return true; return true;
if (memcmp("the", s, ss) == 0) return true; if (memcmp("are", s, ss) == 0)
return true;
if (memcmp("its", s, ss) == 0)
return true;
if (memcmp("the", s, ss) == 0)
return true;
} }
if (ws == 4) if (ws == 4)
{ {
if (memcmp("from", s, ss) == 0) return true; if (memcmp("from", s, ss) == 0)
if (memcmp("with", s, ss) == 0) return true; return true;
if (memcmp("into", s, ss) == 0) return true; if (memcmp("with", s, ss) == 0)
return true;
if (memcmp("into", s, ss) == 0)
return true;
} }
if (ws == 6) if (ws == 6)
{ {
if (memcmp("within", s, ss) == 0) return true; if (memcmp("within", s, ss) == 0)
return true;
} }
return false; return false;
} }
} // namespace author } // namespace author
 End of changes. 18 change blocks. 
41 lines changed or deleted 65 lines changed or added

Home  |  About  |  Features  |  All  |  Newest  |  Dox  |  Diffs  |  RSS Feeds  |  Screenshots  |  Comments  |  Imprint  |  Privacy  |  HTTP(S)