"Fossies" - the Fresh Open Source Software Archive  

Source code changes of the file "src/c2b/heuristicBibParser.cpp" between
cb2bib-1.9.9.tar.gz and cb2bib-2.0.0.tar.gz

About: cb2Bib is a multiplatform application for rapidly extracting unformatted, or unstandardized bibliographic references from email alerts, journal Web pages, and PDF files.

heuristicBibParser.cpp  (cb2bib-1.9.9):heuristicBibParser.cpp  (cb2bib-2.0.0)
/*************************************************************************** /***************************************************************************
* Copyright (C) 2004-2018 by Pere Constans * Copyright (C) 2004-2019 by Pere Constans
* constans@molspaces.com * constans@molspaces.com
* cb2Bib version 1.9.9. Licensed under the GNU GPL version 3. * cb2Bib version 2.0.0. Licensed under the GNU GPL version 3.
* See the LICENSE file that comes with this distribution. * See the LICENSE file that comes with this distribution.
***************************************************************************/ ***************************************************************************/
#include "heuristicBibParser.h" #include "heuristicBibParser.h"
#include "bibParser.h" #include "bibParser.h"
#include "journalDB.h" #include "journalDB.h"
#include <QDate> #include <QDate>
#include <cmath> #include <cmath>
heuristicBibParser::heuristicBibParser(bibParser* bp) : heuristicBibParser::heuristicBibParser(bibParser* bp)
_bpP(bp), : _bpP(bp),
_current_reference(bp->_current_reference), _current_reference(bp->_current_reference),
_reliable_number(false), _reliable_number(false),
_reliable_pages(false), _reliable_pages(false),
_reliable_volume(false), _reliable_volume(false),
_abstract("abstract"), _abstract("abstract"),
_addauthors("addauthors"), _addauthors("addauthors"),
_author("author"), _author("author"),
_isbn("isbn"), _isbn("isbn"),
_keywords("keywords"), _keywords("keywords"),
_number("number"), _number("number"),
_pages("pages"), _pages("pages"),
_title("title"), _title("title"),
_volume("volume"), _volume("volume"),
_year("year"), _year("year"),
_bibliographic_fields(bp->_bibliographic_fields), _bibliographic_fields(bp->_bibliographic_fields),
_journal_db(*bp->_journal_dbP) _journal_db(*bp->_journal_dbP)
{ {
_leading_non_letters = QRegExp("^[^<\\w]+"); _leading_non_letters = QRegExp("^[^<\\w]+");
// Char 65533 appears many times in pdftotext outputs on windows (seems an e ncoding error, though) // Char 65533 appears many times in pdftotext outputs on windows (seems an e ncoding error, though)
_hyphens = QRegExp("\\s*[-" + QString(QChar(8211)) + QString(QChar(8722)) + QString(QChar(65533)) + "]+\\s*"); _hyphens = QRegExp("\\s*[-" + QString(QChar(8211)) + QString(QChar(8722)) + QString(QChar(65533)) + "]+\\s*");
_hyphen_nums = "(?:\\d+|\\d+-\\d+)"; _hyphen_nums = "(?:\\d+|\\d+-\\d+)";
_hyphen_pages = "(?:\\d+|\\d+-\\d+p{0,2}|\\d+ \\d+pp)"; _hyphen_pages = "(?:\\d+|\\d+-\\d+p{0,2}|\\d+ \\d+pp)";
_max_year = 3 + QDate::currentDate().year(); _max_year = 3 + QDate::currentDate().year();
// Setup author recognition // Setup author recognition
// Implementation of author field extraction // Implementation of author field extraction
// P. Constans. A Simple Extraction Procedure for Bibliographical Author Fie ld. // P. Constans. A Simple Extraction Procedure for Bibliographical Author Fie ld.
// arXiv:0902.0755, 2009. // arXiv:0902.0755, 2009.
_word_prefix_lexicon = c2bUtils::fileToString(":/txt/txt/word_prefix_lexicon _word_prefix_lexicon =
.txt").split(c2bUtils::nonLetter, c2bUtils::fileToString(":/txt/txt/word_prefix_lexicon.txt").split(c2bUti
QString::SkipEmptyParts); ls::nonLetter, QString::SkipEmptyParts);
for (int i = 0; i < _word_prefix_lexicon.count(); ++i) for (int i = 0; i < _word_prefix_lexicon.count(); ++i)
{ {
_word_prefix_lexicon[i].replace('_', ' '); _word_prefix_lexicon[i].replace('_', ' ');
_word_prefix_lexicon[i].squeeze(); _word_prefix_lexicon[i].squeeze();
} }
const QString author_lc_t("(?:n%1n|n%1%1n|%1n|%1nn|%1%1n|%1%1%1n|n%1nn|nn%1n |%1%1nn|%1n%1n|nn|nnn)"); const QString author_lc_t("(?:n%1n|n%1%1n|%1n|%1nn|%1%1n|%1%1%1n|n%1nn|nn%1n |%1%1nn|%1n%1n|nn|nnn)");
const QString author_uc_t("(?:[nN]%1N|[nN]%1%1N|%1N|%1[nN]N|%1%1N|%1%1%1N|[n const QString author_uc_t(
N]%1[nN]N|[nN]N%1N|%1%1[nN]N|%1[nN]%1N|[nN]N|[nN][nN]N)"); "(?:[nN]%1N|[nN]%1%1N|%1N|%1[nN]N|%1%1N|%1%1%1N|[nN]%1[nN]N|[nN]N%1N|%1%
1[nN]N|%1[nN]%1N|[nN]N|[nN][nN]N)");
const QString author_initial("Ip{0,1}"); const QString author_initial("Ip{0,1}");
const QString author_line("L%1(?:[,;&L]+%1)*(?=L)"); const QString author_line("L%1(?:[,;&L]+%1)*(?=L)");
const QString author_lc(author_line.arg(author_lc_t.arg(author_initial))); const QString author_lc(author_line.arg(author_lc_t.arg(author_initial)));
const QString author_uc(author_line.arg(author_uc_t.arg(author_initial))); const QString author_uc(author_line.arg(author_uc_t.arg(author_initial)));
const QString author_address("(?:L[^L]*)"); const QString author_address("(?:L[^L]*)");
_author_sb = QRegExp(QString("L%1(?:[,;&]+L{0,2}%1)*[,;&L]*L").arg(author_lc _t.arg(author_initial)), _author_sb = QRegExp(QString("L%1(?:[,;&]+L{0,2}%1)*[,;&L]*L").arg(author_lc _t.arg(author_initial)),
Qt::CaseInsensitive, QRegExp::RegExp); Qt::CaseInsensitive, QRegExp::RegExp);
_author_sb_lc = QRegExp(QString("L%1(?:[,;&L]+%1)*(?=L)").arg(author_lc_t.ar _author_sb_lc = QRegExp(QString("L%1(?:[,;&L]+%1)*(?=L)").arg(author_lc_t.ar
g(author_initial)), g(author_initial)), Qt::CaseSensitive,
Qt::CaseSensitive, QRegExp::RegExp); QRegExp::RegExp);
_author_sb_uc = QRegExp(QString("L%1(?:[,;&L]+%1)*(?=L)").arg(author_uc_t.ar _author_sb_uc = QRegExp(QString("L%1(?:[,;&L]+%1)*(?=L)").arg(author_uc_t.ar
g(author_initial)), g(author_initial)), Qt::CaseSensitive,
Qt::CaseSensitive, QRegExp::RegExp); QRegExp::RegExp);
_author_sb.setMinimal(false); _author_sb.setMinimal(false);
_author_sb_lc.setMinimal(false); _author_sb_lc.setMinimal(false);
_author_sb_uc.setMinimal(false); _author_sb_uc.setMinimal(false);
// Note: Syntax must be RegExp and not RegExp2. // Note: Syntax must be RegExp and not RegExp2.
// If not, in cases as 'LnnL ... Lnn,nn,Lnn,&nnL' it will overextend address es to include Lnn,nn,L. // If not, in cases as 'LnnL ... Lnn,nn,Lnn,&nnL' it will overextend address es to include Lnn,nn,L.
// This is not related to minimal/greedy. // This is not related to minimal/greedy.
// Note also that Lnn,nn,L will not be taken, unless it is followed by anoth er author line. This is a feature. // Note also that Lnn,nn,L will not be taken, unless it is followed by anoth er author line. This is a feature.
_author_b2_lc_rx = new QRegExp(QString("(%1)%2{0,7}(%1)").arg(author_lc).arg _author_b2_lc_rx =
(author_address), new QRegExp(QString("(%1)%2{0,7}(%1)").arg(author_lc, author_address), Q
t::CaseSensitive, QRegExp::RegExp);
_author_b2_uc_rx =
new QRegExp(QString("(%1)%2{0,7}(%1)").arg(author_uc, author_address), Q
t::CaseSensitive, QRegExp::RegExp);
_author_b3_lc_rx = new QRegExp(QString("(%1)%2{0,7}(%1)%2{0,7}(%1)").arg(aut
hor_lc, author_address),
Qt::CaseSensitive, QRegExp::RegExp); Qt::CaseSensitive, QRegExp::RegExp);
_author_b2_uc_rx = new QRegExp(QString("(%1)%2{0,7}(%1)").arg(author_uc).arg (author_address), _author_b3_uc_rx = new QRegExp(QString("(%1)%2{0,7}(%1)%2{0,7}(%1)").arg(aut hor_uc, author_address),
Qt::CaseSensitive, QRegExp::RegExp); Qt::CaseSensitive, QRegExp::RegExp);
_author_b3_lc_rx = new QRegExp(QString("(%1)%2{0,7}(%1)%2{0,7}(%1)").arg(aut hor_lc).arg(author_address), _author_b4_lc_rx = new QRegExp(QString("(%1)%2{0,7}(%1)%2{0,7}(%1)%2{0,7}(%1 )").arg(author_lc, author_address),
Qt::CaseSensitive, QRegExp::RegExp); Qt::CaseSensitive, QRegExp::RegExp);
_author_b3_uc_rx = new QRegExp(QString("(%1)%2{0,7}(%1)%2{0,7}(%1)").arg(aut _author_b4_uc_rx = new QRegExp(QString("(%1)%2{0,7}(%1)%2{0,7}(%1)%2{0,7}(%1
hor_uc).arg(author_address), )").arg(author_uc, author_address),
Qt::CaseSensitive, QRegExp::RegExp);
_author_b4_lc_rx = new QRegExp(QString("(%1)%2{0,7}(%1)%2{0,7}(%1)%2{0,7}(%1
)").arg(author_lc).arg(author_address),
Qt::CaseSensitive, QRegExp::RegExp);
_author_b4_uc_rx = new QRegExp(QString("(%1)%2{0,7}(%1)%2{0,7}(%1)%2{0,7}(%1
)").arg(author_uc).arg(author_address),
Qt::CaseSensitive, QRegExp::RegExp);
_author_b5_lc_rx = new QRegExp(QString("(%1)%2{0,7}(%1)%2{0,7}(%1)%2{0,7}(%1
)%2{0,7}(%1)").arg(author_lc).arg(author_address),
Qt::CaseSensitive, QRegExp::RegExp);
_author_b5_uc_rx = new QRegExp(QString("(%1)%2{0,7}(%1)%2{0,7}(%1)%2{0,7}(%1
)%2{0,7}(%1)").arg(author_uc).arg(author_address),
Qt::CaseSensitive, QRegExp::RegExp); Qt::CaseSensitive, QRegExp::RegExp);
_author_b5_lc_rx =
new QRegExp(QString("(%1)%2{0,7}(%1)%2{0,7}(%1)%2{0,7}(%1)%2{0,7}(%1)").
arg(author_lc, author_address),
Qt::CaseSensitive, QRegExp::RegExp);
_author_b5_uc_rx =
new QRegExp(QString("(%1)%2{0,7}(%1)%2{0,7}(%1)%2{0,7}(%1)%2{0,7}(%1)").
arg(author_uc, author_address),
Qt::CaseSensitive, QRegExp::RegExp);
_author_b2_lc_rx->setMinimal(true); _author_b2_lc_rx->setMinimal(true);
_author_b2_uc_rx->setMinimal(true); _author_b2_uc_rx->setMinimal(true);
_author_b3_lc_rx->setMinimal(true); _author_b3_lc_rx->setMinimal(true);
_author_b3_uc_rx->setMinimal(true); _author_b3_uc_rx->setMinimal(true);
_author_b4_lc_rx->setMinimal(true); _author_b4_lc_rx->setMinimal(true);
_author_b4_uc_rx->setMinimal(true); _author_b4_uc_rx->setMinimal(true);
_author_b5_lc_rx->setMinimal(true); _author_b5_lc_rx->setMinimal(true);
_author_b5_uc_rx->setMinimal(true); _author_b5_uc_rx->setMinimal(true);
} }
skipping to change at line 185 skipping to change at line 188
*/ */
void heuristicBibParser::guessAbstract(const QString& text) void heuristicBibParser::guessAbstract(const QString& text)
{ {
// Check whether text might come from a web page or from a PDF with new line s // Check whether text might come from a web page or from a PDF with new line s
// Set line length to 93 // Set line length to 93
// Check first for abstract, check later for summary // Check first for abstract, check later for summary
if (text.contains("Abstract", Qt::CaseInsensitive)) if (text.contains("Abstract", Qt::CaseInsensitive))
{ {
const QString crl(QChar(169)); // Avoid abstract overextending to copyri ght line. const QString crl(QChar(169)); // Avoid abstract overextending to copyri ght line.
QRegExp rxH("(?:<NewLine\\d+>|<Tab\\d+>)\\s*Abstract[\\s\\.:]*<NewLine\\ QRegExp rxH("(?:<NewLine\\d+>|<Tab\\d+>)\\s*Abstract[\\s\\.:]*<NewLine\\
d+>(.+)(?:" + crl + "|<NewLine|$)", Qt::CaseInsensitive); d+>(.+)(?:" + crl + "|<NewLine|$)",
Qt::CaseInsensitive);
rxH.setMinimal(true); rxH.setMinimal(true);
int nH(rxH.indexIn(text)); int nH(rxH.indexIn(text));
if (nH > -1) if (nH > -1)
if (rxH.cap(1).length() > 93) if (rxH.cap(1).length() > 93)
{ {
const QString val(rxH.cap(1).remove(_leading_non_letters)); const QString val(rxH.cap(1).remove(_leading_non_letters));
_current_reference[_abstract] = _bpP->parse(_abstract, val); _current_reference[_abstract] = _bpP->parse(_abstract, val);
return; return;
} }
rxH = QRegExp("(?:<NewLine\\d+>|<Tab\\d+>)\\s*Abstract[\\s\\.:]*<NewLine rxH = QRegExp("(?:<NewLine\\d+>|<Tab\\d+>)\\s*Abstract[\\s\\.:]*<NewLine
\\d+>(.+)(?:" + crl + "|<NewLine\\d+>\\s*<NewLine|$)", \\d+>(.+)(?:" + crl +
"|<NewLine\\d+>\\s*<NewLine|$)",
Qt::CaseInsensitive); Qt::CaseInsensitive);
rxH.setMinimal(true); rxH.setMinimal(true);
nH = rxH.indexIn(text); nH = rxH.indexIn(text);
if (nH > -1) if (nH > -1)
{ {
const QString val(rxH.cap(1).remove(_leading_non_letters)); const QString val(rxH.cap(1).remove(_leading_non_letters));
_current_reference[_abstract] = _bpP->parse(_abstract, val); _current_reference[_abstract] = _bpP->parse(_abstract, val);
return; return;
} }
rxH = QRegExp("(?:<NewLine\\d+>|<Tab\\d+>)\\s*Abstract\\b(.+)(?:" + crl + "|<NewLine|$)", Qt::CaseInsensitive); rxH = QRegExp("(?:<NewLine\\d+>|<Tab\\d+>)\\s*Abstract\\b(.+)(?:" + crl + "|<NewLine|$)", Qt::CaseInsensitive);
rxH.setMinimal(true); rxH.setMinimal(true);
nH = rxH.indexIn(text); nH = rxH.indexIn(text);
if (nH > -1) if (nH > -1)
if (rxH.cap(1).length() > 93) if (rxH.cap(1).length() > 93)
{ {
const QString val(rxH.cap(1).remove(_leading_non_letters)); const QString val(rxH.cap(1).remove(_leading_non_letters));
_current_reference[_abstract] = _bpP->parse(_abstract, val); _current_reference[_abstract] = _bpP->parse(_abstract, val);
return; return;
} }
rxH = QRegExp("(?:<NewLine\\d+>|<Tab\\d+>)\\s*Abstract\\b(.+)(?:" + crl rxH = QRegExp("(?:<NewLine\\d+>|<Tab\\d+>)\\s*Abstract\\b(.+)(?:" + crl
+ "|<NewLine\\d+>\\s*<NewLine|$)", Qt::CaseInsensitive); + "|<NewLine\\d+>\\s*<NewLine|$)",
Qt::CaseInsensitive);
rxH.setMinimal(true); rxH.setMinimal(true);
nH = rxH.indexIn(text); nH = rxH.indexIn(text);
if (nH > -1) if (nH > -1)
{ {
const QString val(rxH.cap(1).remove(_leading_non_letters)); const QString val(rxH.cap(1).remove(_leading_non_letters));
_current_reference[_abstract] = _bpP->parse(_abstract, val); _current_reference[_abstract] = _bpP->parse(_abstract, val);
return; return;
} }
} }
if (text.contains("Summary", Qt::CaseInsensitive)) if (text.contains("Summary", Qt::CaseInsensitive))
{ {
QRegExp rxH("(?:<NewLine\\d+>|<Tab\\d+>)\\s*Summary[\\s\\.:]*<NewLine\\d QRegExp rxH("(?:<NewLine\\d+>|<Tab\\d+>)\\s*Summary[\\s\\.:]*<NewLine\\d
+>(.+)(?:<NewLine|$)", Qt::CaseInsensitive); +>(.+)(?:<NewLine|$)",
Qt::CaseInsensitive);
rxH.setMinimal(true); rxH.setMinimal(true);
int nH(rxH.indexIn(text)); int nH(rxH.indexIn(text));
if (nH > -1) if (nH > -1)
if (rxH.cap(1).length() > 93) if (rxH.cap(1).length() > 93)
{ {
const QString val(rxH.cap(1).remove(_leading_non_letters)); const QString val(rxH.cap(1).remove(_leading_non_letters));
_current_reference[_abstract] = _bpP->parse(_abstract, val); _current_reference[_abstract] = _bpP->parse(_abstract, val);
return; return;
} }
rxH = QRegExp("(?:<NewLine\\d+>|<Tab\\d+>)\\s*Summary[\\s\\.:]*<NewLine\ rxH =
\d+>(.+)(?:<NewLine\\d+>\\s*<NewLine|$)", QRegExp("(?:<NewLine\\d+>|<Tab\\d+>)\\s*Summary[\\s\\.:]*<NewLine\\d
Qt::CaseInsensitive); +>(.+)(?:<NewLine\\d+>\\s*<NewLine|$)",
Qt::CaseInsensitive);
rxH.setMinimal(true); rxH.setMinimal(true);
nH = rxH.indexIn(text); nH = rxH.indexIn(text);
if (nH > -1) if (nH > -1)
{ {
const QString val(rxH.cap(1).remove(_leading_non_letters)); const QString val(rxH.cap(1).remove(_leading_non_letters));
_current_reference[_abstract] = _bpP->parse(_abstract, val); _current_reference[_abstract] = _bpP->parse(_abstract, val);
return; return;
} }
rxH = QRegExp("(?:<NewLine\\d+>|<Tab\\d+>)\\s*Summary\\b(.+)(?:<NewLine| $)", Qt::CaseInsensitive); rxH = QRegExp("(?:<NewLine\\d+>|<Tab\\d+>)\\s*Summary\\b(.+)(?:<NewLine| $)", Qt::CaseInsensitive);
rxH.setMinimal(true); rxH.setMinimal(true);
nH = rxH.indexIn(text); nH = rxH.indexIn(text);
if (nH > -1) if (nH > -1)
if (rxH.cap(1).length() > 93) if (rxH.cap(1).length() > 93)
{ {
const QString val(rxH.cap(1).remove(_leading_non_letters)); const QString val(rxH.cap(1).remove(_leading_non_letters));
_current_reference[_abstract] = _bpP->parse(_abstract, val); _current_reference[_abstract] = _bpP->parse(_abstract, val);
return; return;
} }
rxH = QRegExp("(?:<NewLine\\d+>|<Tab\\d+>)\\s*Summary\\b(.+)(?:<NewLine\ rxH =
\d+>\\s*<NewLine|$)", Qt::CaseInsensitive); QRegExp("(?:<NewLine\\d+>|<Tab\\d+>)\\s*Summary\\b(.+)(?:<NewLine\\d
+>\\s*<NewLine|$)", Qt::CaseInsensitive);
rxH.setMinimal(true); rxH.setMinimal(true);
nH = rxH.indexIn(text); nH = rxH.indexIn(text);
if (nH > -1) if (nH > -1)
{ {
const QString val(rxH.cap(1).remove(_leading_non_letters)); const QString val(rxH.cap(1).remove(_leading_non_letters));
_current_reference[_abstract] = _bpP->parse(_abstract, val); _current_reference[_abstract] = _bpP->parse(_abstract, val);
return; return;
} }
} }
} }
skipping to change at line 291 skipping to change at line 300
// Prepare input stream // Prepare input stream
QString simplified_text("| " + tagged_text + " |"); QString simplified_text("| " + tagged_text + " |");
simplified_text.replace('|', ' '); simplified_text.replace('|', ' ');
simplified_text = "| " + simplified_text + " |"; simplified_text = "| " + simplified_text + " |";
simplified_text.replace(QRegExp("<NewLine\\d+>"), " | "); simplified_text.replace(QRegExp("<NewLine\\d+>"), " | ");
simplified_text.replace(QRegExp("(\\w)<Tab\\d+>(\\w)"), "\\1 , \\2"); simplified_text.replace(QRegExp("(\\w)<Tab\\d+>(\\w)"), "\\1 , \\2");
simplified_text.replace(QRegExp("<Tab\\d+>"), " "); simplified_text.replace(QRegExp("<Tab\\d+>"), " ");
simplified_text.remove(QRegExp("author.{0,3}:{0,1}", Qt::CaseInsensitive)); simplified_text.remove(QRegExp("author.{0,3}:{0,1}", Qt::CaseInsensitive));
simplified_text.replace(QRegExp("(\\w{4,10})[a-z]\\)", Qt::CaseSensitive), " simplified_text.replace(QRegExp("(\\w{4,10})[a-z]\\)", Qt::CaseSensitive),
\\1 "); // Remove superscript, e. g. LASTNAMEa "\\1 "); // Remove superscript, e. g. LASTNAMEa
simplified_text.replace(QRegExp("([A-Z]{2,10})[a-z]\\b", Qt::CaseSensitive), simplified_text.replace(QRegExp("([A-Z]{2,10})[a-z]\\b", Qt::CaseSensitive),
"\\1 "); // Remove superscript, e. g. LASTNAMEa) "\\1 "); // Remove superscript, e. g. LASTNAMEa)
simplified_text.replace(_hyphens, "-"); simplified_text.replace(_hyphens, "-");
simplified_text.replace(QChar(183), ','); // Fancy author separator simplified_text.replace(QChar(183), ','); // Fancy author sepa rator
simplified_text.replace(' ' + QChar(198) + ' ', " , "); // Fancy author sepa rator simplified_text.replace(' ' + QChar(198) + ' ', " , "); // Fancy author sepa rator
simplified_text.replace(QChar(8226), ','); // Fancy author separator simplified_text.replace(QChar(8226), ','); // Fancy author sepa
simplified_text.replace(QChar(178), ' '); // Superscript dagger as sometimes rator
translated by pdftotext simplified_text.replace(QChar(178), ' '); // Superscript dagge
r as sometimes translated by pdftotext
author::simplifyString(simplified_text); author::simplifyString(simplified_text);
// Capitalize and encode // Capitalize and encode
for (int i = 0; i < _word_prefix_lexicon.count(); ++i) for (int i = 0; i < _word_prefix_lexicon.count(); ++i)
{ {
const QString& wp = _word_prefix_lexicon.at(i); const QString& wp = _word_prefix_lexicon.at(i);
simplified_text.replace(wp, wp, Qt::CaseInsensitive); simplified_text.replace(wp, wp, Qt::CaseInsensitive);
} }
simplified_text.replace(" by ", " "); simplified_text.replace(" by ", " ");
_aencoder.encode(simplified_text); _aencoder.encode(simplified_text);
skipping to change at line 337 skipping to change at line 348
// from a faulty plain text conversion. // from a faulty plain text conversion.
author = _bpP->parse(_author, _aencoder.decoded(_author_sb.pos(0), _auth or_sb.matchedLength())); author = _bpP->parse(_author, _aencoder.decoded(_author_sb.pos(0), _auth or_sb.matchedLength()));
return author; return author;
} }
QString heuristicBibParser::guessAuthor_multi_block() QString heuristicBibParser::guessAuthor_multi_block()
{ {
QString author; QString author;
if (_author_b5_uc_rx->indexIn(_aencoder.code) > -1) if (_author_b5_uc_rx->indexIn(_aencoder.code) > -1)
for (int i = 1; i < _author_b5_uc_rx->capturedTexts().count(); ++i) for (int i = 1; i < _author_b5_uc_rx->capturedTexts().count(); ++i)
author = _bpP->parse(_addauthors, _aencoder.decoded(_author_b5_uc_rx author = _bpP->parse(
->pos(i), _author_b5_uc_rx->cap(i).length()), _addauthors, _aencoder.decoded(_author_b5_uc_rx->pos(i)
author); , _author_b5_uc_rx->cap(i).length()), author);
else if (_author_b5_lc_rx->indexIn(_aencoder.code) > -1) else if (_author_b5_lc_rx->indexIn(_aencoder.code) > -1)
for (int i = 1; i < _author_b5_lc_rx->capturedTexts().count(); ++i) for (int i = 1; i < _author_b5_lc_rx->capturedTexts().count(); ++i)
author = _bpP->parse(_addauthors, _aencoder.decoded(_author_b5_lc_rx author = _bpP->parse(
->pos(i), _author_b5_lc_rx->cap(i).length()), _addauthors, _aencoder.decoded(_author_b5_lc_rx->pos(i)
author); , _author_b5_lc_rx->cap(i).length()), author);
else if (_author_b4_uc_rx->indexIn(_aencoder.code) > -1) else if (_author_b4_uc_rx->indexIn(_aencoder.code) > -1)
for (int i = 1; i < _author_b4_uc_rx->capturedTexts().count(); ++i) for (int i = 1; i < _author_b4_uc_rx->capturedTexts().count(); ++i)
author = _bpP->parse(_addauthors, _aencoder.decoded(_author_b4_uc_rx author = _bpP->parse(
->pos(i), _author_b4_uc_rx->cap(i).length()), _addauthors, _aencoder.decoded(_author_b4_uc_rx->pos(i)
author); , _author_b4_uc_rx->cap(i).length()), author);
else if (_author_b4_lc_rx->indexIn(_aencoder.code) > -1) else if (_author_b4_lc_rx->indexIn(_aencoder.code) > -1)
for (int i = 1; i < _author_b4_lc_rx->capturedTexts().count(); ++i) for (int i = 1; i < _author_b4_lc_rx->capturedTexts().count(); ++i)
author = _bpP->parse(_addauthors, _aencoder.decoded(_author_b4_lc_rx author = _bpP->parse(
->pos(i), _author_b4_lc_rx->cap(i).length()), _addauthors, _aencoder.decoded(_author_b4_lc_rx->pos(i)
author); , _author_b4_lc_rx->cap(i).length()), author);
else if (_author_b3_uc_rx->indexIn(_aencoder.code) > -1) else if (_author_b3_uc_rx->indexIn(_aencoder.code) > -1)
for (int i = 1; i < _author_b3_uc_rx->capturedTexts().count(); ++i) for (int i = 1; i < _author_b3_uc_rx->capturedTexts().count(); ++i)
author = _bpP->parse(_addauthors, _aencoder.decoded(_author_b3_uc_rx author = _bpP->parse(
->pos(i), _author_b3_uc_rx->cap(i).length()), _addauthors, _aencoder.decoded(_author_b3_uc_rx->pos(i)
author); , _author_b3_uc_rx->cap(i).length()), author);
else if (_author_b3_lc_rx->indexIn(_aencoder.code) > -1) else if (_author_b3_lc_rx->indexIn(_aencoder.code) > -1)
for (int i = 1; i < _author_b3_lc_rx->capturedTexts().count(); ++i) for (int i = 1; i < _author_b3_lc_rx->capturedTexts().count(); ++i)
author = _bpP->parse(_addauthors, _aencoder.decoded(_author_b3_lc_rx author = _bpP->parse(
->pos(i), _author_b3_lc_rx->cap(i).length()), _addauthors, _aencoder.decoded(_author_b3_lc_rx->pos(i)
author); , _author_b3_lc_rx->cap(i).length()), author);
else if (_author_b2_uc_rx->indexIn(_aencoder.code) > -1) else if (_author_b2_uc_rx->indexIn(_aencoder.code) > -1)
for (int i = 1; i < _author_b2_uc_rx->capturedTexts().count(); ++i) for (int i = 1; i < _author_b2_uc_rx->capturedTexts().count(); ++i)
author = _bpP->parse(_addauthors, _aencoder.decoded(_author_b2_uc_rx author = _bpP->parse(
->pos(i), _author_b2_uc_rx->cap(i).length()), _addauthors, _aencoder.decoded(_author_b2_uc_rx->pos(i)
author); , _author_b2_uc_rx->cap(i).length()), author);
else if (_author_b2_lc_rx->indexIn(_aencoder.code) > -1) else if (_author_b2_lc_rx->indexIn(_aencoder.code) > -1)
for (int i = 1; i < _author_b2_lc_rx->capturedTexts().count(); ++i) for (int i = 1; i < _author_b2_lc_rx->capturedTexts().count(); ++i)
author = _bpP->parse(_addauthors, _aencoder.decoded(_author_b2_lc_rx author = _bpP->parse(
->pos(i), _author_b2_lc_rx->cap(i).length()), _addauthors, _aencoder.decoded(_author_b2_lc_rx->pos(i)
author); , _author_b2_lc_rx->cap(i).length()), author);
return author; return author;
} }
int heuristicBibParser::authorCount(const QString& authors) int heuristicBibParser::authorCount(const QString& authors)
{ {
if (authors.isEmpty()) if (authors.isEmpty())
return 0; return 0;
return 1 + authors.count(" and "); return 1 + authors.count(" and ");
} }
/** \page heuristics /** \page heuristics
- <b>Keywords</b> - <b>Keywords</b>
- If <tt>Key\\s{0,1}words\b</tt> is found. - If <tt>Key\\s{0,1}words\b</tt> is found.
*/ */
void heuristicBibParser::guessKeywords(const QString& text) void heuristicBibParser::guessKeywords(const QString& text)
{ {
QRegExp rxH("<(?:NewLine|Tab)\\d+>\\s*Key\\s{0,1}words and phrases[\\s\\.:]* QRegExp rxH("<(?:NewLine|Tab)\\d+>\\s*Key\\s{0,1}words and phrases[\\s\\.:]*
<NewLine\\d+>(.+)(<NewLine|$)", Qt::CaseInsensitive); <NewLine\\d+>(.+)(<NewLine|$)",
Qt::CaseInsensitive);
rxH.setMinimal(true); rxH.setMinimal(true);
int nH(rxH.indexIn(text)); int nH(rxH.indexIn(text));
if (nH > -1) if (nH > -1)
{ {
const QString val(rxH.cap(1).remove(_leading_non_letters)); const QString val(rxH.cap(1).remove(_leading_non_letters));
_current_reference[_keywords] = _bpP->parse(_keywords, val); _current_reference[_keywords] = _bpP->parse(_keywords, val);
return; return;
} }
rxH = QRegExp("<(?:NewLine|Tab)\\d+>\\s*Key\\s{0,1}words and phrases\\b(.+)( <NewLine|$)", Qt::CaseInsensitive); rxH = QRegExp("<(?:NewLine|Tab)\\d+>\\s*Key\\s{0,1}words and phrases\\b(.+)( <NewLine|$)", Qt::CaseInsensitive);
rxH.setMinimal(true); rxH.setMinimal(true);
nH = rxH.indexIn(text); nH = rxH.indexIn(text);
if (nH > -1) if (nH > -1)
{ {
const QString val(rxH.cap(1).remove(_leading_non_letters)); const QString val(rxH.cap(1).remove(_leading_non_letters));
_current_reference[_keywords] = _bpP->parse(_keywords, val); _current_reference[_keywords] = _bpP->parse(_keywords, val);
} }
rxH = QRegExp("<(?:NewLine|Tab)\\d+>\\s*Key\\s{0,1}words[\\s\\.:]*<NewLine\\ rxH =
d+>(.+)(<NewLine|$)", Qt::CaseInsensitive); QRegExp("<(?:NewLine|Tab)\\d+>\\s*Key\\s{0,1}words[\\s\\.:]*<NewLine\\d+
>(.+)(<NewLine|$)", Qt::CaseInsensitive);
rxH.setMinimal(true); rxH.setMinimal(true);
nH = rxH.indexIn(text); nH = rxH.indexIn(text);
if (nH > -1) if (nH > -1)
{ {
const QString val(rxH.cap(1).remove(_leading_non_letters)); const QString val(rxH.cap(1).remove(_leading_non_letters));
_current_reference[_keywords] = _bpP->parse(_keywords, val); _current_reference[_keywords] = _bpP->parse(_keywords, val);
return; return;
} }
rxH = QRegExp("<(?:NewLine|Tab)\\d+>\\s*Key\\s{0,1}words\\b(.+)(<NewLine|$)" , Qt::CaseInsensitive); rxH = QRegExp("<(?:NewLine|Tab)\\d+>\\s*Key\\s{0,1}words\\b(.+)(<NewLine|$)" , Qt::CaseInsensitive);
rxH.setMinimal(true); rxH.setMinimal(true);
skipping to change at line 589 skipping to change at line 602
} }
rxH = QRegExp("(?!\\()(\\d+)\\s*-{1,2}\\s*(\\d+)\\b(?!\\))"); rxH = QRegExp("(?!\\()(\\d+)\\s*-{1,2}\\s*(\\d+)\\b(?!\\))");
nH = 0; nH = 0;
while (nH >= 0) while (nH >= 0)
{ {
nH = rxH.indexIn(text, nH); nH = rxH.indexIn(text, nH);
if (nH > -1) if (nH > -1)
{ {
if (!rxH.cap(1).startsWith('0')) if (!rxH.cap(1).startsWith('0'))
{ {
const QString pp(_bpP->parse(_pages, QString("%1 %2").arg(rxH.ca p(1)).arg(rxH.cap(2)))); const QString pp(_bpP->parse(_pages, QString("%1 %2").arg(rxH.ca p(1), rxH.cap(2))));
const QStringList flpp(pp.split(QRegExp("\\D"), QString::SkipEmp tyParts)); const QStringList flpp(pp.split(QRegExp("\\D"), QString::SkipEmp tyParts));
if (flpp.count() != 2) if (flpp.count() != 2)
continue; continue;
const int fp(flpp.first().toInt()); const int fp(flpp.first().toInt());
const int lp(flpp.last().toInt()); const int lp(flpp.last().toInt());
if (fp < lp && lp - fp < 250) if (fp < lp && lp - fp < 250)
{ {
_current_reference[_pages] = pp; _current_reference[_pages] = pp;
return; return;
} }
} }
nH += rxH.matchedLength(); nH += rxH.matchedLength();
} }
} }
} }
/** \page heuristics /** \page heuristics
- <b>Year</b> - <b>Year</b>
- If <tt>\\b(19|20)(\\d\\d)\\b</tt> is found. - If <tt>\\b(19|20)(\\d\\d)\\b</tt> is found.
*/ */
skipping to change at line 787 skipping to change at line 800
int nH(rxH.indexIn(text)); int nH(rxH.indexIn(text));
if (nH > -1) if (nH > -1)
{ {
_current_reference[_volume] = _bpP->parse(_volume, rxH.cap(1)); _current_reference[_volume] = _bpP->parse(_volume, rxH.cap(1));
_current_reference[_number] = _bpP->parse(_number, rxH.cap(2)); _current_reference[_number] = _bpP->parse(_number, rxH.cap(2));
_current_reference[_pages] = _bpP->parse(_pages, rxH.cap(3)); _current_reference[_pages] = _bpP->parse(_pages, rxH.cap(3));
_current_reference[_year] = _bpP->parse(_year, rxH.cap(4) + rxH.cap(5)); _current_reference[_year] = _bpP->parse(_year, rxH.cap(4) + rxH.cap(5));
return; return;
} }
// J. Sci., 108 (15), 3206 (2004) // J. Sci., 108 (15), 3206 (2004)
rxH = QRegExp("(\\d+)\\s*\\((" + _hyphen_nums + ")\\)\\s*[,:]\\s*(" + _hyphe rxH =
n_pages + ")[,\\s]*\\((19|20)(\\d\\d)\\)"); QRegExp("(\\d+)\\s*\\((" + _hyphen_nums + ")\\)\\s*[,:]\\s*(" + _hyphen_
pages + ")[,\\s]*\\((19|20)(\\d\\d)\\)");
nH = rxH.indexIn(text); nH = rxH.indexIn(text);
if (nH > -1) if (nH > -1)
{ {
_current_reference[_volume] = _bpP->parse(_volume, rxH.cap(1)); _current_reference[_volume] = _bpP->parse(_volume, rxH.cap(1));
_current_reference[_number] = _bpP->parse(_number, rxH.cap(2)); _current_reference[_number] = _bpP->parse(_number, rxH.cap(2));
_current_reference[_pages] = _bpP->parse(_pages, rxH.cap(3)); _current_reference[_pages] = _bpP->parse(_pages, rxH.cap(3));
_current_reference[_year] = _bpP->parse(_year, rxH.cap(4) + rxH.cap(5)); _current_reference[_year] = _bpP->parse(_year, rxH.cap(4) + rxH.cap(5));
return; return;
} }
if (_reliable_pages && _reliable_volume) if (_reliable_pages && _reliable_volume)
skipping to change at line 846 skipping to change at line 860
} }
} }
void heuristicBibParser::guessYearVolumePages(const QString& text) void heuristicBibParser::guessYearVolumePages(const QString& text)
{ {
// Does several year volume pages formats // Does several year volume pages formats
if (_reliable_pages && _reliable_volume && _reliable_number) if (_reliable_pages && _reliable_volume && _reliable_number)
return; return;
// J. Sci. 1995 January 25; 247(4):536-40. // J. Sci. 1995 January 25; 247(4):536-40.
// J. Sci. 1995, 247(4):536-40. // J. Sci. 1995, 247(4):536-40.
QRegExp rxH(_current_reference.value(_year) + "[\\w ]{0,15}[,:; ]\\s*(\\d+)\ QRegExp rxH(_current_reference.value(_year) + "[\\w ]{0,15}[,:; ]\\s*(\\d+)\
\s*\\((" + _hyphen_nums + ")\\)\\s*[,:;]\\s*(" + _hyphen_pages + ')'); \s*\\((" + _hyphen_nums +
")\\)\\s*[,:;]\\s*(" + _hyphen_pages + ')');
int nH(rxH.indexIn(text)); int nH(rxH.indexIn(text));
if (nH > -1) if (nH > -1)
{ {
_current_reference[_volume] = _bpP->parse(_volume, rxH.cap(1)); _current_reference[_volume] = _bpP->parse(_volume, rxH.cap(1));
_current_reference[_number] = _bpP->parse(_number, rxH.cap(2)); _current_reference[_number] = _bpP->parse(_number, rxH.cap(2));
_current_reference[_pages] = _bpP->parse(_pages, rxH.cap(3)); _current_reference[_pages] = _bpP->parse(_pages, rxH.cap(3));
_reliable_pages = true; _reliable_pages = true;
_reliable_volume = true; _reliable_volume = true;
return; return;
} }
// J. Sci. (1999), 86, 3, pp. 635-648 // J. Sci. (1999), 86, 3, pp. 635-648
rxH = QRegExp("\\(" + _current_reference.value(_year) + "\\)" + "\\s*[,:;]\\ rxH = QRegExp("\\(" + _current_reference.value(_year) + "\\)" + "\\s*[,:;]\\
s*(\\d+)\\s*[,:;]\\s*(" + s*(\\d+)\\s*[,:;]\\s*(" + _hyphen_nums +
_hyphen_nums + ")\\s*[,:;]\\s*(?:pp)?\\.?\\s*(" + _hyphen_page ")\\s*[,:;]\\s*(?:pp)?\\.?\\s*(" + _hyphen_pages + ')');
s + ')');
nH = rxH.indexIn(text); nH = rxH.indexIn(text);
if (nH > -1) if (nH > -1)
{ {
_current_reference[_volume] = _bpP->parse(_volume, rxH.cap(1)); _current_reference[_volume] = _bpP->parse(_volume, rxH.cap(1));
_current_reference[_number] = _bpP->parse(_number, rxH.cap(2)); _current_reference[_number] = _bpP->parse(_number, rxH.cap(2));
_current_reference[_pages] = _bpP->parse(_pages, rxH.cap(3)); _current_reference[_pages] = _bpP->parse(_pages, rxH.cap(3));
_reliable_pages = true; _reliable_pages = true;
_reliable_volume = true; _reliable_volume = true;
return; return;
} }
if (_reliable_volume) if (_reliable_volume)
return; return;
// J. Sci. 1995 January 25; 247:536-40. // J. Sci. 1995 January 25; 247:536-40.
// J. Sci. 2005, 103, 818 // J. Sci. 2005, 103, 818
// J. Sci. 2002;9:101–106.5. // J. Sci. 2002;9:101–106.5.
rxH = QRegExp(_current_reference.value(_year) + "\\s*[\\w ]{0,15}[,:;]\\s*(\ rxH =
\d+)\\s*[,:;]\\s*(" + _hyphen_pages + ')'); QRegExp(_current_reference.value(_year) + "\\s*[\\w ]{0,15}[,:;]\\s*(\\d
+)\\s*[,:;]\\s*(" + _hyphen_pages + ')');
nH = rxH.indexIn(text); nH = rxH.indexIn(text);
if (nH > -1) if (nH > -1)
{ {
_current_reference[_volume] = _bpP->parse(_volume, rxH.cap(1)); _current_reference[_volume] = _bpP->parse(_volume, rxH.cap(1));
_current_reference[_pages] = _bpP->parse(_pages, rxH.cap(2)); _current_reference[_pages] = _bpP->parse(_pages, rxH.cap(2));
_reliable_pages = true; _reliable_pages = true;
_reliable_volume = true; _reliable_volume = true;
return; return;
} }
// J. Sci. 2005 103:818 // J. Sci. 2005 103:818
 End of changes. 36 change blocks. 
110 lines changed or deleted 124 lines changed or added

Home  |  About  |  Features  |  All  |  Newest  |  Dox  |  Diffs  |  RSS Feeds  |  Screenshots  |  Comments  |  Imprint  |  Privacy  |  HTTP(S)