"Fossies" - the Fresh Open Source Software Archive  

Source code changes of the file "src/c2b/cb2bib_utilities.cpp" between
cb2bib-1.9.9.tar.gz and cb2bib-2.0.0.tar.gz

About: cb2Bib is a multiplatform application for rapidly extracting unformatted, or unstandardized bibliographic references from email alerts, journal Web pages, and PDF files.

cb2bib_utilities.cpp  (cb2bib-1.9.9):cb2bib_utilities.cpp  (cb2bib-2.0.0)
/*************************************************************************** /***************************************************************************
* Copyright (C) 2004-2018 by Pere Constans * Copyright (C) 2004-2019 by Pere Constans
* constans@molspaces.com * constans@molspaces.com
* cb2Bib version 1.9.9. Licensed under the GNU GPL version 3. * cb2Bib version 2.0.0. Licensed under the GNU GPL version 3.
* See the LICENSE file that comes with this distribution. * See the LICENSE file that comes with this distribution.
***************************************************************************/ ***************************************************************************/
#include "cb2bib_utilities.h" #include "cb2bib_utilities.h"
#include <QRegularExpressionMatchIterator> #include <QRegularExpressionMatchIterator>
#ifdef C2B_USE_LZO #ifdef C2B_USE_LZO
#include <lzo/lzoconf.h>
#include <lzo/lzo1x.h> #include <lzo/lzo1x.h>
#include <lzo/lzoconf.h>
#endif #endif
namespace c2bUtils namespace c2bUtils
{ {
QString setCapitalization(const QString& str) QString setCapitalization(const QString& str)
{ {
QString cap_string(str); QString cap_string(str);
if (isUpperCaseString(str)) if (isUpperCaseString(str))
cap_string = cap_string.toLower(); cap_string = cap_string.toLower();
bool do_upper(true); bool do_upper(true);
for (int i = 0; i < cap_string.length(); i++) for (int i = 0; i < cap_string.length(); i++)
if (cap_string.at(i).isLetter()) if (cap_string.at(i).isLetter())
{ {
if (do_upper) if (do_upper)
cap_string[i] = cap_string.at(i).toUpper(); cap_string[i] = cap_string.at(i).toUpper();
do_upper = false; do_upper = false;
} }
else if (cap_string.at(i) == '.' || cap_string.at(i) == ':') else if (cap_string.at(i) == '.' || cap_string.at(i) == ':')
do_upper = true; do_upper = true;
return (cap_string); return cap_string;
} }
QString& simplifyString(QString& str) QString& simplifyString(QString& str)
{ {
if (str.length() == 0) if (str.length() == 0)
return str; return str;
const ushort space(32); const ushort space(32);
ushort* const c0 = (ushort*)str.data(); ushort* const c0 = (ushort*)str.data();
ushort* const cn = c0 + str.length(); ushort* const cn = c0 + str.length();
ushort* c = c0; ushort* c = c0;
skipping to change at line 134 skipping to change at line 134
while (c < cpl) while (c < cpl)
*c++ = uch; *c++ = uch;
} }
return str; return str;
} }
static const unsigned short _cyrillic_to_ascii[] = static const unsigned short _cyrillic_to_ascii[] =
{ {
// Code points 1024 to 1309 // Code points 1024 to 1309
// See http://en.wikipedia.org/wiki/ISO_9 // See http://en.wikipedia.org/wiki/ISO_9
69, 69, 68, 71, 69, 90, 73, 73, 74, 76, 78, 67, 75, 73, 85, 68, 65, 69, 69, 68, 71, 69, 90, 73, 73, 74, 76, 78, 67, 75,
66, 86, 71, 68, 69, 90, 90, 73, 74, 75, 76, 77, 78, 79, 80, 82, 83, 73, 85, 68, 65, 66,
84, 85, 70, 72, 67, 67, 83, 83, 698, 89, 697, 69, 85, 65, 97, 98, 118, 86, 71, 68, 69, 90, 90, 73, 74, 75, 76, 77, 78, 79,
103, 100, 101, 122, 122, 105, 106, 107, 108, 109, 110, 111, 112, 114, 115, 1 80, 82, 83, 84, 85,
16, 117, 70, 72, 67, 67, 83, 83, 698, 89, 697, 69, 85, 65, 97,
102, 104, 99, 99, 115, 115, 698, 121, 697, 101, 117, 97, 101, 101, 100, 103, 98, 118, 103, 100, 101,
101, 122, 122, 105, 106, 107, 108, 109, 110, 111, 112, 114, 115, 116,
122, 105, 105, 106, 108, 110, 99, 107, 105, 117, 100, 1120, 1121, 69, 101, 1 117, 102, 104, 99, 99,
124, 1125, 115, 115, 698, 121, 697, 101, 117, 97, 101, 101, 100, 103, 101,
1126, 1127, 1128, 1129, 65, 97, 1132, 1133, 1134, 1135, 1136, 1137, 70, 102, 122, 105, 105, 106, 108,
89, 121, 89, 110, 99, 107, 105, 117, 100, 1120, 1121, 69, 101, 1124, 1125, 1126
121, 1144, 1145, 1146, 1147, 1148, 1149, 1150, 1151, 1152, 1153, 1154, 1155, , 1127, 1128, 1129, 65, 97,
1156, 1157, 1158, 1159, 1132, 1133, 1134, 1135, 1136, 1137, 70, 102, 89, 121, 89, 121, 1144
1160, 1161, 1162, 1163, 1164, 1165, 1166, 1167, 71, 103, 71, 103, 71, 103, 9 , 1145, 1146, 1147, 1148, 1149,
0, 122, 1176, 1150, 1151, 1152, 1153, 1154, 1155, 1156, 1157, 1158, 1159, 1160, 1161, 1162
1177, 75, 107, 75, 107, 75, 107, 75, 107, 78, 110, 78, 110, 80, 112, 79, 111 , 1163, 1164, 1165, 1166, 1167,
, 71, 103, 71, 103, 71, 103, 90, 122, 1176, 1177, 75, 107, 75,
83, 115, 84, 116, 85, 117, 85, 117, 72, 104, 67, 99, 67, 99, 67, 99, 72, 107, 75, 107, 75, 107,
104, 67, 99, 67, 99, 1216, 90, 122, 75, 107, 76, 108, 78, 110, 78, 110, 67, 78, 110, 78, 110, 80, 112, 79, 111, 83, 115, 84, 116, 85,
99, 1229, 1230, 1231, 65, 97, 65, 97, 1236, 1237, 69, 101, 65, 97, 65, 97, 9 117, 85, 117, 72, 104,
0, 67, 99, 67, 99, 67, 99, 72, 104, 67, 99, 67, 99, 1216
122, 90, 122, 90, 122, 73, 105, 73, 105, 79, 111, 79, 111, 79, 111, 69, 101, , 90, 122, 75, 107, 76,
85, 117, 85, 117, 85, 117, 67, 99, 1270, 1271, 89, 121, 1274, 1275, 1276, 12 108, 78, 110, 78, 110, 67, 99, 1229, 1230, 1231, 65, 97, 65,
77, 1278, 97, 1236, 1237, 69, 101,
1279, 1280, 1281, 1282, 1283, 1284, 1285, 1286, 1287, 1288, 1289, 78, 110, 1 65, 97, 65, 97, 90, 122, 90, 122, 90, 122, 73, 105, 73,
292, 1293, 84, 116, 105, 79, 111, 79, 111,
1296, 1297, 1298, 1299, 1300, 1301, 1302, 1303, 1304, 1305, 81, 113, 87, 119 79, 111, 69, 101, 85, 117, 85, 117, 85, 117, 67, 99, 1270
, 1271, 89, 121, 1274, 1275,
1276, 1277, 1278, 1279, 1280, 1281, 1282, 1283, 1284, 1285, 1286, 1287, 1288
, 1289, 78, 110, 1292, 1293,
84, 116, 1296, 1297, 1298, 1299, 1300, 1301, 1302, 1303, 1304, 1305, 81,
113, 87, 119
}; };
static inline QString& _to_ascii_transliterate(QString& str) static inline QString& _to_ascii_transliterate(QString& str)
{ {
// Strip diacritics, undo ligatures, transliterate // Strip diacritics, undo ligatures, transliterate
if (str.length() == 0) if (str.length() == 0)
return str; return str;
ushort* const c0 = (ushort*)str.data(); ushort* const c0 = (ushort*)str.data();
ushort* const cn = c0 + str.length(); ushort* const cn = c0 + str.length();
ushort* c = c0 - 1; ushort* c = c0 - 1;
skipping to change at line 173 skipping to change at line 172
bool do_ligatures_230(false); bool do_ligatures_230(false);
bool do_ligatures_338(false); bool do_ligatures_338(false);
bool do_ligatures_339(false); bool do_ligatures_339(false);
while (c < cn) while (c < cn)
{ {
++c; ++c;
if (*c < 128) if (*c < 128)
continue; continue;
if (*c > 1023 && *c < 1310) if (*c > 1023 && *c < 1310)
{ {
*c = _cyrillic_to_ascii[*c-1024]; *c = _cyrillic_to_ascii[*c - 1024];
if (*c == 1236) if (*c == 1236)
{ {
*c = 198; *c = 198;
do_ligatures_198 = true; do_ligatures_198 = true;
} }
if (*c == 1237) if (*c == 1237)
{ {
*c = 230; *c = 230;
do_ligatures_230 = true; do_ligatures_230 = true;
} }
skipping to change at line 468 skipping to change at line 467
str.replace(QRegExp("\\{\\\\(.{1,2})\\{(\\w{0,2})\\}\\}"), "{\\\\1\\2}"); str.replace(QRegExp("\\{\\\\(.{1,2})\\{(\\w{0,2})\\}\\}"), "{\\\\1\\2}");
if (hasLatexDiacritic.indexIn(str) >= 0) if (hasLatexDiacritic.indexIn(str) >= 0)
{ {
str.replace("{\\`A}", QChar(192)); str.replace("{\\`A}", QChar(192));
str.replace("{\\'A}", QChar(193)); str.replace("{\\'A}", QChar(193));
str.replace("{\\^A}", QChar(194)); str.replace("{\\^A}", QChar(194));
str.replace("{\\~A}", QChar(195)); str.replace("{\\~A}", QChar(195));
str.replace("{\\\"A}", QChar(196)); str.replace("{\\\"A}", QChar(196));
str.replace("{\\AA}", QChar(197)); str.replace("{\\AA}", QChar(197));
str.replace("{\\AE}", QChar(198)); str.replace("{\\AE}", QChar(198));
str.replace("{\\cC}", QChar(199)); // {\\c{C}} str.replace("{\\cC}", QChar(199)); // {\\c{C}}
str.replace("{\\c C}", QChar(199)); str.replace("{\\c C}", QChar(199));
str.replace("{\\`E}", QChar(200)); str.replace("{\\`E}", QChar(200));
str.replace("{\\'E}", QChar(201)); str.replace("{\\'E}", QChar(201));
str.replace("{\\^E}", QChar(202)); str.replace("{\\^E}", QChar(202));
str.replace("{\\\"E}", QChar(203)); str.replace("{\\\"E}", QChar(203));
str.replace("{\\`I}", QChar(204)); str.replace("{\\`I}", QChar(204));
str.replace("{\\'I}", QChar(205)); str.replace("{\\'I}", QChar(205));
str.replace("{\\^I}", QChar(206)); str.replace("{\\^I}", QChar(206));
str.replace("{\\\"I}", QChar(207)); str.replace("{\\\"I}", QChar(207));
str.replace("{\\~N}", QChar(209)); str.replace("{\\~N}", QChar(209));
skipping to change at line 498 skipping to change at line 497
str.replace("{\\\"U}", QChar(220)); str.replace("{\\\"U}", QChar(220));
str.replace("{\\'Y}", QChar(221)); str.replace("{\\'Y}", QChar(221));
str.replace("{\\ss}", QChar(223)); str.replace("{\\ss}", QChar(223));
str.replace("{\\`a}", QChar(224)); str.replace("{\\`a}", QChar(224));
str.replace("{\\'a}", QChar(225)); str.replace("{\\'a}", QChar(225));
str.replace("{\\^a}", QChar(226)); str.replace("{\\^a}", QChar(226));
str.replace("{\\~a}", QChar(227)); str.replace("{\\~a}", QChar(227));
str.replace("{\\\"a}", QChar(228)); str.replace("{\\\"a}", QChar(228));
str.replace("{\\aa}", QChar(229)); str.replace("{\\aa}", QChar(229));
str.replace("{\\ae}", QChar(230)); str.replace("{\\ae}", QChar(230));
str.replace("{\\cc}", QChar(231)); // {\\c{c}} str.replace("{\\cc}", QChar(231)); // {\\c{c}}
str.replace("{\\c c}", QChar(231)); str.replace("{\\c c}", QChar(231));
str.replace("{\\`e}", QChar(232)); str.replace("{\\`e}", QChar(232));
str.replace("{\\'e}", QChar(233)); str.replace("{\\'e}", QChar(233));
str.replace("{\\^e}", QChar(234)); str.replace("{\\^e}", QChar(234));
str.replace("{\\\"e}", QChar(235)); str.replace("{\\\"e}", QChar(235));
str.replace("{\\`i}", QChar(236)); str.replace("{\\`i}", QChar(236));
str.replace("{\\'i}", QChar(237)); str.replace("{\\'i}", QChar(237));
str.replace("{\\^i}", QChar(238)); str.replace("{\\^i}", QChar(238));
str.replace("{\\\"i}", QChar(239)); str.replace("{\\\"i}", QChar(239));
str.replace("{\\`\\i}", QChar(236)); str.replace("{\\`\\i}", QChar(236));
skipping to change at line 526 skipping to change at line 525
str.replace("{\\~o}", QChar(245)); str.replace("{\\~o}", QChar(245));
str.replace("{\\\"o}", QChar(246)); str.replace("{\\\"o}", QChar(246));
str.replace("{\\o}", QChar(248)); str.replace("{\\o}", QChar(248));
str.replace("{\\`u}", QChar(249)); str.replace("{\\`u}", QChar(249));
str.replace("{\\'u}", QChar(250)); str.replace("{\\'u}", QChar(250));
str.replace("{\\^u}", QChar(251)); str.replace("{\\^u}", QChar(251));
str.replace("{\\\"u}", QChar(252)); str.replace("{\\\"u}", QChar(252));
str.replace("{\\'y}", QChar(253)); str.replace("{\\'y}", QChar(253));
str.replace("{\\\"y}", QChar(255)); str.replace("{\\\"y}", QChar(255));
str.replace("{\\'c}", QChar(263)); str.replace("{\\'c}", QChar(263));
str.replace("{\\vC}", QChar(268)); // {\\v{C}} str.replace("{\\vC}", QChar(268)); // {\\v{C}}
str.replace("{\\v C}", QChar(268)); str.replace("{\\v C}", QChar(268));
str.replace("{\\vc}", QChar(269)); // {\\v{c} str.replace("{\\vc}", QChar(269)); // {\\v{c}
str.replace("{\\v c}", QChar(269)); str.replace("{\\v c}", QChar(269));
str.replace("{\\DJ}", QChar(272)); str.replace("{\\DJ}", QChar(272));
str.replace("{\\dj}", QChar(273)); str.replace("{\\dj}", QChar(273));
str.replace("{\\L}", QChar(321)); str.replace("{\\L}", QChar(321));
str.replace("{\\l}", QChar(322)); str.replace("{\\l}", QChar(322));
str.replace("{\\'N}", QChar(323)); str.replace("{\\'N}", QChar(323));
str.replace("{\\'n}", QChar(324)); str.replace("{\\'n}", QChar(324));
str.replace("{\\OE}", QChar(338)); str.replace("{\\OE}", QChar(338));
str.replace("{\\oe}", QChar(339)); str.replace("{\\oe}", QChar(339));
str.replace("{\\vS}", QChar(352)); // {\\v{S}} str.replace("{\\vS}", QChar(352)); // {\\v{S}}
str.replace("{\\v S}", QChar(352)); str.replace("{\\v S}", QChar(352));
str.replace("{\\vs}", QChar(353)); // {\\v{s}} str.replace("{\\vs}", QChar(353)); // {\\v{s}}
str.replace("{\\v s}", QChar(353)); str.replace("{\\v s}", QChar(353));
str.replace("{\\\"Y}", QChar(376)); str.replace("{\\\"Y}", QChar(376));
str.replace("{\\vZ}", QChar(381)); // {\\v{Z}} str.replace("{\\vZ}", QChar(381)); // {\\v{Z}}
str.replace("{\\v Z}", QChar(381)); str.replace("{\\v Z}", QChar(381));
str.replace("{\\vz}", QChar(382)); // {\\v{Z}} str.replace("{\\vz}", QChar(382)); // {\\v{Z}}
str.replace("{\\v z}", QChar(382)); str.replace("{\\v z}", QChar(382));
} }
if (hasLatexSymbol.indexIn(str) >= 0) if (hasLatexSymbol.indexIn(str) >= 0)
{ {
str.replace("$\\cdot$", QChar(183)); str.replace("$\\cdot$", QChar(183));
str.replace("$\\Alpha$", QChar(913)); str.replace("$\\Alpha$", QChar(913));
str.replace("$\\Beta$", QChar(914)); str.replace("$\\Beta$", QChar(914));
str.replace("$\\Gamma$", QChar(915)); str.replace("$\\Gamma$", QChar(915));
str.replace("$\\Delta$", QChar(916)); str.replace("$\\Delta$", QChar(916));
str.replace("$\\Epsilon$", QChar(917)); str.replace("$\\Epsilon$", QChar(917));
skipping to change at line 593 skipping to change at line 592
str.replace("$\\iota$", QChar(953)); str.replace("$\\iota$", QChar(953));
str.replace("$\\kappa$", QChar(954)); str.replace("$\\kappa$", QChar(954));
str.replace("$\\lambda$", QChar(955)); str.replace("$\\lambda$", QChar(955));
str.replace("$\\mu$", QChar(956)); str.replace("$\\mu$", QChar(956));
str.replace("$\\nu$", QChar(957)); str.replace("$\\nu$", QChar(957));
str.replace("$\\xi$", QChar(958)); str.replace("$\\xi$", QChar(958));
str.replace("$\\omicron$", QChar(959)); str.replace("$\\omicron$", QChar(959));
str.replace("$\\pi$", QChar(960)); str.replace("$\\pi$", QChar(960));
str.replace("$\\rho$", QChar(961)); str.replace("$\\rho$", QChar(961));
str.replace("$\\sigmaf$", QChar(962)); str.replace("$\\sigmaf$", QChar(962));
str.replace("$\\varsigma$", QChar(962)); // Equal to \sigmaf str.replace("$\\varsigma$", QChar(962)); // Equal to \sigmaf
str.replace("$\\sigma$", QChar(963)); str.replace("$\\sigma$", QChar(963));
str.replace("$\\tau$", QChar(964)); str.replace("$\\tau$", QChar(964));
str.replace("$\\upsilon$", QChar(965)); str.replace("$\\upsilon$", QChar(965));
str.replace("$\\phi$", QChar(966)); str.replace("$\\phi$", QChar(966));
str.replace("$\\chi$", QChar(967)); str.replace("$\\chi$", QChar(967));
str.replace("$\\psi$", QChar(968)); str.replace("$\\psi$", QChar(968));
str.replace("$\\omega$", QChar(969)); str.replace("$\\omega$", QChar(969));
str.replace("$\\vartheta$", QChar(977)); str.replace("$\\vartheta$", QChar(977));
str.replace("$\\varphi$", QChar(981)); str.replace("$\\varphi$", QChar(981));
str.replace("$\\varpi$", QChar(982)); str.replace("$\\varpi$", QChar(982));
skipping to change at line 616 skipping to change at line 615
str.replace("$\\varrho$", QChar(1009)); str.replace("$\\varrho$", QChar(1009));
str.replace("$\\epsilon$", QChar(1013)); str.replace("$\\epsilon$", QChar(1013));
str.replace("$\\partial$", QChar(8706)); str.replace("$\\partial$", QChar(8706));
str.replace("$\\infty$", QChar(8734)); str.replace("$\\infty$", QChar(8734));
} }
return str; return str;
} }
const QString fromUtf8(const QByteArray& ba) const QString fromUtf8(const QByteArray& ba)
{ {
// Based on Qt's QString::fromUtf8 function. Input ba must be an UTF-8 // Based on Qt's QString::fromUtf8 function. Input ba must be an UTF-8
// encoded array produced by QString::toUtf8. Encoding correctness is // encoded array produced by QString::toUtf8. Encoding correctness is
// assumed and checking omitted. It performs a 20% faster compared to // assumed and checking omitted. It performs a 20% faster compared to
// Qt 4.5, and expected more compared to Qt 4.6. // Qt 4.5, and expected more compared to Qt 4.6.
const char* b = ba.constData(); const char* b = ba.constData();
const char* const bn = b + ba.length(); const char* const bn = b + ba.length();
QString output; QString output;
output.resize(ba.length()); output.resize(ba.length());
ushort* o = (ushort*)output.unicode(); ushort* o = (ushort*)output.unicode();
uint c; uint c;
while (b < bn) while (b < bn)
{ {
c = uint(*b); c = uint(*b);
skipping to change at line 661 skipping to change at line 660
c = QChar::lowSurrogate(c); c = QChar::lowSurrogate(c);
b += 3; b += 3;
} }
else else
c = QChar::ReplacementCharacter; c = QChar::ReplacementCharacter;
} }
*o++ = ushort(c); *o++ = ushort(c);
++b; ++b;
} }
output.truncate(int(o - (ushort*)output.unicode())); output.truncate(int(o - (ushort*)output.unicode()));
// QString check = QString::fromUtf8(ba); // QString check = QString::fromUtf8(ba);
// qDebug() << (check == output); // qDebug() << (check == output);
return output; return output;
} }
#ifdef C2B_USE_LZO #ifdef C2B_USE_LZO
const QByteArray lzo::compress(const QByteArray& data) const QByteArray lzo::compress(const QByteArray& data)
{ {
const lzo_uint src_s(data.size()); const lzo_uint src_s(data.size());
if (src_s == 0) if (src_s == 0)
return QByteArray(); return QByteArray();
if (lzo_init() != LZO_E_OK) if (lzo_init() != LZO_E_OK)
 End of changes. 18 change blocks. 
47 lines changed or deleted 52 lines changed or added

Home  |  About  |  Features  |  All  |  Newest  |  Dox  |  Diffs  |  RSS Feeds  |  Screenshots  |  Comments  |  Imprint  |  Privacy  |  HTTP(S)