"Fossies" - the Fresh Open Source Software Archive

Member "cb2bib-2.0.1/src/c2b/cb2bib_utilities.cpp" (12 Feb 2021, 51022 Bytes) of package /linux/privat/cb2bib-2.0.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "cb2bib_utilities.cpp" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 2.0.0_vs_2.0.1.

    1 /***************************************************************************
    2  *   Copyright (C) 2004-2021 by Pere Constans
    3  *   constans@molspaces.com
    4  *   cb2Bib version 2.0.1. Licensed under the GNU GPL version 3.
    5  *   See the LICENSE file that comes with this distribution.
    6  ***************************************************************************/
    7 #include "cb2bib_utilities.h"
    8 
    9 #include "txtmatcher.h"
   10 
   11 #include <QRegularExpressionMatchIterator>
   12 
   13 
   14 #if defined(C2B_USE_LZSSE)
   15 #include "./lzsse4/lzsse4.h"
   16 #elif defined(C2B_USE_LZ4)
   17 #include <lz4.h>
   18 #include <lz4hc.h>
   19 #elif defined(C2B_USE_LZO)
   20 #include <lzo/lzo1x.h>
   21 #include <lzo/lzoconf.h>
   22 #endif
   23 
   24 
   25 namespace c2bUtils
   26 {
   27 
   28 const QRegExp nonAsciiLetter("[^A-Za-z]");
   29 const QRegExp nonLetter("\\W");
   30 
   31 const QString arxivUrl("https://arxiv.org/abs/%1");
   32 // Link set according to: "Creating a Web Link to the Entrez Databases",
   33 // http://www.ncbi.nlm.nih.gov/books/bv.fcgi?rid=helplinks.chapter.linkshelp#linkshelp.Retrieve_PubMed_Cita
   34 const QString pubmedUrl("https://pubmed.ncbi.nlm.nih.gov/%1");
   35 const QString metadatasection("\n[Raw Metadata\n%1\n/Raw Metadata]\n");
   36 
   37 QString setCapitalization(const QString& str)
   38 {
   39     QString cap_string(str);
   40     if (isUpperCaseString(str))
   41         cap_string = cap_string.toLower();
   42     bool do_upper(true);
   43     for (int i = 0; i < cap_string.length(); ++i)
   44         if (cap_string.at(i).isLetter())
   45         {
   46             if (do_upper)
   47             {
   48                 // Check for special cases: pH, mRNA, l-Alanine, Ea.hy926, p53, α/β-barrel
   49                 bool can_do_upper(true);
   50                 for (int j = i + 1; j < cap_string.length(); ++j)
   51                     if (cap_string.at(j).isDigit() || (cap_string.at(j).isLetter() && cap_string.at(j).isUpper()) ||
   52                         (j - i <= 3 && cap_string.at(j) == QLatin1Char('-') && cap_string.at(i) >= QChar(945)))
   53                     {
   54                         can_do_upper = false;
   55                         break;
   56                     }
   57                     else if (cap_string.at(j) == QLatin1Char(' '))
   58                         break;
   59                 if (can_do_upper)
   60                     cap_string[i] = cap_string.at(i).toUpper();
   61             }
   62             do_upper = false;
   63         }
   64         else if (cap_string.at(i) == QLatin1Char('.') || cap_string.at(i) == QLatin1Char(':') ||
   65                  cap_string.at(i) == QLatin1Char('?'))
   66             do_upper = true;
   67         else if (cap_string.at(i) != QLatin1Char(' '))
   68             do_upper = false;
   69     return cap_string;
   70 }
   71 
   72 QString& simplifyString(QString& str)
   73 {
   74     if (str.length() == 0)
   75         return str;
   76     const ushort space(32);
   77     ushort* const c0 = (ushort*)str.data();
   78     ushort* const cn = c0 + str.length();
   79     ushort* c = c0;
   80     ushort* o = c0;
   81     while (c < cn)
   82     {
   83         const ushort ch = *c;
   84         if ((ch > 32 && ch < 160) || !(ch == space || QChar(ch).isSpace()))
   85             *o++ = ch;
   86         else if (o > c0 && *(o - 1) != space)
   87             *o++ = space;
   88         ++c;
   89     }
   90     if (o > c0 && *(o - 1) == space)
   91         --o;
   92     str.truncate(int(o - c0));
   93     return str;
   94 }
   95 
   96 QString& fillString(QString& str, const QStringMatcher& pattern, const QChar& ch)
   97 {
   98     if (str.length() == 0)
   99         return str;
  100     const int pl(pattern.pattern().length());
  101     const ushort uch(ch.unicode());
  102     ushort* const c0((ushort*)str.data());
  103     int p(0);
  104     while (p >= 0)
  105     {
  106         p = pattern.indexIn(str, p);
  107         if (p > -1)
  108         {
  109             ushort* c(c0 + p);
  110             const ushort* const cpl(c + pl);
  111             while (c < cpl)
  112                 *c++ = uch;
  113             p += pl;
  114         }
  115     }
  116     return str;
  117 }
  118 
  119 QString& fillString(QString& str, const QString& pattern, const QChar& ch)
  120 {
  121     return fillString(str, QStringMatcher(pattern, Qt::CaseSensitive), ch);
  122 }
  123 
  124 QString& fillString(QString& str, const txtmatcher& pattern, const QChar& ch)
  125 {
  126     if (str.length() == 0)
  127         return str;
  128     const int pl(pattern.pattern().length());
  129     const ushort uch(ch.unicode());
  130     ushort* const c0((ushort*)str.data());
  131     int p(0);
  132     while (p >= 0)
  133     {
  134         p = pattern.indexIn(str, p);
  135         if (p > -1)
  136         {
  137             ushort* c(c0 + p);
  138             const ushort* const cpl(c + pl);
  139             while (c < cpl)
  140                 *c++ = uch;
  141             p += pl;
  142         }
  143     }
  144     return str;
  145 }
  146 
  147 QString& fillString(QString& str, const QRegExp& pattern, const QChar& ch)
  148 {
  149     if (str.length() == 0)
  150         return str;
  151     const ushort uch(ch.unicode());
  152     ushort* const c0((ushort*)str.data());
  153     int p(0);
  154     while (p >= 0)
  155     {
  156         p = pattern.indexIn(str, p);
  157         if (p > -1)
  158         {
  159             const int pl(pattern.matchedLength());
  160             ushort* c(c0 + p);
  161             const ushort* const cpl(c + pl);
  162             while (c < cpl)
  163                 *c++ = uch;
  164             p += pl;
  165         }
  166     }
  167     return str;
  168 }
  169 
  170 QString& fillString(QString& str, const QRegularExpression& pattern, const QChar& ch)
  171 {
  172     if (str.length() == 0)
  173         return str;
  174     const ushort uch(ch.unicode());
  175     ushort* const c0((ushort*)str.data());
  176     QRegularExpressionMatchIterator it(pattern.globalMatch(str));
  177     while (it.hasNext())
  178     {
  179         const QRegularExpressionMatch match(it.next());
  180         ushort* c(c0 + match.capturedStart());
  181         const ushort* const cpl(c + match.capturedLength());
  182         while (c < cpl)
  183             *c++ = uch;
  184     }
  185     return str;
  186 }
  187 
  188 static const unsigned short _cyrillic_to_ascii[] =
  189 {
  190     // Code points 1024 to 1309
  191     // See http://en.wikipedia.org/wiki/ISO_9
  192     69,   69,   68,   71,   69,   90,   73,   73,   74,   76,   78,   67,   75,   73,   85,   68,   65,   66,
  193     86,   71,   68,   69,   90,   90,   73,   74,   75,   76,   77,   78,   79,   80,   82,   83,   84,   85,
  194     70,   72,   67,   67,   83,   83,   698,  89,   697,  69,   85,   65,   97,   98,   118,  103,  100,  101,
  195     122,  122,  105,  106,  107,  108,  109,  110,  111,  112,  114,  115,  116,  117,  102,  104,  99,   99,
  196     115,  115,  698,  121,  697,  101,  117,  97,   101,  101,  100,  103,  101,  122,  105,  105,  106,  108,
  197     110,  99,   107,  105,  117,  100,  1120, 1121, 69,   101,  1124, 1125, 1126, 1127, 1128, 1129, 65,   97,
  198     1132, 1133, 1134, 1135, 1136, 1137, 70,   102,  89,   121,  89,   121,  1144, 1145, 1146, 1147, 1148, 1149,
  199     1150, 1151, 1152, 1153, 1154, 1155, 1156, 1157, 1158, 1159, 1160, 1161, 1162, 1163, 1164, 1165, 1166, 1167,
  200     71,   103,  71,   103,  71,   103,  90,   122,  1176, 1177, 75,   107,  75,   107,  75,   107,  75,   107,
  201     78,   110,  78,   110,  80,   112,  79,   111,  83,   115,  84,   116,  85,   117,  85,   117,  72,   104,
  202     67,   99,   67,   99,   67,   99,   72,   104,  67,   99,   67,   99,   1216, 90,   122,  75,   107,  76,
  203     108,  78,   110,  78,   110,  67,   99,   1229, 1230, 1231, 65,   97,   65,   97,   1236, 1237, 69,   101,
  204     65,   97,   65,   97,   90,   122,  90,   122,  90,   122,  73,   105,  73,   105,  79,   111,  79,   111,
  205     79,   111,  69,   101,  85,   117,  85,   117,  85,   117,  67,   99,   1270, 1271, 89,   121,  1274, 1275,
  206     1276, 1277, 1278, 1279, 1280, 1281, 1282, 1283, 1284, 1285, 1286, 1287, 1288, 1289, 78,   110,  1292, 1293,
  207     84,   116,  1296, 1297, 1298, 1299, 1300, 1301, 1302, 1303, 1304, 1305, 81,   113,  87,   119
  208 };
  209 
  210 static inline QString& _to_ascii_transliterate(QString& str)
  211 {
  212     // Strip diacritics, undo ligatures, transliterate
  213     if (str.length() == 0)
  214         return str;
  215     ushort* const c0 = (ushort*)str.data();
  216     ushort* const cn = c0 + str.length();
  217     ushort* c = c0 - 1;
  218     bool do_ligatures_198(false);
  219     bool do_ligatures_223(false);
  220     bool do_ligatures_230(false);
  221     bool do_ligatures_338(false);
  222     bool do_ligatures_339(false);
  223     while (c < cn)
  224     {
  225         ++c;
  226         if (*c < 128)
  227             continue;
  228         if (*c > 1023 && *c < 1310)
  229         {
  230             *c = _cyrillic_to_ascii[*c - 1024];
  231             if (*c == 1236)
  232             {
  233                 *c = 198;
  234                 do_ligatures_198 = true;
  235             }
  236             if (*c == 1237)
  237             {
  238                 *c = 230;
  239                 do_ligatures_230 = true;
  240             }
  241             continue;
  242         }
  243         QChar qc(*c);
  244         if (!qc.isLetter())
  245             continue;
  246         switch (*c)
  247         {
  248         case 216:
  249             *c = QChar('O').unicode();
  250             break;
  251         case 248:
  252             *c = QChar('o').unicode();
  253             break;
  254         case 272:
  255             *c = QChar('D').unicode();
  256             break;
  257         case 273:
  258             *c = QChar('d').unicode();
  259             break;
  260         case 321:
  261             *c = QChar('L').unicode();
  262             break;
  263         case 322:
  264             *c = QChar('l').unicode();
  265             break;
  266         case 198:
  267             do_ligatures_198 = true;
  268             break;
  269         case 223:
  270             do_ligatures_223 = true;
  271             break;
  272         case 230:
  273             do_ligatures_230 = true;
  274             break;
  275         case 338:
  276             do_ligatures_338 = true;
  277             break;
  278         case 339:
  279             do_ligatures_339 = true;
  280             break;
  281         }
  282         if (qc.decompositionTag() == QChar::NoDecomposition)
  283             continue;
  284         qc = qc.decomposition().at(0);
  285         *c = qc.unicode();
  286         if (qc.decompositionTag() == QChar::NoDecomposition)
  287             continue;
  288         qc = qc.decomposition().at(0);
  289         *c = qc.unicode();
  290     }
  291     if (do_ligatures_198)
  292         str.replace(QChar(198), "AE", Qt::CaseSensitive);
  293     if (do_ligatures_223)
  294         str.replace(QChar(223), "ss", Qt::CaseSensitive);
  295     if (do_ligatures_230)
  296         str.replace(QChar(230), "ae", Qt::CaseSensitive);
  297     if (do_ligatures_338)
  298         str.replace(QChar(338), "OE", Qt::CaseSensitive);
  299     if (do_ligatures_339)
  300         str.replace(QChar(339), "oe", Qt::CaseSensitive);
  301     return str;
  302 }
  303 
  304 static inline QString& _to_ascii_keep_words(QString& str)
  305 {
  306     // Do:
  307     // const QRegExp nonAsciiWords("[^A-Za-z0-9\\+\\- ]");
  308     // str.replace(nonAsciiWords, " ");
  309     // str = str.simplified();
  310     if (str.length() == 0)
  311         return str;
  312     const ushort dash(QChar('-').unicode());
  313     const ushort la(QChar('a').unicode());
  314     const ushort lz(QChar('z').unicode());
  315     const ushort n0(QChar('0').unicode());
  316     const ushort n9(QChar('9').unicode());
  317     const ushort plus(QChar('+').unicode());
  318     const ushort space(QChar(' ').unicode());
  319     const ushort ua(QChar('A').unicode());
  320     const ushort uz(QChar('Z').unicode());
  321 
  322     ushort* const c0 = (ushort*)str.data();
  323     ushort* const cn = c0 + str.length();
  324     ushort* c = c0;
  325     ushort* o = c0;
  326 
  327     while (c < cn)
  328     {
  329         const ushort ch = *c;
  330         if ((ch >= la && ch <= lz) || (ch >= ua && ch <= uz) || (ch >= n0 && ch <= n9) || ch == dash || ch == plus)
  331             *o++ = ch;
  332         else if (o > c0 && *(o - 1) != space)
  333             *o++ = space;
  334         ++c;
  335     }
  336     if (o > c0 && *(o - 1) == space)
  337         --o;
  338     str.truncate(int(o - c0));
  339     return str;
  340 }
  341 
  342 QString toAscii(const QString& str, const AsciiConversion type)
  343 {
  344     QString ascii(str);
  345     if (type == FromBibTeX)
  346         cleanEquations(ascii);
  347     _to_ascii_transliterate(ascii);
  348     if (type == Collation)
  349     {
  350         for (int i = 0; i < ascii.length(); ++i)
  351             if (ascii.at(i).category() == QChar::Punctuation_Dash)
  352                 ascii[i] = QLatin1Char(' ');
  353         return ascii.toCaseFolded();
  354     }
  355     if (type == KeepWords || type == FromBibTeX)
  356         _to_ascii_keep_words(ascii);
  357     else if (type == Cleanup)
  358         ascii.remove(nonAsciiLetter);
  359     return ascii;
  360 }
  361 
  362 QString& stripDiacritics(QString& str)
  363 {
  364     _to_ascii_transliterate(str);
  365     return str;
  366 }
  367 
  368 QString& c2bToBib(QString& str)
  369 {
  370     // Escape common Extended Latin characters
  371     str.replace(QLatin1String(" &"), QLatin1String(" \\&"));
  372     str.replace(QChar(192), QLatin1String("{\\`A}"));
  373     str.replace(QChar(193), QLatin1String("{\\'A}"));
  374     str.replace(QChar(194), QLatin1String("{\\^A}"));
  375     str.replace(QChar(195), QLatin1String("{\\~A}"));
  376     str.replace(QChar(196), QLatin1String("{\\\"A}"));
  377     str.replace(QChar(197), QLatin1String("{\\AA}"));
  378     str.replace(QChar(198), QLatin1String("{\\AE}"));
  379     str.replace(QChar(199), QLatin1String("{\\c C}"));
  380     str.replace(QChar(200), QLatin1String("{\\`E}"));
  381     str.replace(QChar(201), QLatin1String("{\\'E}"));
  382     str.replace(QChar(202), QLatin1String("{\\^E}"));
  383     str.replace(QChar(203), QLatin1String("{\\\"E}"));
  384     str.replace(QChar(204), QLatin1String("{\\`I}"));
  385     str.replace(QChar(205), QLatin1String("{\\'I}"));
  386     str.replace(QChar(206), QLatin1String("{\\^I}"));
  387     str.replace(QChar(207), QLatin1String("{\\\"I}"));
  388     str.replace(QChar(209), QLatin1String("{\\~N}"));
  389     str.replace(QChar(210), QLatin1String("{\\`O}"));
  390     str.replace(QChar(211), QLatin1String("{\\'O}"));
  391     str.replace(QChar(212), QLatin1String("{\\^O}"));
  392     str.replace(QChar(213), QLatin1String("{\\~O}"));
  393     str.replace(QChar(214), QLatin1String("{\\\"O}"));
  394     str.replace(QChar(216), QLatin1String("{\\O}"));
  395     str.replace(QChar(217), QLatin1String("{\\`U}"));
  396     str.replace(QChar(218), QLatin1String("{\\'U}"));
  397     str.replace(QChar(219), QLatin1String("{\\^U}"));
  398     str.replace(QChar(220), QLatin1String("{\\\"U}"));
  399     str.replace(QChar(221), QLatin1String("{\\'Y}"));
  400     str.replace(QChar(223), QLatin1String("{\\ss}"));
  401     str.replace(QChar(224), QLatin1String("{\\`a}"));
  402     str.replace(QChar(225), QLatin1String("{\\'a}"));
  403     str.replace(QChar(226), QLatin1String("{\\^a}"));
  404     str.replace(QChar(227), QLatin1String("{\\~a}"));
  405     str.replace(QChar(228), QLatin1String("{\\\"a}"));
  406     str.replace(QChar(229), QLatin1String("{\\aa}"));
  407     str.replace(QChar(230), QLatin1String("{\\ae}"));
  408     str.replace(QChar(231), QLatin1String("{\\c c}"));
  409     str.replace(QChar(232), QLatin1String("{\\`e}"));
  410     str.replace(QChar(233), QLatin1String("{\\'e}"));
  411     str.replace(QChar(234), QLatin1String("{\\^e}"));
  412     str.replace(QChar(235), QLatin1String("{\\\"e}"));
  413     str.replace(QChar(236), QLatin1String("{\\`i}"));
  414     str.replace(QChar(237), QLatin1String("{\\'i}"));
  415     str.replace(QChar(238), QLatin1String("{\\^i}"));
  416     str.replace(QChar(239), QLatin1String("{\\\"i}"));
  417     str.replace(QChar(241), QLatin1String("{\\~n}"));
  418     str.replace(QChar(242), QLatin1String("{\\`o}"));
  419     str.replace(QChar(243), QLatin1String("{\\'o}"));
  420     str.replace(QChar(244), QLatin1String("{\\^o}"));
  421     str.replace(QChar(245), QLatin1String("{\\~o}"));
  422     str.replace(QChar(246), QLatin1String("{\\\"o}"));
  423     str.replace(QChar(248), QLatin1String("{\\o}"));
  424     str.replace(QChar(249), QLatin1String("{\\`u}"));
  425     str.replace(QChar(250), QLatin1String("{\\'u}"));
  426     str.replace(QChar(251), QLatin1String("{\\^u}"));
  427     str.replace(QChar(252), QLatin1String("{\\\"u}"));
  428     str.replace(QChar(253), QLatin1String("{\\'y}"));
  429     str.replace(QChar(255), QLatin1String("{\\\"y}"));
  430     str.replace(QChar(256), QLatin1String("{\\=A}"));
  431     str.replace(QChar(257), QLatin1String("{\\=a}"));
  432     str.replace(QChar(258), QLatin1String("{\\u A}"));
  433     str.replace(QChar(259), QLatin1String("{\\u a}"));
  434     str.replace(QChar(260), QLatin1String("{\\c A}"));
  435     str.replace(QChar(261), QLatin1String("{\\c a}"));
  436     str.replace(QChar(262), QLatin1String("{\\'C}"));
  437     str.replace(QChar(263), QLatin1String("{\\'c}"));
  438     str.replace(QChar(264), QLatin1String("{\\^C}"));
  439     str.replace(QChar(265), QLatin1String("{\\^c}"));
  440     str.replace(QChar(266), QLatin1String("{\\.C}"));
  441     str.replace(QChar(267), QLatin1String("{\\.c}"));
  442     str.replace(QChar(268), QLatin1String("{\\v C}"));
  443     str.replace(QChar(269), QLatin1String("{\\v c}"));
  444     str.replace(QChar(270), QLatin1String("{\\v D}"));
  445     str.replace(QChar(271), QLatin1String("{\\v d}"));
  446     str.replace(QChar(272), QLatin1String("{\\DJ}"));
  447     str.replace(QChar(273), QLatin1String("{\\dj}"));
  448     str.replace(QChar(274), QLatin1String("{\\=E}"));
  449     str.replace(QChar(275), QLatin1String("{\\=e}"));
  450     str.replace(QChar(276), QLatin1String("{\\u E}"));
  451     str.replace(QChar(277), QLatin1String("{\\u e}"));
  452     str.replace(QChar(278), QLatin1String("{\\.E}"));
  453     str.replace(QChar(279), QLatin1String("{\\.e}"));
  454     str.replace(QChar(280), QLatin1String("{\\c E}"));
  455     str.replace(QChar(281), QLatin1String("{\\c e}"));
  456     str.replace(QChar(282), QLatin1String("{\\v E}"));
  457     str.replace(QChar(283), QLatin1String("{\\v e}"));
  458     str.replace(QChar(284), QLatin1String("{\\^G}"));
  459     str.replace(QChar(285), QLatin1String("{\\^g}"));
  460     str.replace(QChar(286), QLatin1String("{\\u G}"));
  461     str.replace(QChar(287), QLatin1String("{\\u g}"));
  462     str.replace(QChar(288), QLatin1String("{\\.G}"));
  463     str.replace(QChar(289), QLatin1String("{\\.g}"));
  464     str.replace(QChar(290), QLatin1String("{\\c G}"));
  465     str.replace(QChar(291), QLatin1String("{\\c g}"));
  466     str.replace(QChar(292), QLatin1String("{\\^H}"));
  467     str.replace(QChar(293), QLatin1String("{\\^h}"));
  468     str.replace(QChar(294), QLatin1String("{\\H}"));
  469     str.replace(QChar(295), QLatin1String("{\\h}"));
  470     str.replace(QChar(296), QLatin1String("{\\~I}"));
  471     str.replace(QChar(297), QLatin1String("{\\~i}"));
  472     str.replace(QChar(298), QLatin1String("{\\=I}"));
  473     str.replace(QChar(299), QLatin1String("{\\=i}"));
  474     str.replace(QChar(300), QLatin1String("{\\u I}"));
  475     str.replace(QChar(301), QLatin1String("{\\u i}"));
  476     str.replace(QChar(302), QLatin1String("{\\c I}"));
  477     str.replace(QChar(303), QLatin1String("{\\c i}"));
  478     str.replace(QChar(304), QLatin1String("{\\.I}"));
  479     str.replace(QChar(305), QLatin1String("{\\i}"));
  480     str.replace(QChar(321), QLatin1String("{\\L}"));
  481     str.replace(QChar(322), QLatin1String("{\\l}"));
  482     str.replace(QChar(323), QLatin1String("{\\'N}"));
  483     str.replace(QChar(324), QLatin1String("{\\'n}"));
  484     str.replace(QChar(325), QLatin1String("{\\c N}"));
  485     str.replace(QChar(326), QLatin1String("{\\c n}"));
  486     str.replace(QChar(327), QLatin1String("{\\v N}"));
  487     str.replace(QChar(328), QLatin1String("{\\v n}"));
  488     str.replace(QChar(332), QLatin1String("{\\=O}"));
  489     str.replace(QChar(333), QLatin1String("{\\=o}"));
  490     str.replace(QChar(334), QLatin1String("{\\u O}"));
  491     str.replace(QChar(335), QLatin1String("{\\u o}"));
  492     str.replace(QChar(336), QLatin1String("{\\H O}"));
  493     str.replace(QChar(337), QLatin1String("{\\H o}"));
  494     str.replace(QChar(338), QLatin1String("{\\OE}"));
  495     str.replace(QChar(339), QLatin1String("{\\oe}"));
  496     str.replace(QChar(340), QLatin1String("{\\'R}"));
  497     str.replace(QChar(341), QLatin1String("{\\'r}"));
  498     str.replace(QChar(342), QLatin1String("{\\c R}"));
  499     str.replace(QChar(343), QLatin1String("{\\c r}"));
  500     str.replace(QChar(344), QLatin1String("{\\v R}"));
  501     str.replace(QChar(345), QLatin1String("{\\v r}"));
  502     str.replace(QChar(346), QLatin1String("{\\'S}"));
  503     str.replace(QChar(347), QLatin1String("{\\'s}"));
  504     str.replace(QChar(348), QLatin1String("{\\^S}"));
  505     str.replace(QChar(349), QLatin1String("{\\^s}"));
  506     str.replace(QChar(350), QLatin1String("{\\c S}"));
  507     str.replace(QChar(351), QLatin1String("{\\c s}"));
  508     str.replace(QChar(352), QLatin1String("{\\v S}"));
  509     str.replace(QChar(353), QLatin1String("{\\v s}"));
  510     str.replace(QChar(354), QLatin1String("{\\c T}"));
  511     str.replace(QChar(355), QLatin1String("{\\c t}"));
  512     str.replace(QChar(356), QLatin1String("{\\v T}"));
  513     str.replace(QChar(357), QLatin1String("{\\v t}"));
  514     str.replace(QChar(374), QLatin1String("{\\^Y}"));
  515     str.replace(QChar(375), QLatin1String("{\\^y}"));
  516     str.replace(QChar(376), QLatin1String("{\\\"Y}"));
  517     str.replace(QChar(377), QLatin1String("{\\'Z}"));
  518     str.replace(QChar(378), QLatin1String("{\\'z}"));
  519     str.replace(QChar(379), QLatin1String("{\\.Z}"));
  520     str.replace(QChar(380), QLatin1String("{\\.z}"));
  521     str.replace(QChar(381), QLatin1String("{\\v Z}"));
  522     str.replace(QChar(382), QLatin1String("{\\v z}"));
  523     // Escape common Greek and math
  524     // Some uppercases might require engrec package
  525     str.replace(QChar(181), QLatin1String("$\\mu$"));
  526     str.replace(QChar(183), QLatin1String("$\\cdot$"));
  527     str.replace(QChar(913), QLatin1String("$\\Alpha$"));
  528     str.replace(QChar(914), QLatin1String("$\\Beta$"));
  529     str.replace(QChar(915), QLatin1String("$\\Gamma$"));
  530     str.replace(QChar(916), QLatin1String("$\\Delta$"));
  531     str.replace(QChar(917), QLatin1String("$\\Epsilon$"));
  532     str.replace(QChar(918), QLatin1String("$\\Zeta$"));
  533     str.replace(QChar(919), QLatin1String("$\\Eta$"));
  534     str.replace(QChar(920), QLatin1String("$\\Theta$"));
  535     str.replace(QChar(921), QLatin1String("$\\Iota$"));
  536     str.replace(QChar(922), QLatin1String("$\\Kappa$"));
  537     str.replace(QChar(923), QLatin1String("$\\Lambda$"));
  538     str.replace(QChar(924), QLatin1String("$\\Mu$"));
  539     str.replace(QChar(925), QLatin1String("$\\Nu$"));
  540     str.replace(QChar(926), QLatin1String("$\\Xi$"));
  541     str.replace(QChar(927), QLatin1String("$\\Omicron$"));
  542     str.replace(QChar(928), QLatin1String("$\\Pi$"));
  543     str.replace(QChar(929), QLatin1String("$\\Rho$"));
  544     str.replace(QChar(931), QLatin1String("$\\Sigma$"));
  545     str.replace(QChar(932), QLatin1String("$\\Tau$"));
  546     str.replace(QChar(933), QLatin1String("$\\Upsilon$"));
  547     str.replace(QChar(934), QLatin1String("$\\Phi$"));
  548     str.replace(QChar(935), QLatin1String("$\\Chi$"));
  549     str.replace(QChar(936), QLatin1String("$\\Psi$"));
  550     str.replace(QChar(937), QLatin1String("$\\Omega$"));
  551     str.replace(QChar(945), QLatin1String("$\\alpha$"));
  552     str.replace(QChar(946), QLatin1String("$\\beta$"));
  553     str.replace(QChar(947), QLatin1String("$\\gamma$"));
  554     str.replace(QChar(948), QLatin1String("$\\delta$"));
  555     str.replace(QChar(949), QLatin1String("$\\varepsilon$"));
  556     str.replace(QChar(950), QLatin1String("$\\zeta$"));
  557     str.replace(QChar(951), QLatin1String("$\\eta$"));
  558     str.replace(QChar(952), QLatin1String("$\\theta$"));
  559     str.replace(QChar(953), QLatin1String("$\\iota$"));
  560     str.replace(QChar(954), QLatin1String("$\\kappa$"));
  561     str.replace(QChar(955), QLatin1String("$\\lambda$"));
  562     str.replace(QChar(956), QLatin1String("$\\mu$"));
  563     str.replace(QChar(957), QLatin1String("$\\nu$"));
  564     str.replace(QChar(958), QLatin1String("$\\xi$"));
  565     str.replace(QChar(959), QLatin1String("$\\omicron$"));
  566     str.replace(QChar(960), QLatin1String("$\\pi$"));
  567     str.replace(QChar(961), QLatin1String("$\\rho$"));
  568     str.replace(QChar(962), QLatin1String("$\\varsigma$"));
  569     str.replace(QChar(963), QLatin1String("$\\sigma$"));
  570     str.replace(QChar(964), QLatin1String("$\\tau$"));
  571     str.replace(QChar(965), QLatin1String("$\\upsilon$"));
  572     str.replace(QChar(966), QLatin1String("$\\phi$"));
  573     str.replace(QChar(967), QLatin1String("$\\chi$"));
  574     str.replace(QChar(968), QLatin1String("$\\psi$"));
  575     str.replace(QChar(969), QLatin1String("$\\omega$"));
  576     str.replace(QChar(977), QLatin1String("$\\vartheta$"));
  577     str.replace(QChar(981), QLatin1String("$\\varphi$"));
  578     str.replace(QChar(982), QLatin1String("$\\varpi$"));
  579     str.replace(QChar(989), QLatin1String("$\\digamma$"));
  580     str.replace(QChar(1008), QLatin1String("$\\varkappa$"));
  581     str.replace(QChar(1009), QLatin1String("$\\varrho$"));
  582     str.replace(QChar(1013), QLatin1String("$\\epsilon$"));
  583     str.replace(QChar(8211), '-');
  584     str.replace(QChar(8462), QLatin1String("$\\hbar$"));
  585     str.replace(QChar(8463), QLatin1String("$\\hslash$"));
  586     str.replace(QChar(8467), QLatin1String("$\\ell$"));
  587     str.replace(QChar(8476), QLatin1String("$\\Re$"));
  588     str.replace(QChar(8706), QLatin1String("$\\partial$"));
  589     str.replace(QChar(8722), '-');
  590     str.replace(QChar(8734), QLatin1String("$\\infty$"));
  591     str.replace(QChar(8764), QLatin1String("$\\sim$"));
  592     str.replace(QChar(8943), QLatin1String("$\\cdots$"));
  593 
  594     return str;
  595 }
  596 
  597 QHash<QString, QChar> latex_to_unicode()
  598 {
  599     QHash<QString, QChar> lu;
  600 
  601     lu.insert(QLatin1String("{\\&}"), QChar(38));
  602     lu.insert(QLatin1String("$\\cdot$"), QChar(183));
  603     lu.insert(QLatin1String("{\\`A}"), QChar(192));
  604     lu.insert(QLatin1String("{\\'A}"), QChar(193));
  605     lu.insert(QLatin1String("{\\^A}"), QChar(194));
  606     lu.insert(QLatin1String("{\\~A}"), QChar(195));
  607     lu.insert(QLatin1String("{\\\"A}"), QChar(196));
  608     lu.insert(QLatin1String("{\\AA}"), QChar(197));
  609     lu.insert(QLatin1String("{\\AE}"), QChar(198));
  610     lu.insert(QLatin1String("{\\c C}"), QChar(199));
  611     lu.insert(QLatin1String("{\\`E}"), QChar(200));
  612     lu.insert(QLatin1String("{\\'E}"), QChar(201));
  613     lu.insert(QLatin1String("{\\^E}"), QChar(202));
  614     lu.insert(QLatin1String("{\\\"E}"), QChar(203));
  615     lu.insert(QLatin1String("{\\`I}"), QChar(204));
  616     lu.insert(QLatin1String("{\\'I}"), QChar(205));
  617     lu.insert(QLatin1String("{\\^I}"), QChar(206));
  618     lu.insert(QLatin1String("{\\\"I}"), QChar(207));
  619     lu.insert(QLatin1String("{\\~N}"), QChar(209));
  620     lu.insert(QLatin1String("{\\`O}"), QChar(210));
  621     lu.insert(QLatin1String("{\\'O}"), QChar(211));
  622     lu.insert(QLatin1String("{\\^O}"), QChar(212));
  623     lu.insert(QLatin1String("{\\~O}"), QChar(213));
  624     lu.insert(QLatin1String("{\\\"O}"), QChar(214));
  625     lu.insert(QLatin1String("{\\O}"), QChar(216));
  626     lu.insert(QLatin1String("{\\`U}"), QChar(217));
  627     lu.insert(QLatin1String("{\\'U}"), QChar(218));
  628     lu.insert(QLatin1String("{\\^U}"), QChar(219));
  629     lu.insert(QLatin1String("{\\\"U}"), QChar(220));
  630     lu.insert(QLatin1String("{\\'Y}"), QChar(221));
  631     lu.insert(QLatin1String("{\\ss}"), QChar(223));
  632     lu.insert(QLatin1String("{\\`a}"), QChar(224));
  633     lu.insert(QLatin1String("{\\'a}"), QChar(225));
  634     lu.insert(QLatin1String("{\\^a}"), QChar(226));
  635     lu.insert(QLatin1String("{\\~a}"), QChar(227));
  636     lu.insert(QLatin1String("{\\\"a}"), QChar(228));
  637     lu.insert(QLatin1String("{\\aa}"), QChar(229));
  638     lu.insert(QLatin1String("{\\ae}"), QChar(230));
  639     lu.insert(QLatin1String("{\\c c}"), QChar(231));
  640     lu.insert(QLatin1String("{\\`e}"), QChar(232));
  641     lu.insert(QLatin1String("{\\'e}"), QChar(233));
  642     lu.insert(QLatin1String("{\\^e}"), QChar(234));
  643     lu.insert(QLatin1String("{\\\"e}"), QChar(235));
  644     lu.insert(QLatin1String("{\\`\\i}"), QChar(236));
  645     lu.insert(QLatin1String("{\\`i}"), QChar(236));
  646     lu.insert(QLatin1String("{\\'\\i}"), QChar(237));
  647     lu.insert(QLatin1String("{\\'i}"), QChar(237));
  648     lu.insert(QLatin1String("{\\^\\i}"), QChar(238));
  649     lu.insert(QLatin1String("{\\^i}"), QChar(238));
  650     lu.insert(QLatin1String("{\\\"\\i}"), QChar(239));
  651     lu.insert(QLatin1String("{\\\"i}"), QChar(239));
  652     lu.insert(QLatin1String("{\\~n}"), QChar(241));
  653     lu.insert(QLatin1String("{\\`o}"), QChar(242));
  654     lu.insert(QLatin1String("{\\'o}"), QChar(243));
  655     lu.insert(QLatin1String("{\\^o}"), QChar(244));
  656     lu.insert(QLatin1String("{\\~o}"), QChar(245));
  657     lu.insert(QLatin1String("{\\\"o}"), QChar(246));
  658     lu.insert(QLatin1String("{\\o}"), QChar(248));
  659     lu.insert(QLatin1String("{\\`u}"), QChar(249));
  660     lu.insert(QLatin1String("{\\'u}"), QChar(250));
  661     lu.insert(QLatin1String("{\\^u}"), QChar(251));
  662     lu.insert(QLatin1String("{\\\"u}"), QChar(252));
  663     lu.insert(QLatin1String("{\\'y}"), QChar(253));
  664     lu.insert(QLatin1String("{\\\"y}"), QChar(255));
  665     lu.insert(QLatin1String("{\\=A}"), QChar(256));
  666     lu.insert(QLatin1String("{\\=a}"), QChar(257));
  667     lu.insert(QLatin1String("{\\u A}"), QChar(258));
  668     lu.insert(QLatin1String("{\\u a}"), QChar(259));
  669     lu.insert(QLatin1String("{\\c A}"), QChar(260));
  670     lu.insert(QLatin1String("{\\c a}"), QChar(261));
  671     lu.insert(QLatin1String("{\\'C}"), QChar(262));
  672     lu.insert(QLatin1String("{\\'c}"), QChar(263));
  673     lu.insert(QLatin1String("{\\^C}"), QChar(264));
  674     lu.insert(QLatin1String("{\\^c}"), QChar(265));
  675     lu.insert(QLatin1String("{\\.C}"), QChar(266));
  676     lu.insert(QLatin1String("{\\.c}"), QChar(267));
  677     lu.insert(QLatin1String("{\\v C}"), QChar(268));
  678     lu.insert(QLatin1String("{\\v c}"), QChar(269));
  679     lu.insert(QLatin1String("{\\v D}"), QChar(270));
  680     lu.insert(QLatin1String("{\\v d}"), QChar(271));
  681     lu.insert(QLatin1String("{\\DJ}"), QChar(272));
  682     lu.insert(QLatin1String("{\\dj}"), QChar(273));
  683     lu.insert(QLatin1String("{\\=E}"), QChar(274));
  684     lu.insert(QLatin1String("{\\=e}"), QChar(275));
  685     lu.insert(QLatin1String("{\\u E}"), QChar(276));
  686     lu.insert(QLatin1String("{\\u e}"), QChar(277));
  687     lu.insert(QLatin1String("{\\.E}"), QChar(278));
  688     lu.insert(QLatin1String("{\\.e}"), QChar(279));
  689     lu.insert(QLatin1String("{\\c E}"), QChar(280));
  690     lu.insert(QLatin1String("{\\c e}"), QChar(281));
  691     lu.insert(QLatin1String("{\\v E}"), QChar(282));
  692     lu.insert(QLatin1String("{\\v e}"), QChar(283));
  693     lu.insert(QLatin1String("{\\^G}"), QChar(284));
  694     lu.insert(QLatin1String("{\\^g}"), QChar(285));
  695     lu.insert(QLatin1String("{\\u G}"), QChar(286));
  696     lu.insert(QLatin1String("{\\u g}"), QChar(287));
  697     lu.insert(QLatin1String("{\\.G}"), QChar(288));
  698     lu.insert(QLatin1String("{\\.g}"), QChar(289));
  699     lu.insert(QLatin1String("{\\c G}"), QChar(290));
  700     lu.insert(QLatin1String("{\\c g}"), QChar(291));
  701     lu.insert(QLatin1String("{\\^H}"), QChar(292));
  702     lu.insert(QLatin1String("{\\^h}"), QChar(293));
  703     lu.insert(QLatin1String("{\\H}"), QChar(294));
  704     lu.insert(QLatin1String("{\\h}"), QChar(295));
  705     lu.insert(QLatin1String("{\\~I}"), QChar(296));
  706     lu.insert(QLatin1String("{\\~i}"), QChar(297));
  707     lu.insert(QLatin1String("{\\=I}"), QChar(298));
  708     lu.insert(QLatin1String("{\\=i}"), QChar(299));
  709     lu.insert(QLatin1String("{\\u I}"), QChar(300));
  710     lu.insert(QLatin1String("{\\u i}"), QChar(301));
  711     lu.insert(QLatin1String("{\\c I}"), QChar(302));
  712     lu.insert(QLatin1String("{\\c i}"), QChar(303));
  713     lu.insert(QLatin1String("{\\.I}"), QChar(304));
  714     lu.insert(QLatin1String("{\\i}"), QChar(305));
  715     lu.insert(QLatin1String("{\\L}"), QChar(321));
  716     lu.insert(QLatin1String("{\\l}"), QChar(322));
  717     lu.insert(QLatin1String("{\\'N}"), QChar(323));
  718     lu.insert(QLatin1String("{\\'n}"), QChar(324));
  719     lu.insert(QLatin1String("{\\c N}"), QChar(325));
  720     lu.insert(QLatin1String("{\\c n}"), QChar(326));
  721     lu.insert(QLatin1String("{\\v N}"), QChar(327));
  722     lu.insert(QLatin1String("{\\v n}"), QChar(328));
  723     lu.insert(QLatin1String("{\\=O}"), QChar(332));
  724     lu.insert(QLatin1String("{\\=o}"), QChar(333));
  725     lu.insert(QLatin1String("{\\u O}"), QChar(334));
  726     lu.insert(QLatin1String("{\\u o}"), QChar(335));
  727     lu.insert(QLatin1String("{\\H O}"), QChar(336));
  728     lu.insert(QLatin1String("{\\H o}"), QChar(337));
  729     lu.insert(QLatin1String("{\\OE}"), QChar(338));
  730     lu.insert(QLatin1String("{\\oe}"), QChar(339));
  731     lu.insert(QLatin1String("{\\'R}"), QChar(340));
  732     lu.insert(QLatin1String("{\\'r}"), QChar(341));
  733     lu.insert(QLatin1String("{\\c R}"), QChar(342));
  734     lu.insert(QLatin1String("{\\c r}"), QChar(343));
  735     lu.insert(QLatin1String("{\\v R}"), QChar(344));
  736     lu.insert(QLatin1String("{\\v r}"), QChar(345));
  737     lu.insert(QLatin1String("{\\'S}"), QChar(346));
  738     lu.insert(QLatin1String("{\\'s}"), QChar(347));
  739     lu.insert(QLatin1String("{\\^S}"), QChar(348));
  740     lu.insert(QLatin1String("{\\^s}"), QChar(349));
  741     lu.insert(QLatin1String("{\\c S}"), QChar(350));
  742     lu.insert(QLatin1String("{\\c s}"), QChar(351));
  743     lu.insert(QLatin1String("{\\v S}"), QChar(352));
  744     lu.insert(QLatin1String("{\\v s}"), QChar(353));
  745     lu.insert(QLatin1String("{\\c T}"), QChar(354));
  746     lu.insert(QLatin1String("{\\c t}"), QChar(355));
  747     lu.insert(QLatin1String("{\\v T}"), QChar(356));
  748     lu.insert(QLatin1String("{\\v t}"), QChar(357));
  749     lu.insert(QLatin1String("{\\^Y}"), QChar(374));
  750     lu.insert(QLatin1String("{\\^y}"), QChar(375));
  751     lu.insert(QLatin1String("{\\\"Y}"), QChar(376));
  752     lu.insert(QLatin1String("{\\'Z}"), QChar(377));
  753     lu.insert(QLatin1String("{\\'z}"), QChar(378));
  754     lu.insert(QLatin1String("{\\.Z}"), QChar(379));
  755     lu.insert(QLatin1String("{\\.z}"), QChar(380));
  756     lu.insert(QLatin1String("{\\v Z}"), QChar(381));
  757     lu.insert(QLatin1String("{\\v z}"), QChar(382));
  758     lu.insert(QLatin1String("$\\Alpha$"), QChar(913));
  759     lu.insert(QLatin1String("$\\Beta$"), QChar(914));
  760     lu.insert(QLatin1String("$\\Gamma$"), QChar(915));
  761     lu.insert(QLatin1String("$\\Delta$"), QChar(916));
  762     lu.insert(QLatin1String("$\\Epsilon$"), QChar(917));
  763     lu.insert(QLatin1String("$\\Zeta$"), QChar(918));
  764     lu.insert(QLatin1String("$\\Eta$"), QChar(919));
  765     lu.insert(QLatin1String("$\\Theta$"), QChar(920));
  766     lu.insert(QLatin1String("$\\Iota$"), QChar(921));
  767     lu.insert(QLatin1String("$\\Kappa$"), QChar(922));
  768     lu.insert(QLatin1String("$\\Lambda$"), QChar(923));
  769     lu.insert(QLatin1String("$\\Mu$"), QChar(924));
  770     lu.insert(QLatin1String("$\\Nu$"), QChar(925));
  771     lu.insert(QLatin1String("$\\Xi$"), QChar(926));
  772     lu.insert(QLatin1String("$\\Omicron$"), QChar(927));
  773     lu.insert(QLatin1String("$\\Pi$"), QChar(928));
  774     lu.insert(QLatin1String("$\\Rho$"), QChar(929));
  775     lu.insert(QLatin1String("$\\Sigma$"), QChar(931));
  776     lu.insert(QLatin1String("$\\Tau$"), QChar(932));
  777     lu.insert(QLatin1String("$\\Upsilon$"), QChar(933));
  778     lu.insert(QLatin1String("$\\Phi$"), QChar(934));
  779     lu.insert(QLatin1String("$\\Chi$"), QChar(935));
  780     lu.insert(QLatin1String("$\\Psi$"), QChar(936));
  781     lu.insert(QLatin1String("$\\Omega$"), QChar(937));
  782     lu.insert(QLatin1String("$\\alpha$"), QChar(945));
  783     lu.insert(QLatin1String("$\\beta$"), QChar(946));
  784     lu.insert(QLatin1String("$\\gamma$"), QChar(947));
  785     lu.insert(QLatin1String("$\\delta$"), QChar(948));
  786     lu.insert(QLatin1String("$\\varepsilon$"), QChar(949));
  787     lu.insert(QLatin1String("$\\zeta$"), QChar(950));
  788     lu.insert(QLatin1String("$\\eta$"), QChar(951));
  789     lu.insert(QLatin1String("$\\theta$"), QChar(952));
  790     lu.insert(QLatin1String("$\\iota$"), QChar(953));
  791     lu.insert(QLatin1String("$\\kappa$"), QChar(954));
  792     lu.insert(QLatin1String("$\\lambda$"), QChar(955));
  793     lu.insert(QLatin1String("$\\mu$"), QChar(956));
  794     lu.insert(QLatin1String("$\\nu$"), QChar(957));
  795     lu.insert(QLatin1String("$\\xi$"), QChar(958));
  796     lu.insert(QLatin1String("$\\omicron$"), QChar(959));
  797     lu.insert(QLatin1String("$\\pi$"), QChar(960));
  798     lu.insert(QLatin1String("$\\rho$"), QChar(961));
  799     lu.insert(QLatin1String("$\\sigmaf$"), QChar(962));
  800     lu.insert(QLatin1String("$\\varsigma$"), QChar(962));
  801     lu.insert(QLatin1String("$\\sigma$"), QChar(963));
  802     lu.insert(QLatin1String("$\\tau$"), QChar(964));
  803     lu.insert(QLatin1String("$\\upsilon$"), QChar(965));
  804     lu.insert(QLatin1String("$\\phi$"), QChar(966));
  805     lu.insert(QLatin1String("$\\chi$"), QChar(967));
  806     lu.insert(QLatin1String("$\\psi$"), QChar(968));
  807     lu.insert(QLatin1String("$\\omega$"), QChar(969));
  808     lu.insert(QLatin1String("$\\vartheta$"), QChar(977));
  809     lu.insert(QLatin1String("$\\varphi$"), QChar(981));
  810     lu.insert(QLatin1String("$\\varpi$"), QChar(982));
  811     lu.insert(QLatin1String("$\\digamma$"), QChar(989));
  812     lu.insert(QLatin1String("$\\varkappa$"), QChar(1008));
  813     lu.insert(QLatin1String("$\\varrho$"), QChar(1009));
  814     lu.insert(QLatin1String("$\\epsilon$"), QChar(1013));
  815     lu.insert(QLatin1String("$\\hbar$"), QChar(8462));
  816     lu.insert(QLatin1String("$\\hslash$"), QChar(8463));
  817     lu.insert(QLatin1String("$\\ell$"), QChar(8467));
  818     lu.insert(QLatin1String("$\\Re$"), QChar(8476));
  819     lu.insert(QLatin1String("$\\partial$"), QChar(8706));
  820     lu.insert(QLatin1String("$\\infty$"), QChar(8734));
  821     lu.insert(QLatin1String("$\\sim$"), QChar(8764));
  822     lu.insert(QLatin1String("$\\cdots$"), QChar(8943));
  823 
  824     return lu;
  825 }
  826 
  827 const QHash<QString, QChar> latex_to_unicode_hash(latex_to_unicode());
  828 
  829 const txtmatcher hasLatex("\\", Qt::CaseSensitive, 0);
  830 
  831 QString& bibToC2b(QString& str)
  832 {
  833     const int length(str.length());
  834     if (length < 2)
  835         return str;
  836     int cursor(hasLatex.indexIn(str));
  837     if (cursor < 0)
  838         return str;
  839 
  840     QChar* const s(str.data());
  841     QString latex;
  842     bool modified(false);
  843     const QChar amp('&');
  844     const QChar close('}');
  845     const QChar del(0);
  846     const QChar dollar('$');
  847     const QChar open('{');
  848     const int mblen(7), mdlen(14);
  849 
  850     if (cursor == 0)
  851     {
  852         if (s[cursor + 1] == amp)
  853         {
  854             s[cursor] = del;
  855             modified = true;
  856         }
  857         ++cursor;
  858     }
  859     while (true)
  860     {
  861         cursor = hasLatex.indexIn(s, length, cursor);
  862         if (cursor < 1)
  863             break;
  864 
  865         if (s[cursor - 1] == open)
  866         {
  867             const int ls(cursor - 1);
  868             const int m(std::min(cursor + mblen, length));
  869             for (; cursor < m; ++cursor)
  870                 if (s[cursor] == close)
  871                 {
  872                     latex.setRawData(s + ls, cursor - ls + 1);
  873                     if (latex_to_unicode_hash.contains(latex))
  874                     {
  875                         s[ls] = latex_to_unicode_hash.value(latex);
  876                         for (int l = ls + 1; l <= cursor; ++l)
  877                             s[l] = del;
  878                         modified = true;
  879                     }
  880                     break;
  881                 }
  882             cursor = ls + 2;
  883             continue;
  884         }
  885         if (s[cursor - 1] == dollar)
  886         {
  887             const int ls(cursor - 1);
  888             const int m(std::min(cursor + mdlen, length));
  889             for (; cursor < m; ++cursor)
  890                 if (s[cursor] == dollar)
  891                 {
  892                     latex.setRawData(s + ls, cursor - ls + 1);
  893                     if (latex_to_unicode_hash.contains(latex))
  894                     {
  895                         s[ls] = latex_to_unicode_hash.value(latex);
  896                         for (int l = ls + 1; l <= cursor; ++l)
  897                             s[l] = del;
  898                         modified = true;
  899                     }
  900                     break;
  901                 }
  902             cursor = ls + 2;
  903             continue;
  904         }
  905         if (s[cursor + 1] == amp)
  906         {
  907             s[cursor] = del;
  908             modified = true;
  909             cursor += 2;
  910             continue;
  911         }
  912         cursor += 1;
  913         continue;
  914     }
  915     if (modified)
  916     {
  917         int c(0);
  918         for (int l = 0; l < length; ++l)
  919             if (s[l] != del)
  920                 s[c++] = s[l];
  921         str.truncate(c);
  922     }
  923     return str;
  924 }
  925 
  926 const QRegExp isBibToC2b("[\\{\\[\\$\\(\\\\<>]");
  927 const QRegExp pnasGreekLetters("[\\{\\[\\(](alpha|beta|gamma|delta|varepsilon|"
  928                                "zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|"
  929                                "omicron|pi|rho|sigmaf|varsigma|sigma|tau|"
  930                                "upsilon|phi|chi|psi|omega|vartheta|varphi|"
  931                                "varpi|digamma|varkappa|varrho|epsilon)[\\}\\]\\)]",
  932                                Qt::CaseInsensitive);
  933 
  934 QString& fullBibToC2b(QString& str)
  935 {
  936     // Escape TeX and other special characters to Unicode
  937     if (!str.contains(isBibToC2b))
  938         return str;
  939     // PNAS Style
  940     str.replace(pnasGreekLetters, "$\\\\1$");
  941     str.replace(QLatin1String("{micro}"), QLatin1String("$\\mu$"));
  942     str.replace(QLatin1String("{middle dot}"), QLatin1String("$\\cdot$"));
  943     // Sub and superscripts
  944     str.replace(QRegExp("<sub>([^<]*)</sub>"), "$_{\\1}$");
  945     str.replace(QRegExp("<sup>([^<]*)</sup>"), "$^{\\1}$");
  946     str.replace(QRegExp("\\[sub ([^\\]]*)\\]"), "$_{\\1}$");
  947     str.replace(QRegExp("\\[sup ([^\\]]*)\\]"), "$^{\\1}$");
  948     str.replace(QRegExp("\\}\\$\\$([_\\^])\\{"), "}\\1{"); // merge if both
  949     // Normalize TeX
  950     str.replace(QRegExp("\\{\\\\(\\W)\\{([\\\\]{0,1}\\w)\\}\\}"), "{\\\\1\\2}");  // {\'{A}} to {\'A}
  951     str.replace(QRegExp("\\{\\\\(\\w)\\{([\\\\]{0,1}\\w)\\}\\}"), "{\\\\1 \\2}"); // {\c{C}} to {\c C}
  952     str.replace(QRegExp("\\\\(\\W)\\{([\\\\]{0,1}\\w)\\}"), "{\\\\1\\2}");        // \'{A} to {\'A}
  953     str.replace(QRegExp("\\\\(\\w)\\{([\\\\]{0,1}\\w)\\}"), "{\\\\1 \\2}");       // \c{C} to {\c C}
  954     str.replace(QRegExp("\\{\\\\(.{1,2})\\{(\\w{0,2})\\}\\}"), "{\\\\1\\2}");     // {\\AA{}} to {\\AA}
  955     str.replace(QRegExp("\\\\\\{([A-Z]\\w{1,7})\\\\\\}"), "\\1");                 // \{NaCl\} to NaCl
  956     // TeX to Unicode
  957     bibToC2b(str);
  958     return str;
  959 }
  960 
  961 QString fromUtf8(const QByteArray& ba)
  962 {
  963     // Based on Qt's QString::fromUtf8 function. Input ba must be an UTF-8
  964     // encoded array produced by QString::toUtf8. Encoding correctness is
  965     // assumed and checking omitted for efficiency.
  966 
  967     const char* b = ba.constData();
  968     const char* const bn = b + ba.length();
  969     QString output;
  970     output.resize(ba.length());
  971     ushort* o = (ushort*)output.unicode();
  972     uint c;
  973     while (b < bn)
  974     {
  975         c = uint(*b);
  976         if (c & 0x80)
  977         {
  978             if ((c & 0xe0) == 0xc0)
  979             {
  980                 c &= 0x1f;
  981                 c = (uint)((c << 6) | (*++b & 0x3f));
  982             }
  983             else if ((c & 0xf0) == 0xe0)
  984             {
  985                 c &= 0x0f;
  986                 c = (uint)((c << 6) | (b[1] & 0x3f));
  987                 c = (uint)((c << 6) | (b[2] & 0x3f));
  988                 b += 2;
  989             }
  990             else if ((c & 0xf8) == 0xf0)
  991             {
  992                 c &= 0x07;
  993                 c = (uint)((c << 6) | (b[1] & 0x3f));
  994                 c = (uint)((c << 6) | (b[2] & 0x3f));
  995                 c = (uint)((c << 6) | (b[3] & 0x3f));
  996                 *o++ = QChar::highSurrogate(c);
  997                 c = QChar::lowSurrogate(c);
  998                 b += 3;
  999             }
 1000             else
 1001                 c = QChar::ReplacementCharacter;
 1002         }
 1003         *o++ = ushort(c);
 1004         ++b;
 1005     }
 1006     output.truncate(int(o - (ushort*)output.unicode()));
 1007     // QString check = QString::fromUtf8(ba);
 1008     // qDebug() << (check == output);
 1009     return output;
 1010 }
 1011 
 1012 #if defined(C2B_USE_LZSSE)
 1013 namespace lzsse
 1014 {
 1015 static const int HEADER_LENGTH = 10;
 1016 
 1017 QByteArray compress(const QByteArray& decompressed)
 1018 {
 1019     const int sdecompressed(decompressed.size());
 1020     if (sdecompressed == 0)
 1021         return QByteArray();
 1022 
 1023     LZSSE4_OptimalParseState* state(LZSSE4_MakeOptimalParseState(static_cast<size_t>(sdecompressed)));
 1024     if (!state)
 1025     {
 1026         warn("compress: lzsse: internal error - compression failed");
 1027         return QByteArray();
 1028     }
 1029     QByteArray compressed;
 1030     const int sdst(sdecompressed);
 1031     const int scompressed(HEADER_LENGTH + sdst);
 1032     compressed.resize(scompressed);
 1033     char* dst(compressed.data() + HEADER_LENGTH);
 1034     const unsigned int clevel(17);
 1035     const int sactual(LZSSE4_CompressOptimalParse(state, decompressed.constData(), sdecompressed, dst, sdst, clevel));
 1036     LZSSE4_FreeOptimalParseState(state);
 1037     if (sactual == 0)
 1038     {
 1039         warn("compress: lzsse: internal error - compression failed");
 1040         return QByteArray();
 1041     }
 1042     compressed.resize(sactual + HEADER_LENGTH);
 1043 
 1044     QByteArray header(QByteArray::number(qulonglong(sdecompressed)));
 1045     for (int i = 0; i < std::min(header.length(), HEADER_LENGTH); ++i)
 1046         compressed[i] = header[i];
 1047     for (int i = std::min(header.length(), HEADER_LENGTH); i < HEADER_LENGTH; ++i)
 1048         compressed[i] = ' ';
 1049     return compressed;
 1050 }
 1051 
 1052 QByteArray decompress(const QByteArray& compressed)
 1053 {
 1054     const int scompressed(compressed.size());
 1055     if (scompressed <= HEADER_LENGTH)
 1056         return QByteArray();
 1057 
 1058     const int sdecompressed(compressed.left(HEADER_LENGTH).trimmed().toULong());
 1059     QByteArray decompressed;
 1060     decompressed.resize(sdecompressed);
 1061     const int ssrc(scompressed - HEADER_LENGTH);
 1062     const char* src(compressed.constData() + HEADER_LENGTH);
 1063 
 1064     const int sactual(LZSSE4_Decompress(src, ssrc, decompressed.data(), sdecompressed));
 1065     if (sactual != sdecompressed)
 1066     {
 1067         warn("decompress: lzsse: internal error - decompression failed");
 1068         return QByteArray();
 1069     }
 1070     return decompressed;
 1071 }
 1072 
 1073 } // namespace lzsse
 1074 
 1075 QByteArray compressString(const QString& decompressed)
 1076 {
 1077     return lzsse::compress(
 1078                QByteArray(reinterpret_cast<const char*>(decompressed.unicode()), decompressed.size() * sizeof(ushort)));
 1079 }
 1080 
 1081 QString decompressString(const QByteArray& compressed)
 1082 {
 1083     const int scompressed(compressed.size());
 1084     if (scompressed <= lzsse::HEADER_LENGTH)
 1085         return QString();
 1086 
 1087     const int sdecompressedchars(compressed.left(lzsse::HEADER_LENGTH).trimmed().toULong());
 1088     const int sdecompressed(sdecompressedchars / sizeof(ushort));
 1089     QString decompressed;
 1090     decompressed.resize(sdecompressed);
 1091     const int ssrc(scompressed - lzsse::HEADER_LENGTH);
 1092     const char* src(compressed.constData() + lzsse::HEADER_LENGTH);
 1093 
 1094     const int sactual(LZSSE4_Decompress(src, ssrc, reinterpret_cast<char*>(decompressed.data()), sdecompressedchars));
 1095     if (sactual != sdecompressedchars)
 1096     {
 1097         warn("decompress: lzsse: internal error - decompression failed");
 1098         return QString();
 1099     }
 1100     return decompressed;
 1101 }
 1102 #elif defined(C2B_USE_LZ4)
 1103 namespace lz4
 1104 {
 1105 static const int HEADER_LENGTH = 10;
 1106 
 1107 QByteArray compress(const QByteArray& decompressed)
 1108 {
 1109     const int sdecompressed(decompressed.size());
 1110     if (sdecompressed == 0)
 1111         return QByteArray();
 1112 
 1113     QByteArray compressed;
 1114     const int sdst(LZ4_compressBound(sdecompressed));
 1115     const int scompressed(HEADER_LENGTH + sdst);
 1116     compressed.resize(scompressed);
 1117     char* dst(compressed.data() + HEADER_LENGTH);
 1118 
 1119     const int sactual(LZ4_compress_HC(decompressed.constData(), dst, sdecompressed, sdst, LZ4HC_CLEVEL_DEFAULT));
 1120     if (sactual <= 0)
 1121     {
 1122         warn("compress: lz4: internal error - compression failed");
 1123         return QByteArray();
 1124     }
 1125     compressed.resize(sactual + HEADER_LENGTH);
 1126 
 1127     QByteArray header(QByteArray::number(qulonglong(sdecompressed)));
 1128     for (int i = 0; i < std::min(header.length(), HEADER_LENGTH); ++i)
 1129         compressed[i] = header[i];
 1130     for (int i = std::min(header.length(), HEADER_LENGTH); i < HEADER_LENGTH; ++i)
 1131         compressed[i] = ' ';
 1132     return compressed;
 1133 }
 1134 
 1135 QByteArray decompress(const QByteArray& compressed)
 1136 {
 1137     const int scompressed(compressed.size());
 1138     if (scompressed <= HEADER_LENGTH)
 1139         return QByteArray();
 1140 
 1141     const int sdecompressed(compressed.left(HEADER_LENGTH).trimmed().toULong());
 1142     QByteArray decompressed;
 1143     decompressed.resize(sdecompressed);
 1144     const int ssrc(scompressed - HEADER_LENGTH);
 1145     const char* src(compressed.constData() + HEADER_LENGTH);
 1146 
 1147     const int sactual(LZ4_decompress_safe(src, decompressed.data(), ssrc, sdecompressed));
 1148     if (sactual != sdecompressed)
 1149     {
 1150         warn("decompress: lz4: internal error - decompression failed");
 1151         return QByteArray();
 1152     }
 1153     return decompressed;
 1154 }
 1155 
 1156 } // namespace lz4
 1157 
 1158 QByteArray compressString(const QString& decompressed)
 1159 {
 1160     return lz4::compress(
 1161                QByteArray(reinterpret_cast<const char*>(decompressed.unicode()), decompressed.size() * sizeof(ushort)));
 1162 }
 1163 
 1164 QString decompressString(const QByteArray& compressed)
 1165 {
 1166     const int scompressed(compressed.size());
 1167     if (scompressed <= lz4::HEADER_LENGTH)
 1168         return QString();
 1169 
 1170     const int sdecompressedchars(compressed.left(lz4::HEADER_LENGTH).trimmed().toULong());
 1171     const int sdecompressed(sdecompressedchars / sizeof(ushort));
 1172     QString decompressed;
 1173     decompressed.resize(sdecompressed);
 1174     const int ssrc(scompressed - lz4::HEADER_LENGTH);
 1175     const char* src(compressed.constData() + lz4::HEADER_LENGTH);
 1176 
 1177     const int sactual(LZ4_decompress_safe(src, reinterpret_cast<char*>(decompressed.data()), ssrc, sdecompressedchars));
 1178     if (sactual != sdecompressedchars)
 1179     {
 1180         warn("decompress: lz4: internal error - decompression failed");
 1181         return QString();
 1182     }
 1183     return decompressed;
 1184 }
 1185 #elif defined(C2B_USE_LZO)
 1186 namespace lzo
 1187 {
 1188 static const int HEADER_LENGTH = 10;
 1189 
 1190 QByteArray compress(const QByteArray& data)
 1191 {
 1192     const lzo_uint src_s(data.size());
 1193     if (src_s == 0)
 1194         return QByteArray();
 1195     if (lzo_init() != LZO_E_OK)
 1196     {
 1197         warn("compress: lzo: internal error - initialization failed");
 1198         return QByteArray();
 1199     }
 1200     const lzo_bytep src_p = reinterpret_cast<const lzo_bytep>(data.constData());
 1201 
 1202     QByteArray compressed;
 1203     const lzo_uint dest_s = HEADER_LENGTH + (src_s + src_s / 16 + 64 + 3);
 1204     compressed.resize((int)dest_s);
 1205     lzo_bytep compressed_p = reinterpret_cast<lzo_bytep>(compressed.data());
 1206     lzo_bytep dest_p = compressed_p + HEADER_LENGTH;
 1207     lzo_uint enc_dest_s = dest_s - HEADER_LENGTH;
 1208 
 1209     QByteArray wrkmem;
 1210     wrkmem.resize(LZO1X_999_MEM_COMPRESS);
 1211     lzo_bytep wrkmem_p = reinterpret_cast<lzo_bytep>(wrkmem.data());
 1212 
 1213     if (lzo1x_999_compress(src_p, src_s, dest_p, &enc_dest_s, wrkmem_p) == LZO_E_OK)
 1214     {
 1215         compressed.resize((int)enc_dest_s + HEADER_LENGTH);
 1216         QByteArray header(QByteArray::number((qulonglong)src_s));
 1217         for (int i = 0; i < std::min(header.length(), HEADER_LENGTH); ++i)
 1218             compressed[i] = header[i];
 1219         for (int i = std::min(header.length(), HEADER_LENGTH); i < HEADER_LENGTH; ++i)
 1220             compressed[i] = ' ';
 1221         return compressed;
 1222     }
 1223     else
 1224     {
 1225         warn("compress: lzo: internal error - compression failed");
 1226         return QByteArray();
 1227     }
 1228 }
 1229 
 1230 QByteArray decompress(const QByteArray& data)
 1231 {
 1232     const lzo_uint data_s(data.size());
 1233     if (data_s <= (unsigned int)HEADER_LENGTH)
 1234         return QByteArray();
 1235     if (lzo_init() != LZO_E_OK)
 1236     {
 1237         warn("decompress: lzo: internal error - initialization failed");
 1238         return QByteArray();
 1239     }
 1240     const lzo_bytep data_p = reinterpret_cast<const lzo_bytep>(data.constData());
 1241     const lzo_bytep src_p = data_p + HEADER_LENGTH;
 1242     const lzo_uint src_s = data_s - HEADER_LENGTH;
 1243 
 1244     QByteArray decompressed;
 1245     const ulong expected_dest_s = data.left(HEADER_LENGTH).trimmed().toULong();
 1246     lzo_uint dest_s = std::max(expected_dest_s, lzo_uint(1));
 1247     decompressed.resize((int)dest_s);
 1248     lzo_bytep dest_p = reinterpret_cast<lzo_bytep>(decompressed.data());
 1249 
 1250     if (lzo1x_decompress(src_p, src_s, dest_p, &dest_s, NULL) == LZO_E_OK && (ulong)dest_s == expected_dest_s)
 1251         return decompressed;
 1252     else
 1253     {
 1254         warn("decompress: lzo: internal error - decompression failed");
 1255         return QByteArray();
 1256     }
 1257 }
 1258 
 1259 } // namespace lzo
 1260 
 1261 QByteArray compressString(const QString& decompressed)
 1262 {
 1263     return lzo::compress(decompressed.toUtf8());
 1264 }
 1265 
 1266 QString decompressString(const QByteArray& compressed)
 1267 {
 1268     return fromUtf8(lzo::decompress(compressed));
 1269 }
 1270 #else
 1271 QByteArray compressString(const QString& decompressed)
 1272 {
 1273     return qCompress(decompressed.toUtf8());
 1274 }
 1275 
 1276 QString decompressString(const QByteArray& compressed)
 1277 {
 1278     return fromUtf8(qUncompress(compressed));
 1279 }
 1280 #endif
 1281 
 1282 } // namespace c2bUtils