"Fossies" - the Fresh Open Source Software Archive

Member "regexxer-0.10/src/stringutils.cc" (6 Oct 2011, 17978 Bytes) of package /linux/privat/old/regexxer-0.10.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "stringutils.cc" see the Fossies "Dox" file reference documentation.

    1 /*
    2  * Copyright (c) 2002-2007  Daniel Elstner  <daniel.kitta@gmail.com>
    3  *
    4  * This file is part of regexxer.
    5  *
    6  * regexxer is free software; you can redistribute it and/or modify
    7  * it under the terms of the GNU General Public License as published by
    8  * the Free Software Foundation; either version 2 of the License, or
    9  * (at your option) any later version.
   10  *
   11  * regexxer is distributed in the hope that it will be useful,
   12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
   13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   14  * GNU General Public License for more details.
   15  *
   16  * You should have received a copy of the GNU General Public License
   17  * along with regexxer; if not, write to the Free Software Foundation,
   18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
   19  */
   20 
   21 #include "stringutils.h"
   22 
   23 #include <glib.h>
   24 #include <glib-object.h>
   25 #include <glibmm.h>
   26 #include <gdkmm/color.h>
   27 
   28 #include <algorithm>
   29 #include <iomanip>
   30 #include <locale>
   31 #include <sstream>
   32 #include <stdexcept>
   33 #include <utility>
   34 #include <vector>
   35 
   36 namespace
   37 {
   38 
   39 typedef std::pair<int, char> ModPos;
   40 
   41 class ScopedTypeClass
   42 {
   43 private:
   44   void* class_;
   45 
   46   ScopedTypeClass(const ScopedTypeClass&);
   47   ScopedTypeClass& operator=(const ScopedTypeClass&);
   48 
   49 public:
   50   explicit ScopedTypeClass(GType type)
   51     : class_ (g_type_class_ref(type)) {}
   52 
   53   ~ScopedTypeClass() { g_type_class_unref(class_); }
   54 
   55   void* get() const { return class_; }
   56 };
   57 
   58 static inline
   59 bool is_significant_encoding_char(char c)
   60 {
   61   switch (c)
   62   {
   63     case ' ': case '-': case '_': case '.': case ':':
   64       return false;
   65   }
   66 
   67   return true;
   68 }
   69 
   70 static inline
   71 unsigned int scale_to_8bit(unsigned int value)
   72 {
   73   return (value & 0xFF00) >> 8;
   74 }
   75 
   76 static inline
   77 bool ascii_isodigit(char c)
   78 {
   79   return (c >= '0' && c <= '7');
   80 }
   81 
   82 static
   83 std::string apply_modifiers(const std::string& subject, const std::vector<ModPos>& modifiers)
   84 {
   85   std::string result;
   86   result.reserve(subject.size());
   87 
   88   int idx = 0;
   89 
   90   const std::vector<ModPos>::const_iterator pend = modifiers.end();
   91   std::vector<ModPos>::const_iterator       p    = modifiers.begin();
   92 
   93   while (p != pend)
   94   {
   95     const int start = p->first;
   96     result.append(subject, idx, start - idx);
   97     idx = start;
   98 
   99     const char mod = p->second;
  100     ++p;
  101 
  102     switch (mod)
  103     {
  104       case 'L': case 'U':
  105       {
  106         while (p != pend && (p->second == 'l' || p->second == 'u'))
  107           ++p;
  108 
  109         const int stop = (p == pend) ? subject.size() : p->first;
  110         const Glib::ustring slice (subject.begin() + start, subject.begin() + stop);
  111         const Glib::ustring str = (mod == 'L') ? slice.lowercase() : slice.uppercase();
  112 
  113         result.append(str.raw());
  114         idx = stop;
  115         break;
  116       }
  117       case 'l': case 'u': // TODO: Simplify.  This code is way too complicated.
  118       {
  119         if (unsigned(start) < subject.size())
  120         {
  121           while (p != pend && p->first == start && p->second != 'L' && p->second != 'U')
  122             ++p;
  123 
  124           if (p != pend && p->first == start)
  125           {
  126             const char submod = p->second;
  127 
  128             do
  129               ++p;
  130             while (p != pend && (p->second == 'l' || p->second == 'u'));
  131 
  132             const int stop = (p == pend) ? subject.size() : p->first;
  133             const Glib::ustring slice (subject.begin() + start, subject.begin() + stop);
  134             const Glib::ustring str = (submod == 'L') ? slice.lowercase() : slice.uppercase();
  135 
  136             if (!str.empty())
  137             {
  138               Glib::ustring::const_iterator cpos = str.begin();
  139               gunichar uc = *cpos++;
  140               uc = (mod == 'l') ? Glib::Unicode::tolower(uc) : Glib::Unicode::totitle(uc);
  141 
  142               if (Glib::Unicode::validate(uc))
  143                 result.append(Glib::ustring(1, uc).raw());
  144 
  145               result.append(cpos.base(), str.end().base());
  146             }
  147             idx = stop;
  148           }
  149           else
  150           {
  151             Glib::ustring::const_iterator cpos (subject.begin() + start);
  152             gunichar uc = *cpos++;
  153             uc = (mod == 'l') ? Glib::Unicode::tolower(uc) : Glib::Unicode::totitle(uc);
  154 
  155             if (Glib::Unicode::validate(uc))
  156               result.append(Glib::ustring(1, uc).raw());
  157 
  158             idx = cpos.base() - subject.begin();
  159           }
  160         }
  161         break;
  162       }
  163       case 'E':
  164       {
  165         break;
  166       }
  167       default:
  168       {
  169         g_assert_not_reached();
  170         break;
  171       }
  172     }
  173   }
  174 
  175   result.append(subject, idx, std::string::npos);
  176 
  177   return result;
  178 }
  179 
  180 static
  181 void parse_control_char(std::string::const_iterator& p, std::string::const_iterator pend,
  182                         std::string& dest)
  183 {
  184   const std::string::const_iterator pnext = p + 1;
  185 
  186   if (pnext != pend && (static_cast<unsigned char>(*pnext) & 0x80U) == 0)
  187   {
  188     p = pnext;
  189 
  190     // Flip bit 6 of the upcased character.
  191     const char c = static_cast<unsigned char>(Glib::Ascii::toupper(*pnext)) ^ 0x40U;
  192 
  193     // TextBuffer can't handle NUL; interpret it as empty string instead.
  194     if (c != '\0')
  195       dest += c;
  196   }
  197   else
  198     dest += 'c';
  199 }
  200 
  201 static
  202 void parse_hex_unichar(std::string::const_iterator& p, std::string::const_iterator pend,
  203                        std::string& dest)
  204 {
  205   using namespace Glib;
  206 
  207   std::string::const_iterator pstart = p + 1;
  208 
  209   if (pstart != pend)
  210   {
  211     if (*pstart == '{')
  212     {
  213       const std::string::const_iterator pstop = std::find(++pstart, pend, '}');
  214 
  215       if (pstop != pend)
  216       {
  217         p = pstop;
  218         gunichar uc = 0;
  219 
  220         for (; pstart != pstop; ++pstart)
  221         {
  222           if (!Ascii::isxdigit(*pstart))
  223             return;
  224 
  225           uc *= 0x10;
  226           uc += Ascii::xdigit_value(*pstart);
  227         }
  228 
  229         if (uc != 0 && Unicode::validate(uc))
  230           dest += ustring(1, uc).raw();
  231 
  232         return;
  233       }
  234     }
  235     else if (pstart + 1 != pend && Ascii::isxdigit(pstart[0]) && Ascii::isxdigit(pstart[1]))
  236     {
  237       p = pstart + 1;
  238       const gunichar uc = 0x10 * Ascii::xdigit_value(pstart[0]) + Ascii::xdigit_value(pstart[1]);
  239 
  240       if (uc != 0 && Unicode::validate(uc))
  241         dest += ustring(1, uc).raw();
  242 
  243       return;
  244     }
  245   }
  246 
  247   dest += 'x';
  248 }
  249 
  250 static
  251 void parse_oct_unichar(std::string::const_iterator& p, std::string::const_iterator pend,
  252                        std::string& dest)
  253 {
  254   gunichar uc = 0;
  255   std::string::const_iterator pnum = p;
  256 
  257   for (; pnum != pend && (pnum - p) < 3; ++pnum)
  258   {
  259     if (!ascii_isodigit(*pnum))
  260       break;
  261 
  262     uc *= 010;
  263     uc += Glib::Ascii::digit_value(*pnum);
  264   }
  265 
  266   if (pnum > p)
  267   {
  268     p = pnum - 1;
  269 
  270     if (uc != 0 && Glib::Unicode::validate(uc))
  271       dest += Glib::ustring(1, uc).raw();
  272   }
  273   else
  274     dest += *p;
  275 }
  276 
  277 /*
  278  * On entry, p _must_ point to either a digit or a starting bracket '{'.  Also,
  279  * if p points to '{' the closing bracket '}' is assumed to follow before pend.
  280  */
  281 static
  282 int parse_capture_index(std::string::const_iterator& p, std::string::const_iterator pend)
  283 {
  284   std::string::const_iterator pnum = p;
  285 
  286   if (*pnum == '{' && *++pnum == '}')
  287   {
  288     p = pnum;
  289     return -1;
  290   }
  291 
  292   int result = 0;
  293 
  294   while (pnum != pend && Glib::Ascii::isdigit(*pnum))
  295   {
  296     result *= 10;
  297     result += Glib::Ascii::digit_value(*pnum++);
  298   }
  299 
  300   if (*p != '{') // case "$digits": set position to last digit
  301   {
  302     p = pnum - 1;
  303   }
  304   else if (*pnum == '}') // case "${digits}": set position to '}'
  305   {
  306     p = pnum;
  307   }
  308   else // case "${invalid}": return -1 but still skip until '}'
  309   {
  310     p = std::find(pnum, pend, '}');
  311     return -1;
  312   }
  313 
  314   return result;
  315 }
  316 
  317 } // anonymous namespace
  318 
  319 /*
  320  * Convert the content of an std::wstring to UTF-8.  Using wide strings is
  321  * necessary when dealing with localized stream formatting, for the reasons
  322  * outlined here:  http://bugzilla.gnome.org/show_bug.cgi?id=399216
  323  *
  324  * Direct use of wide strings in regexxer is a temporary measure.  Thus,
  325  * this function should be removed once Glib::compose() and Glib::format()
  326  * are available in glibmm.
  327  */
  328 Glib::ustring Util::wstring_to_utf8(const std::wstring& str)
  329 {
  330   class ScopedCharArray
  331   {
  332   private:
  333     char* ptr_;
  334 
  335     ScopedCharArray(const ScopedCharArray&);
  336     ScopedCharArray& operator=(const ScopedCharArray&);
  337 
  338   public:
  339     explicit ScopedCharArray(char* ptr) : ptr_ (ptr) {}
  340     ~ScopedCharArray() { g_free(ptr_); }
  341 
  342     char* get() const { return ptr_; }
  343   };
  344 
  345   GError* error = 0;
  346 
  347 #ifdef __STDC_ISO_10646__
  348   // Avoid going through iconv if wchar_t always contains UCS-4.
  349   glong n_bytes = 0;
  350   const ScopedCharArray buf (g_ucs4_to_utf8(reinterpret_cast<const gunichar*>(str.data()),
  351                                             str.size(), 0, &n_bytes, &error));
  352 #else
  353   gsize n_bytes = 0;
  354   const ScopedCharArray buf (g_convert(reinterpret_cast<const char*>(str.data()),
  355                                        str.size() * sizeof(std::wstring::value_type),
  356                                        "UTF-8", "WCHAR_T", 0, &n_bytes, &error));
  357 #endif /* !__STDC_ISO_10646__ */
  358 
  359   if (G_UNLIKELY(error))
  360   {
  361     g_warning("%s", error->message);
  362     g_error_free(error);
  363     return Glib::ustring();
  364   }
  365 
  366   return Glib::ustring(buf.get(), buf.get() + n_bytes);
  367 }
  368 
  369 bool Util::validate_encoding(const std::string& encoding)
  370 {
  371   // GLib just ignores some characters that aren't used in encoding names,
  372   // so we have to parse the string for invalid characters ourselves.
  373 
  374   if (encoding.empty() || !Glib::Ascii::isalnum(*encoding.begin())
  375                        || !Glib::Ascii::isalnum(*encoding.rbegin()))
  376     return false;
  377 
  378   for (std::string::const_iterator p = encoding.begin() + 1; p != encoding.end(); ++p)
  379   {
  380     if (!Glib::Ascii::isalnum(*p) && is_significant_encoding_char(*p))
  381       return false;
  382   }
  383 
  384   // Better don't try to call Glib::convert() with identical input and output
  385   // encodings.  I heard the iconv on Solaris doesn't like that idea at all.
  386 
  387   if (!Util::encodings_equal(encoding, "UTF-8"))
  388     try
  389     {
  390       Glib::convert(std::string(), "UTF-8", encoding);
  391     }
  392     catch (const Glib::ConvertError& error)
  393     {
  394       if (error.code() == Glib::ConvertError::NO_CONVERSION)
  395         return false;
  396       throw;
  397     }
  398 
  399   return true;
  400 }
  401 
  402 /*
  403  * Test lhs and rhs for equality while ignoring case
  404  * and several separation characters used in encoding names.
  405  */
  406 bool Util::encodings_equal(const std::string& lhs, const std::string& rhs)
  407 {
  408   typedef std::string::const_iterator Iterator;
  409 
  410   Iterator       lhs_pos = lhs.begin();
  411   Iterator       rhs_pos = rhs.begin();
  412   const Iterator lhs_end = lhs.end();
  413   const Iterator rhs_end = rhs.end();
  414 
  415   for (;;)
  416   {
  417     while (lhs_pos != lhs_end && !is_significant_encoding_char(*lhs_pos))
  418       ++lhs_pos;
  419     while (rhs_pos != rhs_end && !is_significant_encoding_char(*rhs_pos))
  420       ++rhs_pos;
  421 
  422     if (lhs_pos == lhs_end || rhs_pos == rhs_end)
  423       break;
  424 
  425     if (Glib::Ascii::toupper(*lhs_pos) != Glib::Ascii::toupper(*rhs_pos))
  426       return false;
  427 
  428     ++lhs_pos;
  429     ++rhs_pos;
  430   }
  431 
  432   return (lhs_pos == lhs_end && rhs_pos == rhs_end);
  433 }
  434 
  435 Glib::ustring Util::shell_pattern_to_regex(const Glib::ustring& pattern)
  436 {
  437   // Don't use Glib::ustring to accumulate the result since we might append
  438   // partial UTF-8 characters during processing.  Although this would work with
  439   // the current Glib::ustring implementation, it's definitely not a good idea.
  440   std::string result;
  441   result.reserve(std::max<std::string::size_type>(32, 2 * pattern.raw().size()));
  442 
  443   result.append("\\A", 2);
  444 
  445   int brace_level = 0;
  446 
  447   const std::string::const_iterator pend = pattern.raw().end();
  448   std::string::const_iterator       p    = pattern.raw().begin();
  449   std::string::const_iterator       pcc  = pend; // start of character class
  450 
  451   for (; p != pend; ++p)
  452   {
  453     if (*p == '\\')
  454     {
  455       // Always escape a single trailing '\' to avoid mangling the "\z"
  456       // terminator.  Never escape multi-byte or alpha-numeric characters.
  457 
  458       if (p + 1 == pend || Glib::Ascii::ispunct(*++p))
  459         result += '\\';
  460 
  461       result += *p;
  462     }
  463     else if (pcc == pend)
  464     {
  465       switch (*p)
  466       {
  467         case '*':
  468           result.append(".*", 2);
  469           break;
  470 
  471         case '?':
  472           result += '.';
  473           break;
  474 
  475         case '[':
  476           result += '[';
  477           pcc = p + 1;
  478           break;
  479 
  480         case '{':
  481           result.append("(?:", 3);
  482           ++brace_level;
  483           break;
  484 
  485         case '}':
  486           result += ')';
  487           --brace_level;
  488           break;
  489 
  490         case ',':
  491           result += (brace_level > 0) ? '|' : ',';
  492           break;
  493 
  494         case '^': case '$': case '.': case '+': case '(': case ')': case '|':
  495           result += '\\';
  496           // fallthrough
  497 
  498         default:
  499           result += *p;
  500           break;
  501       }
  502     }
  503     else // pcc != pend
  504     {
  505       switch (*p)
  506       {
  507         case ']':
  508           result += ']';
  509           if (p != pcc && !(p == pcc + 1 && (*pcc == '!' || *pcc == '^')))
  510             pcc = pend;
  511           break;
  512 
  513         case '!':
  514           result += (p == pcc) ? '^' : '!';
  515           break;
  516 
  517         default:
  518           result += *p;
  519           break;
  520       }
  521     }
  522   }
  523 
  524   result.append("\\z", 2);
  525 
  526   return result;
  527 }
  528 
  529 Glib::ustring Util::substitute_references(const Glib::ustring& substitution,
  530                                           const Glib::ustring& subject,
  531                                           const CaptureVector& captures)
  532 {
  533   std::string result;
  534   result.reserve(2 * std::max(substitution.raw().size(), subject.raw().size()));
  535 
  536   std::vector<ModPos> modifiers;
  537 
  538   const std::string::const_iterator pend = substitution.raw().end();
  539   std::string::const_iterator       p    = substitution.raw().begin();
  540 
  541   for (; p != pend; ++p)
  542   {
  543     if (*p == '\\' && p + 1 != pend)
  544     {
  545       switch (*++p)
  546       {
  547         case 'L': case 'U': case 'l': case 'u': case 'E':
  548           modifiers.push_back(ModPos(result.size(), *p));
  549           break;
  550 
  551         case 'a':
  552           result += '\a';
  553           break;
  554 
  555         case 'e':
  556           result += '\033';
  557           break;
  558 
  559         case 'f':
  560           result += '\f';
  561           break;
  562 
  563         case 'n':
  564           result += '\n';
  565           break;
  566 
  567         case 'r':
  568           result += '\r';
  569           break;
  570 
  571         case 't':
  572           result += '\t';
  573           break;
  574 
  575         case 'c':
  576           parse_control_char(p, pend, result);
  577           break;
  578 
  579         case 'x':
  580           parse_hex_unichar(p, pend, result);
  581           break;
  582 
  583         case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7':
  584           parse_oct_unichar(p, pend, result);
  585           break;
  586 
  587         default:
  588           result += *p;
  589           break;
  590       }
  591     }
  592     else if (*p == '$' && p + 1 != pend)
  593     {
  594       std::pair<int, int> bounds;
  595 
  596       if (Glib::Ascii::isdigit(*++p) || (*p == '{' && std::find(p + 1, pend, '}') != pend))
  597       {
  598         const int index = parse_capture_index(p, pend);
  599 
  600         if (index >= 0 && unsigned(index) < captures.size())
  601           bounds = captures[index];
  602         else
  603           continue;
  604       }
  605       else switch (*p)
  606       {
  607         case '+':
  608           if (captures.size() > 1)
  609             bounds = captures.back();
  610           break;
  611 
  612         case '&':
  613           bounds = captures.front();
  614           break;
  615 
  616         case '`':
  617           bounds.first  = 0;
  618           bounds.second = captures.front().first;
  619           break;
  620 
  621         case '\'':
  622           bounds.first  = captures.front().second;
  623           bounds.second = subject.raw().size();
  624           break;
  625 
  626         default:
  627           result += '$';
  628           result += *p;
  629           continue;
  630       }
  631 
  632       if (bounds.first >= 0 && bounds.second > bounds.first)
  633         result.append(subject.raw(), bounds.first, bounds.second - bounds.first);
  634     }
  635     else // (*p != '\\' && *p != '$') || (p + 1 == pend)
  636     {
  637       result += *p;
  638     }
  639   }
  640 
  641   if (!modifiers.empty())
  642     result = apply_modifiers(result, modifiers);
  643 
  644   return result;
  645 }
  646 
  647 Glib::ustring Util::int_to_string(int number)
  648 {
  649   std::wostringstream output;
  650 
  651   try // don't abort if the user-specified locale doesn't exist
  652   {
  653     output.imbue(std::locale(""));
  654   }
  655   catch (const std::runtime_error& error)
  656   {
  657     g_warning("%s", error.what());
  658   }
  659 
  660   output << number;
  661 
  662   return Util::wstring_to_utf8(output.str());
  663 }
  664 
  665 Glib::ustring Util::filename_short_display_name(const std::string& filename)
  666 {
  667   const std::string homedir = Glib::get_home_dir();
  668   const std::string::size_type len = homedir.length();
  669 
  670   if (filename.length() >= len
  671       && (filename.length() == len || G_IS_DIR_SEPARATOR(filename[len]))
  672       && filename.compare(0, len, homedir) == 0)
  673   {
  674     std::string short_name (1, '~');
  675     short_name.append(filename, len, std::string::npos);
  676 
  677     return Glib::filename_display_name(short_name);
  678   }
  679 
  680   return Glib::filename_display_name(filename);
  681 }
  682 
  683 Glib::ustring Util::color_to_string(const Gdk::Color& color)
  684 {
  685   std::ostringstream output;
  686 
  687   output.imbue(std::locale::classic());
  688   output.setf(std::ios::hex, std::ios::basefield);
  689   output.setf(std::ios::uppercase);
  690   output.fill('0');
  691 
  692   output << '#' << std::setw(2) << scale_to_8bit(color.get_red())
  693                 << std::setw(2) << scale_to_8bit(color.get_green())
  694                 << std::setw(2) << scale_to_8bit(color.get_blue());
  695 
  696   return output.str();
  697 }
  698 
  699 int Util::enum_from_nick_impl(GType type, const Glib::ustring& nick)
  700 {
  701   const ScopedTypeClass type_class (type);
  702 
  703   GEnumClass *const enum_class = G_ENUM_CLASS(type_class.get());
  704   GEnumValue *const enum_value = g_enum_get_value_by_nick(enum_class, nick.c_str());
  705 
  706   g_return_val_if_fail(enum_value != 0, enum_class->minimum);
  707 
  708   return enum_value->value;
  709 }
  710 
  711 Glib::ustring Util::enum_to_nick_impl(GType type, int value)
  712 {
  713   const ScopedTypeClass type_class (type);
  714 
  715   GEnumClass *const enum_class = G_ENUM_CLASS(type_class.get());
  716   GEnumValue *const enum_value = g_enum_get_value(enum_class, value);
  717 
  718   g_return_val_if_fail(enum_value != 0, "");
  719 
  720   return enum_value->value_nick;
  721 }