"Fossies" - the Fresh Open Source Software Archive 
Member "regexxer-0.10/src/stringutils.cc" (6 Oct 2011, 17978 Bytes) of package /linux/privat/old/regexxer-0.10.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "stringutils.cc" see the
Fossies "Dox" file reference documentation.
1 /*
2 * Copyright (c) 2002-2007 Daniel Elstner <daniel.kitta@gmail.com>
3 *
4 * This file is part of regexxer.
5 *
6 * regexxer is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * regexxer is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with regexxer; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "stringutils.h"
22
23 #include <glib.h>
24 #include <glib-object.h>
25 #include <glibmm.h>
26 #include <gdkmm/color.h>
27
28 #include <algorithm>
29 #include <iomanip>
30 #include <locale>
31 #include <sstream>
32 #include <stdexcept>
33 #include <utility>
34 #include <vector>
35
36 namespace
37 {
38
39 typedef std::pair<int, char> ModPos;
40
41 class ScopedTypeClass
42 {
43 private:
44 void* class_;
45
46 ScopedTypeClass(const ScopedTypeClass&);
47 ScopedTypeClass& operator=(const ScopedTypeClass&);
48
49 public:
50 explicit ScopedTypeClass(GType type)
51 : class_ (g_type_class_ref(type)) {}
52
53 ~ScopedTypeClass() { g_type_class_unref(class_); }
54
55 void* get() const { return class_; }
56 };
57
58 static inline
59 bool is_significant_encoding_char(char c)
60 {
61 switch (c)
62 {
63 case ' ': case '-': case '_': case '.': case ':':
64 return false;
65 }
66
67 return true;
68 }
69
70 static inline
71 unsigned int scale_to_8bit(unsigned int value)
72 {
73 return (value & 0xFF00) >> 8;
74 }
75
76 static inline
77 bool ascii_isodigit(char c)
78 {
79 return (c >= '0' && c <= '7');
80 }
81
82 static
83 std::string apply_modifiers(const std::string& subject, const std::vector<ModPos>& modifiers)
84 {
85 std::string result;
86 result.reserve(subject.size());
87
88 int idx = 0;
89
90 const std::vector<ModPos>::const_iterator pend = modifiers.end();
91 std::vector<ModPos>::const_iterator p = modifiers.begin();
92
93 while (p != pend)
94 {
95 const int start = p->first;
96 result.append(subject, idx, start - idx);
97 idx = start;
98
99 const char mod = p->second;
100 ++p;
101
102 switch (mod)
103 {
104 case 'L': case 'U':
105 {
106 while (p != pend && (p->second == 'l' || p->second == 'u'))
107 ++p;
108
109 const int stop = (p == pend) ? subject.size() : p->first;
110 const Glib::ustring slice (subject.begin() + start, subject.begin() + stop);
111 const Glib::ustring str = (mod == 'L') ? slice.lowercase() : slice.uppercase();
112
113 result.append(str.raw());
114 idx = stop;
115 break;
116 }
117 case 'l': case 'u': // TODO: Simplify. This code is way too complicated.
118 {
119 if (unsigned(start) < subject.size())
120 {
121 while (p != pend && p->first == start && p->second != 'L' && p->second != 'U')
122 ++p;
123
124 if (p != pend && p->first == start)
125 {
126 const char submod = p->second;
127
128 do
129 ++p;
130 while (p != pend && (p->second == 'l' || p->second == 'u'));
131
132 const int stop = (p == pend) ? subject.size() : p->first;
133 const Glib::ustring slice (subject.begin() + start, subject.begin() + stop);
134 const Glib::ustring str = (submod == 'L') ? slice.lowercase() : slice.uppercase();
135
136 if (!str.empty())
137 {
138 Glib::ustring::const_iterator cpos = str.begin();
139 gunichar uc = *cpos++;
140 uc = (mod == 'l') ? Glib::Unicode::tolower(uc) : Glib::Unicode::totitle(uc);
141
142 if (Glib::Unicode::validate(uc))
143 result.append(Glib::ustring(1, uc).raw());
144
145 result.append(cpos.base(), str.end().base());
146 }
147 idx = stop;
148 }
149 else
150 {
151 Glib::ustring::const_iterator cpos (subject.begin() + start);
152 gunichar uc = *cpos++;
153 uc = (mod == 'l') ? Glib::Unicode::tolower(uc) : Glib::Unicode::totitle(uc);
154
155 if (Glib::Unicode::validate(uc))
156 result.append(Glib::ustring(1, uc).raw());
157
158 idx = cpos.base() - subject.begin();
159 }
160 }
161 break;
162 }
163 case 'E':
164 {
165 break;
166 }
167 default:
168 {
169 g_assert_not_reached();
170 break;
171 }
172 }
173 }
174
175 result.append(subject, idx, std::string::npos);
176
177 return result;
178 }
179
180 static
181 void parse_control_char(std::string::const_iterator& p, std::string::const_iterator pend,
182 std::string& dest)
183 {
184 const std::string::const_iterator pnext = p + 1;
185
186 if (pnext != pend && (static_cast<unsigned char>(*pnext) & 0x80U) == 0)
187 {
188 p = pnext;
189
190 // Flip bit 6 of the upcased character.
191 const char c = static_cast<unsigned char>(Glib::Ascii::toupper(*pnext)) ^ 0x40U;
192
193 // TextBuffer can't handle NUL; interpret it as empty string instead.
194 if (c != '\0')
195 dest += c;
196 }
197 else
198 dest += 'c';
199 }
200
201 static
202 void parse_hex_unichar(std::string::const_iterator& p, std::string::const_iterator pend,
203 std::string& dest)
204 {
205 using namespace Glib;
206
207 std::string::const_iterator pstart = p + 1;
208
209 if (pstart != pend)
210 {
211 if (*pstart == '{')
212 {
213 const std::string::const_iterator pstop = std::find(++pstart, pend, '}');
214
215 if (pstop != pend)
216 {
217 p = pstop;
218 gunichar uc = 0;
219
220 for (; pstart != pstop; ++pstart)
221 {
222 if (!Ascii::isxdigit(*pstart))
223 return;
224
225 uc *= 0x10;
226 uc += Ascii::xdigit_value(*pstart);
227 }
228
229 if (uc != 0 && Unicode::validate(uc))
230 dest += ustring(1, uc).raw();
231
232 return;
233 }
234 }
235 else if (pstart + 1 != pend && Ascii::isxdigit(pstart[0]) && Ascii::isxdigit(pstart[1]))
236 {
237 p = pstart + 1;
238 const gunichar uc = 0x10 * Ascii::xdigit_value(pstart[0]) + Ascii::xdigit_value(pstart[1]);
239
240 if (uc != 0 && Unicode::validate(uc))
241 dest += ustring(1, uc).raw();
242
243 return;
244 }
245 }
246
247 dest += 'x';
248 }
249
250 static
251 void parse_oct_unichar(std::string::const_iterator& p, std::string::const_iterator pend,
252 std::string& dest)
253 {
254 gunichar uc = 0;
255 std::string::const_iterator pnum = p;
256
257 for (; pnum != pend && (pnum - p) < 3; ++pnum)
258 {
259 if (!ascii_isodigit(*pnum))
260 break;
261
262 uc *= 010;
263 uc += Glib::Ascii::digit_value(*pnum);
264 }
265
266 if (pnum > p)
267 {
268 p = pnum - 1;
269
270 if (uc != 0 && Glib::Unicode::validate(uc))
271 dest += Glib::ustring(1, uc).raw();
272 }
273 else
274 dest += *p;
275 }
276
277 /*
278 * On entry, p _must_ point to either a digit or a starting bracket '{'. Also,
279 * if p points to '{' the closing bracket '}' is assumed to follow before pend.
280 */
281 static
282 int parse_capture_index(std::string::const_iterator& p, std::string::const_iterator pend)
283 {
284 std::string::const_iterator pnum = p;
285
286 if (*pnum == '{' && *++pnum == '}')
287 {
288 p = pnum;
289 return -1;
290 }
291
292 int result = 0;
293
294 while (pnum != pend && Glib::Ascii::isdigit(*pnum))
295 {
296 result *= 10;
297 result += Glib::Ascii::digit_value(*pnum++);
298 }
299
300 if (*p != '{') // case "$digits": set position to last digit
301 {
302 p = pnum - 1;
303 }
304 else if (*pnum == '}') // case "${digits}": set position to '}'
305 {
306 p = pnum;
307 }
308 else // case "${invalid}": return -1 but still skip until '}'
309 {
310 p = std::find(pnum, pend, '}');
311 return -1;
312 }
313
314 return result;
315 }
316
317 } // anonymous namespace
318
319 /*
320 * Convert the content of an std::wstring to UTF-8. Using wide strings is
321 * necessary when dealing with localized stream formatting, for the reasons
322 * outlined here: http://bugzilla.gnome.org/show_bug.cgi?id=399216
323 *
324 * Direct use of wide strings in regexxer is a temporary measure. Thus,
325 * this function should be removed once Glib::compose() and Glib::format()
326 * are available in glibmm.
327 */
328 Glib::ustring Util::wstring_to_utf8(const std::wstring& str)
329 {
330 class ScopedCharArray
331 {
332 private:
333 char* ptr_;
334
335 ScopedCharArray(const ScopedCharArray&);
336 ScopedCharArray& operator=(const ScopedCharArray&);
337
338 public:
339 explicit ScopedCharArray(char* ptr) : ptr_ (ptr) {}
340 ~ScopedCharArray() { g_free(ptr_); }
341
342 char* get() const { return ptr_; }
343 };
344
345 GError* error = 0;
346
347 #ifdef __STDC_ISO_10646__
348 // Avoid going through iconv if wchar_t always contains UCS-4.
349 glong n_bytes = 0;
350 const ScopedCharArray buf (g_ucs4_to_utf8(reinterpret_cast<const gunichar*>(str.data()),
351 str.size(), 0, &n_bytes, &error));
352 #else
353 gsize n_bytes = 0;
354 const ScopedCharArray buf (g_convert(reinterpret_cast<const char*>(str.data()),
355 str.size() * sizeof(std::wstring::value_type),
356 "UTF-8", "WCHAR_T", 0, &n_bytes, &error));
357 #endif /* !__STDC_ISO_10646__ */
358
359 if (G_UNLIKELY(error))
360 {
361 g_warning("%s", error->message);
362 g_error_free(error);
363 return Glib::ustring();
364 }
365
366 return Glib::ustring(buf.get(), buf.get() + n_bytes);
367 }
368
369 bool Util::validate_encoding(const std::string& encoding)
370 {
371 // GLib just ignores some characters that aren't used in encoding names,
372 // so we have to parse the string for invalid characters ourselves.
373
374 if (encoding.empty() || !Glib::Ascii::isalnum(*encoding.begin())
375 || !Glib::Ascii::isalnum(*encoding.rbegin()))
376 return false;
377
378 for (std::string::const_iterator p = encoding.begin() + 1; p != encoding.end(); ++p)
379 {
380 if (!Glib::Ascii::isalnum(*p) && is_significant_encoding_char(*p))
381 return false;
382 }
383
384 // Better don't try to call Glib::convert() with identical input and output
385 // encodings. I heard the iconv on Solaris doesn't like that idea at all.
386
387 if (!Util::encodings_equal(encoding, "UTF-8"))
388 try
389 {
390 Glib::convert(std::string(), "UTF-8", encoding);
391 }
392 catch (const Glib::ConvertError& error)
393 {
394 if (error.code() == Glib::ConvertError::NO_CONVERSION)
395 return false;
396 throw;
397 }
398
399 return true;
400 }
401
402 /*
403 * Test lhs and rhs for equality while ignoring case
404 * and several separation characters used in encoding names.
405 */
406 bool Util::encodings_equal(const std::string& lhs, const std::string& rhs)
407 {
408 typedef std::string::const_iterator Iterator;
409
410 Iterator lhs_pos = lhs.begin();
411 Iterator rhs_pos = rhs.begin();
412 const Iterator lhs_end = lhs.end();
413 const Iterator rhs_end = rhs.end();
414
415 for (;;)
416 {
417 while (lhs_pos != lhs_end && !is_significant_encoding_char(*lhs_pos))
418 ++lhs_pos;
419 while (rhs_pos != rhs_end && !is_significant_encoding_char(*rhs_pos))
420 ++rhs_pos;
421
422 if (lhs_pos == lhs_end || rhs_pos == rhs_end)
423 break;
424
425 if (Glib::Ascii::toupper(*lhs_pos) != Glib::Ascii::toupper(*rhs_pos))
426 return false;
427
428 ++lhs_pos;
429 ++rhs_pos;
430 }
431
432 return (lhs_pos == lhs_end && rhs_pos == rhs_end);
433 }
434
435 Glib::ustring Util::shell_pattern_to_regex(const Glib::ustring& pattern)
436 {
437 // Don't use Glib::ustring to accumulate the result since we might append
438 // partial UTF-8 characters during processing. Although this would work with
439 // the current Glib::ustring implementation, it's definitely not a good idea.
440 std::string result;
441 result.reserve(std::max<std::string::size_type>(32, 2 * pattern.raw().size()));
442
443 result.append("\\A", 2);
444
445 int brace_level = 0;
446
447 const std::string::const_iterator pend = pattern.raw().end();
448 std::string::const_iterator p = pattern.raw().begin();
449 std::string::const_iterator pcc = pend; // start of character class
450
451 for (; p != pend; ++p)
452 {
453 if (*p == '\\')
454 {
455 // Always escape a single trailing '\' to avoid mangling the "\z"
456 // terminator. Never escape multi-byte or alpha-numeric characters.
457
458 if (p + 1 == pend || Glib::Ascii::ispunct(*++p))
459 result += '\\';
460
461 result += *p;
462 }
463 else if (pcc == pend)
464 {
465 switch (*p)
466 {
467 case '*':
468 result.append(".*", 2);
469 break;
470
471 case '?':
472 result += '.';
473 break;
474
475 case '[':
476 result += '[';
477 pcc = p + 1;
478 break;
479
480 case '{':
481 result.append("(?:", 3);
482 ++brace_level;
483 break;
484
485 case '}':
486 result += ')';
487 --brace_level;
488 break;
489
490 case ',':
491 result += (brace_level > 0) ? '|' : ',';
492 break;
493
494 case '^': case '$': case '.': case '+': case '(': case ')': case '|':
495 result += '\\';
496 // fallthrough
497
498 default:
499 result += *p;
500 break;
501 }
502 }
503 else // pcc != pend
504 {
505 switch (*p)
506 {
507 case ']':
508 result += ']';
509 if (p != pcc && !(p == pcc + 1 && (*pcc == '!' || *pcc == '^')))
510 pcc = pend;
511 break;
512
513 case '!':
514 result += (p == pcc) ? '^' : '!';
515 break;
516
517 default:
518 result += *p;
519 break;
520 }
521 }
522 }
523
524 result.append("\\z", 2);
525
526 return result;
527 }
528
529 Glib::ustring Util::substitute_references(const Glib::ustring& substitution,
530 const Glib::ustring& subject,
531 const CaptureVector& captures)
532 {
533 std::string result;
534 result.reserve(2 * std::max(substitution.raw().size(), subject.raw().size()));
535
536 std::vector<ModPos> modifiers;
537
538 const std::string::const_iterator pend = substitution.raw().end();
539 std::string::const_iterator p = substitution.raw().begin();
540
541 for (; p != pend; ++p)
542 {
543 if (*p == '\\' && p + 1 != pend)
544 {
545 switch (*++p)
546 {
547 case 'L': case 'U': case 'l': case 'u': case 'E':
548 modifiers.push_back(ModPos(result.size(), *p));
549 break;
550
551 case 'a':
552 result += '\a';
553 break;
554
555 case 'e':
556 result += '\033';
557 break;
558
559 case 'f':
560 result += '\f';
561 break;
562
563 case 'n':
564 result += '\n';
565 break;
566
567 case 'r':
568 result += '\r';
569 break;
570
571 case 't':
572 result += '\t';
573 break;
574
575 case 'c':
576 parse_control_char(p, pend, result);
577 break;
578
579 case 'x':
580 parse_hex_unichar(p, pend, result);
581 break;
582
583 case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7':
584 parse_oct_unichar(p, pend, result);
585 break;
586
587 default:
588 result += *p;
589 break;
590 }
591 }
592 else if (*p == '$' && p + 1 != pend)
593 {
594 std::pair<int, int> bounds;
595
596 if (Glib::Ascii::isdigit(*++p) || (*p == '{' && std::find(p + 1, pend, '}') != pend))
597 {
598 const int index = parse_capture_index(p, pend);
599
600 if (index >= 0 && unsigned(index) < captures.size())
601 bounds = captures[index];
602 else
603 continue;
604 }
605 else switch (*p)
606 {
607 case '+':
608 if (captures.size() > 1)
609 bounds = captures.back();
610 break;
611
612 case '&':
613 bounds = captures.front();
614 break;
615
616 case '`':
617 bounds.first = 0;
618 bounds.second = captures.front().first;
619 break;
620
621 case '\'':
622 bounds.first = captures.front().second;
623 bounds.second = subject.raw().size();
624 break;
625
626 default:
627 result += '$';
628 result += *p;
629 continue;
630 }
631
632 if (bounds.first >= 0 && bounds.second > bounds.first)
633 result.append(subject.raw(), bounds.first, bounds.second - bounds.first);
634 }
635 else // (*p != '\\' && *p != '$') || (p + 1 == pend)
636 {
637 result += *p;
638 }
639 }
640
641 if (!modifiers.empty())
642 result = apply_modifiers(result, modifiers);
643
644 return result;
645 }
646
647 Glib::ustring Util::int_to_string(int number)
648 {
649 std::wostringstream output;
650
651 try // don't abort if the user-specified locale doesn't exist
652 {
653 output.imbue(std::locale(""));
654 }
655 catch (const std::runtime_error& error)
656 {
657 g_warning("%s", error.what());
658 }
659
660 output << number;
661
662 return Util::wstring_to_utf8(output.str());
663 }
664
665 Glib::ustring Util::filename_short_display_name(const std::string& filename)
666 {
667 const std::string homedir = Glib::get_home_dir();
668 const std::string::size_type len = homedir.length();
669
670 if (filename.length() >= len
671 && (filename.length() == len || G_IS_DIR_SEPARATOR(filename[len]))
672 && filename.compare(0, len, homedir) == 0)
673 {
674 std::string short_name (1, '~');
675 short_name.append(filename, len, std::string::npos);
676
677 return Glib::filename_display_name(short_name);
678 }
679
680 return Glib::filename_display_name(filename);
681 }
682
683 Glib::ustring Util::color_to_string(const Gdk::Color& color)
684 {
685 std::ostringstream output;
686
687 output.imbue(std::locale::classic());
688 output.setf(std::ios::hex, std::ios::basefield);
689 output.setf(std::ios::uppercase);
690 output.fill('0');
691
692 output << '#' << std::setw(2) << scale_to_8bit(color.get_red())
693 << std::setw(2) << scale_to_8bit(color.get_green())
694 << std::setw(2) << scale_to_8bit(color.get_blue());
695
696 return output.str();
697 }
698
699 int Util::enum_from_nick_impl(GType type, const Glib::ustring& nick)
700 {
701 const ScopedTypeClass type_class (type);
702
703 GEnumClass *const enum_class = G_ENUM_CLASS(type_class.get());
704 GEnumValue *const enum_value = g_enum_get_value_by_nick(enum_class, nick.c_str());
705
706 g_return_val_if_fail(enum_value != 0, enum_class->minimum);
707
708 return enum_value->value;
709 }
710
711 Glib::ustring Util::enum_to_nick_impl(GType type, int value)
712 {
713 const ScopedTypeClass type_class (type);
714
715 GEnumClass *const enum_class = G_ENUM_CLASS(type_class.get());
716 GEnumValue *const enum_value = g_enum_get_value(enum_class, value);
717
718 g_return_val_if_fail(enum_value != 0, "");
719
720 return enum_value->value_nick;
721 }