geany  1.38
About: Geany is a text editor (using GTK2) with basic features of an integrated development environment (syntax highlighting, code folding, symbol name auto-completion, ...). F: office T: editor programming GTK+ IDE
  Fossies Dox: geany-1.38.tar.bz2  ("unofficial" and yet experimental doxygen-generated source code documentation)  

UniConversion.h
Go to the documentation of this file.
1// Scintilla source code edit control
2/** @file UniConversion.h
3 ** Functions to handle UTF-8 and UTF-16 strings.
4 **/
5// Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
6// The License.txt file describes the conditions under which this software may be distributed.
7
8#ifndef UNICONVERSION_H
9#define UNICONVERSION_H
10
11namespace Scintilla {
12
13constexpr int UTF8MaxBytes = 4;
14
15constexpr int unicodeReplacementChar = 0xFFFD;
16
17size_t UTF8Length(const wchar_t *uptr, size_t tlen) noexcept;
18void UTF8FromUTF16(const wchar_t *uptr, size_t tlen, char *putf, size_t len) noexcept;
19void UTF8FromUTF32Character(int uch, char *putf) noexcept;
20size_t UTF16Length(const char *s, size_t len) noexcept;
21size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen);
22size_t UTF32Length(const char *s, size_t len) noexcept;
23size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen);
24// WStringFromUTF8 does the right thing when wchar_t is 2 or 4 bytes so
25// works on both Windows and Unix.
26std::wstring WStringFromUTF8(const char *s, size_t len);
27unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) noexcept;
28bool UTF8IsValid(const char *s, size_t len) noexcept;
29std::string FixInvalidUTF8(const std::string &text);
30
31extern const unsigned char UTF8BytesOfLead[256];
32
33inline int UnicodeFromUTF8(const unsigned char *us) noexcept {
34 switch (UTF8BytesOfLead[us[0]]) {
35 case 1:
36 return us[0];
37 case 2:
38 return ((us[0] & 0x1F) << 6) + (us[1] & 0x3F);
39 case 3:
40 return ((us[0] & 0xF) << 12) + ((us[1] & 0x3F) << 6) + (us[2] & 0x3F);
41 default:
42 return ((us[0] & 0x7) << 18) + ((us[1] & 0x3F) << 12) + ((us[2] & 0x3F) << 6) + (us[3] & 0x3F);
43 }
44}
45
46inline constexpr bool UTF8IsTrailByte(unsigned char ch) noexcept {
47 return (ch >= 0x80) && (ch < 0xc0);
48}
49
50inline constexpr bool UTF8IsAscii(int ch) noexcept {
51 return ch < 0x80;
52}
53
55int UTF8Classify(const unsigned char *us, size_t len) noexcept;
56
57// Similar to UTF8Classify but returns a length of 1 for invalid bytes
58// instead of setting the invalid flag
59int UTF8DrawBytes(const unsigned char *us, int len) noexcept;
60
61// Line separator is U+2028 \xe2\x80\xa8
62// Paragraph separator is U+2029 \xe2\x80\xa9
63constexpr int UTF8SeparatorLength = 3;
64inline bool UTF8IsSeparator(const unsigned char *us) noexcept {
65 return (us[0] == 0xe2) && (us[1] == 0x80) && ((us[2] == 0xa8) || (us[2] == 0xa9));
66}
67
68// NEL is U+0085 \xc2\x85
69constexpr int UTF8NELLength = 2;
70inline bool UTF8IsNEL(const unsigned char *us) noexcept {
71 return (us[0] == 0xc2) && (us[1] == 0x85);
72}
73
74// Is the sequence of 3 char a UTF-8 line end? Only the last two char are tested for a NEL.
75constexpr bool UTF8IsMultibyteLineEnd(unsigned char ch0, unsigned char ch1, unsigned char ch2) noexcept {
76 return
77 ((ch0 == 0xe2) && (ch1 == 0x80) && ((ch2 == 0xa8) || (ch2 == 0xa9))) ||
78 ((ch1 == 0xc2) && (ch2 == 0x85));
79}
80
81enum { SURROGATE_LEAD_FIRST = 0xD800 };
82enum { SURROGATE_LEAD_LAST = 0xDBFF };
83enum { SURROGATE_TRAIL_FIRST = 0xDC00 };
84enum { SURROGATE_TRAIL_LAST = 0xDFFF };
85enum { SUPPLEMENTAL_PLANE_FIRST = 0x10000 };
86
87inline constexpr unsigned int UTF16CharLength(wchar_t uch) noexcept {
88 return ((uch >= SURROGATE_LEAD_FIRST) && (uch <= SURROGATE_LEAD_LAST)) ? 2 : 1;
89}
90
91inline constexpr unsigned int UTF16LengthFromUTF8ByteCount(unsigned int byteCount) noexcept {
92 return (byteCount < 4) ? 1 : 2;
93}
94
95}
96
97#endif
gchar * text
Definition: editor.c:83
Styling buffer using one element for each run rather than using a filled buffer.
Definition: Converter.h:9
constexpr unsigned int UTF16LengthFromUTF8ByteCount(unsigned int byteCount) noexcept
Definition: UniConversion.h:91
size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen)
@ SURROGATE_LEAD_LAST
Definition: UniConversion.h:82
constexpr int UTF8MaxBytes
Definition: UniConversion.h:13
@ SURROGATE_TRAIL_LAST
Definition: UniConversion.h:84
void UTF8FromUTF32Character(int uch, char *putf) noexcept
@ SUPPLEMENTAL_PLANE_FIRST
Definition: UniConversion.h:85
@ SURROGATE_LEAD_FIRST
Definition: UniConversion.h:81
size_t UTF16Length(const char *s, size_t len) noexcept
bool UTF8IsValid(const char *s, size_t len) noexcept
constexpr int unicodeReplacementChar
Definition: UniConversion.h:15
constexpr bool UTF8IsAscii(int ch) noexcept
Definition: UniConversion.h:50
unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) noexcept
size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen)
bool UTF8IsSeparator(const unsigned char *us) noexcept
Definition: UniConversion.h:64
void UTF8FromUTF16(const wchar_t *uptr, size_t tlen, char *putf, size_t len) noexcept
size_t UTF8Length(const wchar_t *uptr, size_t tlen) noexcept
@ SURROGATE_TRAIL_FIRST
Definition: UniConversion.h:83
bool UTF8IsNEL(const unsigned char *us) noexcept
Definition: UniConversion.h:70
int UTF8DrawBytes(const unsigned char *us, int len) noexcept
size_t UTF32Length(const char *s, size_t len) noexcept
constexpr int UTF8SeparatorLength
Definition: UniConversion.h:63
constexpr int UTF8NELLength
Definition: UniConversion.h:69
int UTF8Classify(const unsigned char *us, size_t len) noexcept
int UnicodeFromUTF8(const unsigned char *us) noexcept
Definition: UniConversion.h:33
constexpr unsigned int UTF16CharLength(wchar_t uch) noexcept
Definition: UniConversion.h:87
std::wstring WStringFromUTF8(const char *s, size_t len)
std::string FixInvalidUTF8(const std::string &text)
constexpr bool UTF8IsMultibyteLineEnd(unsigned char ch0, unsigned char ch1, unsigned char ch2) noexcept
Definition: UniConversion.h:75
constexpr bool UTF8IsTrailByte(unsigned char ch) noexcept
Definition: UniConversion.h:46
const unsigned char UTF8BytesOfLead[256]