"Fossies" - the Fresh Open Source Software Archive

Member "memcached-1.6.15/vendor/lua/src/lutf8lib.c" (1 Oct 2021, 8093 Bytes) of package /linux/www/memcached-1.6.15.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 /*
    2 ** $Id: lutf8lib.c $
    3 ** Standard library for UTF-8 manipulation
    4 ** See Copyright Notice in lua.h
    5 */
    6 
    7 #define lutf8lib_c
    8 #define LUA_LIB
    9 
   10 #include "lprefix.h"
   11 
   12 
   13 #include <assert.h>
   14 #include <limits.h>
   15 #include <stdlib.h>
   16 #include <string.h>
   17 
   18 #include "lua.h"
   19 
   20 #include "lauxlib.h"
   21 #include "lualib.h"
   22 
   23 
   24 #define MAXUNICODE  0x10FFFFu
   25 
   26 #define MAXUTF      0x7FFFFFFFu
   27 
   28 /*
   29 ** Integer type for decoded UTF-8 values; MAXUTF needs 31 bits.
   30 */
   31 #if (UINT_MAX >> 30) >= 1
   32 typedef unsigned int utfint;
   33 #else
   34 typedef unsigned long utfint;
   35 #endif
   36 
   37 
   38 #define iscont(p)   ((*(p) & 0xC0) == 0x80)
   39 
   40 
   41 /* from strlib */
   42 /* translate a relative string position: negative means back from end */
   43 static lua_Integer u_posrelat (lua_Integer pos, size_t len) {
   44   if (pos >= 0) return pos;
   45   else if (0u - (size_t)pos > len) return 0;
   46   else return (lua_Integer)len + pos + 1;
   47 }
   48 
   49 
   50 /*
   51 ** Decode one UTF-8 sequence, returning NULL if byte sequence is
   52 ** invalid.  The array 'limits' stores the minimum value for each
   53 ** sequence length, to check for overlong representations. Its first
   54 ** entry forces an error for non-ascii bytes with no continuation
   55 ** bytes (count == 0).
   56 */
   57 static const char *utf8_decode (const char *s, utfint *val, int strict) {
   58   static const utfint limits[] =
   59         {~(utfint)0, 0x80, 0x800, 0x10000u, 0x200000u, 0x4000000u};
   60   unsigned int c = (unsigned char)s[0];
   61   utfint res = 0;  /* final result */
   62   if (c < 0x80)  /* ascii? */
   63     res = c;
   64   else {
   65     int count = 0;  /* to count number of continuation bytes */
   66     for (; c & 0x40; c <<= 1) {  /* while it needs continuation bytes... */
   67       unsigned int cc = (unsigned char)s[++count];  /* read next byte */
   68       if ((cc & 0xC0) != 0x80)  /* not a continuation byte? */
   69         return NULL;  /* invalid byte sequence */
   70       res = (res << 6) | (cc & 0x3F);  /* add lower 6 bits from cont. byte */
   71     }
   72     res |= ((utfint)(c & 0x7F) << (count * 5));  /* add first byte */
   73     if (count > 5 || res > MAXUTF || res < limits[count])
   74       return NULL;  /* invalid byte sequence */
   75     s += count;  /* skip continuation bytes read */
   76   }
   77   if (strict) {
   78     /* check for invalid code points; too large or surrogates */
   79     if (res > MAXUNICODE || (0xD800u <= res && res <= 0xDFFFu))
   80       return NULL;
   81   }
   82   if (val) *val = res;
   83   return s + 1;  /* +1 to include first byte */
   84 }
   85 
   86 
   87 /*
   88 ** utf8len(s [, i [, j [, lax]]]) --> number of characters that
   89 ** start in the range [i,j], or nil + current position if 's' is not
   90 ** well formed in that interval
   91 */
   92 static int utflen (lua_State *L) {
   93   lua_Integer n = 0;  /* counter for the number of characters */
   94   size_t len;  /* string length in bytes */
   95   const char *s = luaL_checklstring(L, 1, &len);
   96   lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
   97   lua_Integer posj = u_posrelat(luaL_optinteger(L, 3, -1), len);
   98   int lax = lua_toboolean(L, 4);
   99   luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2,
  100                    "initial position out of bounds");
  101   luaL_argcheck(L, --posj < (lua_Integer)len, 3,
  102                    "final position out of bounds");
  103   while (posi <= posj) {
  104     const char *s1 = utf8_decode(s + posi, NULL, !lax);
  105     if (s1 == NULL) {  /* conversion error? */
  106       luaL_pushfail(L);  /* return fail ... */
  107       lua_pushinteger(L, posi + 1);  /* ... and current position */
  108       return 2;
  109     }
  110     posi = s1 - s;
  111     n++;
  112   }
  113   lua_pushinteger(L, n);
  114   return 1;
  115 }
  116 
  117 
  118 /*
  119 ** codepoint(s, [i, [j [, lax]]]) -> returns codepoints for all
  120 ** characters that start in the range [i,j]
  121 */
  122 static int codepoint (lua_State *L) {
  123   size_t len;
  124   const char *s = luaL_checklstring(L, 1, &len);
  125   lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
  126   lua_Integer pose = u_posrelat(luaL_optinteger(L, 3, posi), len);
  127   int lax = lua_toboolean(L, 4);
  128   int n;
  129   const char *se;
  130   luaL_argcheck(L, posi >= 1, 2, "out of bounds");
  131   luaL_argcheck(L, pose <= (lua_Integer)len, 3, "out of bounds");
  132   if (posi > pose) return 0;  /* empty interval; return no values */
  133   if (pose - posi >= INT_MAX)  /* (lua_Integer -> int) overflow? */
  134     return luaL_error(L, "string slice too long");
  135   n = (int)(pose -  posi) + 1;  /* upper bound for number of returns */
  136   luaL_checkstack(L, n, "string slice too long");
  137   n = 0;  /* count the number of returns */
  138   se = s + pose;  /* string end */
  139   for (s += posi - 1; s < se;) {
  140     utfint code;
  141     s = utf8_decode(s, &code, !lax);
  142     if (s == NULL)
  143       return luaL_error(L, "invalid UTF-8 code");
  144     lua_pushinteger(L, code);
  145     n++;
  146   }
  147   return n;
  148 }
  149 
  150 
  151 static void pushutfchar (lua_State *L, int arg) {
  152   lua_Unsigned code = (lua_Unsigned)luaL_checkinteger(L, arg);
  153   luaL_argcheck(L, code <= MAXUTF, arg, "value out of range");
  154   lua_pushfstring(L, "%U", (long)code);
  155 }
  156 
  157 
  158 /*
  159 ** utfchar(n1, n2, ...)  -> char(n1)..char(n2)...
  160 */
  161 static int utfchar (lua_State *L) {
  162   int n = lua_gettop(L);  /* number of arguments */
  163   if (n == 1)  /* optimize common case of single char */
  164     pushutfchar(L, 1);
  165   else {
  166     int i;
  167     luaL_Buffer b;
  168     luaL_buffinit(L, &b);
  169     for (i = 1; i <= n; i++) {
  170       pushutfchar(L, i);
  171       luaL_addvalue(&b);
  172     }
  173     luaL_pushresult(&b);
  174   }
  175   return 1;
  176 }
  177 
  178 
  179 /*
  180 ** offset(s, n, [i])  -> index where n-th character counting from
  181 **   position 'i' starts; 0 means character at 'i'.
  182 */
  183 static int byteoffset (lua_State *L) {
  184   size_t len;
  185   const char *s = luaL_checklstring(L, 1, &len);
  186   lua_Integer n  = luaL_checkinteger(L, 2);
  187   lua_Integer posi = (n >= 0) ? 1 : len + 1;
  188   posi = u_posrelat(luaL_optinteger(L, 3, posi), len);
  189   luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3,
  190                    "position out of bounds");
  191   if (n == 0) {
  192     /* find beginning of current byte sequence */
  193     while (posi > 0 && iscont(s + posi)) posi--;
  194   }
  195   else {
  196     if (iscont(s + posi))
  197       return luaL_error(L, "initial position is a continuation byte");
  198     if (n < 0) {
  199        while (n < 0 && posi > 0) {  /* move back */
  200          do {  /* find beginning of previous character */
  201            posi--;
  202          } while (posi > 0 && iscont(s + posi));
  203          n++;
  204        }
  205      }
  206      else {
  207        n--;  /* do not move for 1st character */
  208        while (n > 0 && posi < (lua_Integer)len) {
  209          do {  /* find beginning of next character */
  210            posi++;
  211          } while (iscont(s + posi));  /* (cannot pass final '\0') */
  212          n--;
  213        }
  214      }
  215   }
  216   if (n == 0)  /* did it find given character? */
  217     lua_pushinteger(L, posi + 1);
  218   else  /* no such character */
  219     luaL_pushfail(L);
  220   return 1;
  221 }
  222 
  223 
  224 static int iter_aux (lua_State *L, int strict) {
  225   size_t len;
  226   const char *s = luaL_checklstring(L, 1, &len);
  227   lua_Integer n = lua_tointeger(L, 2) - 1;
  228   if (n < 0)  /* first iteration? */
  229     n = 0;  /* start from here */
  230   else if (n < (lua_Integer)len) {
  231     n++;  /* skip current byte */
  232     while (iscont(s + n)) n++;  /* and its continuations */
  233   }
  234   if (n >= (lua_Integer)len)
  235     return 0;  /* no more codepoints */
  236   else {
  237     utfint code;
  238     const char *next = utf8_decode(s + n, &code, strict);
  239     if (next == NULL)
  240       return luaL_error(L, "invalid UTF-8 code");
  241     lua_pushinteger(L, n + 1);
  242     lua_pushinteger(L, code);
  243     return 2;
  244   }
  245 }
  246 
  247 
  248 static int iter_auxstrict (lua_State *L) {
  249   return iter_aux(L, 1);
  250 }
  251 
  252 static int iter_auxlax (lua_State *L) {
  253   return iter_aux(L, 0);
  254 }
  255 
  256 
  257 static int iter_codes (lua_State *L) {
  258   int lax = lua_toboolean(L, 2);
  259   luaL_checkstring(L, 1);
  260   lua_pushcfunction(L, lax ? iter_auxlax : iter_auxstrict);
  261   lua_pushvalue(L, 1);
  262   lua_pushinteger(L, 0);
  263   return 3;
  264 }
  265 
  266 
  267 /* pattern to match a single UTF-8 character */
  268 #define UTF8PATT    "[\0-\x7F\xC2-\xFD][\x80-\xBF]*"
  269 
  270 
  271 static const luaL_Reg funcs[] = {
  272   {"offset", byteoffset},
  273   {"codepoint", codepoint},
  274   {"char", utfchar},
  275   {"len", utflen},
  276   {"codes", iter_codes},
  277   /* placeholders */
  278   {"charpattern", NULL},
  279   {NULL, NULL}
  280 };
  281 
  282 
  283 LUAMOD_API int luaopen_utf8 (lua_State *L) {
  284   luaL_newlib(L, funcs);
  285   lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT)/sizeof(char) - 1);
  286   lua_setfield(L, -2, "charpattern");
  287   return 1;
  288 }
  289