"Fossies" - the Fresh Open Source Software Archive

Member "txr-218/regex.c" (20 Jun 2019, 90553 Bytes) of package /linux/misc/txr-218.tar.bz2:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "regex.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 217_vs_218.

    1 /* Copyright 2009-2019
    2  * Kaz Kylheku <kaz@kylheku.com>
    3  * Vancouver, Canada
    4  * All rights reserved.
    5  *
    6  * Redistribution and use in source and binary forms, with or without
    7  * modification, are permitted provided that the following conditions are met:
    8  *
    9  * 1. Redistributions of source code must retain the above copyright notice, this
   10  *    list of conditions and the following disclaimer.
   11  *
   12  * 2. Redistributions in binary form must reproduce the above copyright notice,
   13  *    this list of conditions and the following disclaimer in the documentation
   14  *    and/or other materials provided with the distribution.
   15  *
   16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
   17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
   18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   19  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
   20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
   22  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   23  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   24  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   25  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   26  */
   27 
   28 #include <stdio.h>
   29 #include <stdlib.h>
   30 #include <string.h>
   31 #include <wchar.h>
   32 #include <assert.h>
   33 #include <limits.h>
   34 #include <signal.h>
   35 #include <stdarg.h>
   36 #include "config.h"
   37 #include "alloca.h"
   38 #include "lib.h"
   39 #include "parser.h"
   40 #include "signal.h"
   41 #include "unwind.h"
   42 #include "stream.h"
   43 #include "gc.h"
   44 #include "eval.h"
   45 #include "cadr.h"
   46 #include "itypes.h"
   47 #include "regex.h"
   48 #include "txr.h"
   49 
   50 #if WCHAR_MAX > 65535
   51 #define FULL_UNICODE
   52 #endif
   53 
   54 typedef union nfa_state nfa_state_t;
   55 
   56 typedef struct nfa {
   57   nfa_state_t *start;
   58   nfa_state_t *accept;
   59 } nfa_t;
   60 
   61 typedef enum { REGEX_NFA, REGEX_DV } regex_kind_t;
   62 
   63 typedef struct regex {
   64   regex_kind_t kind;
   65   union {
   66     struct nfa nfa;
   67     val dv;
   68   } r;
   69   int nstates;
   70   val source;
   71 } regex_t;
   72 
   73 /*
   74  * Result from regex_machine_feed.
   75  * These values have two meanings, based on whether
   76  * the matching is still open (characters are being fed)
   77  * or finalized.
   78  *
   79  * When feeding characters:
   80  * REGM_INCOMPLETE: no match at this character, but matching can continue.
   81  * REGM_FAIL: no more state transitions are possible.
   82  * REGM_MATCH: match (accept state) for this character.
   83  *
   84  * When the end of the input is encountered, or a REGM_FAIL,
   85  * then regex_machine_feed is called one more time with
   86  * the null character. It then reports:
   87  * REGM_INCOMPLETE: there was a partial match for the input.
   88  * REGM_FAIL: none of the input matched.
   89  * REGM_MATCH: the input was completely matched
   90  *
   91  * Note that a REGM_FAIL (no transitions) during the character feeding phase
   92  * can turn into REGM_INCOMPLETE (partial match) when the match is sealed with
   93  * the null character signal!
   94  */
   95 typedef enum regm_result {
   96   REGM_INCOMPLETE,
   97   REGM_FAIL,
   98   REGM_MATCH
   99 } regm_result_t;
  100 
  101 typedef union regex_machine regex_machine_t;
  102 
  103 typedef unsigned int bitcell_t;
  104 
  105 #define CHAR_SET_SIZE (256 / (sizeof (bitcell_t) * CHAR_BIT))
  106 
  107 #define BITCELL_BIT (sizeof (bitcell_t) * CHAR_BIT)
  108 
  109 #define CHAR_SET_INDEX(CH) ((CH) / BITCELL_BIT)
  110 #define CHAR_SET_BIT(CH) ((CH) % BITCELL_BIT)
  111 
  112 #define CHAR_SET_L0(CH) ((CH) & 0xFF)
  113 #define CHAR_SET_L1(CH) (((CH) >> 8) & 0xF)
  114 #define CHAR_SET_L2(CH) (((CH) >> 12) & 0xF)
  115 #ifdef FULL_UNICODE
  116 #define CHAR_SET_L3(CH) (((CH) >> 16) & 0x1F)
  117 #endif
  118 
  119 #ifdef FULL_UNICODE
  120 #define CHAR_SET_L2_LO(CH) ((CH) & ~convert(wchar_t, 0xFFFF))
  121 #define CHAR_SET_L2_HI(CH) ((CH) | convert(wchar_t, 0xFFFF))
  122 #endif
  123 
  124 #define CHAR_SET_L1_LO(CH) ((CH) & ~convert(wchar_t, 0xFFF))
  125 #define CHAR_SET_L1_HI(CH) ((CH) | convert(wchar_t, 0xFFF))
  126 
  127 #define CHAR_SET_L0_LO(CH) ((CH) & ~convert(wchar_t, 0xFF))
  128 #define CHAR_SET_L0_HI(CH) ((CH) | convert(wchar_t, 0xFF))
  129 
  130 typedef enum {
  131   CHSET_SMALL, CHSET_DISPLACED, CHSET_LARGE,
  132 #ifdef FULL_UNICODE
  133   CHSET_XLARGE
  134 #endif
  135 } chset_type_t;
  136 
  137 typedef bitcell_t cset_L0_t[CHAR_SET_SIZE];
  138 typedef cset_L0_t *cset_L1_t[16];
  139 typedef cset_L1_t *cset_L2_t[16];
  140 #ifdef FULL_UNICODE
  141 typedef cset_L2_t *cset_L3_t[17];
  142 #endif
  143 
  144 struct any_char_set {
  145   unsigned type : 3;
  146   unsigned comp : 1;
  147   unsigned stat : 1;
  148 };
  149 
  150 struct small_char_set {
  151   unsigned type : 3;
  152   unsigned comp : 1;
  153   unsigned stat : 1;
  154   cset_L0_t bitcell;
  155 };
  156 
  157 struct displaced_char_set {
  158   unsigned type : 3;
  159   unsigned comp : 1;
  160   unsigned stat : 1;
  161   cset_L0_t bitcell;
  162   wchar_t base;
  163 };
  164 
  165 
  166 struct large_char_set {
  167   unsigned type : 3;
  168   unsigned comp : 1;
  169   unsigned stat : 1;
  170   cset_L2_t dir;
  171 };
  172 
  173 #ifdef FULL_UNICODE
  174 struct xlarge_char_set {
  175   unsigned type : 3;
  176   unsigned comp : 1;
  177   unsigned stat : 1;
  178   cset_L3_t dir;
  179 };
  180 #endif
  181 
  182 typedef union char_set {
  183   struct any_char_set any;
  184   struct small_char_set s;
  185   struct displaced_char_set d;
  186   struct large_char_set l;
  187 #ifdef FULL_UNICODE
  188   struct xlarge_char_set xl;
  189 #endif
  190 } char_set_t;
  191 
  192 typedef enum {
  193   nfa_empty, nfa_accept, nfa_wild, nfa_single, nfa_set
  194 } nfa_kind_t;
  195 
  196 struct nfa_state_any {
  197   nfa_kind_t kind;
  198   unsigned visited;
  199 };
  200 
  201 struct nfa_state_empty {
  202   nfa_kind_t kind;
  203   unsigned visited;
  204   nfa_state_t *trans0;
  205   nfa_state_t *trans1;
  206 };
  207 
  208 struct nfa_state_single {
  209   nfa_kind_t kind;
  210   unsigned visited;
  211   nfa_state_t *trans;
  212   wchar_t ch;
  213 };
  214 
  215 struct nfa_state_set {
  216   nfa_kind_t kind;
  217   unsigned visited;
  218   nfa_state_t *trans;
  219   char_set_t *set;
  220 };
  221 
  222 union nfa_state {
  223   struct nfa_state_any a;
  224   struct nfa_state_empty e;
  225   struct nfa_state_single o;
  226   struct nfa_state_set s;
  227 };
  228 
  229 #define nfa_accept_state_p(s) ((s)->a.kind == nfa_accept)
  230 #define nfa_empty_state_p(s) ((s)->a.kind == nfa_accept || \
  231                               (s)->a.kind == nfa_empty)
  232 
  233 struct nfa_machine {
  234   int is_nfa;           /* common member */
  235   cnum last_accept_pos; /* common member */
  236   cnum count;           /* common member */
  237   unsigned visited;
  238   nfa_state_t **set, **stack;
  239   int nclos;
  240   nfa_t nfa;
  241   int nstates;
  242 };
  243 
  244 struct dv_machine {
  245   int is_nfa;           /* common member */
  246   cnum last_accept_pos; /* common member */
  247   cnum count;           /* common member */
  248   val deriv;
  249   val regex;
  250 };
  251 
  252 union regex_machine {
  253   struct nfa_machine n;
  254   struct dv_machine d;
  255 };
  256 
  257 int opt_derivative_regex = 0;
  258 
  259 wchar_t spaces[] = {
  260   0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x0020, 0x00a0, 0x1680, 0x180e,
  261   0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008,
  262   0x2009, 0x200a, 0x2028, 0x2029, 0x205f, 0x3000, 0
  263 };
  264 
  265 static int L0_full(cset_L0_t *L0)
  266 {
  267   int i;
  268 
  269   for (i = 0; i < convert(int, CHAR_SET_SIZE); i++)
  270     if ((*L0)[i] != convert(bitcell_t, -1))
  271       return 0;
  272   return 1;
  273 }
  274 
  275 static void L0_fill_range(cset_L0_t *L0, wchar_t ch0, wchar_t ch1)
  276 {
  277   int i;
  278   int bt0 = CHAR_SET_BIT(ch0);
  279   int bc0 = CHAR_SET_INDEX(ch0);
  280   bitcell_t mask0 = ~((convert(bitcell_t, 1) << bt0) - 1);
  281   int bt1 = CHAR_SET_BIT(ch1);
  282   int bc1 = CHAR_SET_INDEX(ch1);
  283   bitcell_t mask1 = (bt1 == (BITCELL_BIT - 1))
  284                      ? convert(bitcell_t, -1)
  285                      : (convert(bitcell_t, 1) << (bt1 + 1)) - 1;
  286 
  287   if (bc1 == bc0) {
  288     (*L0)[bc0] |= (mask0 & mask1);
  289   } else {
  290     (*L0)[bc0] |= mask0;
  291     (*L0)[bc1] |= mask1;
  292     for (i = bc0 + 1; i < bc1; i++)
  293       (*L0)[i] = convert(bitcell_t, -1);
  294   }
  295 }
  296 
  297 static int L0_contains(cset_L0_t *L0, wchar_t ch)
  298 {
  299   return ((*L0)[CHAR_SET_INDEX(ch)] & (1 << CHAR_SET_BIT(ch))) != 0;
  300 }
  301 
  302 static int L1_full(cset_L1_t *L1)
  303 {
  304   int i;
  305   for (i = 0; i < 16; i++)
  306     if ((*L1)[i] != coerce(cset_L0_t *, -1))
  307       return 0;
  308   return 1;
  309 }
  310 
  311 static void L1_fill_range(cset_L1_t *L1, wchar_t ch0, wchar_t ch1)
  312 {
  313   int i1, i10, i11;
  314 
  315   i10 = CHAR_SET_L1(ch0);
  316   i11 = CHAR_SET_L1(ch1);
  317 
  318   for (i1 = i10; i1 <= i11; i1++) {
  319     wchar_t c0 = 0, c1 = 0;
  320     cset_L0_t *L0;
  321 
  322     if (i1 > i10 && i1 < i11) {
  323       free((*L1)[i1]);
  324       (*L1)[i1] = coerce(cset_L0_t *, -1);
  325       continue;
  326     } else if (i10 == i11) {
  327       c0 = ch0;
  328       c1 = ch1;
  329     } else if (i1 == i10) {
  330       c0 = ch0;
  331       c1 = CHAR_SET_L0_HI(ch0);
  332     } else if (i1 == i11) {
  333       c0 = CHAR_SET_L0_LO(ch1);
  334       c1 = ch1;
  335     }
  336 
  337     if ((L0 = (*L1)[i1]) == coerce(cset_L0_t *, -1))
  338       continue;
  339 
  340     if (L0 == 0) {
  341       static cset_L0_t blank;
  342       L0 = (*L1)[i1] = coerce(cset_L0_t *, chk_malloc(sizeof *L0));
  343       memcpy(L0, &blank, sizeof *L0);
  344     }
  345 
  346     L0_fill_range(L0, CHAR_SET_L0(c0), CHAR_SET_L0(c1));
  347 
  348     if (L0_full(L0)) {
  349       free(L0);
  350       (*L1)[i1] = coerce(cset_L0_t *, -1);
  351     }
  352   }
  353 }
  354 
  355 static int L1_contains(cset_L1_t *L1, wchar_t ch)
  356 {
  357   int i1 = CHAR_SET_L1(ch);
  358   cset_L0_t *L0 = (*L1)[i1];
  359 
  360   if (L0 == 0)
  361     return 0;
  362   else if (L0 == coerce(cset_L0_t *, -1))
  363     return 1;
  364   else
  365     return L0_contains(L0, CHAR_SET_L0(ch));
  366 }
  367 
  368 
  369 static void L1_free(cset_L1_t *L1)
  370 {
  371   int i1;
  372 
  373   if (L1 == coerce(cset_L1_t *, -1))
  374     return;
  375 
  376   for (i1 = 0; i1 < 16; i1++)
  377     if ((*L1)[i1] != coerce(cset_L0_t *, -1))
  378       free((*L1)[i1]);
  379 }
  380 
  381 #ifdef FULL_UNICODE
  382 static int L2_full(cset_L2_t *L2)
  383 {
  384   int i;
  385   for (i = 0; i < 16; i++)
  386     if ((*L2)[i] != coerce(cset_L1_t *, -1))
  387       return 0;
  388   return 1;
  389 }
  390 #endif
  391 
  392 static void L2_fill_range(cset_L2_t *L2, wchar_t ch0, wchar_t ch1)
  393 {
  394   int i2, i20, i21;
  395 
  396   i20 = CHAR_SET_L2(ch0);
  397   i21 = CHAR_SET_L2(ch1);
  398 
  399   for (i2 = i20; i2 <= i21; i2++) {
  400     wchar_t c0 = 0, c1 = 0;
  401     cset_L1_t *L1;
  402 
  403     if (i2 > i20 && i2 < i21) {
  404       free((*L2)[i2]);
  405       (*L2)[i2] = coerce(cset_L1_t *, -1);
  406       continue;
  407     } else if (i20 == i21) {
  408       c0 = ch0;
  409       c1 = ch1;
  410     } else if (i2 == i20) {
  411       c0 = ch0;
  412       c1 = CHAR_SET_L1_HI(ch0);
  413     } else if (i2 == i21) {
  414       c0 = CHAR_SET_L1_LO(ch1);
  415       c1 = ch1;
  416     }
  417 
  418     if ((L1 = (*L2)[i2]) == coerce(cset_L1_t *, -1))
  419       continue;
  420 
  421     if (L1 == 0) {
  422       static cset_L1_t blank;
  423       L1 = (*L2)[i2] = coerce(cset_L1_t *, chk_malloc(sizeof *L1));
  424       memcpy(L1, &blank, sizeof *L1);
  425     }
  426 
  427     L1_fill_range(L1, c0, c1);
  428 
  429     if (L1_full(L1)) {
  430       free(L1);
  431       (*L2)[i2] = coerce(cset_L1_t *, -1);
  432     }
  433   }
  434 }
  435 
  436 static int L2_contains(cset_L2_t *L2, wchar_t ch)
  437 {
  438   int i2 = CHAR_SET_L2(ch);
  439   cset_L1_t *L1 = (*L2)[i2];
  440 
  441   if (L1 == 0)
  442     return 0;
  443   else if (L1 == coerce(cset_L1_t *, -1))
  444     return 1;
  445   else
  446     return L1_contains(L1, ch);
  447 }
  448 
  449 static void L2_free(cset_L2_t *L2)
  450 {
  451   int i2;
  452 
  453   for (i2 = 0; i2 < 16; i2++) {
  454     cset_L1_t *L1 = (*L2)[i2];
  455     if (L1 != 0 && L1 != coerce(cset_L1_t *, -1)) {
  456       L1_free((*L2)[i2]);
  457       free((*L2)[i2]);
  458     }
  459   }
  460 }
  461 
  462 #ifdef FULL_UNICODE
  463 
  464 static void L3_fill_range(cset_L3_t *L3, wchar_t ch0, wchar_t ch1)
  465 {
  466   int i3, i30, i31;
  467 
  468   i30 = CHAR_SET_L3(ch0);
  469   i31 = CHAR_SET_L3(ch1);
  470 
  471   for (i3 = i30; i3 <= i31; i3++) {
  472     wchar_t c0 = 0, c1 = 0;
  473     cset_L2_t *L2;
  474 
  475     if (i3 > i30 && i3 < i31) {
  476       free((*L3)[i3]);
  477       (*L3)[i3] = coerce(cset_L2_t *, -1);
  478       continue;
  479     } else if (i30 == i31) {
  480       c0 = ch0;
  481       c1 = ch1;
  482     } else if (i3 == i30) {
  483       c0 = ch0;
  484       c1 = CHAR_SET_L2_HI(ch0);
  485     } else if (i3 == i31) {
  486       c0 = CHAR_SET_L2_LO(ch1);
  487       c1 = ch1;
  488     }
  489 
  490     if ((L2 = (*L3)[i3]) == coerce(cset_L2_t *, -1))
  491       continue;
  492 
  493     if (L2 == 0) {
  494       static cset_L2_t blank;
  495       L2 = (*L3)[i3] = coerce(cset_L2_t *, chk_malloc(sizeof *L2));
  496       memcpy(L2, &blank, sizeof *L2);
  497     }
  498 
  499     L2_fill_range(L2, c0, c1);
  500     if (L2_full(L2)) {
  501       free(L2);
  502       (*L3)[i3] = coerce(cset_L2_t *, -1);
  503     }
  504   }
  505 }
  506 
  507 
  508 static int L3_contains(cset_L3_t *L3, wchar_t ch)
  509 {
  510   int i3 = CHAR_SET_L3(ch);
  511   cset_L2_t *L2 = (*L3)[i3];
  512 
  513   if (L2 == 0)
  514     return 0;
  515   else if (L2 == coerce(cset_L2_t *, -1))
  516     return 1;
  517   else
  518     return L2_contains(L2, ch);
  519 }
  520 
  521 static void L3_free(cset_L3_t *L3)
  522 {
  523   int i3;
  524 
  525   for (i3 = 0; i3 < 17; i3++) {
  526     cset_L2_t *L2 = (*L3)[i3];
  527     if (L2 != 0 && L2 != coerce(cset_L2_t *, -1)) {
  528       L2_free((*L3)[i3]);
  529       free((*L3)[i3]);
  530     }
  531   }
  532 }
  533 
  534 #endif
  535 
  536 static char_set_t *char_set_create(chset_type_t type, wchar_t base, unsigned st)
  537 {
  538   static char_set_t blank;
  539   char_set_t *cs = coerce(char_set_t *, chk_malloc(sizeof *cs));
  540   *cs = blank;
  541   cs->any.type = type;
  542   cs->any.stat = st;
  543 
  544   if (type == CHSET_DISPLACED)
  545     cs->d.base = base;
  546 
  547   return cs;
  548 }
  549 
  550 static void char_set_destroy(char_set_t *set, int force)
  551 {
  552   if (!set)
  553     return;
  554 
  555   if (set->any.stat && !force)
  556     return;
  557 
  558   switch (set->any.type) {
  559   case CHSET_DISPLACED:
  560   case CHSET_SMALL:
  561     free(set);
  562     break;
  563   case CHSET_LARGE:
  564     L2_free(&set->l.dir);
  565     free(set);
  566     break;
  567 #ifdef FULL_UNICODE
  568   case CHSET_XLARGE:
  569     L3_free(&set->xl.dir);
  570     free(set);
  571     break;
  572 #endif
  573   }
  574 }
  575 
  576 static void char_set_compl(char_set_t *set)
  577 {
  578   set->any.comp = 1;
  579 }
  580 
  581 static void char_set_add(char_set_t *set, wchar_t ch)
  582 {
  583   switch (set->any.type) {
  584   case CHSET_DISPLACED:
  585     assert (ch >= set->d.base && ch < set->d.base + 256);
  586     ch -= set->d.base;
  587     /* fallthrough */
  588   case CHSET_SMALL:
  589     assert (ch < 256);
  590     set->s.bitcell[CHAR_SET_INDEX(ch)] |= (1 << CHAR_SET_BIT(ch));
  591     break;
  592   case CHSET_LARGE:
  593     assert (ch < 0x10000);
  594     L2_fill_range(&set->l.dir, ch, ch);
  595     break;
  596 #ifdef FULL_UNICODE
  597   case CHSET_XLARGE:
  598     assert (ch < 0x110000);
  599     L3_fill_range(&set->xl.dir, ch, ch);
  600     break;
  601 #endif
  602   }
  603 }
  604 
  605 static void char_set_add_range(char_set_t *set, wchar_t ch0, wchar_t ch1)
  606 {
  607   if (ch0 >= ch1)
  608     return;
  609 
  610   switch (set->any.type) {
  611   case CHSET_DISPLACED:
  612     assert (ch0 >= set->d.base && ch1 < set->d.base + 256);
  613     ch0 -= set->d.base;
  614     ch1 -= set->d.base;
  615     /* fallthrough */
  616   case CHSET_SMALL:
  617     assert (ch1 < 256);
  618     L0_fill_range(&set->s.bitcell, ch0, ch1);
  619     break;
  620   case CHSET_LARGE:
  621     assert (ch1 < 0x10000);
  622     L2_fill_range(&set->l.dir, ch0, ch1);
  623     break;
  624 #ifdef FULL_UNICODE
  625   case CHSET_XLARGE:
  626     assert (ch1 < 0x110000);
  627     L3_fill_range(&set->xl.dir, ch0, ch1);
  628     break;
  629 #endif
  630   }
  631 }
  632 
  633 static void char_set_add_str(char_set_t *set, wchar_t *str)
  634 {
  635   while (*str != 0)
  636     char_set_add(set, *str++);
  637 }
  638 
  639 static int char_set_contains(char_set_t *set, wchar_t ch)
  640 {
  641   int result = 0;
  642 
  643   switch (set->any.type) {
  644   case CHSET_DISPLACED:
  645     if (ch < set->d.base)
  646       break;
  647     ch -= set->d.base;
  648     /* fallthrough */
  649   case CHSET_SMALL:
  650     if (ch >= 256)
  651       break;
  652     result = L0_contains(&set->s.bitcell, ch);
  653     break;
  654   case CHSET_LARGE:
  655     if (ch >= 0x10000)
  656       break;
  657     result = L2_contains(&set->l.dir, ch);
  658     break;
  659 #ifdef FULL_UNICODE
  660   case CHSET_XLARGE:
  661     if (ch >= 0x110000)
  662       break;
  663     result = L3_contains(&set->xl.dir, ch);
  664     break;
  665 #endif
  666   }
  667 
  668   return set->any.comp ? !result : result;
  669 }
  670 
  671 static char_set_t *char_set_compile(val args, val comp)
  672 {
  673   val iter;
  674   wchar_t min = WCHAR_MAX;
  675   wchar_t max = 0;
  676   chset_type_t cst;
  677 
  678   for (iter = args; iter; iter = rest(iter)) {
  679     val item = first(iter);
  680 
  681     if (consp(item)) {
  682       val from = car(item);
  683       val to = cdr(item);
  684 
  685       assert (typeof(from) == chr_s && typeof(to) == chr_s);
  686 
  687       if (c_chr(from) < min)
  688         min = c_chr(from);
  689       if (c_chr(from) > max)
  690         max = c_chr(from);
  691 
  692       if (c_chr(to) < min)
  693         min = c_chr(to);
  694       if (c_chr(to) > max)
  695         max = c_chr(to);
  696     } else if (typeof(item) == chr_s) {
  697       if (c_chr(item) < min)
  698         min = c_chr(item);
  699       if (c_chr(item) > max)
  700         max = c_chr(item);
  701     } else if (item == space_k) {
  702       if (max < 0x3000)
  703         max = 0x3000;
  704       if (min > 0x9)
  705         min = 0x9;
  706     } else if (item == digit_k) {
  707       if (max < '9')
  708         max = 9;
  709       if (min > '0')
  710         min = 0;
  711     } else if (item == word_char_k) {
  712       if (min > 'A')
  713         min = 'A';
  714       if (max < 'z')
  715         max = 'z';
  716     } else if (item == cspace_k || item == cdigit_k || item == cword_char_k) {
  717       uw_throwf(error_s, lit("bad object in character class syntax: ~s"),
  718                 item, nao);
  719     } else {
  720       assert(0 && "bad regex set");
  721     }
  722   }
  723 
  724   if (max < 0x100)
  725     cst = CHSET_SMALL;
  726   else if (max - min < 0x100)
  727     cst = CHSET_DISPLACED;
  728   else if (max < 0x10000)
  729     cst = CHSET_LARGE;
  730   else
  731 #ifdef FULL_UNICODE
  732     cst = CHSET_XLARGE;
  733 #else
  734     cst = CHSET_LARGE;
  735 #endif
  736 
  737 
  738   {
  739     char_set_t *set = char_set_create(cst, min, 0);
  740 
  741     for (iter = args; iter; iter = rest(iter)) {
  742       val item = first(iter);
  743 
  744       if (consp(item)) {
  745         val from = car(item);
  746         val to = cdr(item);
  747 
  748         assert (typeof(from) == chr_s && typeof(to) == chr_s);
  749         char_set_add_range(set, c_chr(from), c_chr(to));
  750       } else if (typeof(item) == chr_s) {
  751         char_set_add(set, c_chr(item));
  752       } else if (item == space_k) {
  753         char_set_add_str(set, spaces);
  754       } else if (item == digit_k) {
  755         char_set_add_range(set, '0', '9');
  756       } else if (item == word_char_k) {
  757         char_set_add_range(set, 'A', 'Z');
  758         char_set_add_range(set, 'a', 'z');
  759         char_set_add(set, '_');
  760       } else {
  761         assert(0 && "bad regex set");
  762       }
  763     }
  764 
  765     if (comp)
  766       char_set_compl(set);
  767 
  768     return set;
  769   }
  770 }
  771 
  772 static char_set_t *space_cs, *digit_cs, *word_cs;
  773 static char_set_t *cspace_cs, *cdigit_cs, *cword_cs;
  774 
  775 static void init_special_char_sets(void)
  776 {
  777   space_cs = char_set_create(CHSET_LARGE, 0, 1);
  778   cspace_cs = char_set_create(CHSET_LARGE, 0, 1);
  779   digit_cs = char_set_create(CHSET_SMALL, 0, 1);
  780   cdigit_cs = char_set_create(CHSET_SMALL, 0, 1);
  781   word_cs = char_set_create(CHSET_SMALL, 0, 1);
  782   cword_cs = char_set_create(CHSET_SMALL, 0, 1);
  783 
  784   char_set_compl(cspace_cs);
  785   char_set_compl(cdigit_cs);
  786   char_set_compl(cword_cs);
  787 
  788   char_set_add_str(space_cs, spaces);
  789   char_set_add_str(cspace_cs, spaces);
  790 
  791   char_set_add_range(digit_cs, '0', '9');
  792   char_set_add_range(cdigit_cs, '0', '9');
  793 
  794   char_set_add_range(word_cs, 'A', 'Z');
  795   char_set_add_range(cword_cs, 'A', 'Z');
  796   char_set_add_range(word_cs, 'a', 'z');
  797   char_set_add_range(cword_cs, 'a', 'z');
  798   char_set_add(word_cs, '_');
  799   char_set_add(cword_cs, '_');
  800 }
  801 
  802 static void char_set_cobj_destroy(val chset)
  803 {
  804   char_set_t *set = coerce(char_set_t *, chset->co.handle);
  805   char_set_destroy(set, 0);
  806   chset->co.handle = 0;
  807 }
  808 
  809 static struct cobj_ops char_set_obj_ops = cobj_ops_init(eq,
  810                                                         cobj_print_op,
  811                                                         char_set_cobj_destroy,
  812                                                         cobj_mark_op,
  813                                                         cobj_eq_hash_op);
  814 
  815 static nfa_state_t *nfa_state_accept(void)
  816 {
  817   nfa_state_t *st = coerce(nfa_state_t *, chk_malloc(sizeof *st));
  818   st->e.kind = nfa_accept;
  819   st->e.visited = 0;
  820   st->e.trans0 = st->e.trans1 = 0;
  821   return st;
  822 }
  823 
  824 static nfa_state_t *nfa_state_empty(nfa_state_t *t0, nfa_state_t *t1)
  825 {
  826   nfa_state_t *st = coerce(nfa_state_t *, chk_malloc(sizeof *st));
  827   st->e.kind = nfa_empty;
  828   st->e.visited = 0;
  829   st->e.trans0 = t0;
  830   st->e.trans1 = t1;
  831   return st;
  832 }
  833 
  834 static nfa_state_t *nfa_state_single(nfa_state_t *t, wchar_t ch)
  835 {
  836   nfa_state_t *st = coerce(nfa_state_t *, chk_malloc(sizeof *st));
  837   st->o.kind = nfa_single;
  838   st->o.visited = 0;
  839   st->o.trans = t;
  840   st->o.ch = ch;
  841   return st;
  842 }
  843 
  844 static nfa_state_t *nfa_state_wild(nfa_state_t *t)
  845 {
  846   nfa_state_t *st = coerce(nfa_state_t *, chk_malloc(sizeof *st));
  847   st->o.kind = nfa_wild;
  848   st->o.visited = 0;
  849   st->o.trans = t;
  850   st->o.ch = 0;
  851   return st;
  852 }
  853 
  854 static void nfa_state_free(nfa_state_t *st)
  855 {
  856   if (st->a.kind == nfa_set)
  857     char_set_destroy(st->s.set, 0);
  858   free(st);
  859 }
  860 
  861 static void nfa_state_shallow_free(nfa_state_t *st)
  862 {
  863   free(st);
  864 }
  865 
  866 static nfa_state_t *nfa_state_set(nfa_state_t *t, char_set_t *cs)
  867 {
  868   nfa_state_t *st = coerce(nfa_state_t *, chk_malloc(sizeof *st));
  869   st->s.kind = nfa_set;
  870   st->s.visited = 0;
  871   st->s.trans = t;
  872   st->s.set = cs;
  873   return st;
  874 }
  875 
  876 /*
  877  * An acceptance state is converted to an empty transition
  878  * state with specified transitions. It thereby loses
  879  * its acceptance state status. This is used during
  880  * compilation to hook new output paths into an inner NFA,
  881  * either back to itself, or to a new state in the
  882  * surrounding new NFA.
  883  */
  884 static void nfa_state_empty_convert(nfa_state_t *acc, nfa_state_t *t0,
  885                                     nfa_state_t *t1)
  886 {
  887   assert (nfa_accept_state_p(acc));
  888   acc->e.kind = nfa_empty;
  889   acc->e.trans0 = t0;
  890   acc->e.trans1 = t1;
  891 }
  892 
  893 /*
  894  * Acceptance state takes on the kind of st, and all associated
  895  * data. I.e. we merge the identity of accept,
  896  * with the contents of st, such that the new state has
  897  * all of the outgoing arrows of st, and
  898  * all of the incoming arrows of acc.
  899  * This is easily done with an assignment, provided
  900  * that st doesn't have any incoming arrows.
  901  * We ensure that start states don't have any incoming
  902  * arrows in the compiler, by ensuring that repetition
  903  * operators terminate their backwards arrows on an
  904  * existing start state, and allocate a new start
  905  * state in front of it.
  906  */
  907 static void nfa_state_merge(nfa_state_t *acc, nfa_state_t *st)
  908 {
  909   assert (nfa_accept_state_p(acc));
  910   *acc = *st;
  911 }
  912 
  913 static nfa_t nfa_make(nfa_state_t *s, nfa_state_t *acc)
  914 {
  915   nfa_t ret;
  916   ret.start = s;
  917   ret.accept = acc;
  918   return ret;
  919 }
  920 
  921 /*
  922  * Combine two NFA's representing regexps that are catenated.
  923  * The acceptance state of the predecessor is merged with the start state of
  924  * the successor.
  925  */
  926 static nfa_t nfa_combine(nfa_t pred, nfa_t succ)
  927 {
  928   nfa_t ret;
  929   ret.start = pred.start;
  930   ret.accept = succ.accept;
  931   nfa_state_merge(pred.accept, succ.start);
  932   nfa_state_shallow_free(succ.start); /* No longer needed. */
  933   return ret;
  934 }
  935 
  936 static nfa_t nfa_compile_set(val args, val comp)
  937 {
  938   char_set_t *set = char_set_compile(args, comp);
  939   nfa_state_t *acc = nfa_state_accept();
  940   nfa_state_t *s = nfa_state_set(acc, set);
  941   return nfa_make(s, acc);
  942 }
  943 
  944 static nfa_t nfa_compile_given_set(char_set_t *set)
  945 {
  946   nfa_state_t *acc = nfa_state_accept();
  947   nfa_state_t *s = nfa_state_set(acc, set);
  948   return nfa_make(s, acc);
  949 }
  950 
  951 static nfa_t nfa_compile_regex(val regex);
  952 
  953 /*
  954  * Helper to nfa_compile_regex for compiling the argument list of
  955  * a compound regex.
  956  */
  957 
  958 static nfa_t nfa_compile_list(val exp_list)
  959 {
  960   nfa_t nfa_first = nfa_compile_regex(first(exp_list));
  961 
  962   if (rest(exp_list)) {
  963     nfa_t nfa_rest = nfa_compile_list(rest(exp_list));
  964     return nfa_combine(nfa_first, nfa_rest);
  965   } else {
  966     return nfa_first;
  967   }
  968 }
  969 
  970 /*
  971  * Input is the items from a regex form,
  972  * not including the regex symbol.
  973  * I.e.  (rest '(regex ...)) not '(regex ...).
  974  */
  975 static nfa_t nfa_compile_regex(val exp)
  976 {
  977   if (nilp(exp)) {
  978     nfa_state_t *acc = nfa_state_accept();
  979     nfa_state_t *s = nfa_state_empty(acc, 0);
  980     return nfa_make(s, acc);
  981   } else if (chrp(exp)) {
  982     nfa_state_t *acc = nfa_state_accept();
  983     nfa_state_t *s = nfa_state_single(acc, c_chr(exp));
  984     return nfa_make(s, acc);
  985   } else if (stringp(exp)) {
  986     return nfa_compile_regex(cons(compound_s, list_str(exp)));
  987   } else if (exp == wild_s) {
  988     nfa_state_t *acc = nfa_state_accept();
  989     nfa_state_t *s = nfa_state_wild(acc);
  990     return nfa_make(s, acc);
  991   } else if (exp == space_k) {
  992     return nfa_compile_given_set(space_cs);
  993   } else if (exp == digit_k) {
  994     return nfa_compile_given_set(digit_cs);
  995   } else if (exp == word_char_k) {
  996     return nfa_compile_given_set(word_cs);
  997   } else if (exp == cspace_k) {
  998     return nfa_compile_given_set(cspace_cs);
  999   } else if (exp == cdigit_k) {
 1000     return nfa_compile_given_set(cdigit_cs);
 1001   } else if (exp == cword_char_k) {
 1002     return nfa_compile_given_set(cword_cs);
 1003   } else if (consp(exp)) {
 1004     val sym = first(exp), args = rest(exp);
 1005 
 1006     if (sym == set_s) {
 1007       return nfa_compile_set(args, nil);
 1008     } else if (sym == cset_s) {
 1009       return nfa_compile_set(args, t);
 1010     } else if (sym == compound_s) {
 1011       return nfa_compile_list(args);
 1012     } else if (sym == zeroplus_s) {
 1013       nfa_t nfa_arg = nfa_compile_regex(first(args));
 1014       nfa_state_t *acc = nfa_state_accept();
 1015       /* New start state has empty transitions going through
 1016          the inner NFA, or skipping it right to the new acceptance state. */
 1017       nfa_state_t *s = nfa_state_empty(nfa_arg.start, acc);
 1018       /* Convert acceptance state of inner NFA to one which has
 1019          an empty transition back to the start state, and
 1020          an empty transition to the new acceptance state. */
 1021       nfa_state_empty_convert(nfa_arg.accept, nfa_arg.start, acc);
 1022       return nfa_make(s, acc);
 1023     } else if (sym == oneplus_s) {
 1024       /* One-plus case differs from zero-plus in that the new start state
 1025          does not have an empty transition to the acceptance state.
 1026          So the inner NFA must be traversed once. */
 1027       nfa_t nfa_arg = nfa_compile_regex(first(args));
 1028       nfa_state_t *acc = nfa_state_accept();
 1029       nfa_state_t *s = nfa_state_empty(nfa_arg.start, 0); /* <-- diff */
 1030       nfa_state_empty_convert(nfa_arg.accept, nfa_arg.start, acc);
 1031       return nfa_make(s, acc);
 1032     } else if (sym == optional_s) {
 1033       /* In this case, we can keep the acceptance state of the inner
 1034          NFA as the acceptance state of the new NFA. We simply add
 1035          a new start state which can short-circuit to it via an empty
 1036          transition.  */
 1037       nfa_t nfa_arg = nfa_compile_regex(first(args));
 1038       nfa_state_t *s = nfa_state_empty(nfa_arg.start, nfa_arg.accept);
 1039       return nfa_make(s, nfa_arg.accept);
 1040     } else if (sym == or_s) {
 1041       /* Simple: make a new start and acceptance state, which form
 1042          the ends of a spindle that goes through two branches. */
 1043       nfa_t nfa_first = nfa_compile_regex(first(args));
 1044       nfa_t nfa_second = nfa_compile_regex(second(args));
 1045       nfa_state_t *acc = nfa_state_accept();
 1046       /* New state s has empty transitions into each inner NFA. */
 1047       nfa_state_t *s = nfa_state_empty(nfa_first.start, nfa_second.start);
 1048       /* Acceptance state of each inner NFA converted to empty
 1049          transition to new combined acceptance state. */
 1050       nfa_state_empty_convert(nfa_first.accept, acc, 0);
 1051       nfa_state_empty_convert(nfa_second.accept, acc, 0);
 1052       return nfa_make(s, acc);
 1053     } else {
 1054       uw_throwf(error_s, lit("bad operator in regex syntax: ~s"), sym, nao);
 1055     }
 1056   } else if (exp == t) {
 1057     return nfa_make(0, 0);
 1058   } else {
 1059     uw_throwf(error_s, lit("bad object in regex syntax: ~s"), exp, nao);
 1060   }
 1061 }
 1062 
 1063 INLINE int nfa_test_set_visited(nfa_state_t *s, unsigned visited)
 1064 {
 1065   if (s && s->a.visited != visited) {
 1066     s->a.visited = visited;
 1067     return 1;
 1068   }
 1069   return 0;
 1070 }
 1071 
 1072 static void nfa_map_states(nfa_state_t *s,
 1073                            mem_t *ctx, void (*fun)(nfa_state_t *,
 1074                                                    mem_t *ctx),
 1075                            unsigned visited)
 1076 {
 1077   if (nfa_test_set_visited(s, visited)) {
 1078     fun(s, ctx);
 1079 
 1080     switch (s->a.kind) {
 1081     case nfa_empty:
 1082     case nfa_accept:
 1083       nfa_map_states(s->e.trans0, ctx, fun, visited);
 1084       nfa_map_states(s->e.trans1, ctx, fun, visited);
 1085       break;
 1086     case nfa_wild:
 1087     case nfa_single:
 1088     case nfa_set:
 1089       nfa_map_states(s->o.trans, ctx, fun, visited);
 1090       break;
 1091     }
 1092   }
 1093 }
 1094 
 1095 static void nfa_count_one(nfa_state_t *s, mem_t *ctx)
 1096 {
 1097   (void) s;
 1098   int *pcount = coerce(int *, ctx);
 1099   (*pcount)++;
 1100 }
 1101 
 1102 static int nfa_count_states(nfa_state_t *s)
 1103 {
 1104   int count = 0;
 1105   if (s) {
 1106     unsigned visited = s->a.visited + 1;
 1107     nfa_map_states(s, coerce(mem_t *, &count), nfa_count_one, visited);
 1108   }
 1109   return count;
 1110 }
 1111 
 1112 static void nfa_handle_wraparound(nfa_state_t *s, unsigned *pvisited)
 1113 {
 1114   if (s && *pvisited > UINT_MAX - 8) {
 1115     s->a.visited = UINT_MAX - 1;
 1116     (void) nfa_count_states(s);
 1117     s->a.visited = UINT_MAX;
 1118     (void) nfa_count_states(s);
 1119     *pvisited = 1;
 1120   }
 1121 }
 1122 
 1123 static void nfa_collect_one(nfa_state_t *s, mem_t *ctx)
 1124 {
 1125   nfa_state_t ***ppel = coerce(nfa_state_t ***, ctx);
 1126   *(*ppel)++ = s;
 1127 }
 1128 
 1129 static void nfa_free(nfa_t nfa, int nstates)
 1130 {
 1131   nfa_state_t **all = coerce(nfa_state_t **, alloca(nstates * sizeof *all));
 1132   nfa_state_t **pelem = all, *s = nfa.start;
 1133   unsigned visited = s->a.visited + 1;
 1134   int i;
 1135 
 1136   /* We don't care if visited has reached UINT_MAX here, because the regex is
 1137    * going away, so we don't bother with nfa_handle_wraparound.
 1138    */
 1139   nfa_map_states(s, coerce(mem_t *, &pelem), nfa_collect_one, visited);
 1140 
 1141   assert (pelem - all == nstates);
 1142 
 1143   for (i = 0; i < nstates; i++)
 1144     nfa_state_free(all[i]);
 1145 }
 1146 
 1147 static void nfa_thread_epsilons(nfa_state_t **ps, unsigned visited)
 1148 {
 1149   nfa_state_t *s = *ps, **ps0 = 0, **ps1 = 0;
 1150 
 1151   if (!s)
 1152     return;
 1153 
 1154   switch (s->a.kind) {
 1155   case nfa_empty:
 1156     if (s->e.trans0 && s->e.trans1) {
 1157       ps0 = &s->e.trans0;
 1158       ps1 = &s->e.trans1;
 1159     } else if (s->e.trans0) {
 1160       *ps = s->e.trans0;
 1161       ps0 = ps;
 1162     } else if (s->e.trans1) {
 1163       *ps = s->e.trans1;
 1164       ps0 = ps;
 1165     } else {
 1166       *ps = 0;
 1167     }
 1168     break;
 1169   case nfa_accept:
 1170     ps0 = &s->e.trans0;
 1171     ps1 = &s->e.trans1;
 1172     break;
 1173   case nfa_single:
 1174   case nfa_wild:
 1175   case nfa_set:
 1176     ps0 = &s->o.trans;
 1177     break;
 1178   }
 1179 
 1180   if (nfa_test_set_visited(s, visited)) {
 1181     if (ps1)
 1182       nfa_thread_epsilons(ps1, visited);
 1183     if (ps0)
 1184       nfa_thread_epsilons(ps0, visited);
 1185   }
 1186 }
 1187 
 1188 static void nfa_fold_accept(nfa_state_t *s, mem_t *ctx)
 1189 {
 1190   (void) ctx;
 1191 
 1192   if (s->a.kind == nfa_empty) {
 1193     nfa_state_t *e0 = s->e.trans0;
 1194     nfa_state_t *e1 = s->e.trans1;
 1195 
 1196     if (e0 && nfa_accept_state_p(e0)) {
 1197       s->a.kind = nfa_accept;
 1198       s->e.trans0 = 0;
 1199     }
 1200 
 1201     if (e1 && nfa_accept_state_p(e1)) {
 1202       s->a.kind = nfa_accept;
 1203       s->e.trans1 = 0;
 1204     }
 1205   }
 1206 }
 1207 
 1208 static void nfa_noop(nfa_state_t *s, mem_t *ctx)
 1209 {
 1210   (void) s;
 1211   (void) ctx;
 1212 }
 1213 
 1214 static nfa_t nfa_optimize(nfa_t nfa)
 1215 {
 1216   if (nfa.start) {
 1217     int nstates = nfa_count_states(nfa.start), i;
 1218     nfa_state_t **all = coerce(nfa_state_t **, alloca(nstates * sizeof *all));
 1219     nfa_state_t **pelem = all;
 1220     unsigned visited;
 1221 
 1222     /* Get all states in flat array. */
 1223     nfa_map_states(nfa.start, coerce(mem_t *, &pelem), nfa_collect_one, nfa.start->a.visited + 1);
 1224 
 1225     /* Eliminate useless epsilons from graph. */
 1226     nfa_thread_epsilons(&nfa.start, nfa.start->a.visited + 1);
 1227 
 1228     /* Fold accept states into empty transitions which reference them. */
 1229     nfa_map_states(nfa.start, 0, nfa_fold_accept, nfa.start->a.visited + 1);
 1230 
 1231     /* Visit all reachable states. */
 1232     nfa_map_states(nfa.start, 0, nfa_noop, nfa.start->a.visited + 1);
 1233 
 1234     /* Garbage-collect unreachable states. */
 1235     visited = nfa.start->a.visited;
 1236 
 1237     for (i = 0; i < nstates; i++) {
 1238       nfa_state_t *s = all[i];
 1239       if (s->a.visited != visited)
 1240         nfa_state_free(s);
 1241     }
 1242   }
 1243   return nfa;
 1244 }
 1245 
 1246 
 1247 /*
 1248  * Compute the epsilon-closure of the NFA states stored in the set, whose
 1249  * size is given by nin. The results are stored back in the same array, the
 1250  * size of which is returned. The stack parameter provides storage used by the
 1251  * algorithm, so it doesn't have to be allocated and freed repeatedly.
 1252  * The visited parameter is a stamp used for marking states which are added
 1253  * to the epsilon-closure set, so that sets are not added twice.
 1254  * If any of the states added to the closure are acceptance states,
 1255  * the accept parameter is used to store the flag 1.
 1256  *
 1257  * An epsilon-closure is the set of all input states, plus all additional
 1258  * states which are reachable from that set with empty (epsilon) transitions.
 1259  * (Transitions that don't do not consume and match an input character).
 1260  */
 1261 static int nfa_closure(nfa_state_t **stack, nfa_state_t **set, int nin,
 1262                        int nstates, unsigned visited, int *accept)
 1263 {
 1264   int i, nout = 0;
 1265   int stackp = 0;
 1266 
 1267   /* First, add all states in the input set to the closure,
 1268      push them on the stack, and mark them as visited. */
 1269   for (i = 0; i < nin; i++) {
 1270     nfa_state_t *s = set[i];
 1271     bug_unless (stackp < nstates);
 1272 
 1273     s->a.visited = visited;
 1274     stack[stackp++] = s;
 1275     set[nout++] = s;
 1276     if (nfa_accept_state_p(s))
 1277       *accept = 1;
 1278   }
 1279 
 1280   while (stackp) {
 1281     nfa_state_t *top = stack[--stackp];
 1282 
 1283     /* Only states of type nfa_empty are interesting.
 1284        Each such state at most two epsilon transitions. */
 1285 
 1286     if (nfa_empty_state_p(top)) {
 1287       nfa_state_t *e0 = top->e.trans0;
 1288       nfa_state_t *e1 = top->e.trans1;
 1289 
 1290       if (nfa_test_set_visited(e0, visited)) {
 1291         stack[stackp++] = e0;
 1292         set[nout++] = e0;
 1293         if (nfa_accept_state_p(e0))
 1294           *accept = 1;
 1295       }
 1296 
 1297       if (nfa_test_set_visited(e1, visited)) {
 1298         stack[stackp++] = e1;
 1299         set[nout++] = e1;
 1300         if (nfa_accept_state_p(e1))
 1301           *accept = 1;
 1302       }
 1303     }
 1304   }
 1305 
 1306   bug_unless (nout <= nstates);
 1307 
 1308   return nout;
 1309 }
 1310 
 1311 /*
 1312  * The nfa_move_closure function combines nfa_move and nfa_closure into a
 1313  * single operation, elminating an intermediate array.
 1314  */
 1315 static int nfa_move_closure(nfa_state_t **stack, nfa_state_t **set, int nin,
 1316                             int nstates, wchar_t ch,
 1317                             unsigned visited, int *accept)
 1318 {
 1319   int i, nout, stackp;
 1320 
 1321   /*
 1322    * Compute the move set from a given set of NFA states. The move
 1323    * set is the set of states which are reachable from the set of
 1324    * input states on the consumpion of the input character given by ch.
 1325    */
 1326   for (nout = 0, stackp = 0, i = 0; i < nin; i++) {
 1327     nfa_state_t *s = set[i];
 1328 
 1329     switch (s->a.kind) {
 1330     case nfa_wild:
 1331       /* Unconditional match; don't have to look at ch. */
 1332       break;
 1333     case nfa_single:
 1334       if (s->o.ch == ch) /* Character match. */
 1335         break;
 1336       continue; /* no match */
 1337     case nfa_set:
 1338       if (char_set_contains(s->s.set, ch)) /* Set match. */
 1339         break;
 1340       continue; /* no match */
 1341     default:
 1342       /* Epsilon-transition and acceptance states have no character moves. */
 1343       continue;
 1344     }
 1345 
 1346     /* The state matches the character, so add it to the move set.
 1347        C trick: all character-transitioning state types have the
 1348        pointer to the next state in the same position,
 1349        among a common set of leading struct members in the union,
 1350        so we can use s->o.trans. */
 1351     {
 1352       nfa_state_t *mov = s->o.trans;
 1353 
 1354       bug_unless (stackp < nstates);
 1355 
 1356       if (nfa_test_set_visited(mov, visited)) {
 1357         stack[stackp++] = mov;
 1358         set[nout++] = mov;
 1359         if (nfa_accept_state_p(mov))
 1360           *accept = 1;
 1361       }
 1362     }
 1363   }
 1364 
 1365   while (stackp) {
 1366     nfa_state_t *top = stack[--stackp];
 1367 
 1368     /* Only states of type nfa_empty are interesting.
 1369        Each such state at most two epsilon transitions. */
 1370 
 1371     if (nfa_empty_state_p(top)) {
 1372       nfa_state_t *e0 = top->e.trans0;
 1373       nfa_state_t *e1 = top->e.trans1;
 1374 
 1375       if (nfa_test_set_visited(e0, visited)) {
 1376         stack[stackp++] = e0;
 1377         set[nout++] = e0;
 1378         if (nfa_accept_state_p(e0))
 1379           *accept = 1;
 1380       }
 1381 
 1382       if (nfa_test_set_visited(e1, visited)) {
 1383         stack[stackp++] = e1;
 1384         set[nout++] = e1;
 1385         if (nfa_accept_state_p(e1))
 1386           *accept = 1;
 1387       }
 1388     }
 1389   }
 1390 
 1391   bug_unless (nout <= nstates);
 1392 
 1393   return nout;
 1394 }
 1395 
 1396 /*
 1397  * Match regex against the string in. The match is
 1398  * anchored to the front of the string; to search
 1399  * within the string, a .* must be added to the front
 1400  * of the regex.
 1401  *
 1402  * Returns the length of the prefix of the string
 1403  * which matches the regex.  Or, if you will,
 1404  * the position of the first mismatching
 1405  * character.
 1406  *
 1407  * If the regex does not match at all, zero is
 1408  * returned.
 1409  *
 1410  * Matching stops when a state is reached from which
 1411  * there are no transitions on the next input character,
 1412  * or when the string runs out of characters.
 1413  * The most recently visited acceptance state then
 1414  * determines the match length (defaulting to zero
 1415  * if no acceptance states were encountered).
 1416  */
 1417 static cnum nfa_run(nfa_t nfa, int nstates, const wchar_t *str)
 1418 {
 1419   const wchar_t *last_accept_pos = 0, *ptr = str;
 1420   unsigned visited = nfa.start ? nfa.start->a.visited : 0;
 1421   nfa_state_t **set = coerce(nfa_state_t **, alloca(nstates * sizeof *set));
 1422   nfa_state_t **stack = coerce(nfa_state_t **, alloca(nstates * sizeof *stack));
 1423   int nclos = 0;
 1424   int accept = 0;
 1425 
 1426   nfa_handle_wraparound(nfa.start, &visited);
 1427 
 1428   if (nfa.start) {
 1429     set[0] = nfa.start;
 1430     nclos = nfa_closure(stack, set, 1, nstates, ++visited, &accept);
 1431   }
 1432 
 1433   if (accept)
 1434     last_accept_pos = ptr;
 1435 
 1436   for (; *ptr != 0; ptr++) {
 1437     wchar_t ch = *ptr;
 1438 
 1439     accept = 0;
 1440 
 1441     nfa_handle_wraparound(nfa.start, &visited);
 1442 
 1443     nclos = nfa_move_closure(stack, set, nclos,
 1444                              nstates, ch, ++visited, &accept);
 1445 
 1446     if (accept)
 1447       last_accept_pos = ptr + 1;
 1448 
 1449     if (nclos == 0) /* dead end; no match */
 1450       break;
 1451   }
 1452 
 1453   if (nfa.start)
 1454     nfa.start->a.visited = visited;
 1455   return last_accept_pos ? last_accept_pos - str : -1;
 1456 }
 1457 
 1458 static cnum regex_machine_match_span(regex_machine_t *regm)
 1459 {
 1460   return regm->n.last_accept_pos;
 1461 }
 1462 
 1463 static void regex_destroy(val obj)
 1464 {
 1465   regex_t *regex = coerce(regex_t *, obj->co.handle);
 1466   if (regex->kind == REGEX_NFA)
 1467     nfa_free(regex->r.nfa, regex->nstates);
 1468   free(regex);
 1469   obj->co.handle = 0;
 1470 }
 1471 
 1472 static void regex_mark(val obj)
 1473 {
 1474   regex_t *regex = coerce(regex_t *, obj->co.handle);
 1475   if (regex->kind == REGEX_DV)
 1476     gc_mark(regex->r.dv);
 1477   gc_mark(regex->source);
 1478 }
 1479 
 1480 static void regex_print(val obj, val stream, val pretty, struct strm_ctx *);
 1481 
 1482 static struct cobj_ops regex_obj_ops = cobj_ops_init(eq,
 1483                                                      regex_print,
 1484                                                      regex_destroy,
 1485                                                      regex_mark,
 1486                                                      cobj_eq_hash_op);
 1487 
 1488 static val reg_nullable(val);
 1489 
 1490 static val reg_expand_nongreedy(val exp)
 1491 {
 1492   if (atom(exp)) {
 1493     return exp;
 1494   } else if (consp(exp)) {
 1495     val sym = first(exp);
 1496     val args = rest(exp);
 1497 
 1498     if (sym == set_s || sym == cset_s) {
 1499       return exp;
 1500     } else if (sym == compound_s || sym == zeroplus_s || sym == oneplus_s ||
 1501                sym == optional_s || sym == compl_s ||
 1502                sym == or_s || sym == and_s)
 1503     {
 1504       list_collect_decl (out, iter);
 1505       iter = list_collect(iter, sym);
 1506       for (; args; args = cdr(args))
 1507         iter = list_collect(iter, reg_expand_nongreedy(first(args)));
 1508       return out;
 1509     } else if (sym == nongreedy_s) {
 1510       val xfirst = reg_expand_nongreedy(first(args));
 1511       val xsecond = reg_expand_nongreedy(second(args));
 1512       val zplus = cons(zeroplus_s, cons(xfirst, nil));
 1513 
 1514       if (xsecond == nil) {
 1515         return zplus;
 1516       } else {
 1517         val any = list(zeroplus_s, wild_s, nao);
 1518         val notempty = list(oneplus_s, wild_s, nao);
 1519 
 1520         return list(compound_s,
 1521                     list(and_s,
 1522                          zplus,
 1523                          list(compl_s,
 1524                               list(compound_s,
 1525                                    any,
 1526                                    if3(reg_nullable(xsecond),
 1527                                        list(and_s, xsecond, notempty, nao),
 1528                                        xsecond),
 1529                                    any, nao),
 1530                               nao),
 1531                          nao),
 1532                     xsecond, nao);
 1533       }
 1534     } else {
 1535       uw_throwf(error_s, lit("bad operator in regex syntax: ~s"), sym, nao);
 1536     }
 1537   } else {
 1538     uw_throwf(error_s, lit("bad object in regex syntax: ~s"), exp, nao);
 1539   }
 1540 }
 1541 
 1542 static val reg_nary_to_bin(val regex);
 1543 
 1544 static val reg_nary_unfold(val sym, val args, val orig)
 1545 {
 1546   if (atom(args)) {
 1547     return t; /* Nullary intersection and union are both empty. */
 1548   } else if (!cdr(args)) {
 1549     return reg_nary_to_bin(car(args));
 1550   } else if (!cddr(args)) {
 1551     val rx1 = pop(&args);
 1552     val rx2 = pop(&args);
 1553     val newrx1 = reg_nary_to_bin(rx1);
 1554     val newrx2 = reg_nary_to_bin(rx2);
 1555     if (!orig || rx1 != newrx1 || rx2 != newrx2)
 1556       return list(sym, newrx1, newrx2, nao);
 1557     return orig;
 1558   } else {
 1559     return list(sym,
 1560                 reg_nary_to_bin(car(args)),
 1561                 reg_nary_unfold(sym, cdr(args), nil), nao);
 1562   }
 1563 }
 1564 
 1565 static val reg_nary_to_bin(val regex)
 1566 {
 1567   if (atom(regex)) {
 1568     return regex;
 1569   } else {
 1570     val sym = first(regex);
 1571     val args = rest(regex);
 1572 
 1573     if (sym == or_s || sym == and_s) {
 1574       return reg_nary_unfold(sym, args, regex);
 1575     } else if (sym == compound_s || sym == zeroplus_s || sym == oneplus_s ||
 1576                sym == optional_s || sym == compl_s || sym == nongreedy_s)
 1577     {
 1578      list_collect_decl (out, ptail);
 1579       val args_orig = args;
 1580       val nochange = t;
 1581 
 1582       for (; args; args = cdr(args))  {
 1583         val rx = car(args);
 1584         val newrx = reg_nary_to_bin(car(args));
 1585 
 1586         if (nochange && rx != newrx) {
 1587           ptail = list_collect_nconc(ptail, ldiff(args_orig, args));
 1588           nochange = nil;
 1589         }
 1590 
 1591         if (!nochange)
 1592           ptail = list_collect(ptail, newrx);
 1593       }
 1594 
 1595       if (!nochange)
 1596         return cons(sym, out);
 1597     }
 1598 
 1599     return regex;
 1600   }
 1601 }
 1602 
 1603 static val reg_compile_csets(val exp)
 1604 {
 1605   if (exp == space_k) {
 1606     return cobj(coerce(mem_t *, space_cs), chset_s, &char_set_obj_ops);
 1607   } else if (exp == digit_k) {
 1608     return cobj(coerce(mem_t *, digit_cs), chset_s, &char_set_obj_ops);
 1609   } else if (exp == word_char_k) {
 1610     return cobj(coerce(mem_t *, word_cs), chset_s, &char_set_obj_ops);
 1611   } else if (exp == cspace_k) {
 1612     return cobj(coerce(mem_t *, cspace_cs), chset_s, &char_set_obj_ops);
 1613   } else if (exp == cdigit_k) {
 1614     return cobj(coerce(mem_t *, cdigit_cs), chset_s, &char_set_obj_ops);
 1615   } else if (exp == cword_char_k) {
 1616     return cobj(coerce(mem_t *, cword_cs), chset_s, &char_set_obj_ops);
 1617   } else if (symbolp(exp) || chrp(exp)) {
 1618     return exp;
 1619   } else if (stringp(exp)) {
 1620     return cons(compound_s, list_str(exp));
 1621   } else if (consp(exp)) {
 1622     val sym = first(exp);
 1623     val args = rest(exp);
 1624 
 1625     if (sym == set_s || sym == cset_s) {
 1626       char_set_t *set = char_set_compile(args, eq(sym, cset_s));
 1627       return cobj(coerce(mem_t *, set), chset_s, &char_set_obj_ops);
 1628     } else if (sym == compound_s || sym == zeroplus_s || sym == oneplus_s ||
 1629                sym == optional_s || sym == compl_s || sym == nongreedy_s ||
 1630                sym == or_s || sym == and_s)
 1631     {
 1632       list_collect_decl (out, iter);
 1633       iter = list_collect(iter, sym);
 1634       for (; args; args = cdr(args))
 1635         iter = list_collect(iter, reg_compile_csets(first(args)));
 1636       return out;
 1637     } else {
 1638       uw_throwf(error_s, lit("bad operator in regex syntax: ~s"), sym, nao);
 1639     }
 1640   } else {
 1641     uw_throwf(error_s, lit("bad object in regex syntax: ~s"), exp, nao);
 1642   }
 1643 }
 1644 
 1645 /*
 1646  * Helper to reg_nullable for recursing over
 1647  * contents of a compound expression.
 1648  */
 1649 static val reg_nullable_list(val exp_list)
 1650 {
 1651   if (rest(exp_list)) {
 1652     return if2(reg_nullable(first(exp_list)) &&
 1653                reg_nullable_list(rest(exp_list)),
 1654                t);
 1655   } else {
 1656     return reg_nullable(first(exp_list));
 1657   }
 1658 }
 1659 
 1660 /*
 1661  * Determine whether the given regular expression is nullable: that is
 1662  * to say, can the regular expression match the empty string?
 1663  */
 1664 static val reg_nullable(val exp)
 1665 {
 1666   if (exp == nil) {
 1667     return t;
 1668   } else if (atom(exp)) {
 1669     return nil;
 1670   } else {
 1671     val sym = first(exp), args = rest(exp);
 1672 
 1673     if (sym == set_s || sym == cset_s) {
 1674       return nil;
 1675     } else if (sym == compound_s) {
 1676       return reg_nullable_list(args);
 1677     } else if (sym == oneplus_s) {
 1678       return reg_nullable(first(args));
 1679     } else if (sym == zeroplus_s || sym == optional_s) {
 1680       return t;
 1681     } else if (sym == compl_s) {
 1682       return if3(reg_nullable(first(args)), nil, t);
 1683     } else if (sym == or_s) {
 1684       return if2((reg_nullable(first(args)) || reg_nullable(second(args))), t);
 1685     } else if (sym == and_s) {
 1686       return if2((reg_nullable(first(args)) && reg_nullable(second(args))), t);
 1687     } else {
 1688       uw_throwf(error_s, lit("bad operator in regex syntax: ~s"), sym, nao);
 1689     }
 1690   }
 1691 }
 1692 
 1693 static val reg_matches_all(val exp)
 1694 {
 1695   if (atom(exp)) {
 1696     return nil;
 1697   } else {
 1698     val sym = first(exp), args = rest(exp);
 1699 
 1700     if (sym == set_s || sym == cset_s) {
 1701       return nil;
 1702     } else if (sym == compound_s) {
 1703       val am = nil;
 1704       for (; args; args = cdr(args)) {
 1705         val arg = car(args);
 1706         if (!reg_nullable(arg))
 1707           return nil;
 1708         if (!am && reg_matches_all(arg))
 1709           am = t;
 1710       }
 1711       return am;
 1712     } else if (sym == oneplus_s) {
 1713       return reg_matches_all(car(args));
 1714     } else if (sym == zeroplus_s) {
 1715       val arg = car(args);
 1716       if (arg == wild_s || reg_matches_all(arg))
 1717         return t;
 1718       return nil;
 1719     } else if (sym == optional_s) {
 1720       return reg_matches_all(car(args));
 1721     } else if (sym == compl_s) {
 1722       val arg = car(args);
 1723       return if2(arg == t, t);
 1724     } else if (sym == or_s) {
 1725       return tnil(reg_matches_all(pop(&args)) || reg_matches_all(pop(&args)));
 1726     } else if (sym == and_s) {
 1727       return tnil(reg_matches_all(pop(&args)) && reg_matches_all(pop(&args)));
 1728     } else {
 1729       uw_throwf(error_s, lit("bad operator in regex syntax: ~s"), sym, nao);
 1730     }
 1731   }
 1732 }
 1733 
 1734 static val flatten_or(val or_expr)
 1735 {
 1736   if (atom(or_expr) || car(or_expr) != or_s) {
 1737     return cons(or_expr, nil);
 1738   } else {
 1739     val left = second(or_expr);
 1740     val right = third(or_expr);
 1741     return nappend2(flatten_or(left), flatten_or(right));
 1742   }
 1743 }
 1744 
 1745 static val unflatten_or(val exlist)
 1746 {
 1747   val f = first(exlist);
 1748   val r = rest(exlist);
 1749 
 1750   if (r) {
 1751     return cons(or_s, cons(f, cons(unflatten_or(r), nil)));
 1752   } else {
 1753     return f;
 1754   }
 1755 }
 1756 
 1757 static val unique_first(val exlist)
 1758 {
 1759   val f = first(exlist);
 1760   val r = rest(exlist);
 1761 
 1762   if (!memqual(f, r))
 1763     return cons(first(exlist), nil);
 1764   return nil;
 1765 }
 1766 
 1767 static val reduce_or(val or_expr)
 1768 {
 1769   val left = second(or_expr);
 1770   val right = third(or_expr);
 1771 
 1772   /*
 1773    * Do optimization only if this is an or of two or expressions.
 1774    */
 1775 
 1776   if (consp(left) && first(left) == or_s &&
 1777       consp(right) && first(right) == or_s)
 1778   {
 1779     val exlist = flatten_or(or_expr);
 1780     val repeats_removed = mapcon(func_n1(unique_first), exlist);
 1781     return unflatten_or(repeats_removed);
 1782   } else {
 1783     return or_expr;
 1784   }
 1785 }
 1786 
 1787 static val reg_derivative(val, val);
 1788 
 1789 static val reg_derivative_list(val exp_list, val ch)
 1790 {
 1791   if (rest(exp_list)) {
 1792     if (reg_nullable(first(exp_list))) {
 1793       val d_first = reg_derivative(first(exp_list), ch);
 1794       val d_rest = reg_derivative_list(rest(exp_list), ch);
 1795 
 1796       if (d_rest == t && d_first == t)
 1797         return t;
 1798 
 1799       if (d_rest == t)
 1800         return if3(d_first == nil,
 1801                    cons(compound_s, rest(exp_list)),
 1802                    cons(compound_s, cons(d_first, rest(exp_list))));
 1803 
 1804       if (d_first == t)
 1805         return d_rest;
 1806 
 1807       return list(or_s,
 1808                   if3(d_first == nil,
 1809                       cons(compound_s, rest(exp_list)),
 1810                       cons(compound_s, cons(d_first, rest(exp_list)))),
 1811                   d_rest,
 1812                   nao);
 1813     } else {
 1814       val d_first = reg_derivative(first(exp_list), ch);
 1815 
 1816       if (d_first == t)
 1817         return t;
 1818       else if (d_first == nil)
 1819         return cons(compound_s, rest(exp_list));
 1820       else
 1821         return cons(compound_s,
 1822                     cons(d_first, rest(exp_list)));
 1823     }
 1824   } else {
 1825     return reg_derivative(first(exp_list), ch);
 1826   }
 1827 }
 1828 
 1829 /*
 1830  * Determine derivative of regex with respect to character.
 1831  */
 1832 static val reg_derivative(val exp, val ch)
 1833 {
 1834   if (exp == nil || exp == t) {
 1835     return t;
 1836   } else if (chrp(exp)) {
 1837     return null(eq(exp, ch));
 1838   } else if (typeof(exp) == chset_s) {
 1839     char_set_t *set = coerce(char_set_t *, exp->co.handle);
 1840     return if3(char_set_contains(set, c_chr(ch)), nil, t);
 1841   } else if (exp == wild_s) {
 1842     return nil;
 1843   } else {
 1844     val sym = first(exp);
 1845     val args = rest(exp);
 1846 
 1847     if (sym == compound_s) {
 1848       return reg_derivative_list(args, ch);
 1849     } else if (sym == optional_s) {
 1850       return reg_derivative(first(args), ch);
 1851     } else if (sym == oneplus_s) {
 1852       val arg = first(args);
 1853       val d_arg = reg_derivative(arg, ch);
 1854       if (d_arg == t)
 1855         return t;
 1856       if (d_arg == nil)
 1857         return cons(zeroplus_s, cons(arg, nil));
 1858       return cons(compound_s, cons(d_arg,
 1859                                    cons(cons(zeroplus_s,
 1860                                              cons(arg, nil)), nil)));
 1861     } else if (sym == zeroplus_s) {
 1862       val arg = first(args);
 1863       val d_arg = reg_derivative(arg, ch);
 1864       if (d_arg == t)
 1865         return t;
 1866       if (d_arg == nil)
 1867         return exp;
 1868       return cons(compound_s, cons(d_arg, cons(exp, nil)));
 1869     } else if (sym == compl_s) {
 1870       val d_arg = reg_derivative(first(args), ch);
 1871       if (reg_matches_all(d_arg))
 1872         return t;
 1873       if (d_arg == nil)
 1874         return cons(oneplus_s, cons(wild_s, nil));
 1875       if (d_arg == t)
 1876         return cons(zeroplus_s, cons(wild_s, nil));
 1877       return cons(sym, cons(d_arg, nil));
 1878     } else if (sym == or_s) {
 1879       val d_arg1 = reg_derivative(first(args), ch);
 1880       val d_arg2 = reg_derivative(second(args), ch);
 1881 
 1882       if (d_arg1 == t)
 1883         return d_arg2;
 1884 
 1885       if (d_arg2 == t)
 1886         return d_arg1;
 1887 
 1888       return reduce_or(cons(or_s, cons(d_arg1, cons(d_arg2, nil))));
 1889     } else if (sym == and_s) {
 1890       val d_arg1 = reg_derivative(first(args), ch);
 1891       val d_arg2 = nil;
 1892 
 1893       if (d_arg1 == t)
 1894         return t;
 1895 
 1896       d_arg2 = reg_derivative(second(args), ch);
 1897 
 1898       if (d_arg2 == t)
 1899         return t;
 1900 
 1901       return cons(and_s, cons(d_arg1, cons(d_arg2, nil)));
 1902     } else if (sym == set_s || sym == cset_s) {
 1903       uw_throwf(error_s, lit("uncompiled regex passed to reg_derivative"), nao);
 1904     } else {
 1905       uw_throwf(error_s, lit("bad operator in regex syntax: ~s"), sym, nao);
 1906     }
 1907   }
 1908 }
 1909 
 1910 static cnum dv_run(val regex, const wchar_t *str)
 1911 {
 1912   const wchar_t *last_accept_pos = 0, *ptr = str;
 1913 
 1914   for (; *ptr != 0; ptr++) {
 1915     wchar_t ch = *ptr;
 1916     val nullable = reg_nullable(regex);
 1917     val deriv = reg_derivative(regex, chr(ch));
 1918 
 1919     if (nullable)
 1920       last_accept_pos = ptr;
 1921 
 1922     if (deriv == t)
 1923       return last_accept_pos ? last_accept_pos - str : -1;
 1924   }
 1925 
 1926   if (reg_nullable(regex))
 1927     return ptr - str;
 1928   return last_accept_pos ? last_accept_pos - str : -1;
 1929 }
 1930 
 1931 static val reg_single_char_p(val exp)
 1932 {
 1933   if (chrp(exp))
 1934     return t;
 1935   if (consp(exp) && (car(exp) == set_s || car(exp) == cset_s))
 1936     return t;
 1937   if (exp == space_k || exp == word_char_k || exp == digit_k)
 1938     return t;
 1939   if (exp == cspace_k || exp == cword_char_k || exp == cdigit_k)
 1940     return t;
 1941   return nil;
 1942 }
 1943 
 1944 static val reg_compl_char_p(val exp)
 1945 {
 1946   if (consp(exp) && car(exp) == cset_s)
 1947     return t;
 1948   if (exp == cspace_k || exp == cword_char_k || exp == cdigit_k)
 1949     return t;
 1950   return nil;
 1951 }
 1952 
 1953 static val reg_optimize(val exp);
 1954 
 1955 static val invert_single(val exp)
 1956 {
 1957   if (chrp(exp))
 1958     return cons(cset_s, cons(exp, nil));
 1959   if (consp(exp)) {
 1960     if (car(exp) == set_s)
 1961       return cons(cset_s, cdr(exp));
 1962     if (car(exp) == cset_s)
 1963       return reg_optimize(cons(set_s, cdr(exp)));
 1964   }
 1965   if (exp == space_k)
 1966     return cspace_k;
 1967   if (exp == cspace_k)
 1968     return space_k;
 1969   if (exp == word_char_k)
 1970     return cword_char_k;
 1971   if (exp == cword_char_k)
 1972     return word_char_k;
 1973   if (exp == digit_k)
 1974     return cdigit_k;
 1975   if (exp == cdigit_k)
 1976     return digit_k;
 1977 
 1978   abort();
 1979 }
 1980 
 1981 static val reg_optimize(val exp)
 1982 {
 1983   if (atom(exp)) {
 1984     return exp;
 1985   } else if (reg_matches_all(exp)) {
 1986     return cons(zeroplus_s, cons(wild_s, nil));
 1987   } else {
 1988     val sym = first(exp);
 1989     val args = rest(exp);
 1990 
 1991     if (sym == set_s) {
 1992       if (!args)
 1993         return t;
 1994       if (!rest(args) && chrp(first(args)))
 1995         return first(args);
 1996       return exp;
 1997     } else if (sym == cset_s) {
 1998       return if3(rest(exp), exp, wild_s);
 1999     } else if (sym == compound_s) {
 2000       val xargs = mapcar(func_n1(reg_optimize), args);
 2001       while (rest(xargs)) {
 2002         if (reg_matches_all(first(xargs)) &&
 2003             reg_matches_all(second(xargs)))
 2004         {
 2005           pop(&xargs);
 2006           continue;
 2007         }
 2008         break;
 2009       }
 2010       if (!xargs)
 2011         return nil;
 2012       if (!cdr(xargs))
 2013         return car(xargs);
 2014       if (memq(t, xargs))
 2015         return t;
 2016       return cons(sym, xargs);
 2017     } else if (sym == zeroplus_s) {
 2018       val arg = reg_optimize(first(args));
 2019       if (consp(arg)) {
 2020         val arg2 = first(arg);
 2021         if (arg2 == zeroplus_s)
 2022           return arg;
 2023         if (arg2 == oneplus_s || arg2 == optional_s)
 2024           return cons(zeroplus_s, cdr(arg));
 2025       }
 2026       if (reg_matches_all(arg))
 2027         return arg;
 2028       return cons(sym, cons(arg, nil));
 2029     } else if (sym == oneplus_s) {
 2030       val arg = reg_optimize(first(args));
 2031       if (reg_matches_all(arg))
 2032         return cons(zeroplus_s, cons(wild_s, nil));
 2033       if (consp(arg)) {
 2034         val arg2 = first(arg);
 2035         if (arg2 == zeroplus_s || arg2 == oneplus_s)
 2036           return arg;
 2037         if (arg2 == optional_s)
 2038           return cons(zeroplus_s, cdr(arg));
 2039       }
 2040       return cons(sym, cons(arg, nil));
 2041     } else if (sym == optional_s) {
 2042       val arg = reg_optimize(first(args));
 2043       if (reg_matches_all(arg))
 2044         return arg;
 2045       if (consp(arg)) {
 2046         val arg2 = first(arg);
 2047         if (arg2 == zeroplus_s || arg2 == optional_s)
 2048           return arg;
 2049         if (arg2 == oneplus_s)
 2050           return cons(zeroplus_s, cdr(arg));
 2051       }
 2052       return cons(sym, cons(arg, nil));
 2053     } else if (sym == compl_s) {
 2054       val arg_unopt = car(args);
 2055 
 2056       if (consp(arg_unopt) && car(arg_unopt) == compl_s) {
 2057         return reg_optimize(cadr(arg_unopt));
 2058       } else {
 2059         val arg = reg_optimize(arg_unopt);
 2060         if (reg_matches_all(arg))
 2061           return t;
 2062         if (arg == nil)
 2063           return cons(oneplus_s, cons(wild_s, nil));
 2064         if (arg == t)
 2065           return cons(zeroplus_s, cons(wild_s, nil));
 2066         if (reg_single_char_p(arg))
 2067           return list(or_s,
 2068                       list(optional_s, invert_single(arg), nao),
 2069                       list(compound_s, wild_s,
 2070                            list(oneplus_s, wild_s, nao), nao), nao);
 2071         if (consp(arg)) {
 2072           val sym2 = first(arg);
 2073           if (sym2 == compound_s) {
 2074             val args2 = rest(arg);
 2075             if (cddr(args2) && !cdddr(args2)) {
 2076               if (reg_matches_all(first(args2)) &&
 2077                   reg_single_char_p(second(args2)) &&
 2078                   reg_matches_all(third(args2)))
 2079               {
 2080                 return cons(zeroplus_s,
 2081                             cons(invert_single(second(args2)), nil));
 2082               }
 2083             }
 2084 
 2085             if (cdr(args2) && !cddr(args2)) {
 2086               if (reg_matches_all(first(args2)) &&
 2087                   reg_single_char_p(second(args2)))
 2088               {
 2089                 return list(optional_s,
 2090                             list(compound_s,
 2091                                  cons(zeroplus_s, cons(wild_s, nil)),
 2092                                  invert_single(second(args2)), nao), nao);
 2093               }
 2094 
 2095               if (reg_single_char_p(first(args2)) &&
 2096                   reg_matches_all(second(args2))) {
 2097                 return list(optional_s,
 2098                             list(compound_s,
 2099                                  invert_single(first(args2)),
 2100                                  cons(zeroplus_s, cons(wild_s, nil)), nao), nao);
 2101               }
 2102             }
 2103           }
 2104         }
 2105         return cons(sym, cons(arg, nil));
 2106       }
 2107     } else if (sym == or_s) {
 2108       val arg1 = reg_optimize(pop(&args));
 2109       val arg2 = reg_optimize(pop(&args));
 2110 
 2111       if (arg1 == t || reg_matches_all(arg2))
 2112         return arg2;
 2113 
 2114       if (arg2 == t || reg_matches_all(arg1))
 2115         return arg1;
 2116 
 2117       if (arg1 == nil)
 2118         return cons(optional_s, cons(arg2, nil));
 2119 
 2120       if (arg2 == nil)
 2121         return cons(optional_s, cons(arg1, nil));
 2122 
 2123       if (reg_single_char_p(arg1) && reg_single_char_p(arg2)) {
 2124         if (!reg_compl_char_p(arg1) && !reg_compl_char_p(arg2)) {
 2125           if (atom(arg1) && atom(arg2))
 2126             return list(set_s, arg1, arg2, nao);
 2127           if (consp(arg1) && car(arg1) == set_s && atom(arg2))
 2128             return cons(set_s, cons(arg2, cdr(arg1)));
 2129           if (consp(arg2) && car(arg2) == set_s && atom(arg1))
 2130             return cons(set_s, cons(arg1, cdr(arg2)));
 2131           if (consp(arg1) && car(arg1) == set_s &&
 2132               consp(arg1) && car(arg1) == set_s)
 2133             return cons(set_s, append2(cdr(arg1), cdr(arg2)));
 2134         }
 2135         if (reg_compl_char_p(arg1) && reg_compl_char_p(arg2)) {
 2136           arg1 = invert_single(arg1);
 2137           arg2 = invert_single(arg2);
 2138 
 2139           if (atom(arg1) && atom(arg2))
 2140             return list(cset_s, arg1, arg2, nao);
 2141           if (consp(arg1) && car(arg1) == set_s && atom(arg2))
 2142             return cons(cset_s, cons(arg2, cdr(arg1)));
 2143           if (consp(arg2) && car(arg2) == set_s && atom(arg1))
 2144             return cons(cset_s, cons(arg1, cdr(arg2)));
 2145           if (consp(arg1) && car(arg1) == set_s &&
 2146               consp(arg1) && car(arg1) == set_s)
 2147             return cons(cset_s, append2(cdr(arg1), cdr(arg2)));
 2148         }
 2149       }
 2150 
 2151       return cons(sym, cons(arg1, cons(arg2, nil)));
 2152     } else if (sym == and_s) {
 2153       val arg1 = reg_optimize(pop(&args));
 2154       val arg2 = reg_optimize(pop(&args));
 2155 
 2156       if (arg1 == t || arg2 == t)
 2157         return t;
 2158 
 2159       if (arg1 == nil)
 2160         return null(reg_nullable(arg2));
 2161 
 2162       if (arg2 == nil)
 2163         return null(reg_nullable(arg1));
 2164 
 2165       if (reg_matches_all(arg1))
 2166         return arg2;
 2167 
 2168       if (reg_matches_all(arg2))
 2169         return arg2;
 2170 
 2171       return cons(sym, cons(arg1, cons(arg2, nil)));
 2172     } else {
 2173       uw_throwf(error_s, lit("bad operator in regex syntax: ~s"), sym, nao);
 2174     }
 2175   }
 2176 }
 2177 
 2178 static val regex_requires_dv(val exp)
 2179 {
 2180   if (atom(exp)) {
 2181     return nil;
 2182   } else {
 2183     val sym = first(exp);
 2184     val args = rest(exp);
 2185 
 2186     if (sym == set_s || sym == cset_s) {
 2187       return nil;
 2188     } else if (sym == compound_s) {
 2189       return some_satisfy(args, func_n1(regex_requires_dv), nil);
 2190     } else if (sym == zeroplus_s || sym == oneplus_s ||
 2191                sym == optional_s) {
 2192       return regex_requires_dv(first(args));
 2193     } else if (sym == compl_s) {
 2194       return t;
 2195     } else if (sym == or_s) {
 2196       return if2(regex_requires_dv(first(args)) ||
 2197                  regex_requires_dv(second(args)), t);
 2198     } else if (sym == and_s || sym == nongreedy_s) {
 2199       return t;
 2200     } else {
 2201       uw_throwf(error_s, lit("bad operator in regex syntax: ~s"), sym, nao);
 2202     }
 2203   }
 2204 }
 2205 
 2206 val regex_compile(val regex_sexp, val error_stream)
 2207 {
 2208   val regex_source = regex_sexp;
 2209 
 2210   if (stringp(regex_sexp)) {
 2211     regex_sexp = regex_parse(regex_sexp, default_null_arg(error_stream));
 2212     return if2(regex_sexp, regex_compile(regex_sexp, error_stream));
 2213   }
 2214 
 2215   regex_sexp = reg_optimize(reg_expand_nongreedy(reg_nary_to_bin(regex_sexp)));
 2216 
 2217   if (opt_derivative_regex || regex_requires_dv(regex_sexp)) {
 2218     regex_t *regex = coerce(regex_t *, chk_malloc(sizeof *regex));
 2219     val ret;
 2220     val dv = reg_compile_csets(regex_sexp);
 2221     regex->kind = REGEX_DV;
 2222     regex->nstates = 0;
 2223     regex->source = nil;
 2224     ret = cobj(coerce(mem_t *, regex), regex_s, &regex_obj_ops);
 2225     regex->r.dv = dv;
 2226     regex->source = regex_source;
 2227     return ret;
 2228   } else {
 2229     regex_t *regex = coerce(regex_t *, chk_malloc(sizeof *regex));
 2230     val ret;
 2231     regex->kind = REGEX_NFA;
 2232     regex->source = nil;
 2233     ret = cobj(coerce(mem_t *, regex), regex_s, &regex_obj_ops);
 2234     regex->r.nfa = nfa_optimize(nfa_compile_regex(regex_sexp));
 2235     regex->nstates = nfa_count_states(regex->r.nfa.start);
 2236     regex->source = regex_source;
 2237     return ret;
 2238   }
 2239 }
 2240 
 2241 val regexp(val obj)
 2242 {
 2243   return typeof(obj) == regex_s ? t : nil;
 2244 }
 2245 
 2246 val regex_source(val compiled_regex)
 2247 {
 2248   val self = lit("regex-source");
 2249   regex_t *regex = coerce(regex_t *,
 2250                           cobj_handle(self, compiled_regex, regex_s));
 2251   return regex->source;
 2252 }
 2253 
 2254 static void puts_clear_flag(val str, val stream, int *semi_flag)
 2255 {
 2256   *semi_flag = 0;
 2257   put_string(str, stream);
 2258 }
 2259 
 2260 static void putc_clear_flag(val ch, val stream, int *semi_flag)
 2261 {
 2262   *semi_flag = 0;
 2263   put_char(ch, stream);
 2264 }
 2265 
 2266 static void print_class_char(val ch, val first_p, val stream, int *semi_flag)
 2267 {
 2268   wchar_t c = c_chr(ch);
 2269   switch (c) {
 2270   case '^':
 2271     if (!first_p)
 2272       break;
 2273     /* fallthrough */
 2274   case '-': case '[': case ']':
 2275     putc_clear_flag(chr('\\'), stream, semi_flag);
 2276     put_char(ch, stream);
 2277     return;
 2278   }
 2279   out_str_char(c_chr(ch), stream, semi_flag, 1);
 2280 }
 2281 
 2282 static void print_rec(val exp, val stream, int *semi_flag);
 2283 
 2284 static void paren_print_rec(val exp, val stream, int *semi_flag)
 2285 {
 2286   putc_clear_flag(chr('('), stream, semi_flag);
 2287   print_rec(exp, stream, semi_flag);
 2288   putc_clear_flag(chr(')'), stream, semi_flag);
 2289 }
 2290 
 2291 static void print_rec(val exp, val stream, int *semi_flag)
 2292 {
 2293   if (exp == space_k) {
 2294     puts_clear_flag(lit("\\s"), stream, semi_flag);
 2295   } else if (exp == digit_k) {
 2296     puts_clear_flag(lit("\\d"), stream, semi_flag);
 2297   } else if (exp == word_char_k) {
 2298     puts_clear_flag(lit("\\w"), stream, semi_flag);
 2299   } else if (exp == cspace_k) {
 2300     puts_clear_flag(lit("\\S"), stream, semi_flag);
 2301   } else if (exp == cdigit_k) {
 2302     puts_clear_flag(lit("\\D"), stream, semi_flag);
 2303   } else if (exp == cword_char_k) {
 2304     puts_clear_flag(lit("\\W"), stream, semi_flag);
 2305   } else if (exp == wild_s) {
 2306     putc_clear_flag(chr('.'), stream, semi_flag);
 2307   } else if (chrp(exp)) {
 2308     wchar_t ch = c_chr(exp);
 2309     switch (ch) {
 2310     case '?': case '.': case '*': case '+':
 2311     case '(': case ')': case '|': case '~':
 2312     case '&': case '%': case '/':
 2313     case '[': case ']': case '\\':
 2314       putc_clear_flag(chr('\\'), stream, semi_flag);
 2315       put_char(exp, stream);
 2316       break;
 2317     default:
 2318       out_str_char(ch, stream, semi_flag, 1);
 2319     }
 2320   } else if (stringp(exp)) {
 2321     cnum i;
 2322     cnum l = c_num(length(exp));
 2323     for (i = 0; i < l; i++)
 2324       print_rec(chr_str(exp, num(i)), stream, semi_flag);
 2325   } else if (consp(exp)) {
 2326     val sym = first(exp);
 2327     val args = rest(exp);
 2328 
 2329     if (sym == set_s || sym == cset_s) {
 2330       putc_clear_flag(chr('['), stream, semi_flag);
 2331 
 2332       val first_p = t;
 2333 
 2334       if (sym == cset_s) {
 2335         put_char(chr('^'), stream);
 2336         first_p = nil;
 2337       }
 2338 
 2339       while (args) {
 2340         val arg = pop(&args);
 2341         if (consp(arg)) {
 2342           print_class_char(car(arg), first_p, stream, semi_flag);
 2343           putc_clear_flag(chr('-'), stream, semi_flag);
 2344           print_class_char(cdr(arg), nil, stream, semi_flag);
 2345         } else if (symbolp(arg)) {
 2346           print_rec(arg, stream, semi_flag);
 2347         } else {
 2348           print_class_char(arg, first_p, stream, semi_flag);
 2349         }
 2350         first_p = nil;
 2351       }
 2352       putc_clear_flag(chr(']'), stream, semi_flag);
 2353     } else if (sym == compound_s) {
 2354       for (; args; args = cdr(args)) {
 2355         val arg = car(args);
 2356         if (consp(arg) && (car(arg) == compl_s || car(arg) == and_s ||
 2357                            car(arg) == or_s || car(arg) == nongreedy_s))
 2358           paren_print_rec(arg, stream, semi_flag);
 2359         else
 2360           print_rec(arg, stream, semi_flag);
 2361       }
 2362     } else if (sym == zeroplus_s || sym == oneplus_s || sym == optional_s) {
 2363       val arg = pop(&args);
 2364       if (consp(arg) && car(arg) != set_s && car(arg) != cset_s)
 2365         paren_print_rec(arg, stream, semi_flag);
 2366       else
 2367         print_rec(arg, stream, semi_flag);
 2368       if (sym == zeroplus_s)
 2369         putc_clear_flag(chr('*'), stream, semi_flag);
 2370       else if (sym == oneplus_s)
 2371         putc_clear_flag(chr('+'), stream, semi_flag);
 2372       else
 2373         putc_clear_flag(chr('?'), stream, semi_flag);
 2374     } else if (sym == compl_s) {
 2375       val arg = pop(&args);
 2376       putc_clear_flag(chr('~'), stream, semi_flag);
 2377       if (consp(arg) && (car(arg) == or_s || car(arg) == and_s))
 2378         paren_print_rec(arg, stream, semi_flag);
 2379       else
 2380         print_rec(arg, stream, semi_flag);
 2381     } else if (sym == and_s) {
 2382       val arg1 = pop(&args);
 2383       val arg2 = pop(&args);
 2384       if (consp(arg1) && car(arg1) == or_s)
 2385         paren_print_rec(arg1, stream, semi_flag);
 2386       else
 2387         print_rec(arg1, stream, semi_flag);
 2388       putc_clear_flag(chr('&'), stream, semi_flag);
 2389       if (consp(arg2) && car(arg2) == or_s)
 2390         paren_print_rec(arg2, stream, semi_flag);
 2391       else
 2392         print_rec(arg2, stream, semi_flag);
 2393     } else if (sym == or_s) {
 2394       print_rec(pop(&args), stream, semi_flag);
 2395       putc_clear_flag(chr('|'), stream, semi_flag);
 2396       print_rec(pop(&args), stream, semi_flag);
 2397     } else if (sym == nongreedy_s) {
 2398       val arg1 = pop(&args);
 2399       val arg2 = pop(&args);
 2400       if (consp(arg1) && car(arg1) != set_s && car(arg1) != cset_s)
 2401         paren_print_rec(arg1, stream, semi_flag);
 2402       else
 2403         print_rec(arg1, stream, semi_flag);
 2404       putc_clear_flag(chr('%'), stream, semi_flag);
 2405       if (consp(arg2) && (car(arg2) == and_s && car(arg2) == or_s))
 2406         paren_print_rec(arg2, stream, semi_flag);
 2407       else
 2408         print_rec(arg2, stream, semi_flag);
 2409     } else {
 2410       uw_throwf(error_s, lit("bad operator in regex syntax: ~s"), sym, nao);
 2411     }
 2412   } else if (exp == t) {
 2413     puts_clear_flag(lit("[]"), stream, semi_flag);
 2414   } else if (exp != nil) {
 2415     uw_throwf(error_s, lit("bad object in regex syntax: ~s"), exp, nao);
 2416   }
 2417 }
 2418 
 2419 static void regex_print(val obj, val stream, val pretty, struct strm_ctx *ctx)
 2420 {
 2421   val self = lit("regex-print");
 2422   regex_t *regex = coerce(regex_t *, cobj_handle(self, obj, regex_s));
 2423   int semi_flag = 0;
 2424 
 2425   (void) pretty;
 2426   (void) ctx;
 2427 
 2428   put_string(lit("#/"), stream);
 2429   print_rec(regex->source, stream, &semi_flag);
 2430   put_char(chr('/'), stream);
 2431 }
 2432 
 2433 static cnum regex_run(val compiled_regex, const wchar_t *str)
 2434 {
 2435   val self = lit("regex-run");
 2436   regex_t *regex = coerce(regex_t *, cobj_handle(self, compiled_regex, regex_s));
 2437 
 2438   return if3(regex->kind == REGEX_DV,
 2439              dv_run(regex->r.dv, str),
 2440              nfa_run(regex->r.nfa, regex->nstates, str));
 2441 }
 2442 
 2443 /*
 2444  * Regex machine: represents the logic of the regex_run function as state
 2445  * machine object which can be fed one character at a time.
 2446  */
 2447 
 2448 static void regex_machine_reset(regex_machine_t *regm)
 2449 {
 2450   int accept = 0;
 2451 
 2452   regm->n.last_accept_pos = -1;
 2453   regm->n.count = 0;
 2454 
 2455   if (regm->n.is_nfa) {
 2456     nfa_state_t *s = regm->n.nfa.start;
 2457 
 2458     if (s) {
 2459       regm->n.visited = s->a.visited + 1;
 2460       nfa_handle_wraparound(s, &regm->n.visited);
 2461       regm->n.set[0] = s;
 2462       regm->n.nclos = nfa_closure(regm->n.stack, regm->n.set, 1,
 2463                                   regm->n.nstates,
 2464                                   regm->n.visited, &accept);
 2465       s->a.visited = regm->n.visited;
 2466     } else {
 2467       regm->n.nclos = 0;
 2468     }
 2469   } else {
 2470     regm->d.deriv = regm->d.regex;
 2471     accept = (reg_nullable(regm->d.regex) != nil);
 2472   }
 2473 
 2474   if (accept)
 2475     regm->n.last_accept_pos = regm->n.count;
 2476 }
 2477 
 2478 static void regex_machine_init(val self, regex_machine_t *regm, val reg)
 2479 {
 2480   regex_t *regex = coerce(regex_t *, cobj_handle(self, reg, regex_s));
 2481 
 2482   if (regex->kind == REGEX_DV) {
 2483     regm->n.is_nfa = 0;
 2484     regm->d.regex = regex->r.dv;
 2485   } else {
 2486     regm->n.is_nfa = 1;
 2487     regm->n.nfa = regex->r.nfa;
 2488     regm->n.nstates = regex->nstates;
 2489     regm->n.visited = 0;
 2490     regm->n.set = coerce(nfa_state_t **,
 2491                           chk_malloc(regex->nstates * sizeof *regm->n.set));
 2492     regm->n.stack = coerce(nfa_state_t **,
 2493                            chk_malloc(regex->nstates * sizeof *regm->n.stack));
 2494   }
 2495 
 2496   regex_machine_reset(regm);
 2497 }
 2498 
 2499 static void regex_machine_cleanup(regex_machine_t *regm)
 2500 {
 2501   if (regm->n.is_nfa) {
 2502     free(regm->n.stack);
 2503     free(regm->n.set);
 2504     regm->n.stack = 0;
 2505     regm->n.set = 0;
 2506     regm->n.nfa.start = 0;
 2507     regm->n.nfa.accept = 0;
 2508   }
 2509 }
 2510 
 2511 static regm_result_t regex_machine_infer_init_state(regex_machine_t *regm)
 2512 {
 2513   if (regm->n.is_nfa)
 2514     return (regm->n.nclos != 0) ? REGM_INCOMPLETE : REGM_FAIL;
 2515   else
 2516     return (regm->d.deriv != t) ? REGM_INCOMPLETE : REGM_FAIL;
 2517 }
 2518 
 2519 static regm_result_t regex_machine_feed(regex_machine_t *regm, wchar_t ch)
 2520 {
 2521   int accept = 0;
 2522 
 2523   if (regm->n.is_nfa) {
 2524     nfa_handle_wraparound(regm->n.nfa.start, &regm->n.visited);
 2525 
 2526     if (ch != 0) {
 2527       regm->n.count++;
 2528 
 2529       regm->n.nclos = nfa_move_closure(regm->n.stack,
 2530                                        regm->n.set, regm->n.nclos,
 2531                                        regm->n.nstates, ch, ++regm->n.visited,
 2532                                        &accept);
 2533 
 2534       if (regm->n.nfa.start)
 2535         regm->n.nfa.start->a.visited = regm->n.visited;
 2536 
 2537       if (accept) {
 2538         regm->n.last_accept_pos = regm->n.count;
 2539         return REGM_MATCH;
 2540       }
 2541 
 2542       return (regm->n.nclos != 0) ? REGM_INCOMPLETE : REGM_FAIL;
 2543     }
 2544   } else {
 2545     val accept = nil;
 2546 
 2547     if (ch != 0) {
 2548       regm->d.count++;
 2549       regm->d.deriv = reg_derivative(regm->d.deriv, chr(ch));
 2550 
 2551       if ((accept = reg_nullable(regm->d.deriv))) {
 2552         regm->d.last_accept_pos = regm->d.count;
 2553         return REGM_MATCH;
 2554       }
 2555 
 2556       return (regm->d.deriv != t) ? REGM_INCOMPLETE : REGM_FAIL;
 2557     }
 2558   }
 2559 
 2560   /* Reached if the null character is
 2561      consumed, or NFA/derivation hit a transition dead end. */
 2562 
 2563   if (regm->n.last_accept_pos == regm->n.count)
 2564     return REGM_MATCH;
 2565   if (regm->n.last_accept_pos == -1)
 2566     return REGM_FAIL;
 2567   return REGM_INCOMPLETE;
 2568 }
 2569 
 2570 val search_regex(val haystack, val needle_regex, val start,
 2571                  val from_end)
 2572 {
 2573   val self = lit("search-regex");
 2574   val slen = nil;
 2575   start = default_arg(start, zero);
 2576   from_end = default_null_arg(from_end);
 2577 
 2578   if (minusp(start)) {
 2579     slen = length_str(haystack);
 2580     start = plus(start, slen);
 2581     if (minusp(start))
 2582       start = zero;
 2583   }
 2584 
 2585   if (from_end) {
 2586     cnum i;
 2587     cnum s = c_num(start);
 2588     const wchar_t *h = c_str(haystack);
 2589 
 2590     slen = (slen ? slen : length_str(haystack));
 2591 
 2592     if (regex_run(needle_regex, L"") >= 0)
 2593       return cons(slen, zero);
 2594 
 2595     for (i = c_num(slen) - 1; i >= s; i--) {
 2596       cnum span = regex_run(needle_regex, h + i);
 2597       if (span >= 0)
 2598         return cons(num(i), num(span));
 2599     }
 2600 
 2601     gc_hint(haystack);
 2602   } else {
 2603     regex_machine_t regm;
 2604     val i, pos = start, retval;
 2605     regm_result_t last_res = REGM_INCOMPLETE;
 2606 
 2607     if (length_str_lt(haystack, pos))
 2608       return nil;
 2609 
 2610     regex_machine_init(self, &regm, needle_regex);
 2611 
 2612 again:
 2613     for (i = pos; length_str_gt(haystack, i); i = plus(i, one)) {
 2614       last_res = regex_machine_feed(&regm, c_chr(chr_str(haystack, i)));
 2615 
 2616       if (last_res == REGM_FAIL) {
 2617         last_res = regex_machine_feed(&regm, 0);
 2618         if (last_res == REGM_FAIL) {
 2619           regex_machine_reset(&regm);
 2620           pos = plus(pos, one);
 2621           goto again;
 2622         }
 2623         break;
 2624       }
 2625     }
 2626 
 2627     last_res = regex_machine_feed(&regm, 0);
 2628 
 2629     switch (last_res) {
 2630     case REGM_INCOMPLETE:
 2631     case REGM_MATCH:
 2632       retval = cons(pos, num(regex_machine_match_span(&regm)));
 2633       regex_machine_cleanup(&regm);
 2634       return retval;
 2635     case REGM_FAIL:
 2636       regex_machine_cleanup(&regm);
 2637       return nil;
 2638     }
 2639   }
 2640 
 2641   return nil;
 2642 }
 2643 
 2644 val range_regex(val haystack, val needle_regex, val start,
 2645                 val from_end)
 2646 {
 2647   val result = search_regex(haystack, needle_regex, start, from_end);
 2648 
 2649   if (result) {
 2650     cons_bind (pos, len, result);
 2651     return rcons(pos, plus(pos, len));
 2652   }
 2653 
 2654   return result;
 2655 }
 2656 
 2657 val range_regex_all(val haystack, val needle_regex, val start, val end)
 2658 {
 2659   list_collect_decl (out, ptail);
 2660   val slen = length_str(haystack);
 2661 
 2662   if (null_or_missing_p(start)) {
 2663     start = zero;
 2664   } else if (minusp(start)) {
 2665     start = minus(start, slen);
 2666     if (minusp(start))
 2667       start = zero;
 2668   }
 2669 
 2670   if (null_or_missing_p(end)) {
 2671     end = slen;
 2672   } else if (minusp(end)) {
 2673     end = minus(start, slen);
 2674     if (minusp(end))
 2675       return nil;
 2676   }
 2677 
 2678   for (;;) {
 2679     val range = range_regex(haystack, needle_regex, start, nil);
 2680 
 2681     if (!range)
 2682       break;
 2683 
 2684     ptail = list_collect(ptail, range);
 2685 
 2686     {
 2687       val rt = to(range);
 2688       start = if3(eql(start, rt), succ(start), rt);
 2689     }
 2690   }
 2691 
 2692   return out;
 2693 }
 2694 
 2695 val match_regex(val str, val reg, val pos)
 2696 {
 2697   val self = lit("match-regex");
 2698   regex_machine_t regm;
 2699   val i, retval;
 2700   regm_result_t last_res = REGM_INCOMPLETE;
 2701 
 2702   if (null_or_missing_p(pos)) {
 2703     pos = zero;
 2704   } else if (minusp(pos)) {
 2705     pos = plus(pos, length_str(str));
 2706     if (minusp(pos))
 2707       return nil;
 2708   } else if (length_str_lt(str, pos)) {
 2709     return nil;
 2710   }
 2711 
 2712   regex_machine_init(self, &regm, reg);
 2713 
 2714   for (i = pos; length_str_gt(str, i); i = plus(i, one)) {
 2715     last_res = regex_machine_feed(&regm, c_chr(chr_str(str, i)));
 2716     if (last_res == REGM_FAIL)
 2717       break;
 2718   }
 2719 
 2720   last_res = regex_machine_feed(&regm, 0);
 2721 
 2722   switch (last_res) {
 2723   case REGM_INCOMPLETE:
 2724   case REGM_MATCH:
 2725     retval = plus(pos, num(regex_machine_match_span(&regm)));
 2726     regex_machine_cleanup(&regm);
 2727     return retval;
 2728   case REGM_FAIL:
 2729     regex_machine_cleanup(&regm);
 2730     return nil;
 2731   }
 2732 
 2733   return nil;
 2734 }
 2735 
 2736 val match_regex_len(val str, val regex, val pos)
 2737 {
 2738   if (null_or_missing_p(pos)) {
 2739     return match_regex(str, regex, pos);
 2740   } else {
 2741     val new_pos = match_regex(str, regex, pos);
 2742     return if2(new_pos, minus(new_pos, pos));
 2743   }
 2744 }
 2745 
 2746 static val match_regex_right_old(val str, val regex, val end)
 2747 {
 2748   val pos = zero;
 2749   val slen = length(str);
 2750 
 2751   if (null_or_missing_p(end) || gt(end, slen))
 2752     end = slen;
 2753   else if (minusp(end))
 2754     end = plus(end, slen);
 2755 
 2756   while (le(pos, end)) {
 2757     cons_bind (from, len, search_regex(str, regex, pos, nil));
 2758 
 2759     if (!from)
 2760       return nil;
 2761 
 2762     if (eql(plus(from, len), end))
 2763       return len;
 2764 
 2765     pos = plus(pos, one);
 2766   }
 2767 
 2768   return nil;
 2769 }
 2770 
 2771 val match_regex_right(val str, val regex, val end)
 2772 {
 2773   val self = lit("match-regex-right");
 2774   val pos = zero;
 2775   val len = length(str);
 2776 
 2777   if (null_or_missing_p(end)) {
 2778     end = len;
 2779   } else if (minusp(end)) {
 2780     end = plus(end, len);
 2781     if (minusp(end))
 2782       return nil;
 2783   } else if (gt(end, len)) {
 2784     return nil;
 2785   }
 2786 
 2787   while (le(pos, end)) {
 2788     regex_machine_t regm;
 2789     val i ;
 2790     regm_result_t last_res = REGM_INCOMPLETE;
 2791 
 2792     regex_machine_init(self, &regm, regex);
 2793 
 2794     for (i = pos; lt(i, end); i = plus(i, one)) {
 2795       last_res = regex_machine_feed(&regm, c_chr(chr_str(str, i)));
 2796       if (last_res == REGM_FAIL)
 2797         break;
 2798     }
 2799 
 2800     last_res = regex_machine_feed(&regm, 0);
 2801 
 2802     switch (last_res) {
 2803     case REGM_MATCH:
 2804       regex_machine_cleanup(&regm);
 2805       return minus(end, pos);
 2806     case REGM_INCOMPLETE:
 2807     case REGM_FAIL:
 2808       regex_machine_cleanup(&regm);
 2809       break;
 2810     }
 2811 
 2812     pos = succ(pos);
 2813   }
 2814 
 2815   return nil;
 2816 }
 2817 
 2818 val regex_prefix_match(val reg, val str, val pos)
 2819 {
 2820   val self = lit("regex-prefix-match");
 2821   regex_machine_t regm;
 2822   val i;
 2823   regm_result_t last_res;
 2824 
 2825   if (null_or_missing_p(pos)) {
 2826     pos = zero;
 2827   } else if (minusp(pos)) {
 2828     pos = plus(pos, length_str(str));
 2829     if (minusp(pos))
 2830       return nil;
 2831   } else if (length_str_lt(str, pos)) {
 2832     return nil;
 2833   }
 2834 
 2835   regex_machine_init(self, &regm, reg);
 2836 
 2837   last_res = regex_machine_infer_init_state(&regm);
 2838 
 2839   for (i = pos; length_str_gt(str, i); i = plus(i, one)) {
 2840     last_res = regex_machine_feed(&regm, c_chr(chr_str(str, i)));
 2841     if (last_res == REGM_FAIL)
 2842       break;
 2843   }
 2844 
 2845   regex_machine_cleanup(&regm);
 2846 
 2847   switch (last_res) {
 2848   case REGM_INCOMPLETE:
 2849   case REGM_MATCH:
 2850     return t;
 2851   default:
 2852   case REGM_FAIL:
 2853     return nil;
 2854   }
 2855 }
 2856 
 2857 val regsub(val regex, val repl, val str)
 2858 {
 2859   val isfunc = functionp(repl);
 2860 
 2861   if (functionp(regex)) {
 2862     val range = funcall1(regex, str);
 2863 
 2864     if (!range)
 2865       return str;
 2866 
 2867     {
 2868       val rf = from(range);
 2869       val rt = to(range);
 2870 
 2871       return replace_str(str, if3(isfunc,
 2872                                   funcall1(repl, sub_str(str, rf, rt)),
 2873                                   repl),
 2874                          rf, rt);
 2875     }
 2876   } else {
 2877     list_collect_decl (out, ptail);
 2878     val pos = zero;
 2879 
 2880     do {
 2881       cons_bind (find, len, search_regex(str, regex, pos, nil));
 2882       if (!find) {
 2883         if (pos == zero)
 2884           return str;
 2885         ptail = list_collect(ptail, sub_str(str, pos, nil));
 2886         break;
 2887       }
 2888       ptail = list_collect(ptail, sub_str(str, pos, find));
 2889       ptail = list_collect(ptail, if3(isfunc,
 2890                                       funcall1(repl, sub_str(str, find,
 2891                                                              plus(find, len))),
 2892                                       repl));
 2893       if (len == zero && eql(find, pos)) {
 2894         if (lt(pos, length_str(str))) {
 2895           ptail = list_collect(ptail, chr_str(str, pos));
 2896           pos = plus(pos, one);
 2897         }
 2898       } else {
 2899         pos = plus(find, len);
 2900       }
 2901     } while (lt(pos, length_str(str)));
 2902 
 2903     return cat_str(out, nil);
 2904   }
 2905 }
 2906 
 2907 val search_regst(val haystack, val needle_regex, val start_num, val from_end)
 2908 {
 2909   val range = range_regex(haystack, needle_regex, start_num, from_end);
 2910   return if2(range, sub_str(haystack, from(range), to(range)));
 2911 }
 2912 
 2913 val match_regst(val str, val regex, val pos_in)
 2914 {
 2915   val pos = if3(null_or_missing_p(pos_in),
 2916                 zero,
 2917                 if3(minusp(pos_in),
 2918                     plus(pos_in, length_str(str)), pos_in));
 2919   val new_pos = if3(minusp(pos), nil, match_regex(str, regex, pos));
 2920   return if2(new_pos, sub_str(str, pos, new_pos));
 2921 }
 2922 
 2923 static val match_regst_right_old(val str, val regex, val end)
 2924 {
 2925   val len = match_regex_right_old(str, regex, end);
 2926   return if2(len, if3(null_or_missing_p(end),
 2927                       sub_str(str, neg(len), zero),
 2928                       sub_str(str, minus(end, len), end)));
 2929 }
 2930 
 2931 val match_regst_right(val str, val regex, val end)
 2932 {
 2933   val len = match_regex_right(str, regex, end);
 2934   return if2(len, if3(null_or_missing_p(end),
 2935                       sub_str(str, neg(len), zero),
 2936                       sub_str(str, minus(end, len), end)));
 2937 }
 2938 static val do_match_full(val regex, val str)
 2939 {
 2940    return if2(eql(match_regex(str, regex, zero), length_str(str)), str);
 2941 }
 2942 
 2943 static val do_match_full_offs(val env, val str)
 2944 {
 2945   cons_bind (regex, pos_in, env);
 2946   val len = length_str(str);
 2947   val pos = if3(minusp(pos_in), plus(pos_in, len), pos_in);
 2948   return if3(minusp(pos),
 2949              nil,
 2950              if2(eql(match_regex(str, regex, pos), len),
 2951                  sub_str(str, pos, t)));
 2952 }
 2953 
 2954 val regex_match_full_fun(val regex, val pos)
 2955 {
 2956   if (null_or_missing_p(pos))
 2957     return func_f1(regex, do_match_full);
 2958   return func_f1(cons(regex, pos), do_match_full_offs);
 2959 }
 2960 
 2961 static val do_match_left(val regex, val str)
 2962 {
 2963   return match_regst(str, regex, zero);
 2964 }
 2965 
 2966 static val do_match_left_offs(val env, val str)
 2967 {
 2968   cons_bind (regex, pos, env);
 2969   return match_regst(str, regex, pos);
 2970 }
 2971 
 2972 val regex_match_left_fun(val regex, val pos)
 2973 {
 2974   if (null_or_missing_p(pos))
 2975     return func_f1(regex, do_match_left);
 2976   return func_f1(cons(regex, pos), do_match_left_offs);
 2977 }
 2978 
 2979 static val do_match_right(val regex, val str)
 2980 {
 2981   return match_regst_right(str, regex, nil);
 2982 }
 2983 
 2984 static val do_match_right_offs(val env, val str)
 2985 {
 2986   cons_bind (regex, end, env);
 2987   return match_regst_right(str, regex, end);
 2988 }
 2989 
 2990 val regex_match_right_fun(val regex, val end)
 2991 {
 2992   if (null_or_missing_p(end))
 2993     return func_f1(regex, do_match_right);
 2994   return func_f1(cons(regex, end), do_match_right_offs);
 2995 }
 2996 
 2997 val regex_match_full(val regex, val arg1, val arg2)
 2998 {
 2999   if (null_or_missing_p(arg2)) {
 3000     val str = arg1;
 3001     return if2(eql(match_regex(arg1, regex, arg2), length_str(str)), str);
 3002   } else {
 3003     val str = arg2;
 3004     val len = length_str(str);
 3005     val pos = if3(minusp(arg1), plus(len, arg1), arg1);
 3006     return if3(minusp(pos),
 3007                nil,
 3008                if2(eql(match_regex(str, regex, pos), len),
 3009                    sub_str(str, pos, t)));
 3010   }
 3011 }
 3012 
 3013 val regex_match_left(val regex, val arg1, val arg2)
 3014 {
 3015   if (null_or_missing_p(arg2))
 3016     return match_regst(arg1, regex, arg2);
 3017   return match_regst(arg2, regex, arg1);
 3018 }
 3019 
 3020 val regex_match_right(val regex, val arg1, val arg2)
 3021 {
 3022   if (null_or_missing_p(arg2))
 3023     return match_regst_right(arg1, regex, arg2);
 3024   return match_regst_right(arg2, regex, arg1);
 3025 }
 3026 
 3027 val regex_range_full(val regex, val arg1, val arg2)
 3028 {
 3029   if (null_or_missing_p(arg2)) {
 3030     val str = arg1;
 3031     val len = length_str(str);
 3032     return if2(eql(match_regex(str, regex, zero), len), rcons(zero, len));
 3033   } else {
 3034     val str = arg2;
 3035     val len = length_str(str);
 3036     val pos = if3(minusp(arg1), plus(len, arg1), arg1);
 3037     return if3(minusp(pos), nil,
 3038                if2(eql(match_regex(str, regex, pos), len), rcons(pos, len)));
 3039   }
 3040 }
 3041 
 3042 val regex_range_left(val regex, val arg1, val arg2)
 3043 {
 3044   if (null_or_missing_p(arg2)) {
 3045     val len = match_regex(arg1, regex, arg2);
 3046     return if2(len, rcons(zero, len));
 3047   } else {
 3048     val pos = if3(minusp(arg1), plus(arg1, length_str(arg2)), arg1);
 3049     val new_pos = if3(minusp(pos), nil, match_regex(arg2, regex, pos));
 3050     return if2(new_pos, rcons(pos, new_pos));
 3051   }
 3052 }
 3053 
 3054 val regex_range_right(val regex, val arg1, val arg2)
 3055 {
 3056   if (null_or_missing_p(arg2)) {
 3057     val len = match_regex_right(arg1, regex, arg2);
 3058     if (len) {
 3059       val slen = length_str(arg1);
 3060       return rcons(minus(slen, len), slen);
 3061     } else {
 3062       return nil;
 3063     }
 3064   } else {
 3065     val end = if3(minusp(arg1), plus(arg1, length_str(arg2)), arg1);
 3066     val len = match_regex_right(arg2, regex, end);
 3067     return if2(len, rcons(minus(end, len), end));
 3068   }
 3069 }
 3070 
 3071 val regex_range_search(val regex, val arg1, val arg2, val arg3)
 3072 {
 3073   if (missingp(arg2)) {
 3074     return range_regex(arg1, regex, zero, nil);
 3075   } else if (missingp(arg3)) {
 3076     return range_regex(arg2, regex, arg1, nil);
 3077   } else {
 3078     return range_regex(arg3, regex, arg1, arg2);
 3079   }
 3080 }
 3081 
 3082 val regex_range_all(val regex, val arg1, val arg2, val arg3)
 3083 {
 3084   if (missingp(arg2)) {
 3085     return range_regex_all(arg1, regex, zero, nil);
 3086   } else if (missingp(arg3)) {
 3087     return range_regex_all(arg2, regex, arg1, nil);
 3088   } else {
 3089     return range_regex_all(arg3, regex, arg1, arg2);
 3090   }
 3091 }
 3092 
 3093 val regex_range_full_fun(val regex, val pos)
 3094 {
 3095   return curry_123_3(func_n3(regex_range_full),
 3096                      regex, default_arg(pos, zero));
 3097 }
 3098 
 3099 val regex_range_left_fun(val regex, val pos)
 3100 {
 3101   return curry_123_3(func_n3(regex_range_left),
 3102                      regex, default_arg(pos, zero));
 3103 }
 3104 
 3105 val regex_range_right_fun(val regex, val end)
 3106 {
 3107   if (null_or_missing_p(end))
 3108     return curry_123_2(func_n3(regex_range_right), regex, end);
 3109   return curry_123_3(func_n3(regex_range_left), regex, end);
 3110 }
 3111 
 3112 val regex_range_search_fun(val regex, val start, val from_end)
 3113 {
 3114   return curry_1234_1(func_n4(range_regex), regex, start, from_end);
 3115 }
 3116 
 3117 static val scan_until_common(val self, val regex, val stream_in,
 3118                              val include_match_in, val accum)
 3119 {
 3120   regex_machine_t regm;
 3121   val out = nil;
 3122   u64_t count = 0;
 3123   val stack = nil;
 3124   val match = nil;
 3125   val stream = default_arg(stream_in, std_input);
 3126   val include_match = default_null_arg(include_match_in);
 3127 
 3128   regex_machine_init(self, &regm, regex);
 3129 
 3130   for (;;) {
 3131     val ch = get_char(stream);
 3132 
 3133     if (!ch) {
 3134       switch (regex_machine_feed(&regm, 0)) {
 3135       case REGM_FAIL:
 3136       case REGM_INCOMPLETE:
 3137         if (match)
 3138           goto out_match;
 3139         break;
 3140       case REGM_MATCH:
 3141         goto out_match;
 3142       }
 3143       break;
 3144     }
 3145 
 3146     switch (regex_machine_feed(&regm, c_chr(ch))) {
 3147     case REGM_FAIL:
 3148       unget_char(ch, stream);
 3149 
 3150       if (match)
 3151         goto out_match;
 3152 
 3153       while (stack)
 3154         unget_char(rcyc_pop(&stack), stream);
 3155 
 3156       ch = get_char(stream);
 3157 
 3158       if (accum) {
 3159         if (!out)
 3160           out = mkstring(one, ch);
 3161         else
 3162           string_extend(out, ch);
 3163       } else {
 3164         count++;
 3165       }
 3166 
 3167       regex_machine_reset(&regm);
 3168       continue;
 3169     case REGM_MATCH:
 3170       push(ch, &stack);
 3171       match = stack;
 3172       continue;
 3173     case REGM_INCOMPLETE:
 3174       push(ch, &stack);
 3175       continue;
 3176     }
 3177 
 3178     break;
 3179   }
 3180 
 3181   if (nil) {
 3182 out_match:
 3183     while (stack && stack != match)
 3184       unget_char(rcyc_pop(&stack), stream);
 3185     if (accum && !out)
 3186       out = null_string;
 3187     if (include_match)
 3188       out = cat_str(cons(out, stack = nreverse(stack)), nil);
 3189     if (!accum && match) {
 3190       val c = unum_64(count);
 3191       out = if3(include_match, cons(c, out), c);
 3192     }
 3193   }
 3194 
 3195   regex_machine_cleanup(&regm);
 3196 
 3197   while (stack)
 3198     rcyc_pop(&stack);
 3199 
 3200   return out;
 3201 }
 3202 
 3203 val read_until_match(val regex, val stream_in, val include_match_in)
 3204 {
 3205   return scan_until_common(lit("read-until-match"), regex, stream_in,
 3206                            include_match_in, t);
 3207 }
 3208 
 3209 val scan_until_match(val regex, val stream_in)
 3210 {
 3211   return scan_until_common(lit("scan-until-match"), regex, stream_in, t, nil);
 3212 }
 3213 
 3214 val count_until_match(val regex, val stream_in)
 3215 {
 3216   return scan_until_common(lit("count-until-match"), regex, stream_in, nil, nil);
 3217 }
 3218 
 3219 static char_set_t *create_wide_cs(void)
 3220 {
 3221 #ifdef FULL_UNICODE
 3222   chset_type_t cst = CHSET_XLARGE;
 3223 #else
 3224   chset_type_t cst = CHSET_LARGE;
 3225 #endif
 3226 
 3227   char_set_t *cs = char_set_create(cst, 0, 1);
 3228 
 3229   char_set_add_range(cs, 0x1100, 0x115F);
 3230   char_set_add_range(cs, 0x2329, 0x232A);
 3231   char_set_add_range(cs, 0x2E80, 0x2E99);
 3232   char_set_add_range(cs, 0x2E9B, 0x2EF3);
 3233   char_set_add_range(cs, 0x2F00, 0x2FD5);
 3234   char_set_add_range(cs, 0x2FF0, 0x2FFB);
 3235   char_set_add_range(cs, 0x3000, 0x303E);
 3236   char_set_add_range(cs, 0x3000, 0x303E);
 3237   char_set_add_range(cs, 0x3041, 0x3096);
 3238   char_set_add_range(cs, 0x3099, 0x30FF);
 3239   char_set_add_range(cs, 0x3105, 0x312D);
 3240   char_set_add_range(cs, 0x3131, 0x318E);
 3241   char_set_add_range(cs, 0x3190, 0x31BA);
 3242   char_set_add_range(cs, 0x31C0, 0x31E3);
 3243   char_set_add_range(cs, 0x31F0, 0x321E);
 3244   char_set_add_range(cs, 0x3220, 0x3247);
 3245   char_set_add_range(cs, 0x3250, 0x32FE);
 3246   char_set_add_range(cs, 0x3300, 0x4DB5);
 3247   char_set_add_range(cs, 0x4E00, 0x9FFF);
 3248   char_set_add_range(cs, 0xA000, 0xA48C);
 3249   char_set_add_range(cs, 0xA490, 0xA4C6);
 3250   char_set_add_range(cs, 0xA960, 0xA97C);
 3251   char_set_add_range(cs, 0xAC00, 0xD7A3);
 3252   char_set_add_range(cs, 0xF900, 0xFAFF);
 3253   char_set_add_range(cs, 0xFE10, 0xFE19);
 3254   char_set_add_range(cs, 0xFE30, 0xFE52);
 3255   char_set_add_range(cs, 0xFE54, 0xFE6B);
 3256   char_set_add_range(cs, 0xFF01, 0xFF60);
 3257   char_set_add_range(cs, 0xFFE0, 0xFFE6);
 3258 
 3259 #ifdef FULL_UNICODE
 3260   char_set_add_range(cs, 0x1B000, 0x1B001);
 3261   char_set_add_range(cs, 0x1F200, 0x1F202);
 3262   char_set_add_range(cs, 0x1F210, 0x1F23A);
 3263   char_set_add_range(cs, 0x1F240, 0x1F248);
 3264   char_set_add_range(cs, 0x1F250, 0x1F251);
 3265   char_set_add_range(cs, 0x20000, 0x2FFFD);
 3266   char_set_add_range(cs, 0x30000, 0x3FFFD);
 3267 #endif
 3268 
 3269   return cs;
 3270 }
 3271 
 3272 static char_set_t *wide_cs;
 3273 
 3274 int wide_display_char_p(wchar_t ch)
 3275 {
 3276   if (ch < 0x1100)
 3277     return 0;
 3278 
 3279   if (!wide_cs)
 3280     wide_cs = create_wide_cs();
 3281 
 3282   return char_set_contains(wide_cs, ch);
 3283 }
 3284 
 3285 val space_k, digit_k, word_char_k;
 3286 val cspace_k, cdigit_k, cword_char_k;
 3287 
 3288 void regex_init(void)
 3289 {
 3290   space_k = intern(lit("space"), keyword_package);
 3291   digit_k = intern(lit("digit"), keyword_package);
 3292   word_char_k = intern(lit("word-char"), keyword_package);
 3293   cspace_k = intern(lit("cspace"), keyword_package);
 3294   cdigit_k = intern(lit("cdigit"), keyword_package);
 3295   cword_char_k = intern(lit("cword-char"), keyword_package);
 3296 
 3297   reg_fun(intern(lit("regex-compile"), user_package), func_n2o(regex_compile, 1));
 3298   reg_fun(intern(lit("regexp"), user_package), func_n1(regexp));
 3299   reg_fun(intern(lit("regex-source"), user_package), func_n1(regex_source));
 3300   reg_fun(intern(lit("search-regex"), user_package), func_n4o(search_regex, 2));
 3301   reg_fun(intern(lit("range-regex"), user_package), func_n4o(range_regex, 2));
 3302   reg_fun(intern(lit("search-regst"), user_package), func_n4o(search_regst, 2));
 3303   reg_fun(intern(lit("match-regex"), user_package),
 3304           func_n3o((opt_compat && opt_compat <= 150) ?
 3305                    match_regex : match_regex_len, 2));
 3306   reg_fun(intern(lit("match-regst"), user_package), func_n3o(match_regst, 2));
 3307   reg_fun(intern(lit("match-regex-right"), user_package),
 3308           func_n3o((opt_compat && opt_compat <= 150) ?
 3309                    match_regex_right_old : match_regex_right, 2));
 3310   reg_fun(intern(lit("match-regst-right"), user_package),
 3311           func_n3o((opt_compat && opt_compat <= 150) ?
 3312                    match_regst_right_old : match_regst_right, 2));
 3313   reg_fun(intern(lit("regex-prefix-match"), user_package),
 3314           func_n3o(regex_prefix_match, 2));
 3315   reg_fun(intern(lit("regsub"), user_package), func_n3(regsub));
 3316   reg_fun(intern(lit("regex-parse"), user_package), func_n2o(regex_parse, 1));
 3317 
 3318   reg_fun(intern(lit("reg-expand-nongreedy"), system_package),
 3319           func_n1(reg_expand_nongreedy));
 3320   reg_fun(intern(lit("reg-optimize"), system_package), func_n1(reg_optimize));
 3321   reg_fun(intern(lit("read-until-match"), user_package), func_n3o(read_until_match, 1));
 3322   reg_fun(intern(lit("scan-until-match"), user_package), func_n2(scan_until_match));
 3323   reg_fun(intern(lit("count-until-match"), user_package), func_n2(count_until_match));
 3324   reg_fun(intern(lit("f^$"), user_package), func_n2o(regex_match_full_fun, 1));
 3325   reg_fun(intern(lit("f^"), user_package), func_n2o(regex_match_left_fun, 1));
 3326   reg_fun(intern(lit("f$"), user_package), func_n2o(regex_match_right_fun, 1));
 3327   reg_fun(intern(lit("m^$"), user_package), func_n3o(regex_match_full, 2));
 3328   reg_fun(intern(lit("m^"), user_package), func_n3o(regex_match_left, 2));
 3329   reg_fun(intern(lit("m$"), user_package), func_n3o(regex_match_right, 2));
 3330   reg_fun(intern(lit("r^$"), user_package), func_n3o(regex_range_full, 2));
 3331   reg_fun(intern(lit("r^"), user_package), func_n3o(regex_range_left, 2));
 3332   reg_fun(intern(lit("r$"), user_package), func_n3o(regex_range_right, 2));
 3333   reg_fun(intern(lit("rr"), user_package), func_n4o(regex_range_search, 2));
 3334   reg_fun(intern(lit("rra"), user_package), func_n4o(regex_range_all, 2));
 3335   reg_fun(intern(lit("fr^$"), user_package), func_n2o(regex_range_full_fun, 1));
 3336   reg_fun(intern(lit("fr^"), user_package), func_n2o(regex_range_left_fun, 1));
 3337   reg_fun(intern(lit("fr$"), user_package), func_n2o(regex_range_right_fun, 1));
 3338   reg_fun(intern(lit("frr"), user_package), func_n3o(regex_range_search_fun, 1));
 3339   init_special_char_sets();
 3340 }
 3341 
 3342 void regex_free_all(void)
 3343 {
 3344   char_set_destroy(space_cs, 1);
 3345   char_set_destroy(digit_cs, 1);
 3346   char_set_destroy(word_cs, 1);
 3347   char_set_destroy(cspace_cs, 1);
 3348   char_set_destroy(cdigit_cs, 1);
 3349   char_set_destroy(cword_cs, 1);
 3350   char_set_destroy(wide_cs, 1);
 3351 }