"Fossies" - the Fresh Open Source Software Archive

Member "perl-5.30.3/regcomp.c" (17 May 2020, 942252 Bytes) of package /linux/misc/perl-5.30.3.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. See also the latest Fossies "Diffs" side-by-side code changes report for "regcomp.c": 5.30.2_vs_5.30.3.

    1 /*    regcomp.c
    2  */
    3 
    4 /*
    5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
    6  *
    7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
    8  */
    9 
   10 /* This file contains functions for compiling a regular expression.  See
   11  * also regexec.c which funnily enough, contains functions for executing
   12  * a regular expression.
   13  *
   14  * This file is also copied at build time to ext/re/re_comp.c, where
   15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
   16  * This causes the main functions to be compiled under new names and with
   17  * debugging support added, which makes "use re 'debug'" work.
   18  */
   19 
   20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
   21  * confused with the original package (see point 3 below).  Thanks, Henry!
   22  */
   23 
   24 /* Additional note: this code is very heavily munged from Henry's version
   25  * in places.  In some spots I've traded clarity for efficiency, so don't
   26  * blame Henry for some of the lack of readability.
   27  */
   28 
   29 /* The names of the functions have been changed from regcomp and
   30  * regexec to pregcomp and pregexec in order to avoid conflicts
   31  * with the POSIX routines of the same names.
   32 */
   33 
   34 #ifdef PERL_EXT_RE_BUILD
   35 #include "re_top.h"
   36 #endif
   37 
   38 /*
   39  * pregcomp and pregexec -- regsub and regerror are not used in perl
   40  *
   41  *  Copyright (c) 1986 by University of Toronto.
   42  *  Written by Henry Spencer.  Not derived from licensed software.
   43  *
   44  *  Permission is granted to anyone to use this software for any
   45  *  purpose on any computer system, and to redistribute it freely,
   46  *  subject to the following restrictions:
   47  *
   48  *  1. The author is not responsible for the consequences of use of
   49  *      this software, no matter how awful, even if they arise
   50  *      from defects in it.
   51  *
   52  *  2. The origin of this software must not be misrepresented, either
   53  *      by explicit claim or by omission.
   54  *
   55  *  3. Altered versions must be plainly marked as such, and must not
   56  *      be misrepresented as being the original software.
   57  *
   58  *
   59  ****    Alterations to Henry's code are...
   60  ****
   61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
   62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
   63  ****    by Larry Wall and others
   64  ****
   65  ****    You may distribute under the terms of either the GNU General Public
   66  ****    License or the Artistic License, as specified in the README file.
   67 
   68  *
   69  * Beware that some of this code is subtly aware of the way operator
   70  * precedence is structured in regular expressions.  Serious changes in
   71  * regular-expression syntax might require a total rethink.
   72  */
   73 #include "EXTERN.h"
   74 #define PERL_IN_REGCOMP_C
   75 #include "perl.h"
   76 
   77 #define REG_COMP_C
   78 #ifdef PERL_IN_XSUB_RE
   79 #  include "re_comp.h"
   80 EXTERN_C const struct regexp_engine my_reg_engine;
   81 #else
   82 #  include "regcomp.h"
   83 #endif
   84 
   85 #include "dquote_inline.h"
   86 #include "invlist_inline.h"
   87 #include "unicode_constants.h"
   88 
   89 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
   90  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
   91 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
   92  _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
   93 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
   94 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
   95 
   96 #ifndef STATIC
   97 #define STATIC  static
   98 #endif
   99 
  100 /* this is a chain of data about sub patterns we are processing that
  101    need to be handled separately/specially in study_chunk. Its so
  102    we can simulate recursion without losing state.  */
  103 struct scan_frame;
  104 typedef struct scan_frame {
  105     regnode *last_regnode;      /* last node to process in this frame */
  106     regnode *next_regnode;      /* next node to process when last is reached */
  107     U32 prev_recursed_depth;
  108     I32 stopparen;              /* what stopparen do we use */
  109     bool in_gosub;              /* this or an outer frame is for GOSUB */
  110 
  111     struct scan_frame *this_prev_frame; /* this previous frame */
  112     struct scan_frame *prev_frame;      /* previous frame */
  113     struct scan_frame *next_frame;      /* next frame */
  114 } scan_frame;
  115 
  116 /* Certain characters are output as a sequence with the first being a
  117  * backslash. */
  118 #define isBACKSLASHED_PUNCT(c)  strchr("-[]\\^", c)
  119 
  120 
  121 struct RExC_state_t {
  122     U32     flags;          /* RXf_* are we folding, multilining? */
  123     U32     pm_flags;       /* PMf_* stuff from the calling PMOP */
  124     char    *precomp;       /* uncompiled string. */
  125     char    *precomp_end;       /* pointer to end of uncompiled string. */
  126     REGEXP  *rx_sv;         /* The SV that is the regexp. */
  127     regexp  *rx;                    /* perl core regexp structure */
  128     regexp_internal *rxi;           /* internal data for regexp object
  129                                            pprivate field */
  130     char    *start;         /* Start of input for compile */
  131     char    *end;           /* End of input for compile */
  132     char    *parse;         /* Input-scan pointer. */
  133     char        *copy_start;            /* start of copy of input within
  134                                            constructed parse string */
  135     char        *save_copy_start;       /* Provides one level of saving
  136                                            and restoring 'copy_start' */
  137     char        *copy_start_in_input;   /* Position in input string
  138                                            corresponding to copy_start */
  139     SSize_t whilem_seen;        /* number of WHILEM in this expr */
  140     regnode *emit_start;        /* Start of emitted-code area */
  141     regnode_offset emit;        /* Code-emit pointer */
  142     I32     naughty;        /* How bad is this pattern? */
  143     I32     sawback;        /* Did we see \1, ...? */
  144     U32     seen;
  145     SSize_t size;           /* Number of regnode equivalents in
  146                                            pattern */
  147 
  148     /* position beyond 'precomp' of the warning message furthest away from
  149      * 'precomp'.  During the parse, no warnings are raised for any problems
  150      * earlier in the parse than this position.  This works if warnings are
  151      * raised the first time a given spot is parsed, and if only one
  152      * independent warning is raised for any given spot */
  153     Size_t  latest_warn_offset;
  154 
  155     I32         npar;                   /* Capture buffer count so far in the
  156                                            parse, (OPEN) plus one. ("par" 0 is
  157                                            the whole pattern)*/
  158     I32         total_par;              /* During initial parse, is either 0,
  159                                            or -1; the latter indicating a
  160                                            reparse is needed.  After that pass,
  161                                            it is what 'npar' became after the
  162                                            pass.  Hence, it being > 0 indicates
  163                                            we are in a reparse situation */
  164     I32     nestroot;       /* root parens we are in - used by
  165                                            accept */
  166     I32     seen_zerolen;
  167     regnode_offset *open_parens;    /* offsets to open parens */
  168     regnode_offset *close_parens;   /* offsets to close parens */
  169     I32      parens_buf_size;           /* #slots malloced open/close_parens */
  170     regnode     *end_op;                /* END node in program */
  171     I32     utf8;       /* whether the pattern is utf8 or not */
  172     I32     orig_utf8;  /* whether the pattern was originally in utf8 */
  173                 /* XXX use this for future optimisation of case
  174                  * where pattern must be upgraded to utf8. */
  175     I32     uni_semantics;  /* If a d charset modifier should use unicode
  176                    rules, even if the pattern is not in
  177                    utf8 */
  178     HV      *paren_names;       /* Paren names */
  179 
  180     regnode **recurse;      /* Recurse regops */
  181     I32         recurse_count;          /* Number of recurse regops we have generated */
  182     U8          *study_chunk_recursed;  /* bitmap of which subs we have moved
  183                                            through */
  184     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
  185     I32     in_lookbehind;
  186     I32     contains_locale;
  187     I32     override_recoding;
  188 #ifdef EBCDIC
  189     I32     recode_x_to_native;
  190 #endif
  191     I32     in_multi_char_class;
  192     struct reg_code_blocks *code_blocks;/* positions of literal (?{})
  193                         within pattern */
  194     int     code_index;     /* next code_blocks[] slot */
  195     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
  196     scan_frame *frame_head;
  197     scan_frame *frame_last;
  198     U32         frame_count;
  199     AV         *warn_text;
  200     HV         *unlexed_names;
  201 #ifdef ADD_TO_REGEXEC
  202     char    *starttry;      /* -Dr: where regtry was called. */
  203 #define RExC_starttry   (pRExC_state->starttry)
  204 #endif
  205     SV      *runtime_code_qr;   /* qr with the runtime code blocks */
  206 #ifdef DEBUGGING
  207     const char  *lastparse;
  208     I32         lastnum;
  209     AV          *paren_name_list;       /* idx -> name */
  210     U32         study_chunk_recursed_count;
  211     SV          *mysv1;
  212     SV          *mysv2;
  213 
  214 #define RExC_lastparse  (pRExC_state->lastparse)
  215 #define RExC_lastnum    (pRExC_state->lastnum)
  216 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
  217 #define RExC_study_chunk_recursed_count    (pRExC_state->study_chunk_recursed_count)
  218 #define RExC_mysv   (pRExC_state->mysv1)
  219 #define RExC_mysv1  (pRExC_state->mysv1)
  220 #define RExC_mysv2  (pRExC_state->mysv2)
  221 
  222 #endif
  223     bool        seen_d_op;
  224     bool        strict;
  225     bool        study_started;
  226     bool        in_script_run;
  227     bool        use_BRANCHJ;
  228 };
  229 
  230 #define RExC_flags  (pRExC_state->flags)
  231 #define RExC_pm_flags   (pRExC_state->pm_flags)
  232 #define RExC_precomp    (pRExC_state->precomp)
  233 #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
  234 #define RExC_copy_start_in_constructed  (pRExC_state->copy_start)
  235 #define RExC_save_copy_start_in_constructed  (pRExC_state->save_copy_start)
  236 #define RExC_precomp_end (pRExC_state->precomp_end)
  237 #define RExC_rx_sv  (pRExC_state->rx_sv)
  238 #define RExC_rx     (pRExC_state->rx)
  239 #define RExC_rxi    (pRExC_state->rxi)
  240 #define RExC_start  (pRExC_state->start)
  241 #define RExC_end    (pRExC_state->end)
  242 #define RExC_parse  (pRExC_state->parse)
  243 #define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
  244 #define RExC_whilem_seen    (pRExC_state->whilem_seen)
  245 #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
  246                                                    under /d from /u ? */
  247 
  248 
  249 #ifdef RE_TRACK_PATTERN_OFFSETS
  250 #  define RExC_offsets  (RExC_rxi->u.offsets) /* I am not like the
  251                                                          others */
  252 #endif
  253 #define RExC_emit   (pRExC_state->emit)
  254 #define RExC_emit_start (pRExC_state->emit_start)
  255 #define RExC_sawback    (pRExC_state->sawback)
  256 #define RExC_seen   (pRExC_state->seen)
  257 #define RExC_size   (pRExC_state->size)
  258 #define RExC_maxlen        (pRExC_state->maxlen)
  259 #define RExC_npar   (pRExC_state->npar)
  260 #define RExC_total_parens   (pRExC_state->total_par)
  261 #define RExC_parens_buf_size    (pRExC_state->parens_buf_size)
  262 #define RExC_nestroot   (pRExC_state->nestroot)
  263 #define RExC_seen_zerolen   (pRExC_state->seen_zerolen)
  264 #define RExC_utf8   (pRExC_state->utf8)
  265 #define RExC_uni_semantics  (pRExC_state->uni_semantics)
  266 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
  267 #define RExC_open_parens    (pRExC_state->open_parens)
  268 #define RExC_close_parens   (pRExC_state->close_parens)
  269 #define RExC_end_op (pRExC_state->end_op)
  270 #define RExC_paren_names    (pRExC_state->paren_names)
  271 #define RExC_recurse    (pRExC_state->recurse)
  272 #define RExC_recurse_count  (pRExC_state->recurse_count)
  273 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
  274 #define RExC_study_chunk_recursed_bytes  \
  275                                    (pRExC_state->study_chunk_recursed_bytes)
  276 #define RExC_in_lookbehind  (pRExC_state->in_lookbehind)
  277 #define RExC_contains_locale    (pRExC_state->contains_locale)
  278 #ifdef EBCDIC
  279 #   define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
  280 #endif
  281 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
  282 #define RExC_frame_head (pRExC_state->frame_head)
  283 #define RExC_frame_last (pRExC_state->frame_last)
  284 #define RExC_frame_count (pRExC_state->frame_count)
  285 #define RExC_strict (pRExC_state->strict)
  286 #define RExC_study_started      (pRExC_state->study_started)
  287 #define RExC_warn_text (pRExC_state->warn_text)
  288 #define RExC_in_script_run      (pRExC_state->in_script_run)
  289 #define RExC_use_BRANCHJ        (pRExC_state->use_BRANCHJ)
  290 #define RExC_unlexed_names (pRExC_state->unlexed_names)
  291 
  292 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
  293  * a flag to disable back-off on the fixed/floating substrings - if it's
  294  * a high complexity pattern we assume the benefit of avoiding a full match
  295  * is worth the cost of checking for the substrings even if they rarely help.
  296  */
  297 #define RExC_naughty    (pRExC_state->naughty)
  298 #define TOO_NAUGHTY (10)
  299 #define MARK_NAUGHTY(add) \
  300     if (RExC_naughty < TOO_NAUGHTY) \
  301         RExC_naughty += (add)
  302 #define MARK_NAUGHTY_EXP(exp, add) \
  303     if (RExC_naughty < TOO_NAUGHTY) \
  304         RExC_naughty += RExC_naughty / (exp) + (add)
  305 
  306 #define ISMULT1(c)  ((c) == '*' || (c) == '+' || (c) == '?')
  307 #define ISMULT2(s)  ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
  308     ((*s) == '{' && regcurly(s)))
  309 
  310 /*
  311  * Flags to be passed up and down.
  312  */
  313 #define WORST       0   /* Worst case. */
  314 #define HASWIDTH    0x01    /* Known to not match null strings, could match
  315                                    non-null ones. */
  316 
  317 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
  318  * character.  (There needs to be a case: in the switch statement in regexec.c
  319  * for any node marked SIMPLE.)  Note that this is not the same thing as
  320  * REGNODE_SIMPLE */
  321 #define SIMPLE      0x02
  322 #define SPSTART     0x04    /* Starts with * or + */
  323 #define POSTPONED   0x08    /* (?1),(?&name), (??{...}) or similar */
  324 #define TRYAGAIN    0x10    /* Weeded out a declaration. */
  325 #define RESTART_PARSE   0x20    /* Need to redo the parse */
  326 #define NEED_UTF8       0x40    /* In conjunction with RESTART_PARSE, need to
  327                                    calcuate sizes as UTF-8 */
  328 
  329 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
  330 
  331 /* whether trie related optimizations are enabled */
  332 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
  333 #define TRIE_STUDY_OPT
  334 #define FULL_TRIE_STUDY
  335 #define TRIE_STCLASS
  336 #endif
  337 
  338 
  339 
  340 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
  341 #define PBITVAL(paren) (1 << ((paren) & 7))
  342 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
  343 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
  344 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
  345 
  346 #define REQUIRE_UTF8(flagp) STMT_START {                                   \
  347                                      if (!UTF) {                           \
  348                                          *flagp = RESTART_PARSE|NEED_UTF8; \
  349                                          return 0;                         \
  350                                      }                                     \
  351                              } STMT_END
  352 
  353 /* Change from /d into /u rules, and restart the parse.  RExC_uni_semantics is
  354  * a flag that indicates we need to override /d with /u as a result of
  355  * something in the pattern.  It should only be used in regards to calling
  356  * set_regex_charset() or get_regex_charse() */
  357 #define REQUIRE_UNI_RULES(flagp, restart_retval)                            \
  358     STMT_START {                                                            \
  359             if (DEPENDS_SEMANTICS) {                                        \
  360                 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);      \
  361                 RExC_uni_semantics = 1;                                     \
  362                 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) {           \
  363                     /* No need to restart the parse if we haven't seen      \
  364                      * anything that differs between /u and /d, and no need \
  365                      * to restart immediately if we're going to reparse     \
  366                      * anyway to count parens */                            \
  367                     *flagp |= RESTART_PARSE;                                \
  368                     return restart_retval;                                  \
  369                 }                                                           \
  370             }                                                               \
  371     } STMT_END
  372 
  373 #define REQUIRE_BRANCHJ(flagp, restart_retval)                              \
  374     STMT_START {                                                            \
  375                 RExC_use_BRANCHJ = 1;                                       \
  376                 *flagp |= RESTART_PARSE;                                    \
  377                 return restart_retval;                                      \
  378     } STMT_END
  379 
  380 /* Until we have completed the parse, we leave RExC_total_parens at 0 or
  381  * less.  After that, it must always be positive, because the whole re is
  382  * considered to be surrounded by virtual parens.  Setting it to negative
  383  * indicates there is some construct that needs to know the actual number of
  384  * parens to be properly handled.  And that means an extra pass will be
  385  * required after we've counted them all */
  386 #define ALL_PARENS_COUNTED (RExC_total_parens > 0)
  387 #define REQUIRE_PARENS_PASS                                                 \
  388     STMT_START {  /* No-op if have completed a pass */                      \
  389                     if (! ALL_PARENS_COUNTED) RExC_total_parens = -1;       \
  390     } STMT_END
  391 #define IN_PARENS_PASS (RExC_total_parens < 0)
  392 
  393 
  394 /* This is used to return failure (zero) early from the calling function if
  395  * various flags in 'flags' are set.  Two flags always cause a return:
  396  * 'RESTART_PARSE' and 'NEED_UTF8'.   'extra' can be used to specify any
  397  * additional flags that should cause a return; 0 if none.  If the return will
  398  * be done, '*flagp' is first set to be all of the flags that caused the
  399  * return. */
  400 #define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra)                  \
  401     STMT_START {                                                            \
  402             if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) {              \
  403                 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra));     \
  404                 return 0;                                                   \
  405             }                                                               \
  406     } STMT_END
  407 
  408 #define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
  409 
  410 #define RETURN_FAIL_ON_RESTART(flags,flagp)                                 \
  411                         RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
  412 #define RETURN_FAIL_ON_RESTART_FLAGP(flagp)                                 \
  413                                     if (MUST_RESTART(*(flagp))) return 0
  414 
  415 /* This converts the named class defined in regcomp.h to its equivalent class
  416  * number defined in handy.h. */
  417 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
  418 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
  419 
  420 #define _invlist_union_complement_2nd(a, b, output) \
  421                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
  422 #define _invlist_intersection_complement_2nd(a, b, output) \
  423                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
  424 
  425 /* About scan_data_t.
  426 
  427   During optimisation we recurse through the regexp program performing
  428   various inplace (keyhole style) optimisations. In addition study_chunk
  429   and scan_commit populate this data structure with information about
  430   what strings MUST appear in the pattern. We look for the longest
  431   string that must appear at a fixed location, and we look for the
  432   longest string that may appear at a floating location. So for instance
  433   in the pattern:
  434 
  435     /FOO[xX]A.*B[xX]BAR/
  436 
  437   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
  438   strings (because they follow a .* construct). study_chunk will identify
  439   both FOO and BAR as being the longest fixed and floating strings respectively.
  440 
  441   The strings can be composites, for instance
  442 
  443      /(f)(o)(o)/
  444 
  445   will result in a composite fixed substring 'foo'.
  446 
  447   For each string some basic information is maintained:
  448 
  449   - min_offset
  450     This is the position the string must appear at, or not before.
  451     It also implicitly (when combined with minlenp) tells us how many
  452     characters must match before the string we are searching for.
  453     Likewise when combined with minlenp and the length of the string it
  454     tells us how many characters must appear after the string we have
  455     found.
  456 
  457   - max_offset
  458     Only used for floating strings. This is the rightmost point that
  459     the string can appear at. If set to SSize_t_MAX it indicates that the
  460     string can occur infinitely far to the right.
  461     For fixed strings, it is equal to min_offset.
  462 
  463   - minlenp
  464     A pointer to the minimum number of characters of the pattern that the
  465     string was found inside. This is important as in the case of positive
  466     lookahead or positive lookbehind we can have multiple patterns
  467     involved. Consider
  468 
  469     /(?=FOO).*F/
  470 
  471     The minimum length of the pattern overall is 3, the minimum length
  472     of the lookahead part is 3, but the minimum length of the part that
  473     will actually match is 1. So 'FOO's minimum length is 3, but the
  474     minimum length for the F is 1. This is important as the minimum length
  475     is used to determine offsets in front of and behind the string being
  476     looked for.  Since strings can be composites this is the length of the
  477     pattern at the time it was committed with a scan_commit. Note that
  478     the length is calculated by study_chunk, so that the minimum lengths
  479     are not known until the full pattern has been compiled, thus the
  480     pointer to the value.
  481 
  482   - lookbehind
  483 
  484     In the case of lookbehind the string being searched for can be
  485     offset past the start point of the final matching string.
  486     If this value was just blithely removed from the min_offset it would
  487     invalidate some of the calculations for how many chars must match
  488     before or after (as they are derived from min_offset and minlen and
  489     the length of the string being searched for).
  490     When the final pattern is compiled and the data is moved from the
  491     scan_data_t structure into the regexp structure the information
  492     about lookbehind is factored in, with the information that would
  493     have been lost precalculated in the end_shift field for the
  494     associated string.
  495 
  496   The fields pos_min and pos_delta are used to store the minimum offset
  497   and the delta to the maximum offset at the current point in the pattern.
  498 
  499 */
  500 
  501 struct scan_data_substrs {
  502     SV      *str;       /* longest substring found in pattern */
  503     SSize_t min_offset; /* earliest point in string it can appear */
  504     SSize_t max_offset; /* latest point in string it can appear */
  505     SSize_t *minlenp;   /* pointer to the minlen relevant to the string */
  506     SSize_t lookbehind; /* is the pos of the string modified by LB */
  507     I32 flags;          /* per substring SF_* and SCF_* flags */
  508 };
  509 
  510 typedef struct scan_data_t {
  511     /*I32 len_min;      unused */
  512     /*I32 len_delta;    unused */
  513     SSize_t pos_min;
  514     SSize_t pos_delta;
  515     SV *last_found;
  516     SSize_t last_end;       /* min value, <0 unless valid. */
  517     SSize_t last_start_min;
  518     SSize_t last_start_max;
  519     U8      cur_is_floating; /* whether the last_* values should be set as
  520                               * the next fixed (0) or floating (1)
  521                               * substring */
  522 
  523     /* [0] is longest fixed substring so far, [1] is longest float so far */
  524     struct scan_data_substrs  substrs[2];
  525 
  526     I32 flags;             /* common SF_* and SCF_* flags */
  527     I32 whilem_c;
  528     SSize_t *last_closep;
  529     regnode_ssc *start_class;
  530 } scan_data_t;
  531 
  532 /*
  533  * Forward declarations for pregcomp()'s friends.
  534  */
  535 
  536 static const scan_data_t zero_scan_data = {
  537     0, 0, NULL, 0, 0, 0, 0,
  538     {
  539         { NULL, 0, 0, 0, 0, 0 },
  540         { NULL, 0, 0, 0, 0, 0 },
  541     },
  542     0, 0, NULL, NULL
  543 };
  544 
  545 /* study flags */
  546 
  547 #define SF_BEFORE_SEOL      0x0001
  548 #define SF_BEFORE_MEOL      0x0002
  549 #define SF_BEFORE_EOL       (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
  550 
  551 #define SF_IS_INF       0x0040
  552 #define SF_HAS_PAR      0x0080
  553 #define SF_IN_PAR       0x0100
  554 #define SF_HAS_EVAL     0x0200
  555 
  556 
  557 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
  558  * longest substring in the pattern. When it is not set the optimiser keeps
  559  * track of position, but does not keep track of the actual strings seen,
  560  *
  561  * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
  562  * /foo/i will not.
  563  *
  564  * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
  565  * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
  566  * turned off because of the alternation (BRANCH). */
  567 #define SCF_DO_SUBSTR       0x0400
  568 
  569 #define SCF_DO_STCLASS_AND  0x0800
  570 #define SCF_DO_STCLASS_OR   0x1000
  571 #define SCF_DO_STCLASS      (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
  572 #define SCF_WHILEM_VISITED_POS  0x2000
  573 
  574 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
  575 #define SCF_SEEN_ACCEPT         0x8000
  576 #define SCF_TRIE_DOING_RESTUDY 0x10000
  577 #define SCF_IN_DEFINE          0x20000
  578 
  579 
  580 
  581 
  582 #define UTF cBOOL(RExC_utf8)
  583 
  584 /* The enums for all these are ordered so things work out correctly */
  585 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
  586 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
  587                                                      == REGEX_DEPENDS_CHARSET)
  588 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
  589 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
  590                                                      >= REGEX_UNICODE_CHARSET)
  591 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
  592                                             == REGEX_ASCII_RESTRICTED_CHARSET)
  593 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
  594                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
  595 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
  596                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
  597 
  598 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
  599 
  600 /* For programs that want to be strictly Unicode compatible by dying if any
  601  * attempt is made to match a non-Unicode code point against a Unicode
  602  * property.  */
  603 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
  604 
  605 #define OOB_NAMEDCLASS      -1
  606 
  607 /* There is no code point that is out-of-bounds, so this is problematic.  But
  608  * its only current use is to initialize a variable that is always set before
  609  * looked at. */
  610 #define OOB_UNICODE     0xDEADBEEF
  611 
  612 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
  613 
  614 
  615 /* length of regex to show in messages that don't mark a position within */
  616 #define RegexLengthToShowInErrorMessages 127
  617 
  618 /*
  619  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
  620  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
  621  * op/pragma/warn/regcomp.
  622  */
  623 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
  624 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
  625 
  626 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
  627                         " in m/%" UTF8f MARKER2 "%" UTF8f "/"
  628 
  629 /* The code in this file in places uses one level of recursion with parsing
  630  * rebased to an alternate string constructed by us in memory.  This can take
  631  * the form of something that is completely different from the input, or
  632  * something that uses the input as part of the alternate.  In the first case,
  633  * there should be no possibility of an error, as we are in complete control of
  634  * the alternate string.  But in the second case we don't completely control
  635  * the input portion, so there may be errors in that.  Here's an example:
  636  *      /[abc\x{DF}def]/ui
  637  * is handled specially because \x{df} folds to a sequence of more than one
  638  * character: 'ss'.  What is done is to create and parse an alternate string,
  639  * which looks like this:
  640  *      /(?:\x{DF}|[abc\x{DF}def])/ui
  641  * where it uses the input unchanged in the middle of something it constructs,
  642  * which is a branch for the DF outside the character class, and clustering
  643  * parens around the whole thing. (It knows enough to skip the DF inside the
  644  * class while in this substitute parse.) 'abc' and 'def' may have errors that
  645  * need to be reported.  The general situation looks like this:
  646  *
  647  *                                       |<------- identical ------>|
  648  *              sI                       tI               xI       eI
  649  * Input:       ---------------------------------------------------------------
  650  * Constructed:         ---------------------------------------------------
  651  *                      sC               tC               xC       eC     EC
  652  *                                       |<------- identical ------>|
  653  *
  654  * sI..eI   is the portion of the input pattern we are concerned with here.
  655  * sC..EC   is the constructed substitute parse string.
  656  *  sC..tC  is constructed by us
  657  *  tC..eC  is an exact duplicate of the portion of the input pattern tI..eI.
  658  *          In the diagram, these are vertically aligned.
  659  *  eC..EC  is also constructed by us.
  660  * xC       is the position in the substitute parse string where we found a
  661  *          problem.
  662  * xI       is the position in the original pattern corresponding to xC.
  663  *
  664  * We want to display a message showing the real input string.  Thus we need to
  665  * translate from xC to xI.  We know that xC >= tC, since the portion of the
  666  * string sC..tC has been constructed by us, and so shouldn't have errors.  We
  667  * get:
  668  *      xI = tI + (xC - tC)
  669  *
  670  * When the substitute parse is constructed, the code needs to set:
  671  *      RExC_start (sC)
  672  *      RExC_end (eC)
  673  *      RExC_copy_start_in_input  (tI)
  674  *      RExC_copy_start_in_constructed (tC)
  675  * and restore them when done.
  676  *
  677  * During normal processing of the input pattern, both
  678  * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
  679  * sI, so that xC equals xI.
  680  */
  681 
  682 #define sI              RExC_precomp
  683 #define eI              RExC_precomp_end
  684 #define sC              RExC_start
  685 #define eC              RExC_end
  686 #define tI              RExC_copy_start_in_input
  687 #define tC              RExC_copy_start_in_constructed
  688 #define xI(xC)          (tI + (xC - tC))
  689 #define xI_offset(xC)   (xI(xC) - sI)
  690 
  691 #define REPORT_LOCATION_ARGS(xC)                                            \
  692     UTF8fARG(UTF,                                                           \
  693              (xI(xC) > eI) /* Don't run off end */                          \
  694               ? eI - sI   /* Length before the <--HERE */                   \
  695               : ((xI_offset(xC) >= 0)                                       \
  696                  ? xI_offset(xC)                                            \
  697                  : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %"    \
  698                                     IVdf " trying to output message for "   \
  699                                     " pattern %.*s",                        \
  700                                     __FILE__, __LINE__, (IV) xI_offset(xC), \
  701                                     ((int) (eC - sC)), sC), 0)),            \
  702              sI),         /* The input pattern printed up to the <--HERE */ \
  703     UTF8fARG(UTF,                                                           \
  704              (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */    \
  705              (xI(xC) > eI) ? eI : xI(xC))     /* pattern after <--HERE */
  706 
  707 /* Used to point after bad bytes for an error message, but avoid skipping
  708  * past a nul byte. */
  709 #define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
  710 
  711 /* Set up to clean up after our imminent demise */
  712 #define PREPARE_TO_DIE                                                      \
  713     STMT_START {                                        \
  714         if (RExC_rx_sv)                                                     \
  715             SAVEFREESV(RExC_rx_sv);                                         \
  716         if (RExC_open_parens)                                               \
  717             SAVEFREEPV(RExC_open_parens);                                   \
  718         if (RExC_close_parens)                                              \
  719             SAVEFREEPV(RExC_close_parens);                                  \
  720     } STMT_END
  721 
  722 /*
  723  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
  724  * arg. Show regex, up to a maximum length. If it's too long, chop and add
  725  * "...".
  726  */
  727 #define _FAIL(code) STMT_START {                    \
  728     const char *ellipses = "";                      \
  729     IV len = RExC_precomp_end - RExC_precomp;               \
  730                                     \
  731     PREPARE_TO_DIE;                             \
  732     if (len > RegexLengthToShowInErrorMessages) {           \
  733     /* chop 10 shorter than the max, to ensure meaning of "..." */  \
  734     len = RegexLengthToShowInErrorMessages - 10;            \
  735     ellipses = "...";                       \
  736     }                                   \
  737     code;                                                               \
  738 } STMT_END
  739 
  740 #define FAIL(msg) _FAIL(                \
  741     Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/",     \
  742         msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
  743 
  744 #define FAIL2(msg,arg) _FAIL(               \
  745     Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/",       \
  746         arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
  747 
  748 /*
  749  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
  750  */
  751 #define Simple_vFAIL(m) STMT_START {                    \
  752     Perl_croak(aTHX_ "%s" REPORT_LOCATION,              \
  753         m, REPORT_LOCATION_ARGS(RExC_parse));                   \
  754 } STMT_END
  755 
  756 /*
  757  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
  758  */
  759 #define vFAIL(m) STMT_START {               \
  760     PREPARE_TO_DIE;                                     \
  761     Simple_vFAIL(m);                    \
  762 } STMT_END
  763 
  764 /*
  765  * Like Simple_vFAIL(), but accepts two arguments.
  766  */
  767 #define Simple_vFAIL2(m,a1) STMT_START {            \
  768     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,      \
  769                       REPORT_LOCATION_ARGS(RExC_parse));    \
  770 } STMT_END
  771 
  772 /*
  773  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
  774  */
  775 #define vFAIL2(m,a1) STMT_START {           \
  776     PREPARE_TO_DIE;                                     \
  777     Simple_vFAIL2(m, a1);               \
  778 } STMT_END
  779 
  780 
  781 /*
  782  * Like Simple_vFAIL(), but accepts three arguments.
  783  */
  784 #define Simple_vFAIL3(m, a1, a2) STMT_START {           \
  785     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,      \
  786         REPORT_LOCATION_ARGS(RExC_parse));                  \
  787 } STMT_END
  788 
  789 /*
  790  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
  791  */
  792 #define vFAIL3(m,a1,a2) STMT_START {            \
  793     PREPARE_TO_DIE;                                     \
  794     Simple_vFAIL3(m, a1, a2);               \
  795 } STMT_END
  796 
  797 /*
  798  * Like Simple_vFAIL(), but accepts four arguments.
  799  */
  800 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {       \
  801     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,  \
  802         REPORT_LOCATION_ARGS(RExC_parse));                  \
  803 } STMT_END
  804 
  805 #define vFAIL4(m,a1,a2,a3) STMT_START {         \
  806     PREPARE_TO_DIE;                                     \
  807     Simple_vFAIL4(m, a1, a2, a3);           \
  808 } STMT_END
  809 
  810 /* A specialized version of vFAIL2 that works with UTF8f */
  811 #define vFAIL2utf8f(m, a1) STMT_START {             \
  812     PREPARE_TO_DIE;                                 \
  813     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,  \
  814             REPORT_LOCATION_ARGS(RExC_parse));      \
  815 } STMT_END
  816 
  817 #define vFAIL3utf8f(m, a1, a2) STMT_START {             \
  818     PREPARE_TO_DIE;                                     \
  819     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,  \
  820             REPORT_LOCATION_ARGS(RExC_parse));          \
  821 } STMT_END
  822 
  823 /* Setting this to NULL is a signal to not output warnings */
  824 #define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE                               \
  825     STMT_START {                                                            \
  826       RExC_save_copy_start_in_constructed  = RExC_copy_start_in_constructed;\
  827       RExC_copy_start_in_constructed = NULL;                                \
  828     } STMT_END
  829 #define RESTORE_WARNINGS                                                    \
  830     RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
  831 
  832 /* Since a warning can be generated multiple times as the input is reparsed, we
  833  * output it the first time we come to that point in the parse, but suppress it
  834  * otherwise.  'RExC_copy_start_in_constructed' being NULL is a flag to not
  835  * generate any warnings */
  836 #define TO_OUTPUT_WARNINGS(loc)                                         \
  837   (   RExC_copy_start_in_constructed                                    \
  838    && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
  839 
  840 /* After we've emitted a warning, we save the position in the input so we don't
  841  * output it again */
  842 #define UPDATE_WARNINGS_LOC(loc)                                        \
  843     STMT_START {                                                        \
  844         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
  845             RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc)))         \
  846                                                        - RExC_precomp;  \
  847         }                                                               \
  848     } STMT_END
  849 
  850 /* 'warns' is the output of the packWARNx macro used in 'code' */
  851 #define _WARN_HELPER(loc, warns, code)                                  \
  852     STMT_START {                                                        \
  853         if (! RExC_copy_start_in_constructed) {                         \
  854             Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none"  \
  855                               " expected at '%s'",                      \
  856                               __FILE__, __LINE__, loc);                 \
  857         }                                                               \
  858         if (TO_OUTPUT_WARNINGS(loc)) {                                  \
  859             if (ckDEAD(warns))                                          \
  860                 PREPARE_TO_DIE;                                         \
  861             code;                                                       \
  862             UPDATE_WARNINGS_LOC(loc);                                   \
  863         }                                                               \
  864     } STMT_END
  865 
  866 /* m is not necessarily a "literal string", in this macro */
  867 #define reg_warn_non_literal_string(loc, m)                             \
  868     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
  869                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
  870                                        "%s" REPORT_LOCATION,            \
  871                                   m, REPORT_LOCATION_ARGS(loc)))
  872 
  873 #define ckWARNreg(loc,m)                            \
  874     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
  875                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),       \
  876                                           m REPORT_LOCATION,            \
  877                                       REPORT_LOCATION_ARGS(loc)))
  878 
  879 #define vWARN(loc, m)                                   \
  880     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
  881                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
  882                                        m REPORT_LOCATION,               \
  883                                        REPORT_LOCATION_ARGS(loc)))      \
  884 
  885 #define vWARN_dep(loc, m)                                   \
  886     _WARN_HELPER(loc, packWARN(WARN_DEPRECATED),                        \
  887                       Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),      \
  888                                        m REPORT_LOCATION,               \
  889                                    REPORT_LOCATION_ARGS(loc)))
  890 
  891 #define ckWARNdep(loc,m)                                    \
  892     _WARN_HELPER(loc, packWARN(WARN_DEPRECATED),                        \
  893                       Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
  894                                         m REPORT_LOCATION,          \
  895                                         REPORT_LOCATION_ARGS(loc)))
  896 
  897 #define ckWARNregdep(loc,m)                                 \
  898     _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP),              \
  899                       Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED,     \
  900                                                       WARN_REGEXP),         \
  901                                          m REPORT_LOCATION,             \
  902                                          REPORT_LOCATION_ARGS(loc)))
  903 
  904 #define ckWARN2reg_d(loc,m, a1)                                 \
  905     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
  906                       Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),         \
  907                                         m REPORT_LOCATION,              \
  908                                         a1, REPORT_LOCATION_ARGS(loc)))
  909 
  910 #define ckWARN2reg(loc, m, a1)                                              \
  911     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
  912                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),           \
  913                                           m REPORT_LOCATION,                \
  914                                           a1, REPORT_LOCATION_ARGS(loc)))
  915 
  916 #define vWARN3(loc, m, a1, a2)                              \
  917     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
  918                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),              \
  919                                        m REPORT_LOCATION,                   \
  920                                    a1, a2, REPORT_LOCATION_ARGS(loc)))
  921 
  922 #define ckWARN3reg(loc, m, a1, a2)                              \
  923     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                                \
  924                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),           \
  925                                           m REPORT_LOCATION,                \
  926                                       a1, a2,                           \
  927                                           REPORT_LOCATION_ARGS(loc)))
  928 
  929 #define vWARN4(loc, m, a1, a2, a3)                          \
  930     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
  931                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
  932                                        m REPORT_LOCATION,               \
  933                                    a1, a2, a3,                      \
  934                                        REPORT_LOCATION_ARGS(loc)))
  935 
  936 #define ckWARN4reg(loc, m, a1, a2, a3)                      \
  937     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
  938                       Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),       \
  939                                           m REPORT_LOCATION,            \
  940                                       a1, a2, a3,                   \
  941                                           REPORT_LOCATION_ARGS(loc)))
  942 
  943 #define vWARN5(loc, m, a1, a2, a3, a4)                      \
  944     _WARN_HELPER(loc, packWARN(WARN_REGEXP),                            \
  945                       Perl_warner(aTHX_ packWARN(WARN_REGEXP),          \
  946                                        m REPORT_LOCATION,       \
  947                                    a1, a2, a3, a4,                  \
  948                                        REPORT_LOCATION_ARGS(loc)))
  949 
  950 #define ckWARNexperimental(loc, class, m)                               \
  951     _WARN_HELPER(loc, packWARN(class),                                  \
  952                       Perl_ck_warner_d(aTHX_ packWARN(class),           \
  953                                             m REPORT_LOCATION,          \
  954                                             REPORT_LOCATION_ARGS(loc)))
  955 
  956 /* Convert between a pointer to a node and its offset from the beginning of the
  957  * program */
  958 #define REGNODE_p(offset)    (RExC_emit_start + (offset))
  959 #define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
  960 
  961 /* Macros for recording node offsets.   20001227 mjd@plover.com
  962  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
  963  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
  964  * Element 0 holds the number n.
  965  * Position is 1 indexed.
  966  */
  967 #ifndef RE_TRACK_PATTERN_OFFSETS
  968 #define Set_Node_Offset_To_R(offset,byte)
  969 #define Set_Node_Offset(node,byte)
  970 #define Set_Cur_Node_Offset
  971 #define Set_Node_Length_To_R(node,len)
  972 #define Set_Node_Length(node,len)
  973 #define Set_Node_Cur_Length(node,start)
  974 #define Node_Offset(n)
  975 #define Node_Length(n)
  976 #define Set_Node_Offset_Length(node,offset,len)
  977 #define ProgLen(ri) ri->u.proglen
  978 #define SetProgLen(ri,x) ri->u.proglen = x
  979 #define Track_Code(code)
  980 #else
  981 #define ProgLen(ri) ri->u.offsets[0]
  982 #define SetProgLen(ri,x) ri->u.offsets[0] = x
  983 #define Set_Node_Offset_To_R(offset,byte) STMT_START {          \
  984     MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",     \
  985             __LINE__, (int)(offset), (int)(byte)));     \
  986     if((offset) < 0) {                      \
  987         Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
  988                                          (int)(offset));                \
  989     } else {                            \
  990             RExC_offsets[2*(offset)-1] = (byte);                    \
  991     }                               \
  992 } STMT_END
  993 
  994 #define Set_Node_Offset(node,byte)                                      \
  995     Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
  996 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
  997 
  998 #define Set_Node_Length_To_R(node,len) STMT_START {         \
  999     MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",       \
 1000         __LINE__, (int)(node), (int)(len)));            \
 1001     if((node) < 0) {                        \
 1002         Perl_croak(aTHX_ "value of node is %d in Length macro",     \
 1003                                          (int)(node));                  \
 1004     } else {                            \
 1005         RExC_offsets[2*(node)] = (len);             \
 1006     }                               \
 1007 } STMT_END
 1008 
 1009 #define Set_Node_Length(node,len) \
 1010     Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
 1011 #define Set_Node_Cur_Length(node, start)                \
 1012     Set_Node_Length(node, RExC_parse - start)
 1013 
 1014 /* Get offsets and lengths */
 1015 #define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
 1016 #define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
 1017 
 1018 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 1019     Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset));   \
 1020     Set_Node_Length_To_R(REGNODE_OFFSET(node), (len));  \
 1021 } STMT_END
 1022 
 1023 #define Track_Code(code) STMT_START { code } STMT_END
 1024 #endif
 1025 
 1026 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 1027 #define EXPERIMENTAL_INPLACESCAN
 1028 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 1029 
 1030 #ifdef DEBUGGING
 1031 int
 1032 Perl_re_printf(pTHX_ const char *fmt, ...)
 1033 {
 1034     va_list ap;
 1035     int result;
 1036     PerlIO *f= Perl_debug_log;
 1037     PERL_ARGS_ASSERT_RE_PRINTF;
 1038     va_start(ap, fmt);
 1039     result = PerlIO_vprintf(f, fmt, ap);
 1040     va_end(ap);
 1041     return result;
 1042 }
 1043 
 1044 int
 1045 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
 1046 {
 1047     va_list ap;
 1048     int result;
 1049     PerlIO *f= Perl_debug_log;
 1050     PERL_ARGS_ASSERT_RE_INDENTF;
 1051     va_start(ap, depth);
 1052     PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
 1053     result = PerlIO_vprintf(f, fmt, ap);
 1054     va_end(ap);
 1055     return result;
 1056 }
 1057 #endif /* DEBUGGING */
 1058 
 1059 #define DEBUG_RExC_seen()                                                   \
 1060         DEBUG_OPTIMISE_MORE_r({                                             \
 1061             Perl_re_printf( aTHX_ "RExC_seen: ");                           \
 1062                                                                             \
 1063             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
 1064                 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN ");                \
 1065                                                                             \
 1066             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
 1067                 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN ");              \
 1068                                                                             \
 1069             if (RExC_seen & REG_GPOS_SEEN)                                  \
 1070                 Perl_re_printf( aTHX_ "REG_GPOS_SEEN ");                    \
 1071                                                                             \
 1072             if (RExC_seen & REG_RECURSE_SEEN)                               \
 1073                 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN ");                 \
 1074                                                                             \
 1075             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                    \
 1076                 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN ");      \
 1077                                                                             \
 1078             if (RExC_seen & REG_VERBARG_SEEN)                               \
 1079                 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN ");                 \
 1080                                                                             \
 1081             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
 1082                 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN ");                \
 1083                                                                             \
 1084             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
 1085                 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN ");          \
 1086                                                                             \
 1087             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
 1088                 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN ");          \
 1089                                                                             \
 1090             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                  \
 1091                 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN ");    \
 1092                                                                             \
 1093             Perl_re_printf( aTHX_ "\n");                                    \
 1094         });
 1095 
 1096 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
 1097   if ((flags) & flag) Perl_re_printf( aTHX_  "%s ", #flag)
 1098 
 1099 
 1100 #ifdef DEBUGGING
 1101 static void
 1102 S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
 1103                                     const char *close_str)
 1104 {
 1105     if (!flags)
 1106         return;
 1107 
 1108     Perl_re_printf( aTHX_  "%s", open_str);
 1109     DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
 1110     DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
 1111     DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
 1112     DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
 1113     DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
 1114     DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
 1115     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
 1116     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
 1117     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
 1118     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
 1119     DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
 1120     DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
 1121     DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
 1122     DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
 1123     DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
 1124     Perl_re_printf( aTHX_  "%s", close_str);
 1125 }
 1126 
 1127 
 1128 static void
 1129 S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
 1130                     U32 depth, int is_inf)
 1131 {
 1132     GET_RE_DEBUG_FLAGS_DECL;
 1133 
 1134     DEBUG_OPTIMISE_MORE_r({
 1135         if (!data)
 1136             return;
 1137         Perl_re_indentf(aTHX_  "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
 1138             depth,
 1139             where,
 1140             (IV)data->pos_min,
 1141             (IV)data->pos_delta,
 1142             (UV)data->flags
 1143         );
 1144 
 1145         S_debug_show_study_flags(aTHX_ data->flags," [","]");
 1146 
 1147         Perl_re_printf( aTHX_
 1148             " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
 1149             (IV)data->whilem_c,
 1150             (IV)(data->last_closep ? *((data)->last_closep) : -1),
 1151             is_inf ? "INF " : ""
 1152         );
 1153 
 1154         if (data->last_found) {
 1155             int i;
 1156             Perl_re_printf(aTHX_
 1157                 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
 1158                     SvPVX_const(data->last_found),
 1159                     (IV)data->last_end,
 1160                     (IV)data->last_start_min,
 1161                     (IV)data->last_start_max
 1162             );
 1163 
 1164             for (i = 0; i < 2; i++) {
 1165                 Perl_re_printf(aTHX_
 1166                     " %s%s: '%s' @ %" IVdf "/%" IVdf,
 1167                     data->cur_is_floating == i ? "*" : "",
 1168                     i ? "Float" : "Fixed",
 1169                     SvPVX_const(data->substrs[i].str),
 1170                     (IV)data->substrs[i].min_offset,
 1171                     (IV)data->substrs[i].max_offset
 1172                 );
 1173                 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
 1174             }
 1175         }
 1176 
 1177         Perl_re_printf( aTHX_ "\n");
 1178     });
 1179 }
 1180 
 1181 
 1182 static void
 1183 S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
 1184                 regnode *scan, U32 depth, U32 flags)
 1185 {
 1186     GET_RE_DEBUG_FLAGS_DECL;
 1187 
 1188     DEBUG_OPTIMISE_r({
 1189         regnode *Next;
 1190 
 1191         if (!scan)
 1192             return;
 1193         Next = regnext(scan);
 1194         regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
 1195         Perl_re_indentf( aTHX_   "%s>%3d: %s (%d)",
 1196             depth,
 1197             str,
 1198             REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
 1199             Next ? (REG_NODE_NUM(Next)) : 0 );
 1200         S_debug_show_study_flags(aTHX_ flags," [ ","]");
 1201         Perl_re_printf( aTHX_  "\n");
 1202    });
 1203 }
 1204 
 1205 
 1206 #  define DEBUG_STUDYDATA(where, data, depth, is_inf) \
 1207                     S_debug_studydata(aTHX_ where, data, depth, is_inf)
 1208 
 1209 #  define DEBUG_PEEP(str, scan, depth, flags)   \
 1210                     S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
 1211 
 1212 #else
 1213 #  define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
 1214 #  define DEBUG_PEEP(str, scan, depth, flags)         NOOP
 1215 #endif
 1216 
 1217 
 1218 /* =========================================================
 1219  * BEGIN edit_distance stuff.
 1220  *
 1221  * This calculates how many single character changes of any type are needed to
 1222  * transform a string into another one.  It is taken from version 3.1 of
 1223  *
 1224  * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
 1225  */
 1226 
 1227 /* Our unsorted dictionary linked list.   */
 1228 /* Note we use UVs, not chars. */
 1229 
 1230 struct dictionary{
 1231   UV key;
 1232   UV value;
 1233   struct dictionary* next;
 1234 };
 1235 typedef struct dictionary item;
 1236 
 1237 
 1238 PERL_STATIC_INLINE item*
 1239 push(UV key, item* curr)
 1240 {
 1241     item* head;
 1242     Newx(head, 1, item);
 1243     head->key = key;
 1244     head->value = 0;
 1245     head->next = curr;
 1246     return head;
 1247 }
 1248 
 1249 
 1250 PERL_STATIC_INLINE item*
 1251 find(item* head, UV key)
 1252 {
 1253     item* iterator = head;
 1254     while (iterator){
 1255         if (iterator->key == key){
 1256             return iterator;
 1257         }
 1258         iterator = iterator->next;
 1259     }
 1260 
 1261     return NULL;
 1262 }
 1263 
 1264 PERL_STATIC_INLINE item*
 1265 uniquePush(item* head, UV key)
 1266 {
 1267     item* iterator = head;
 1268 
 1269     while (iterator){
 1270         if (iterator->key == key) {
 1271             return head;
 1272         }
 1273         iterator = iterator->next;
 1274     }
 1275 
 1276     return push(key, head);
 1277 }
 1278 
 1279 PERL_STATIC_INLINE void
 1280 dict_free(item* head)
 1281 {
 1282     item* iterator = head;
 1283 
 1284     while (iterator) {
 1285         item* temp = iterator;
 1286         iterator = iterator->next;
 1287         Safefree(temp);
 1288     }
 1289 
 1290     head = NULL;
 1291 }
 1292 
 1293 /* End of Dictionary Stuff */
 1294 
 1295 /* All calculations/work are done here */
 1296 STATIC int
 1297 S_edit_distance(const UV* src,
 1298                 const UV* tgt,
 1299                 const STRLEN x,             /* length of src[] */
 1300                 const STRLEN y,             /* length of tgt[] */
 1301                 const SSize_t maxDistance
 1302 )
 1303 {
 1304     item *head = NULL;
 1305     UV swapCount, swapScore, targetCharCount, i, j;
 1306     UV *scores;
 1307     UV score_ceil = x + y;
 1308 
 1309     PERL_ARGS_ASSERT_EDIT_DISTANCE;
 1310 
 1311     /* intialize matrix start values */
 1312     Newx(scores, ( (x + 2) * (y + 2)), UV);
 1313     scores[0] = score_ceil;
 1314     scores[1 * (y + 2) + 0] = score_ceil;
 1315     scores[0 * (y + 2) + 1] = score_ceil;
 1316     scores[1 * (y + 2) + 1] = 0;
 1317     head = uniquePush(uniquePush(head, src[0]), tgt[0]);
 1318 
 1319     /* work loops    */
 1320     /* i = src index */
 1321     /* j = tgt index */
 1322     for (i=1;i<=x;i++) {
 1323         if (i < x)
 1324             head = uniquePush(head, src[i]);
 1325         scores[(i+1) * (y + 2) + 1] = i;
 1326         scores[(i+1) * (y + 2) + 0] = score_ceil;
 1327         swapCount = 0;
 1328 
 1329         for (j=1;j<=y;j++) {
 1330             if (i == 1) {
 1331                 if(j < y)
 1332                 head = uniquePush(head, tgt[j]);
 1333                 scores[1 * (y + 2) + (j + 1)] = j;
 1334                 scores[0 * (y + 2) + (j + 1)] = score_ceil;
 1335             }
 1336 
 1337             targetCharCount = find(head, tgt[j-1])->value;
 1338             swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
 1339 
 1340             if (src[i-1] != tgt[j-1]){
 1341                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
 1342             }
 1343             else {
 1344                 swapCount = j;
 1345                 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
 1346             }
 1347         }
 1348 
 1349         find(head, src[i-1])->value = i;
 1350     }
 1351 
 1352     {
 1353         IV score = scores[(x+1) * (y + 2) + (y + 1)];
 1354         dict_free(head);
 1355         Safefree(scores);
 1356         return (maxDistance != 0 && maxDistance < score)?(-1):score;
 1357     }
 1358 }
 1359 
 1360 /* END of edit_distance() stuff
 1361  * ========================================================= */
 1362 
 1363 /* is c a control character for which we have a mnemonic? */
 1364 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 1365 
 1366 STATIC const char *
 1367 S_cntrl_to_mnemonic(const U8 c)
 1368 {
 1369     /* Returns the mnemonic string that represents character 'c', if one
 1370      * exists; NULL otherwise.  The only ones that exist for the purposes of
 1371      * this routine are a few control characters */
 1372 
 1373     switch (c) {
 1374         case '\a':       return "\\a";
 1375         case '\b':       return "\\b";
 1376         case ESC_NATIVE: return "\\e";
 1377         case '\f':       return "\\f";
 1378         case '\n':       return "\\n";
 1379         case '\r':       return "\\r";
 1380         case '\t':       return "\\t";
 1381     }
 1382 
 1383     return NULL;
 1384 }
 1385 
 1386 /* Mark that we cannot extend a found fixed substring at this point.
 1387    Update the longest found anchored substring or the longest found
 1388    floating substrings if needed. */
 1389 
 1390 STATIC void
 1391 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
 1392                     SSize_t *minlenp, int is_inf)
 1393 {
 1394     const STRLEN l = CHR_SVLEN(data->last_found);
 1395     SV * const longest_sv = data->substrs[data->cur_is_floating].str;
 1396     const STRLEN old_l = CHR_SVLEN(longest_sv);
 1397     GET_RE_DEBUG_FLAGS_DECL;
 1398 
 1399     PERL_ARGS_ASSERT_SCAN_COMMIT;
 1400 
 1401     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 1402         const U8 i = data->cur_is_floating;
 1403     SvSetMagicSV(longest_sv, data->last_found);
 1404         data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
 1405 
 1406     if (!i) /* fixed */
 1407         data->substrs[0].max_offset = data->substrs[0].min_offset;
 1408     else { /* float */
 1409         data->substrs[1].max_offset = (l
 1410                           ? data->last_start_max
 1411                           : (data->pos_delta > SSize_t_MAX - data->pos_min
 1412                      ? SSize_t_MAX
 1413                      : data->pos_min + data->pos_delta));
 1414         if (is_inf
 1415          || (STRLEN)data->substrs[1].max_offset > (STRLEN)SSize_t_MAX)
 1416         data->substrs[1].max_offset = SSize_t_MAX;
 1417         }
 1418 
 1419         if (data->flags & SF_BEFORE_EOL)
 1420             data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL);
 1421         else
 1422             data->substrs[i].flags &= ~SF_BEFORE_EOL;
 1423         data->substrs[i].minlenp = minlenp;
 1424         data->substrs[i].lookbehind = 0;
 1425     }
 1426 
 1427     SvCUR_set(data->last_found, 0);
 1428     {
 1429     SV * const sv = data->last_found;
 1430     if (SvUTF8(sv) && SvMAGICAL(sv)) {
 1431         MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 1432         if (mg)
 1433         mg->mg_len = 0;
 1434     }
 1435     }
 1436     data->last_end = -1;
 1437     data->flags &= ~SF_BEFORE_EOL;
 1438     DEBUG_STUDYDATA("commit", data, 0, is_inf);
 1439 }
 1440 
 1441 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
 1442  * list that describes which code points it matches */
 1443 
 1444 STATIC void
 1445 S_ssc_anything(pTHX_ regnode_ssc *ssc)
 1446 {
 1447     /* Set the SSC 'ssc' to match an empty string or any code point */
 1448 
 1449     PERL_ARGS_ASSERT_SSC_ANYTHING;
 1450 
 1451     assert(is_ANYOF_SYNTHETIC(ssc));
 1452 
 1453     /* mortalize so won't leak */
 1454     ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
 1455     ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING;  /* Plus matches empty */
 1456 }
 1457 
 1458 STATIC int
 1459 S_ssc_is_anything(const regnode_ssc *ssc)
 1460 {
 1461     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
 1462      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
 1463      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
 1464      * in any way, so there's no point in using it */
 1465 
 1466     UV start, end;
 1467     bool ret;
 1468 
 1469     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
 1470 
 1471     assert(is_ANYOF_SYNTHETIC(ssc));
 1472 
 1473     if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
 1474         return FALSE;
 1475     }
 1476 
 1477     /* See if the list consists solely of the range 0 - Infinity */
 1478     invlist_iterinit(ssc->invlist);
 1479     ret = invlist_iternext(ssc->invlist, &start, &end)
 1480           && start == 0
 1481           && end == UV_MAX;
 1482 
 1483     invlist_iterfinish(ssc->invlist);
 1484 
 1485     if (ret) {
 1486         return TRUE;
 1487     }
 1488 
 1489     /* If e.g., both \w and \W are set, matches everything */
 1490     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
 1491         int i;
 1492         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
 1493             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
 1494                 return TRUE;
 1495             }
 1496         }
 1497     }
 1498 
 1499     return FALSE;
 1500 }
 1501 
 1502 STATIC void
 1503 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
 1504 {
 1505     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
 1506      * string, any code point, or any posix class under locale */
 1507 
 1508     PERL_ARGS_ASSERT_SSC_INIT;
 1509 
 1510     Zero(ssc, 1, regnode_ssc);
 1511     set_ANYOF_SYNTHETIC(ssc);
 1512     ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
 1513     ssc_anything(ssc);
 1514 
 1515     /* If any portion of the regex is to operate under locale rules that aren't
 1516      * fully known at compile time, initialization includes it.  The reason
 1517      * this isn't done for all regexes is that the optimizer was written under
 1518      * the assumption that locale was all-or-nothing.  Given the complexity and
 1519      * lack of documentation in the optimizer, and that there are inadequate
 1520      * test cases for locale, many parts of it may not work properly, it is
 1521      * safest to avoid locale unless necessary. */
 1522     if (RExC_contains_locale) {
 1523     ANYOF_POSIXL_SETALL(ssc);
 1524     }
 1525     else {
 1526     ANYOF_POSIXL_ZERO(ssc);
 1527     }
 1528 }
 1529 
 1530 STATIC int
 1531 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
 1532                         const regnode_ssc *ssc)
 1533 {
 1534     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
 1535      * to the list of code points matched, and locale posix classes; hence does
 1536      * not check its flags) */
 1537 
 1538     UV start, end;
 1539     bool ret;
 1540 
 1541     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
 1542 
 1543     assert(is_ANYOF_SYNTHETIC(ssc));
 1544 
 1545     invlist_iterinit(ssc->invlist);
 1546     ret = invlist_iternext(ssc->invlist, &start, &end)
 1547           && start == 0
 1548           && end == UV_MAX;
 1549 
 1550     invlist_iterfinish(ssc->invlist);
 1551 
 1552     if (! ret) {
 1553         return FALSE;
 1554     }
 1555 
 1556     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
 1557         return FALSE;
 1558     }
 1559 
 1560     return TRUE;
 1561 }
 1562 
 1563 #define INVLIST_INDEX 0
 1564 #define ONLY_LOCALE_MATCHES_INDEX 1
 1565 #define DEFERRED_USER_DEFINED_INDEX 2
 1566 
 1567 STATIC SV*
 1568 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
 1569                                const regnode_charclass* const node)
 1570 {
 1571     /* Returns a mortal inversion list defining which code points are matched
 1572      * by 'node', which is of type ANYOF.  Handles complementing the result if
 1573      * appropriate.  If some code points aren't knowable at this time, the
 1574      * returned list must, and will, contain every code point that is a
 1575      * possibility. */
 1576 
 1577     dVAR;
 1578     SV* invlist = NULL;
 1579     SV* only_utf8_locale_invlist = NULL;
 1580     unsigned int i;
 1581     const U32 n = ARG(node);
 1582     bool new_node_has_latin1 = FALSE;
 1583     const U8 flags = OP(node) == ANYOFH ? 0 : ANYOF_FLAGS(node);
 1584 
 1585     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
 1586 
 1587     /* Look at the data structure created by S_set_ANYOF_arg() */
 1588     if (n != ANYOF_ONLY_HAS_BITMAP) {
 1589         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
 1590         AV * const av = MUTABLE_AV(SvRV(rv));
 1591         SV **const ary = AvARRAY(av);
 1592         assert(RExC_rxi->data->what[n] == 's');
 1593 
 1594         if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
 1595 
 1596             /* Here there are things that won't be known until runtime -- we
 1597              * have to assume it could be anything */
 1598             invlist = sv_2mortal(_new_invlist(1));
 1599             return _add_range_to_invlist(invlist, 0, UV_MAX);
 1600         }
 1601         else if (ary[INVLIST_INDEX]) {
 1602 
 1603             /* Use the node's inversion list */
 1604             invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
 1605         }
 1606 
 1607         /* Get the code points valid only under UTF-8 locales */
 1608         if (   (flags & ANYOFL_FOLD)
 1609             &&  av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
 1610         {
 1611             only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
 1612         }
 1613     }
 1614 
 1615     if (! invlist) {
 1616         invlist = sv_2mortal(_new_invlist(0));
 1617     }
 1618 
 1619     /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
 1620      * code points, and an inversion list for the others, but if there are code
 1621      * points that should match only conditionally on the target string being
 1622      * UTF-8, those are placed in the inversion list, and not the bitmap.
 1623      * Since there are circumstances under which they could match, they are
 1624      * included in the SSC.  But if the ANYOF node is to be inverted, we have
 1625      * to exclude them here, so that when we invert below, the end result
 1626      * actually does include them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We
 1627      * have to do this here before we add the unconditionally matched code
 1628      * points */
 1629     if (flags & ANYOF_INVERT) {
 1630         _invlist_intersection_complement_2nd(invlist,
 1631                                              PL_UpperLatin1,
 1632                                              &invlist);
 1633     }
 1634 
 1635     /* Add in the points from the bit map */
 1636     if (OP(node) != ANYOFH) {
 1637         for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
 1638             if (ANYOF_BITMAP_TEST(node, i)) {
 1639                 unsigned int start = i++;
 1640 
 1641                 for (;    i < NUM_ANYOF_CODE_POINTS
 1642                        && ANYOF_BITMAP_TEST(node, i); ++i)
 1643                 {
 1644                     /* empty */
 1645                 }
 1646                 invlist = _add_range_to_invlist(invlist, start, i-1);
 1647                 new_node_has_latin1 = TRUE;
 1648             }
 1649         }
 1650     }
 1651 
 1652     /* If this can match all upper Latin1 code points, have to add them
 1653      * as well.  But don't add them if inverting, as when that gets done below,
 1654      * it would exclude all these characters, including the ones it shouldn't
 1655      * that were added just above */
 1656     if (! (flags & ANYOF_INVERT) && OP(node) == ANYOFD
 1657         && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
 1658     {
 1659         _invlist_union(invlist, PL_UpperLatin1, &invlist);
 1660     }
 1661 
 1662     /* Similarly for these */
 1663     if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
 1664         _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
 1665     }
 1666 
 1667     if (flags & ANYOF_INVERT) {
 1668         _invlist_invert(invlist);
 1669     }
 1670     else if (flags & ANYOFL_FOLD) {
 1671         if (new_node_has_latin1) {
 1672 
 1673             /* Under /li, any 0-255 could fold to any other 0-255, depending on
 1674              * the locale.  We can skip this if there are no 0-255 at all. */
 1675             _invlist_union(invlist, PL_Latin1, &invlist);
 1676 
 1677             invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I);
 1678             invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
 1679         }
 1680         else {
 1681             if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
 1682                 invlist = add_cp_to_invlist(invlist, 'I');
 1683             }
 1684             if (_invlist_contains_cp(invlist,
 1685                                         LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
 1686             {
 1687                 invlist = add_cp_to_invlist(invlist, 'i');
 1688             }
 1689         }
 1690     }
 1691 
 1692     /* Similarly add the UTF-8 locale possible matches.  These have to be
 1693      * deferred until after the non-UTF-8 locale ones are taken care of just
 1694      * above, or it leads to wrong results under ANYOF_INVERT */
 1695     if (only_utf8_locale_invlist) {
 1696         _invlist_union_maybe_complement_2nd(invlist,
 1697                                             only_utf8_locale_invlist,
 1698                                             flags & ANYOF_INVERT,
 1699                                             &invlist);
 1700     }
 1701 
 1702     return invlist;
 1703 }
 1704 
 1705 /* These two functions currently do the exact same thing */
 1706 #define ssc_init_zero       ssc_init
 1707 
 1708 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
 1709 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
 1710 
 1711 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
 1712  * should not be inverted.  'and_with->flags & ANYOF_MATCHES_POSIXL' should be
 1713  * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
 1714 
 1715 STATIC void
 1716 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
 1717                 const regnode_charclass *and_with)
 1718 {
 1719     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
 1720      * another SSC or a regular ANYOF class.  Can create false positives. */
 1721 
 1722     SV* anded_cp_list;
 1723     U8  and_with_flags = (OP(and_with) == ANYOFH) ? 0 : ANYOF_FLAGS(and_with);
 1724     U8  anded_flags;
 1725 
 1726     PERL_ARGS_ASSERT_SSC_AND;
 1727 
 1728     assert(is_ANYOF_SYNTHETIC(ssc));
 1729 
 1730     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
 1731      * the code point inversion list and just the relevant flags */
 1732     if (is_ANYOF_SYNTHETIC(and_with)) {
 1733         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
 1734         anded_flags = and_with_flags;
 1735 
 1736         /* XXX This is a kludge around what appears to be deficiencies in the
 1737          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
 1738          * there are paths through the optimizer where it doesn't get weeded
 1739          * out when it should.  And if we don't make some extra provision for
 1740          * it like the code just below, it doesn't get added when it should.
 1741          * This solution is to add it only when AND'ing, which is here, and
 1742          * only when what is being AND'ed is the pristine, original node
 1743          * matching anything.  Thus it is like adding it to ssc_anything() but
 1744          * only when the result is to be AND'ed.  Probably the same solution
 1745          * could be adopted for the same problem we have with /l matching,
 1746          * which is solved differently in S_ssc_init(), and that would lead to
 1747          * fewer false positives than that solution has.  But if this solution
 1748          * creates bugs, the consequences are only that a warning isn't raised
 1749          * that should be; while the consequences for having /l bugs is
 1750          * incorrect matches */
 1751         if (ssc_is_anything((regnode_ssc *)and_with)) {
 1752             anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
 1753         }
 1754     }
 1755     else {
 1756         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
 1757         if (OP(and_with) == ANYOFD) {
 1758             anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
 1759         }
 1760         else {
 1761             anded_flags = and_with_flags
 1762             &( ANYOF_COMMON_FLAGS
 1763               |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
 1764               |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
 1765             if (ANYOFL_UTF8_LOCALE_REQD(and_with_flags)) {
 1766                 anded_flags &=
 1767                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
 1768             }
 1769         }
 1770     }
 1771 
 1772     ANYOF_FLAGS(ssc) &= anded_flags;
 1773 
 1774     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
 1775      * C2 is the list of code points in 'and-with'; P2, its posix classes.
 1776      * 'and_with' may be inverted.  When not inverted, we have the situation of
 1777      * computing:
 1778      *  (C1 | P1) & (C2 | P2)
 1779      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
 1780      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
 1781      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
 1782      *                    <=  ((C1 & C2) | P1 | P2)
 1783      * Alternatively, the last few steps could be:
 1784      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
 1785      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
 1786      *                    <=  (C1 | C2 | (P1 & P2))
 1787      * We favor the second approach if either P1 or P2 is non-empty.  This is
 1788      * because these components are a barrier to doing optimizations, as what
 1789      * they match cannot be known until the moment of matching as they are
 1790      * dependent on the current locale, 'AND"ing them likely will reduce or
 1791      * eliminate them.
 1792      * But we can do better if we know that C1,P1 are in their initial state (a
 1793      * frequent occurrence), each matching everything:
 1794      *  (<everything>) & (C2 | P2) =  C2 | P2
 1795      * Similarly, if C2,P2 are in their initial state (again a frequent
 1796      * occurrence), the result is a no-op
 1797      *  (C1 | P1) & (<everything>) =  C1 | P1
 1798      *
 1799      * Inverted, we have
 1800      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
 1801      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
 1802      *                         <=  (C1 & ~C2) | (P1 & ~P2)
 1803      * */
 1804 
 1805     if ((and_with_flags & ANYOF_INVERT)
 1806         && ! is_ANYOF_SYNTHETIC(and_with))
 1807     {
 1808         unsigned int i;
 1809 
 1810         ssc_intersection(ssc,
 1811                          anded_cp_list,
 1812                          FALSE /* Has already been inverted */
 1813                          );
 1814 
 1815         /* If either P1 or P2 is empty, the intersection will be also; can skip
 1816          * the loop */
 1817         if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
 1818             ANYOF_POSIXL_ZERO(ssc);
 1819         }
 1820         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
 1821 
 1822             /* Note that the Posix class component P from 'and_with' actually
 1823              * looks like:
 1824              *      P = Pa | Pb | ... | Pn
 1825              * where each component is one posix class, such as in [\w\s].
 1826              * Thus
 1827              *      ~P = ~(Pa | Pb | ... | Pn)
 1828              *         = ~Pa & ~Pb & ... & ~Pn
 1829              *        <= ~Pa | ~Pb | ... | ~Pn
 1830              * The last is something we can easily calculate, but unfortunately
 1831              * is likely to have many false positives.  We could do better
 1832              * in some (but certainly not all) instances if two classes in
 1833              * P have known relationships.  For example
 1834              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
 1835              * So
 1836              *      :lower: & :print: = :lower:
 1837              * And similarly for classes that must be disjoint.  For example,
 1838              * since \s and \w can have no elements in common based on rules in
 1839              * the POSIX standard,
 1840              *      \w & ^\S = nothing
 1841              * Unfortunately, some vendor locales do not meet the Posix
 1842              * standard, in particular almost everything by Microsoft.
 1843              * The loop below just changes e.g., \w into \W and vice versa */
 1844 
 1845             regnode_charclass_posixl temp;
 1846             int add = 1;    /* To calculate the index of the complement */
 1847 
 1848             Zero(&temp, 1, regnode_charclass_posixl);
 1849             ANYOF_POSIXL_ZERO(&temp);
 1850             for (i = 0; i < ANYOF_MAX; i++) {
 1851                 assert(i % 2 != 0
 1852                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
 1853                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
 1854 
 1855                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
 1856                     ANYOF_POSIXL_SET(&temp, i + add);
 1857                 }
 1858                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
 1859             }
 1860             ANYOF_POSIXL_AND(&temp, ssc);
 1861 
 1862         } /* else ssc already has no posixes */
 1863     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
 1864          in its initial state */
 1865     else if (! is_ANYOF_SYNTHETIC(and_with)
 1866              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
 1867     {
 1868         /* But if 'ssc' is in its initial state, the result is just 'and_with';
 1869          * copy it over 'ssc' */
 1870         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
 1871             if (is_ANYOF_SYNTHETIC(and_with)) {
 1872                 StructCopy(and_with, ssc, regnode_ssc);
 1873             }
 1874             else {
 1875                 ssc->invlist = anded_cp_list;
 1876                 ANYOF_POSIXL_ZERO(ssc);
 1877                 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
 1878                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
 1879                 }
 1880             }
 1881         }
 1882         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
 1883                  || (and_with_flags & ANYOF_MATCHES_POSIXL))
 1884         {
 1885             /* One or the other of P1, P2 is non-empty. */
 1886             if (and_with_flags & ANYOF_MATCHES_POSIXL) {
 1887                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
 1888             }
 1889             ssc_union(ssc, anded_cp_list, FALSE);
 1890         }
 1891         else { /* P1 = P2 = empty */
 1892             ssc_intersection(ssc, anded_cp_list, FALSE);
 1893         }
 1894     }
 1895 }
 1896 
 1897 STATIC void
 1898 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
 1899                const regnode_charclass *or_with)
 1900 {
 1901     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
 1902      * another SSC or a regular ANYOF class.  Can create false positives if
 1903      * 'or_with' is to be inverted. */
 1904 
 1905     SV* ored_cp_list;
 1906     U8 ored_flags;
 1907     U8  or_with_flags = (OP(or_with) == ANYOFH) ? 0 : ANYOF_FLAGS(or_with);
 1908 
 1909     PERL_ARGS_ASSERT_SSC_OR;
 1910 
 1911     assert(is_ANYOF_SYNTHETIC(ssc));
 1912 
 1913     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
 1914      * the code point inversion list and just the relevant flags */
 1915     if (is_ANYOF_SYNTHETIC(or_with)) {
 1916         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
 1917         ored_flags = or_with_flags;
 1918     }
 1919     else {
 1920         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
 1921         ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
 1922         if (OP(or_with) != ANYOFD) {
 1923             ored_flags
 1924             |= or_with_flags
 1925              & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
 1926                 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
 1927             if (ANYOFL_UTF8_LOCALE_REQD(or_with_flags)) {
 1928                 ored_flags |=
 1929                     ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
 1930             }
 1931         }
 1932     }
 1933 
 1934     ANYOF_FLAGS(ssc) |= ored_flags;
 1935 
 1936     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
 1937      * C2 is the list of code points in 'or-with'; P2, its posix classes.
 1938      * 'or_with' may be inverted.  When not inverted, we have the simple
 1939      * situation of computing:
 1940      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
 1941      * If P1|P2 yields a situation with both a class and its complement are
 1942      * set, like having both \w and \W, this matches all code points, and we
 1943      * can delete these from the P component of the ssc going forward.  XXX We
 1944      * might be able to delete all the P components, but I (khw) am not certain
 1945      * about this, and it is better to be safe.
 1946      *
 1947      * Inverted, we have
 1948      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
 1949      *                         <=  (C1 | P1) | ~C2
 1950      *                         <=  (C1 | ~C2) | P1
 1951      * (which results in actually simpler code than the non-inverted case)
 1952      * */
 1953 
 1954     if ((or_with_flags & ANYOF_INVERT)
 1955         && ! is_ANYOF_SYNTHETIC(or_with))
 1956     {
 1957         /* We ignore P2, leaving P1 going forward */
 1958     }   /* else  Not inverted */
 1959     else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
 1960         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
 1961         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
 1962             unsigned int i;
 1963             for (i = 0; i < ANYOF_MAX; i += 2) {
 1964                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
 1965                 {
 1966                     ssc_match_all_cp(ssc);
 1967                     ANYOF_POSIXL_CLEAR(ssc, i);
 1968                     ANYOF_POSIXL_CLEAR(ssc, i+1);
 1969                 }
 1970             }
 1971         }
 1972     }
 1973 
 1974     ssc_union(ssc,
 1975               ored_cp_list,
 1976               FALSE /* Already has been inverted */
 1977               );
 1978 }
 1979 
 1980 PERL_STATIC_INLINE void
 1981 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
 1982 {
 1983     PERL_ARGS_ASSERT_SSC_UNION;
 1984 
 1985     assert(is_ANYOF_SYNTHETIC(ssc));
 1986 
 1987     _invlist_union_maybe_complement_2nd(ssc->invlist,
 1988                                         invlist,
 1989                                         invert2nd,
 1990                                         &ssc->invlist);
 1991 }
 1992 
 1993 PERL_STATIC_INLINE void
 1994 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
 1995                          SV* const invlist,
 1996                          const bool invert2nd)
 1997 {
 1998     PERL_ARGS_ASSERT_SSC_INTERSECTION;
 1999 
 2000     assert(is_ANYOF_SYNTHETIC(ssc));
 2001 
 2002     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
 2003                                                invlist,
 2004                                                invert2nd,
 2005                                                &ssc->invlist);
 2006 }
 2007 
 2008 PERL_STATIC_INLINE void
 2009 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
 2010 {
 2011     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
 2012 
 2013     assert(is_ANYOF_SYNTHETIC(ssc));
 2014 
 2015     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
 2016 }
 2017 
 2018 PERL_STATIC_INLINE void
 2019 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
 2020 {
 2021     /* AND just the single code point 'cp' into the SSC 'ssc' */
 2022 
 2023     SV* cp_list = _new_invlist(2);
 2024 
 2025     PERL_ARGS_ASSERT_SSC_CP_AND;
 2026 
 2027     assert(is_ANYOF_SYNTHETIC(ssc));
 2028 
 2029     cp_list = add_cp_to_invlist(cp_list, cp);
 2030     ssc_intersection(ssc, cp_list,
 2031                      FALSE /* Not inverted */
 2032                      );
 2033     SvREFCNT_dec_NN(cp_list);
 2034 }
 2035 
 2036 PERL_STATIC_INLINE void
 2037 S_ssc_clear_locale(regnode_ssc *ssc)
 2038 {
 2039     /* Set the SSC 'ssc' to not match any locale things */
 2040     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
 2041 
 2042     assert(is_ANYOF_SYNTHETIC(ssc));
 2043 
 2044     ANYOF_POSIXL_ZERO(ssc);
 2045     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
 2046 }
 2047 
 2048 #define NON_OTHER_COUNT   NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
 2049 
 2050 STATIC bool
 2051 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
 2052 {
 2053     /* The synthetic start class is used to hopefully quickly winnow down
 2054      * places where a pattern could start a match in the target string.  If it
 2055      * doesn't really narrow things down that much, there isn't much point to
 2056      * having the overhead of using it.  This function uses some very crude
 2057      * heuristics to decide if to use the ssc or not.
 2058      *
 2059      * It returns TRUE if 'ssc' rules out more than half what it considers to
 2060      * be the "likely" possible matches, but of course it doesn't know what the
 2061      * actual things being matched are going to be; these are only guesses
 2062      *
 2063      * For /l matches, it assumes that the only likely matches are going to be
 2064      *      in the 0-255 range, uniformly distributed, so half of that is 127
 2065      * For /a and /d matches, it assumes that the likely matches will be just
 2066      *      the ASCII range, so half of that is 63
 2067      * For /u and there isn't anything matching above the Latin1 range, it
 2068      *      assumes that that is the only range likely to be matched, and uses
 2069      *      half that as the cut-off: 127.  If anything matches above Latin1,
 2070      *      it assumes that all of Unicode could match (uniformly), except for
 2071      *      non-Unicode code points and things in the General Category "Other"
 2072      *      (unassigned, private use, surrogates, controls and formats).  This
 2073      *      is a much large number. */
 2074 
 2075     U32 count = 0;      /* Running total of number of code points matched by
 2076                            'ssc' */
 2077     UV start, end;      /* Start and end points of current range in inversion
 2078                            XXX outdated.  UTF-8 locales are common, what about invert? list */
 2079     const U32 max_code_points = (LOC)
 2080                                 ?  256
 2081                                 : ((  ! UNI_SEMANTICS
 2082                                     ||  invlist_highest(ssc->invlist) < 256)
 2083                                   ? 128
 2084                                   : NON_OTHER_COUNT);
 2085     const U32 max_match = max_code_points / 2;
 2086 
 2087     PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
 2088 
 2089     invlist_iterinit(ssc->invlist);
 2090     while (invlist_iternext(ssc->invlist, &start, &end)) {
 2091         if (start >= max_code_points) {
 2092             break;
 2093         }
 2094         end = MIN(end, max_code_points - 1);
 2095         count += end - start + 1;
 2096         if (count >= max_match) {
 2097             invlist_iterfinish(ssc->invlist);
 2098             return FALSE;
 2099         }
 2100     }
 2101 
 2102     return TRUE;
 2103 }
 2104 
 2105 
 2106 STATIC void
 2107 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
 2108 {
 2109     /* The inversion list in the SSC is marked mortal; now we need a more
 2110      * permanent copy, which is stored the same way that is done in a regular
 2111      * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
 2112      * map */
 2113 
 2114     SV* invlist = invlist_clone(ssc->invlist, NULL);
 2115 
 2116     PERL_ARGS_ASSERT_SSC_FINALIZE;
 2117 
 2118     assert(is_ANYOF_SYNTHETIC(ssc));
 2119 
 2120     /* The code in this file assumes that all but these flags aren't relevant
 2121      * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
 2122      * by the time we reach here */
 2123     assert(! (ANYOF_FLAGS(ssc)
 2124         & ~( ANYOF_COMMON_FLAGS
 2125             |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
 2126             |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
 2127 
 2128     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
 2129 
 2130     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
 2131 
 2132     /* Make sure is clone-safe */
 2133     ssc->invlist = NULL;
 2134 
 2135     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
 2136         ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
 2137         OP(ssc) = ANYOFPOSIXL;
 2138     }
 2139     else if (RExC_contains_locale) {
 2140         OP(ssc) = ANYOFL;
 2141     }
 2142 
 2143     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
 2144 }
 2145 
 2146 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
 2147 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
 2148 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
 2149 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
 2150                                ? (TRIE_LIST_CUR( idx ) - 1)           \
 2151                                : 0 )
 2152 
 2153 
 2154 #ifdef DEBUGGING
 2155 /*
 2156    dump_trie(trie,widecharmap,revcharmap)
 2157    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
 2158    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
 2159 
 2160    These routines dump out a trie in a somewhat readable format.
 2161    The _interim_ variants are used for debugging the interim
 2162    tables that are used to generate the final compressed
 2163    representation which is what dump_trie expects.
 2164 
 2165    Part of the reason for their existence is to provide a form
 2166    of documentation as to how the different representations function.
 2167 
 2168 */
 2169 
 2170 /*
 2171   Dumps the final compressed table form of the trie to Perl_debug_log.
 2172   Used for debugging make_trie().
 2173 */
 2174 
 2175 STATIC void
 2176 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
 2177         AV *revcharmap, U32 depth)
 2178 {
 2179     U32 state;
 2180     SV *sv=sv_newmortal();
 2181     int colwidth= widecharmap ? 6 : 4;
 2182     U16 word;
 2183     GET_RE_DEBUG_FLAGS_DECL;
 2184 
 2185     PERL_ARGS_ASSERT_DUMP_TRIE;
 2186 
 2187     Perl_re_indentf( aTHX_  "Char : %-6s%-6s%-4s ",
 2188         depth+1, "Match","Base","Ofs" );
 2189 
 2190     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
 2191     SV ** const tmp = av_fetch( revcharmap, state, 0);
 2192         if ( tmp ) {
 2193             Perl_re_printf( aTHX_  "%*s",
 2194                 colwidth,
 2195                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
 2196                         PL_colors[0], PL_colors[1],
 2197                         (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
 2198                         PERL_PV_ESCAPE_FIRSTCHAR
 2199                 )
 2200             );
 2201         }
 2202     }
 2203     Perl_re_printf( aTHX_  "\n");
 2204     Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
 2205 
 2206     for( state = 0 ; state < trie->uniquecharcount ; state++ )
 2207         Perl_re_printf( aTHX_  "%.*s", colwidth, "--------");
 2208     Perl_re_printf( aTHX_  "\n");
 2209 
 2210     for( state = 1 ; state < trie->statecount ; state++ ) {
 2211     const U32 base = trie->states[ state ].trans.base;
 2212 
 2213         Perl_re_indentf( aTHX_  "#%4" UVXf "|", depth+1, (UV)state);
 2214 
 2215         if ( trie->states[ state ].wordnum ) {
 2216             Perl_re_printf( aTHX_  " W%4X", trie->states[ state ].wordnum );
 2217         } else {
 2218             Perl_re_printf( aTHX_  "%6s", "" );
 2219         }
 2220 
 2221         Perl_re_printf( aTHX_  " @%4" UVXf " ", (UV)base );
 2222 
 2223         if ( base ) {
 2224             U32 ofs = 0;
 2225 
 2226             while( ( base + ofs  < trie->uniquecharcount ) ||
 2227                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
 2228                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
 2229                                                                     != state))
 2230                     ofs++;
 2231 
 2232             Perl_re_printf( aTHX_  "+%2" UVXf "[ ", (UV)ofs);
 2233 
 2234             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
 2235                 if ( ( base + ofs >= trie->uniquecharcount )
 2236                         && ( base + ofs - trie->uniquecharcount
 2237                                                         < trie->lasttrans )
 2238                         && trie->trans[ base + ofs
 2239                                     - trie->uniquecharcount ].check == state )
 2240                 {
 2241                    Perl_re_printf( aTHX_  "%*" UVXf, colwidth,
 2242                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
 2243                    );
 2244                 } else {
 2245                     Perl_re_printf( aTHX_  "%*s", colwidth,"   ." );
 2246                 }
 2247             }
 2248 
 2249             Perl_re_printf( aTHX_  "]");
 2250 
 2251         }
 2252         Perl_re_printf( aTHX_  "\n" );
 2253     }
 2254     Perl_re_indentf( aTHX_  "word_info N:(prev,len)=",
 2255                                 depth);
 2256     for (word=1; word <= trie->wordcount; word++) {
 2257         Perl_re_printf( aTHX_  " %d:(%d,%d)",
 2258         (int)word, (int)(trie->wordinfo[word].prev),
 2259         (int)(trie->wordinfo[word].len));
 2260     }
 2261     Perl_re_printf( aTHX_  "\n" );
 2262 }
 2263 /*
 2264   Dumps a fully constructed but uncompressed trie in list form.
 2265   List tries normally only are used for construction when the number of
 2266   possible chars (trie->uniquecharcount) is very high.
 2267   Used for debugging make_trie().
 2268 */
 2269 STATIC void
 2270 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
 2271              HV *widecharmap, AV *revcharmap, U32 next_alloc,
 2272              U32 depth)
 2273 {
 2274     U32 state;
 2275     SV *sv=sv_newmortal();
 2276     int colwidth= widecharmap ? 6 : 4;
 2277     GET_RE_DEBUG_FLAGS_DECL;
 2278 
 2279     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
 2280 
 2281     /* print out the table precompression.  */
 2282     Perl_re_indentf( aTHX_  "State :Word | Transition Data\n",
 2283             depth+1 );
 2284     Perl_re_indentf( aTHX_  "%s",
 2285             depth+1, "------:-----+-----------------\n" );
 2286 
 2287     for( state=1 ; state < next_alloc ; state ++ ) {
 2288         U16 charid;
 2289 
 2290         Perl_re_indentf( aTHX_  " %4" UVXf " :",
 2291             depth+1, (UV)state  );
 2292         if ( ! trie->states[ state ].wordnum ) {
 2293             Perl_re_printf( aTHX_  "%5s| ","");
 2294         } else {
 2295             Perl_re_printf( aTHX_  "W%4x| ",
 2296                 trie->states[ state ].wordnum
 2297             );
 2298         }
 2299         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
 2300         SV ** const tmp = av_fetch( revcharmap,
 2301                                         TRIE_LIST_ITEM(state, charid).forid, 0);
 2302         if ( tmp ) {
 2303                 Perl_re_printf( aTHX_  "%*s:%3X=%4" UVXf " | ",
 2304                     colwidth,
 2305                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
 2306                               colwidth,
 2307                               PL_colors[0], PL_colors[1],
 2308                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
 2309                               | PERL_PV_ESCAPE_FIRSTCHAR
 2310                     ) ,
 2311                     TRIE_LIST_ITEM(state, charid).forid,
 2312                     (UV)TRIE_LIST_ITEM(state, charid).newstate
 2313                 );
 2314                 if (!(charid % 10))
 2315                     Perl_re_printf( aTHX_  "\n%*s| ",
 2316                         (int)((depth * 2) + 14), "");
 2317             }
 2318         }
 2319         Perl_re_printf( aTHX_  "\n");
 2320     }
 2321 }
 2322 
 2323 /*
 2324   Dumps a fully constructed but uncompressed trie in table form.
 2325   This is the normal DFA style state transition table, with a few
 2326   twists to facilitate compression later.
 2327   Used for debugging make_trie().
 2328 */
 2329 STATIC void
 2330 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
 2331               HV *widecharmap, AV *revcharmap, U32 next_alloc,
 2332               U32 depth)
 2333 {
 2334     U32 state;
 2335     U16 charid;
 2336     SV *sv=sv_newmortal();
 2337     int colwidth= widecharmap ? 6 : 4;
 2338     GET_RE_DEBUG_FLAGS_DECL;
 2339 
 2340     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
 2341 
 2342     /*
 2343        print out the table precompression so that we can do a visual check
 2344        that they are identical.
 2345      */
 2346 
 2347     Perl_re_indentf( aTHX_  "Char : ", depth+1 );
 2348 
 2349     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
 2350     SV ** const tmp = av_fetch( revcharmap, charid, 0);
 2351         if ( tmp ) {
 2352             Perl_re_printf( aTHX_  "%*s",
 2353                 colwidth,
 2354                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
 2355                         PL_colors[0], PL_colors[1],
 2356                         (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
 2357                         PERL_PV_ESCAPE_FIRSTCHAR
 2358                 )
 2359             );
 2360         }
 2361     }
 2362 
 2363     Perl_re_printf( aTHX_ "\n");
 2364     Perl_re_indentf( aTHX_  "State+-", depth+1 );
 2365 
 2366     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
 2367         Perl_re_printf( aTHX_  "%.*s", colwidth,"--------");
 2368     }
 2369 
 2370     Perl_re_printf( aTHX_  "\n" );
 2371 
 2372     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
 2373 
 2374         Perl_re_indentf( aTHX_  "%4" UVXf " : ",
 2375             depth+1,
 2376             (UV)TRIE_NODENUM( state ) );
 2377 
 2378         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
 2379             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
 2380             if (v)
 2381                 Perl_re_printf( aTHX_  "%*" UVXf, colwidth, v );
 2382             else
 2383                 Perl_re_printf( aTHX_  "%*s", colwidth, "." );
 2384         }
 2385         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
 2386             Perl_re_printf( aTHX_  " (%4" UVXf ")\n",
 2387                                             (UV)trie->trans[ state ].check );
 2388         } else {
 2389             Perl_re_printf( aTHX_  " (%4" UVXf ") W%4X\n",
 2390                                             (UV)trie->trans[ state ].check,
 2391             trie->states[ TRIE_NODENUM( state ) ].wordnum );
 2392         }
 2393     }
 2394 }
 2395 
 2396 #endif
 2397 
 2398 
 2399 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
 2400   startbranch: the first branch in the whole branch sequence
 2401   first      : start branch of sequence of branch-exact nodes.
 2402            May be the same as startbranch
 2403   last       : Thing following the last branch.
 2404            May be the same as tail.
 2405   tail       : item following the branch sequence
 2406   count      : words in the sequence
 2407   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
 2408   depth      : indent depth
 2409 
 2410 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
 2411 
 2412 A trie is an N'ary tree where the branches are determined by digital
 2413 decomposition of the key. IE, at the root node you look up the 1st character and
 2414 follow that branch repeat until you find the end of the branches. Nodes can be
 2415 marked as "accepting" meaning they represent a complete word. Eg:
 2416 
 2417   /he|she|his|hers/
 2418 
 2419 would convert into the following structure. Numbers represent states, letters
 2420 following numbers represent valid transitions on the letter from that state, if
 2421 the number is in square brackets it represents an accepting state, otherwise it
 2422 will be in parenthesis.
 2423 
 2424       +-h->+-e->[3]-+-r->(8)-+-s->[9]
 2425       |    |
 2426       |   (2)
 2427       |    |
 2428      (1)   +-i->(6)-+-s->[7]
 2429       |
 2430       +-s->(3)-+-h->(4)-+-e->[5]
 2431 
 2432       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
 2433 
 2434 This shows that when matching against the string 'hers' we will begin at state 1
 2435 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
 2436 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
 2437 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
 2438 single traverse. We store a mapping from accepting to state to which word was
 2439 matched, and then when we have multiple possibilities we try to complete the
 2440 rest of the regex in the order in which they occurred in the alternation.
 2441 
 2442 The only prior NFA like behaviour that would be changed by the TRIE support is
 2443 the silent ignoring of duplicate alternations which are of the form:
 2444 
 2445  / (DUPE|DUPE) X? (?{ ... }) Y /x
 2446 
 2447 Thus EVAL blocks following a trie may be called a different number of times with
 2448 and without the optimisation. With the optimisations dupes will be silently
 2449 ignored. This inconsistent behaviour of EVAL type nodes is well established as
 2450 the following demonstrates:
 2451 
 2452  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
 2453 
 2454 which prints out 'word' three times, but
 2455 
 2456  'words'=~/(word|word|word)(?{ print $1 })S/
 2457 
 2458 which doesnt print it out at all. This is due to other optimisations kicking in.
 2459 
 2460 Example of what happens on a structural level:
 2461 
 2462 The regexp /(ac|ad|ab)+/ will produce the following debug output:
 2463 
 2464    1: CURLYM[1] {1,32767}(18)
 2465    5:   BRANCH(8)
 2466    6:     EXACT <ac>(16)
 2467    8:   BRANCH(11)
 2468    9:     EXACT <ad>(16)
 2469   11:   BRANCH(14)
 2470   12:     EXACT <ab>(16)
 2471   16:   SUCCEED(0)
 2472   17:   NOTHING(18)
 2473   18: END(0)
 2474 
 2475 This would be optimizable with startbranch=5, first=5, last=16, tail=16
 2476 and should turn into:
 2477 
 2478    1: CURLYM[1] {1,32767}(18)
 2479    5:   TRIE(16)
 2480     [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
 2481       <ac>
 2482       <ad>
 2483       <ab>
 2484   16:   SUCCEED(0)
 2485   17:   NOTHING(18)
 2486   18: END(0)
 2487 
 2488 Cases where tail != last would be like /(?foo|bar)baz/:
 2489 
 2490    1: BRANCH(4)
 2491    2:   EXACT <foo>(8)
 2492    4: BRANCH(7)
 2493    5:   EXACT <bar>(8)
 2494    7: TAIL(8)
 2495    8: EXACT <baz>(10)
 2496   10: END(0)
 2497 
 2498 which would be optimizable with startbranch=1, first=1, last=7, tail=8
 2499 and would end up looking like:
 2500 
 2501     1: TRIE(8)
 2502       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
 2503     <foo>
 2504     <bar>
 2505    7: TAIL(8)
 2506    8: EXACT <baz>(10)
 2507   10: END(0)
 2508 
 2509     d = uvchr_to_utf8_flags(d, uv, 0);
 2510 
 2511 is the recommended Unicode-aware way of saying
 2512 
 2513     *(d++) = uv;
 2514 */
 2515 
 2516 #define TRIE_STORE_REVCHAR(val)                                            \
 2517     STMT_START {                                                           \
 2518     if (UTF) {                             \
 2519             SV *zlopp = newSV(UTF8_MAXBYTES);                  \
 2520         unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
 2521             unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
 2522         SvCUR_set(zlopp, kapow - flrbbbbb);                \
 2523         SvPOK_on(zlopp);                           \
 2524         SvUTF8_on(zlopp);                          \
 2525         av_push(revcharmap, zlopp);                    \
 2526     } else {                               \
 2527             char ooooff = (char)val;                                           \
 2528         av_push(revcharmap, newSVpvn(&ooooff, 1));             \
 2529     }                                  \
 2530         } STMT_END
 2531 
 2532 /* This gets the next character from the input, folding it if not already
 2533  * folded. */
 2534 #define TRIE_READ_CHAR STMT_START {                                           \
 2535     wordlen++;                                                                \
 2536     if ( UTF ) {                                                              \
 2537         /* if it is UTF then it is either already folded, or does not need    \
 2538          * folding */                                                         \
 2539         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
 2540     }                                                                         \
 2541     else if (folder == PL_fold_latin1) {                                      \
 2542         /* This folder implies Unicode rules, which in the range expressible  \
 2543          *  by not UTF is the lower case, with the two exceptions, one of     \
 2544          *  which should have been taken care of before calling this */       \
 2545         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
 2546         uvc = toLOWER_L1(*uc);                                                \
 2547         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
 2548         len = 1;                                                              \
 2549     } else {                                                                  \
 2550         /* raw data, will be folded later if needed */                        \
 2551         uvc = (U32)*uc;                                                       \
 2552         len = 1;                                                              \
 2553     }                                                                         \
 2554 } STMT_END
 2555 
 2556 
 2557 
 2558 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
 2559     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
 2560     U32 ging = TRIE_LIST_LEN( state ) * 2;                  \
 2561     Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
 2562         TRIE_LIST_LEN( state ) = ging;                          \
 2563     }                                                           \
 2564     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
 2565     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
 2566     TRIE_LIST_CUR( state )++;                                   \
 2567 } STMT_END
 2568 
 2569 #define TRIE_LIST_NEW(state) STMT_START {                       \
 2570     Newx( trie->states[ state ].trans.list,                     \
 2571     4, reg_trie_trans_le );                                 \
 2572      TRIE_LIST_CUR( state ) = 1;                                \
 2573      TRIE_LIST_LEN( state ) = 4;                                \
 2574 } STMT_END
 2575 
 2576 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
 2577     U16 dupe= trie->states[ state ].wordnum;                    \
 2578     regnode * const noper_next = regnext( noper );              \
 2579                                                                 \
 2580     DEBUG_r({                                                   \
 2581         /* store the word for dumping */                        \
 2582         SV* tmp;                                                \
 2583         if (OP(noper) != NOTHING)                               \
 2584             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
 2585         else                                                    \
 2586             tmp = newSVpvn_utf8( "", 0, UTF );          \
 2587         av_push( trie_words, tmp );                             \
 2588     });                                                         \
 2589                                                                 \
 2590     curword++;                                                  \
 2591     trie->wordinfo[curword].prev   = 0;                         \
 2592     trie->wordinfo[curword].len    = wordlen;                   \
 2593     trie->wordinfo[curword].accept = state;                     \
 2594                                                                 \
 2595     if ( noper_next < tail ) {                                  \
 2596         if (!trie->jump)                                        \
 2597             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
 2598                                                  sizeof(U16) ); \
 2599         trie->jump[curword] = (U16)(noper_next - convert);      \
 2600         if (!jumper)                                            \
 2601             jumper = noper_next;                                \
 2602         if (!nextbranch)                                        \
 2603             nextbranch= regnext(cur);                           \
 2604     }                                                           \
 2605                                                                 \
 2606     if ( dupe ) {                                               \
 2607         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
 2608         /* chain, so that when the bits of chain are later    */\
 2609         /* linked together, the dups appear in the chain      */\
 2610     trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
 2611     trie->wordinfo[dupe].prev = curword;                    \
 2612     } else {                                                    \
 2613         /* we haven't inserted this word yet.                */ \
 2614         trie->states[ state ].wordnum = curword;                \
 2615     }                                                           \
 2616 } STMT_END
 2617 
 2618 
 2619 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)      \
 2620      ( ( base + charid >=  ucharcount                   \
 2621          && base + charid < ubound                  \
 2622          && state == trie->trans[ base - ucharcount + charid ].check    \
 2623          && trie->trans[ base - ucharcount + charid ].next )        \
 2624            ? trie->trans[ base - ucharcount + charid ].next     \
 2625            : ( state==1 ? special : 0 )                 \
 2626       )
 2627 
 2628 #define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder)           \
 2629 STMT_START {                                                \
 2630     TRIE_BITMAP_SET(trie, uvc);                             \
 2631     /* store the folded codepoint */                        \
 2632     if ( folder )                                           \
 2633         TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);           \
 2634                                                             \
 2635     if ( !UTF ) {                                           \
 2636         /* store first byte of utf8 representation of */    \
 2637         /* variant codepoints */                            \
 2638         if (! UVCHR_IS_INVARIANT(uvc)) {                    \
 2639             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));   \
 2640         }                                                   \
 2641     }                                                       \
 2642 } STMT_END
 2643 #define MADE_TRIE       1
 2644 #define MADE_JUMP_TRIE  2
 2645 #define MADE_EXACT_TRIE 4
 2646 
 2647 STATIC I32
 2648 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
 2649                   regnode *first, regnode *last, regnode *tail,
 2650                   U32 word_count, U32 flags, U32 depth)
 2651 {
 2652     /* first pass, loop through and scan words */
 2653     reg_trie_data *trie;
 2654     HV *widecharmap = NULL;
 2655     AV *revcharmap = newAV();
 2656     regnode *cur;
 2657     STRLEN len = 0;
 2658     UV uvc = 0;
 2659     U16 curword = 0;
 2660     U32 next_alloc = 0;
 2661     regnode *jumper = NULL;
 2662     regnode *nextbranch = NULL;
 2663     regnode *convert = NULL;
 2664     U32 *prev_states; /* temp array mapping each state to previous one */
 2665     /* we just use folder as a flag in utf8 */
 2666     const U8 * folder = NULL;
 2667 
 2668     /* in the below add_data call we are storing either 'tu' or 'tuaa'
 2669      * which stands for one trie structure, one hash, optionally followed
 2670      * by two arrays */
 2671 #ifdef DEBUGGING
 2672     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
 2673     AV *trie_words = NULL;
 2674     /* along with revcharmap, this only used during construction but both are
 2675      * useful during debugging so we store them in the struct when debugging.
 2676      */
 2677 #else
 2678     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
 2679     STRLEN trie_charcount=0;
 2680 #endif
 2681     SV *re_trie_maxbuff;
 2682     GET_RE_DEBUG_FLAGS_DECL;
 2683 
 2684     PERL_ARGS_ASSERT_MAKE_TRIE;
 2685 #ifndef DEBUGGING
 2686     PERL_UNUSED_ARG(depth);
 2687 #endif
 2688 
 2689     switch (flags) {
 2690         case EXACT: case EXACT_ONLY8: case EXACTL: break;
 2691     case EXACTFAA:
 2692         case EXACTFUP:
 2693     case EXACTFU:
 2694     case EXACTFLU8: folder = PL_fold_latin1; break;
 2695     case EXACTF:  folder = PL_fold; break;
 2696         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
 2697     }
 2698 
 2699     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
 2700     trie->refcount = 1;
 2701     trie->startstate = 1;
 2702     trie->wordcount = word_count;
 2703     RExC_rxi->data->data[ data_slot ] = (void*)trie;
 2704     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
 2705     if (flags == EXACT || flags == EXACT_ONLY8 || flags == EXACTL)
 2706     trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
 2707     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
 2708                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
 2709 
 2710     DEBUG_r({
 2711         trie_words = newAV();
 2712     });
 2713 
 2714     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
 2715     assert(re_trie_maxbuff);
 2716     if (!SvIOK(re_trie_maxbuff)) {
 2717         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
 2718     }
 2719     DEBUG_TRIE_COMPILE_r({
 2720         Perl_re_indentf( aTHX_
 2721           "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
 2722           depth+1,
 2723           REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
 2724           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
 2725     });
 2726 
 2727    /* Find the node we are going to overwrite */
 2728     if ( first == startbranch && OP( last ) != BRANCH ) {
 2729         /* whole branch chain */
 2730         convert = first;
 2731     } else {
 2732         /* branch sub-chain */
 2733         convert = NEXTOPER( first );
 2734     }
 2735 
 2736     /*  -- First loop and Setup --
 2737 
 2738        We first traverse the branches and scan each word to determine if it
 2739        contains widechars, and how many unique chars there are, this is
 2740        important as we have to build a table with at least as many columns as we
 2741        have unique chars.
 2742 
 2743        We use an array of integers to represent the character codes 0..255
 2744        (trie->charmap) and we use a an HV* to store Unicode characters. We use
 2745        the native representation of the character value as the key and IV's for
 2746        the coded index.
 2747 
 2748        *TODO* If we keep track of how many times each character is used we can
 2749        remap the columns so that the table compression later on is more
 2750        efficient in terms of memory by ensuring the most common value is in the
 2751        middle and the least common are on the outside.  IMO this would be better
 2752        than a most to least common mapping as theres a decent chance the most
 2753        common letter will share a node with the least common, meaning the node
 2754        will not be compressible. With a middle is most common approach the worst
 2755        case is when we have the least common nodes twice.
 2756 
 2757      */
 2758 
 2759     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
 2760         regnode *noper = NEXTOPER( cur );
 2761         const U8 *uc;
 2762         const U8 *e;
 2763         int foldlen = 0;
 2764         U32 wordlen      = 0;         /* required init */
 2765         STRLEN minchars = 0;
 2766         STRLEN maxchars = 0;
 2767         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
 2768                                                bitmap?*/
 2769 
 2770         if (OP(noper) == NOTHING) {
 2771             /* skip past a NOTHING at the start of an alternation
 2772              * eg, /(?:)a|(?:b)/ should be the same as /a|b/
 2773              *
 2774              * If the next node is not something we are supposed to process
 2775              * we will just ignore it due to the condition guarding the
 2776              * next block.
 2777              */
 2778 
 2779             regnode *noper_next= regnext(noper);
 2780             if (noper_next < tail)
 2781                 noper= noper_next;
 2782         }
 2783 
 2784         if (    noper < tail
 2785             && (    OP(noper) == flags
 2786                 || (flags == EXACT && OP(noper) == EXACT_ONLY8)
 2787                 || (flags == EXACTFU && (   OP(noper) == EXACTFU_ONLY8
 2788                                          || OP(noper) == EXACTFUP))))
 2789         {
 2790             uc= (U8*)STRING(noper);
 2791             e= uc + STR_LEN(noper);
 2792         } else {
 2793             trie->minlen= 0;
 2794             continue;
 2795         }
 2796 
 2797 
 2798         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
 2799             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
 2800                                           regardless of encoding */
 2801             if (OP( noper ) == EXACTFUP) {
 2802                 /* false positives are ok, so just set this */
 2803                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
 2804             }
 2805         }
 2806 
 2807         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
 2808                                            branch */
 2809             TRIE_CHARCOUNT(trie)++;
 2810             TRIE_READ_CHAR;
 2811 
 2812             /* TRIE_READ_CHAR returns the current character, or its fold if /i
 2813              * is in effect.  Under /i, this character can match itself, or
 2814              * anything that folds to it.  If not under /i, it can match just
 2815              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
 2816              * all fold to k, and all are single characters.   But some folds
 2817              * expand to more than one character, so for example LATIN SMALL
 2818              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
 2819              * the string beginning at 'uc' is 'ffi', it could be matched by
 2820              * three characters, or just by the one ligature character. (It
 2821              * could also be matched by two characters: LATIN SMALL LIGATURE FF
 2822              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
 2823              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
 2824              * match.)  The trie needs to know the minimum and maximum number
 2825              * of characters that could match so that it can use size alone to
 2826              * quickly reject many match attempts.  The max is simple: it is
 2827              * the number of folded characters in this branch (since a fold is
 2828              * never shorter than what folds to it. */
 2829 
 2830             maxchars++;
 2831 
 2832             /* And the min is equal to the max if not under /i (indicated by
 2833              * 'folder' being NULL), or there are no multi-character folds.  If
 2834              * there is a multi-character fold, the min is incremented just
 2835              * once, for the character that folds to the sequence.  Each
 2836              * character in the sequence needs to be added to the list below of
 2837              * characters in the trie, but we count only the first towards the
 2838              * min number of characters needed.  This is done through the
 2839              * variable 'foldlen', which is returned by the macros that look
 2840              * for these sequences as the number of bytes the sequence
 2841              * occupies.  Each time through the loop, we decrement 'foldlen' by
 2842              * how many bytes the current char occupies.  Only when it reaches
 2843              * 0 do we increment 'minchars' or look for another multi-character
 2844              * sequence. */
 2845             if (folder == NULL) {
 2846                 minchars++;
 2847             }
 2848             else if (foldlen > 0) {
 2849                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
 2850             }
 2851             else {
 2852                 minchars++;
 2853 
 2854                 /* See if *uc is the beginning of a multi-character fold.  If
 2855                  * so, we decrement the length remaining to look at, to account
 2856                  * for the current character this iteration.  (We can use 'uc'
 2857                  * instead of the fold returned by TRIE_READ_CHAR because for
 2858                  * non-UTF, the latin1_safe macro is smart enough to account
 2859                  * for all the unfolded characters, and because for UTF, the
 2860                  * string will already have been folded earlier in the
 2861                  * compilation process */
 2862                 if (UTF) {
 2863                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
 2864                         foldlen -= UTF8SKIP(uc);
 2865                     }
 2866                 }
 2867                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
 2868                     foldlen--;
 2869                 }
 2870             }
 2871 
 2872             /* The current character (and any potential folds) should be added
 2873              * to the possible matching characters for this position in this
 2874              * branch */
 2875             if ( uvc < 256 ) {
 2876                 if ( folder ) {
 2877                     U8 folded= folder[ (U8) uvc ];
 2878                     if ( !trie->charmap[ folded ] ) {
 2879                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
 2880                         TRIE_STORE_REVCHAR( folded );
 2881                     }
 2882                 }
 2883                 if ( !trie->charmap[ uvc ] ) {
 2884                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
 2885                     TRIE_STORE_REVCHAR( uvc );
 2886                 }
 2887                 if ( set_bit ) {
 2888             /* store the codepoint in the bitmap, and its folded
 2889              * equivalent. */
 2890                     TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
 2891                     set_bit = 0; /* We've done our bit :-) */
 2892                 }
 2893             } else {
 2894 
 2895                 /* XXX We could come up with the list of code points that fold
 2896                  * to this using PL_utf8_foldclosures, except not for
 2897                  * multi-char folds, as there may be multiple combinations
 2898                  * there that could work, which needs to wait until runtime to
 2899                  * resolve (The comment about LIGATURE FFI above is such an
 2900                  * example */
 2901 
 2902                 SV** svpp;
 2903                 if ( !widecharmap )
 2904                     widecharmap = newHV();
 2905 
 2906                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
 2907 
 2908                 if ( !svpp )
 2909                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
 2910 
 2911                 if ( !SvTRUE( *svpp ) ) {
 2912                     sv_setiv( *svpp, ++trie->uniquecharcount );
 2913                     TRIE_STORE_REVCHAR(uvc);
 2914                 }
 2915             }
 2916         } /* end loop through characters in this branch of the trie */
 2917 
 2918         /* We take the min and max for this branch and combine to find the min
 2919          * and max for all branches processed so far */
 2920         if( cur == first ) {
 2921             trie->minlen = minchars;
 2922             trie->maxlen = maxchars;
 2923         } else if (minchars < trie->minlen) {
 2924             trie->minlen = minchars;
 2925         } else if (maxchars > trie->maxlen) {
 2926             trie->maxlen = maxchars;
 2927         }
 2928     } /* end first pass */
 2929     DEBUG_TRIE_COMPILE_r(
 2930         Perl_re_indentf( aTHX_
 2931                 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
 2932                 depth+1,
 2933                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
 2934         (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
 2935         (int)trie->minlen, (int)trie->maxlen )
 2936     );
 2937 
 2938     /*
 2939         We now know what we are dealing with in terms of unique chars and
 2940         string sizes so we can calculate how much memory a naive
 2941         representation using a flat table  will take. If it's over a reasonable
 2942         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
 2943         conservative but potentially much slower representation using an array
 2944         of lists.
 2945 
 2946         At the end we convert both representations into the same compressed
 2947         form that will be used in regexec.c for matching with. The latter
 2948         is a form that cannot be used to construct with but has memory
 2949         properties similar to the list form and access properties similar
 2950         to the table form making it both suitable for fast searches and
 2951         small enough that its feasable to store for the duration of a program.
 2952 
 2953         See the comment in the code where the compressed table is produced
 2954         inplace from the flat tabe representation for an explanation of how
 2955         the compression works.
 2956 
 2957     */
 2958 
 2959 
 2960     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
 2961     prev_states[1] = 0;
 2962 
 2963     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
 2964                                                     > SvIV(re_trie_maxbuff) )
 2965     {
 2966         /*
 2967             Second Pass -- Array Of Lists Representation
 2968 
 2969             Each state will be represented by a list of charid:state records
 2970             (reg_trie_trans_le) the first such element holds the CUR and LEN
 2971             points of the allocated array. (See defines above).
 2972 
 2973             We build the initial structure using the lists, and then convert
 2974             it into the compressed table form which allows faster lookups
 2975             (but cant be modified once converted).
 2976         */
 2977 
 2978         STRLEN transcount = 1;
 2979 
 2980         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using list compiler\n",
 2981             depth+1));
 2982 
 2983     trie->states = (reg_trie_state *)
 2984         PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
 2985                   sizeof(reg_trie_state) );
 2986         TRIE_LIST_NEW(1);
 2987         next_alloc = 2;
 2988 
 2989         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
 2990 
 2991             regnode *noper   = NEXTOPER( cur );
 2992         U32 state        = 1;         /* required init */
 2993         U16 charid       = 0;         /* sanity init */
 2994             U32 wordlen      = 0;         /* required init */
 2995 
 2996             if (OP(noper) == NOTHING) {
 2997                 regnode *noper_next= regnext(noper);
 2998                 if (noper_next < tail)
 2999                     noper= noper_next;
 3000                 /* we will undo this assignment if noper does not
 3001                  * point at a trieable type in the else clause of
 3002                  * the following statement. */
 3003             }
 3004 
 3005             if (    noper < tail
 3006                 && (    OP(noper) == flags
 3007                     || (flags == EXACT && OP(noper) == EXACT_ONLY8)
 3008                     || (flags == EXACTFU && (   OP(noper) == EXACTFU_ONLY8
 3009                                              || OP(noper) == EXACTFUP))))
 3010             {
 3011                 const U8 *uc= (U8*)STRING(noper);
 3012                 const U8 *e= uc + STR_LEN(noper);
 3013 
 3014                 for ( ; uc < e ; uc += len ) {
 3015 
 3016                     TRIE_READ_CHAR;
 3017 
 3018                     if ( uvc < 256 ) {
 3019                         charid = trie->charmap[ uvc ];
 3020             } else {
 3021                         SV** const svpp = hv_fetch( widecharmap,
 3022                                                     (char*)&uvc,
 3023                                                     sizeof( UV ),
 3024                                                     0);
 3025                         if ( !svpp ) {
 3026                             charid = 0;
 3027                         } else {
 3028                             charid=(U16)SvIV( *svpp );
 3029                         }
 3030             }
 3031                     /* charid is now 0 if we dont know the char read, or
 3032                      * nonzero if we do */
 3033                     if ( charid ) {
 3034 
 3035                         U16 check;
 3036                         U32 newstate = 0;
 3037 
 3038                         charid--;
 3039                         if ( !trie->states[ state ].trans.list ) {
 3040                             TRIE_LIST_NEW( state );
 3041             }
 3042                         for ( check = 1;
 3043                               check <= TRIE_LIST_USED( state );
 3044                               check++ )
 3045                         {
 3046                             if ( TRIE_LIST_ITEM( state, check ).forid
 3047                                                                     == charid )
 3048                             {
 3049                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
 3050                                 break;
 3051                             }
 3052                         }
 3053                         if ( ! newstate ) {
 3054                             newstate = next_alloc++;
 3055                 prev_states[newstate] = state;
 3056                             TRIE_LIST_PUSH( state, charid, newstate );
 3057                             transcount++;
 3058                         }
 3059                         state = newstate;
 3060                     } else {
 3061                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
 3062             }
 3063         }
 3064             } else {
 3065                 /* If we end up here it is because we skipped past a NOTHING, but did not end up
 3066                  * on a trieable type. So we need to reset noper back to point at the first regop
 3067                  * in the branch before we call TRIE_HANDLE_WORD()
 3068                 */
 3069                 noper= NEXTOPER(cur);
 3070             }
 3071             TRIE_HANDLE_WORD(state);
 3072 
 3073         } /* end second pass */
 3074 
 3075         /* next alloc is the NEXT state to be allocated */
 3076         trie->statecount = next_alloc;
 3077         trie->states = (reg_trie_state *)
 3078         PerlMemShared_realloc( trie->states,
 3079                    next_alloc
 3080                    * sizeof(reg_trie_state) );
 3081 
 3082         /* and now dump it out before we compress it */
 3083         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
 3084                              revcharmap, next_alloc,
 3085                              depth+1)
 3086         );
 3087 
 3088         trie->trans = (reg_trie_trans *)
 3089         PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
 3090         {
 3091             U32 state;
 3092             U32 tp = 0;
 3093             U32 zp = 0;
 3094 
 3095 
 3096             for( state=1 ; state < next_alloc ; state ++ ) {
 3097                 U32 base=0;
 3098 
 3099                 /*
 3100                 DEBUG_TRIE_COMPILE_MORE_r(
 3101                     Perl_re_printf( aTHX_  "tp: %d zp: %d ",tp,zp)
 3102                 );
 3103                 */
 3104 
 3105                 if (trie->states[state].trans.list) {
 3106                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
 3107                     U16 maxid=minid;
 3108             U16 idx;
 3109 
 3110                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
 3111             const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
 3112             if ( forid < minid ) {
 3113                 minid=forid;
 3114             } else if ( forid > maxid ) {
 3115                 maxid=forid;
 3116             }
 3117                     }
 3118                     if ( transcount < tp + maxid - minid + 1) {
 3119                         transcount *= 2;
 3120             trie->trans = (reg_trie_trans *)
 3121                 PerlMemShared_realloc( trie->trans,
 3122                              transcount
 3123                              * sizeof(reg_trie_trans) );
 3124                         Zero( trie->trans + (transcount / 2),
 3125                               transcount / 2,
 3126                               reg_trie_trans );
 3127                     }
 3128                     base = trie->uniquecharcount + tp - minid;
 3129                     if ( maxid == minid ) {
 3130                         U32 set = 0;
 3131                         for ( ; zp < tp ; zp++ ) {
 3132                             if ( ! trie->trans[ zp ].next ) {
 3133                                 base = trie->uniquecharcount + zp - minid;
 3134                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
 3135                                                                    1).newstate;
 3136                                 trie->trans[ zp ].check = state;
 3137                                 set = 1;
 3138                                 break;
 3139                             }
 3140                         }
 3141                         if ( !set ) {
 3142                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
 3143                                                                    1).newstate;
 3144                             trie->trans[ tp ].check = state;
 3145                             tp++;
 3146                             zp = tp;
 3147                         }
 3148                     } else {
 3149                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
 3150                             const U32 tid = base
 3151                                            - trie->uniquecharcount
 3152                                            + TRIE_LIST_ITEM( state, idx ).forid;
 3153                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
 3154                                                                 idx ).newstate;
 3155                             trie->trans[ tid ].check = state;
 3156                         }
 3157                         tp += ( maxid - minid + 1 );
 3158                     }
 3159                     Safefree(trie->states[ state ].trans.list);
 3160                 }
 3161                 /*
 3162                 DEBUG_TRIE_COMPILE_MORE_r(
 3163                     Perl_re_printf( aTHX_  " base: %d\n",base);
 3164                 );
 3165                 */
 3166                 trie->states[ state ].trans.base=base;
 3167             }
 3168             trie->lasttrans = tp + 1;
 3169         }
 3170     } else {
 3171         /*
 3172            Second Pass -- Flat Table Representation.
 3173 
 3174            we dont use the 0 slot of either trans[] or states[] so we add 1 to
 3175            each.  We know that we will need Charcount+1 trans at most to store
 3176            the data (one row per char at worst case) So we preallocate both
 3177            structures assuming worst case.
 3178 
 3179            We then construct the trie using only the .next slots of the entry
 3180            structs.
 3181 
 3182            We use the .check field of the first entry of the node temporarily
 3183            to make compression both faster and easier by keeping track of how
 3184            many non zero fields are in the node.
 3185 
 3186            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
 3187            transition.
 3188 
 3189            There are two terms at use here: state as a TRIE_NODEIDX() which is
 3190            a number representing the first entry of the node, and state as a
 3191            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
 3192            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
 3193            if there are 2 entrys per node. eg:
 3194 
 3195              A B       A B
 3196           1. 2 4    1. 3 7
 3197           2. 0 3    3. 0 5
 3198           3. 0 0    5. 0 0
 3199           4. 0 0    7. 0 0
 3200 
 3201            The table is internally in the right hand, idx form. However as we
 3202            also have to deal with the states array which is indexed by nodenum
 3203            we have to use TRIE_NODENUM() to convert.
 3204 
 3205         */
 3206         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using table compiler\n",
 3207             depth+1));
 3208 
 3209     trie->trans = (reg_trie_trans *)
 3210         PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
 3211                   * trie->uniquecharcount + 1,
 3212                   sizeof(reg_trie_trans) );
 3213         trie->states = (reg_trie_state *)
 3214         PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
 3215                   sizeof(reg_trie_state) );
 3216         next_alloc = trie->uniquecharcount + 1;
 3217 
 3218 
 3219         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
 3220 
 3221             regnode *noper   = NEXTOPER( cur );
 3222 
 3223             U32 state        = 1;         /* required init */
 3224 
 3225             U16 charid       = 0;         /* sanity init */
 3226             U32 accept_state = 0;         /* sanity init */
 3227 
 3228             U32 wordlen      = 0;         /* required init */
 3229 
 3230             if (OP(noper) == NOTHING) {
 3231                 regnode *noper_next= regnext(noper);
 3232                 if (noper_next < tail)
 3233                     noper= noper_next;
 3234                 /* we will undo this assignment if noper does not
 3235                  * point at a trieable type in the else clause of
 3236                  * the following statement. */
 3237             }
 3238 
 3239             if (    noper < tail
 3240                 && (    OP(noper) == flags
 3241                     || (flags == EXACT && OP(noper) == EXACT_ONLY8)
 3242                     || (flags == EXACTFU && (   OP(noper) == EXACTFU_ONLY8
 3243                                              || OP(noper) == EXACTFUP))))
 3244             {
 3245                 const U8 *uc= (U8*)STRING(noper);
 3246                 const U8 *e= uc + STR_LEN(noper);
 3247 
 3248                 for ( ; uc < e ; uc += len ) {
 3249 
 3250                     TRIE_READ_CHAR;
 3251 
 3252                     if ( uvc < 256 ) {
 3253                         charid = trie->charmap[ uvc ];
 3254                     } else {
 3255                         SV* const * const svpp = hv_fetch( widecharmap,
 3256                                                            (char*)&uvc,
 3257                                                            sizeof( UV ),
 3258                                                            0);
 3259                         charid = svpp ? (U16)SvIV(*svpp) : 0;
 3260                     }
 3261                     if ( charid ) {
 3262                         charid--;
 3263                         if ( !trie->trans[ state + charid ].next ) {
 3264                             trie->trans[ state + charid ].next = next_alloc;
 3265                             trie->trans[ state ].check++;
 3266                 prev_states[TRIE_NODENUM(next_alloc)]
 3267                     = TRIE_NODENUM(state);
 3268                             next_alloc += trie->uniquecharcount;
 3269                         }
 3270                         state = trie->trans[ state + charid ].next;
 3271                     } else {
 3272                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
 3273                     }
 3274                     /* charid is now 0 if we dont know the char read, or
 3275                      * nonzero if we do */
 3276                 }
 3277             } else {
 3278                 /* If we end up here it is because we skipped past a NOTHING, but did not end up
 3279                  * on a trieable type. So we need to reset noper back to point at the first regop
 3280                  * in the branch before we call TRIE_HANDLE_WORD().
 3281                 */
 3282                 noper= NEXTOPER(cur);
 3283             }
 3284             accept_state = TRIE_NODENUM( state );
 3285             TRIE_HANDLE_WORD(accept_state);
 3286 
 3287         } /* end second pass */
 3288 
 3289         /* and now dump it out before we compress it */
 3290         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
 3291                               revcharmap,
 3292                               next_alloc, depth+1));
 3293 
 3294         {
 3295         /*
 3296            * Inplace compress the table.*
 3297 
 3298            For sparse data sets the table constructed by the trie algorithm will
 3299            be mostly 0/FAIL transitions or to put it another way mostly empty.
 3300            (Note that leaf nodes will not contain any transitions.)
 3301 
 3302            This algorithm compresses the tables by eliminating most such
 3303            transitions, at the cost of a modest bit of extra work during lookup:
 3304 
 3305            - Each states[] entry contains a .base field which indicates the
 3306            index in the state[] array wheres its transition data is stored.
 3307 
 3308            - If .base is 0 there are no valid transitions from that node.
 3309 
 3310            - If .base is nonzero then charid is added to it to find an entry in
 3311            the trans array.
 3312 
 3313            -If trans[states[state].base+charid].check!=state then the
 3314            transition is taken to be a 0/Fail transition. Thus if there are fail
 3315            transitions at the front of the node then the .base offset will point
 3316            somewhere inside the previous nodes data (or maybe even into a node
 3317            even earlier), but the .check field determines if the transition is
 3318            valid.
 3319 
 3320            XXX - wrong maybe?
 3321            The following process inplace converts the table to the compressed
 3322            table: We first do not compress the root node 1,and mark all its
 3323            .check pointers as 1 and set its .base pointer as 1 as well. This
 3324            allows us to do a DFA construction from the compressed table later,
 3325            and ensures that any .base pointers we calculate later are greater
 3326            than 0.
 3327 
 3328            - We set 'pos' to indicate the first entry of the second node.
 3329 
 3330            - We then iterate over the columns of the node, finding the first and
 3331            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
 3332            and set the .check pointers accordingly, and advance pos
 3333            appropriately and repreat for the next node. Note that when we copy
 3334            the next pointers we have to convert them from the original
 3335            NODEIDX form to NODENUM form as the former is not valid post
 3336            compression.
 3337 
 3338            - If a node has no transitions used we mark its base as 0 and do not
 3339            advance the pos pointer.
 3340 
 3341            - If a node only has one transition we use a second pointer into the
 3342            structure to fill in allocated fail transitions from other states.
 3343            This pointer is independent of the main pointer and scans forward
 3344            looking for null transitions that are allocated to a state. When it
 3345            finds one it writes the single transition into the "hole".  If the
 3346            pointer doesnt find one the single transition is appended as normal.
 3347 
 3348            - Once compressed we can Renew/realloc the structures to release the
 3349            excess space.
 3350 
 3351            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
 3352            specifically Fig 3.47 and the associated pseudocode.
 3353 
 3354            demq
 3355         */
 3356         const U32 laststate = TRIE_NODENUM( next_alloc );
 3357     U32 state, charid;
 3358         U32 pos = 0, zp=0;
 3359         trie->statecount = laststate;
 3360 
 3361         for ( state = 1 ; state < laststate ; state++ ) {
 3362             U8 flag = 0;
 3363         const U32 stateidx = TRIE_NODEIDX( state );
 3364         const U32 o_used = trie->trans[ stateidx ].check;
 3365         U32 used = trie->trans[ stateidx ].check;
 3366             trie->trans[ stateidx ].check = 0;
 3367 
 3368             for ( charid = 0;
 3369                   used && charid < trie->uniquecharcount;
 3370                   charid++ )
 3371             {
 3372                 if ( flag || trie->trans[ stateidx + charid ].next ) {
 3373                     if ( trie->trans[ stateidx + charid ].next ) {
 3374                         if (o_used == 1) {
 3375                             for ( ; zp < pos ; zp++ ) {
 3376                                 if ( ! trie->trans[ zp ].next ) {
 3377                                     break;
 3378                                 }
 3379                             }
 3380                             trie->states[ state ].trans.base
 3381                                                     = zp
 3382                                                       + trie->uniquecharcount
 3383                                                       - charid ;
 3384                             trie->trans[ zp ].next
 3385                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
 3386                                                              + charid ].next );
 3387                             trie->trans[ zp ].check = state;
 3388                             if ( ++zp > pos ) pos = zp;
 3389                             break;
 3390                         }
 3391                         used--;
 3392                     }
 3393                     if ( !flag ) {
 3394                         flag = 1;
 3395                         trie->states[ state ].trans.base
 3396                                        = pos + trie->uniquecharcount - charid ;
 3397                     }
 3398                     trie->trans[ pos ].next
 3399                         = SAFE_TRIE_NODENUM(
 3400                                        trie->trans[ stateidx + charid ].next );
 3401                     trie->trans[ pos ].check = state;
 3402                     pos++;
 3403                 }
 3404             }
 3405         }
 3406         trie->lasttrans = pos + 1;
 3407         trie->states = (reg_trie_state *)
 3408         PerlMemShared_realloc( trie->states, laststate
 3409                    * sizeof(reg_trie_state) );
 3410         DEBUG_TRIE_COMPILE_MORE_r(
 3411             Perl_re_indentf( aTHX_  "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
 3412                 depth+1,
 3413                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
 3414                        + 1 ),
 3415                 (IV)next_alloc,
 3416                 (IV)pos,
 3417                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
 3418             );
 3419 
 3420         } /* end table compress */
 3421     }
 3422     DEBUG_TRIE_COMPILE_MORE_r(
 3423             Perl_re_indentf( aTHX_  "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
 3424                 depth+1,
 3425                 (UV)trie->statecount,
 3426                 (UV)trie->lasttrans)
 3427     );
 3428     /* resize the trans array to remove unused space */
 3429     trie->trans = (reg_trie_trans *)
 3430     PerlMemShared_realloc( trie->trans, trie->lasttrans
 3431                    * sizeof(reg_trie_trans) );
 3432 
 3433     {   /* Modify the program and insert the new TRIE node */
 3434         U8 nodetype =(U8)(flags & 0xFF);
 3435         char *str=NULL;
 3436 
 3437 #ifdef DEBUGGING
 3438         regnode *optimize = NULL;
 3439 #ifdef RE_TRACK_PATTERN_OFFSETS
 3440 
 3441         U32 mjd_offset = 0;
 3442         U32 mjd_nodelen = 0;
 3443 #endif /* RE_TRACK_PATTERN_OFFSETS */
 3444 #endif /* DEBUGGING */
 3445         /*
 3446            This means we convert either the first branch or the first Exact,
 3447            depending on whether the thing following (in 'last') is a branch
 3448            or not and whther first is the startbranch (ie is it a sub part of
 3449            the alternation or is it the whole thing.)
 3450            Assuming its a sub part we convert the EXACT otherwise we convert
 3451            the whole branch sequence, including the first.
 3452          */
 3453         /* Find the node we are going to overwrite */
 3454         if ( first != startbranch || OP( last ) == BRANCH ) {
 3455             /* branch sub-chain */
 3456             NEXT_OFF( first ) = (U16)(last - first);
 3457 #ifdef RE_TRACK_PATTERN_OFFSETS
 3458             DEBUG_r({
 3459                 mjd_offset= Node_Offset((convert));
 3460                 mjd_nodelen= Node_Length((convert));
 3461             });
 3462 #endif
 3463             /* whole branch chain */
 3464         }
 3465 #ifdef RE_TRACK_PATTERN_OFFSETS
 3466         else {
 3467             DEBUG_r({
 3468                 const  regnode *nop = NEXTOPER( convert );
 3469                 mjd_offset= Node_Offset((nop));
 3470                 mjd_nodelen= Node_Length((nop));
 3471             });
 3472         }
 3473         DEBUG_OPTIMISE_r(
 3474             Perl_re_indentf( aTHX_  "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
 3475                 depth+1,
 3476                 (UV)mjd_offset, (UV)mjd_nodelen)
 3477         );
 3478 #endif
 3479         /* But first we check to see if there is a common prefix we can
 3480            split out as an EXACT and put in front of the TRIE node.  */
 3481         trie->startstate= 1;
 3482         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
 3483             /* we want to find the first state that has more than
 3484              * one transition, if that state is not the first state
 3485              * then we have a common prefix which we can remove.
 3486              */
 3487             U32 state;
 3488             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
 3489                 U32 ofs = 0;
 3490                 I32 first_ofs = -1; /* keeps track of the ofs of the first
 3491                                        transition, -1 means none */
 3492                 U32 count = 0;
 3493                 const U32 base = trie->states[ state ].trans.base;
 3494 
 3495                 /* does this state terminate an alternation? */
 3496                 if ( trie->states[state].wordnum )
 3497                         count = 1;
 3498 
 3499                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
 3500                     if ( ( base + ofs >= trie->uniquecharcount ) &&
 3501                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
 3502                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
 3503                     {
 3504                         if ( ++count > 1 ) {
 3505                             /* we have more than one transition */
 3506                             SV **tmp;
 3507                             U8 *ch;
 3508                             /* if this is the first state there is no common prefix
 3509                              * to extract, so we can exit */
 3510                             if ( state == 1 ) break;
 3511                             tmp = av_fetch( revcharmap, ofs, 0);
 3512                             ch = (U8*)SvPV_nolen_const( *tmp );
 3513 
 3514                             /* if we are on count 2 then we need to initialize the
 3515                              * bitmap, and store the previous char if there was one
 3516                              * in it*/
 3517                             if ( count == 2 ) {
 3518                                 /* clear the bitmap */
 3519                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
 3520                                 DEBUG_OPTIMISE_r(
 3521                                     Perl_re_indentf( aTHX_  "New Start State=%" UVuf " Class: [",
 3522                                         depth+1,
 3523                                         (UV)state));
 3524                                 if (first_ofs >= 0) {
 3525                                     SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
 3526                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
 3527 
 3528                                     TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
 3529                                     DEBUG_OPTIMISE_r(
 3530                                         Perl_re_printf( aTHX_  "%s", (char*)ch)
 3531                                     );
 3532                 }
 3533                 }
 3534                             /* store the current firstchar in the bitmap */
 3535                             TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
 3536                             DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
 3537             }
 3538                         first_ofs = ofs;
 3539             }
 3540                 }
 3541                 if ( count == 1 ) {
 3542                     /* This state has only one transition, its transition is part
 3543                      * of a common prefix - we need to concatenate the char it
 3544                      * represents to what we have so far. */
 3545                     SV **tmp = av_fetch( revcharmap, first_ofs, 0);
 3546                     STRLEN len;
 3547                     char *ch = SvPV( *tmp, len );
 3548                     DEBUG_OPTIMISE_r({
 3549                         SV *sv=sv_newmortal();
 3550                         Perl_re_indentf( aTHX_  "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
 3551                             depth+1,
 3552                             (UV)state, (UV)first_ofs,
 3553                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
 3554                             PL_colors[0], PL_colors[1],
 3555                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
 3556                             PERL_PV_ESCAPE_FIRSTCHAR
 3557                             )
 3558                         );
 3559                     });
 3560                     if ( state==1 ) {
 3561                         OP( convert ) = nodetype;
 3562                         str=STRING(convert);
 3563                         STR_LEN(convert)=0;
 3564                     }
 3565                     STR_LEN(convert) += len;
 3566                     while (len--)
 3567                         *str++ = *ch++;
 3568         } else {
 3569 #ifdef DEBUGGING
 3570             if (state>1)
 3571                         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
 3572 #endif
 3573             break;
 3574         }
 3575         }
 3576         trie->prefixlen = (state-1);
 3577             if (str) {
 3578                 regnode *n = convert+NODE_SZ_STR(convert);
 3579                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
 3580                 trie->startstate = state;
 3581                 trie->minlen -= (state - 1);
 3582                 trie->maxlen -= (state - 1);
 3583 #ifdef DEBUGGING
 3584                /* At least the UNICOS C compiler choked on this
 3585                 * being argument to DEBUG_r(), so let's just have
 3586                 * it right here. */
 3587                if (
 3588 #ifdef PERL_EXT_RE_BUILD
 3589                    1
 3590 #else
 3591                    DEBUG_r_TEST
 3592 #endif
 3593                    ) {
 3594                    regnode *fix = convert;
 3595                    U32 word = trie->wordcount;
 3596 #ifdef RE_TRACK_PATTERN_OFFSETS
 3597                    mjd_nodelen++;
 3598 #endif
 3599                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
 3600                    while( ++fix < n ) {
 3601                        Set_Node_Offset_Length(fix, 0, 0);
 3602                    }
 3603                    while (word--) {
 3604                        SV ** const tmp = av_fetch( trie_words, word, 0 );
 3605                        if (tmp) {
 3606                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
 3607                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
 3608                            else
 3609                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
 3610                        }
 3611                    }
 3612                }
 3613 #endif
 3614                 if (trie->maxlen) {
 3615                     convert = n;
 3616         } else {
 3617                     NEXT_OFF(convert) = (U16)(tail - convert);
 3618                     DEBUG_r(optimize= n);
 3619                 }
 3620             }
 3621         }
 3622         if (!jumper)
 3623             jumper = last;
 3624         if ( trie->maxlen ) {
 3625         NEXT_OFF( convert ) = (U16)(tail - convert);
 3626         ARG_SET( convert, data_slot );
 3627         /* Store the offset to the first unabsorbed branch in
 3628            jump[0], which is otherwise unused by the jump logic.
 3629            We use this when dumping a trie and during optimisation. */
 3630         if (trie->jump)
 3631             trie->jump[0] = (U16)(nextbranch - convert);
 3632 
 3633             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
 3634          *   and there is a bitmap
 3635          *   and the first "jump target" node we found leaves enough room
 3636          * then convert the TRIE node into a TRIEC node, with the bitmap
 3637          * embedded inline in the opcode - this is hypothetically faster.
 3638          */
 3639             if ( !trie->states[trie->startstate].wordnum
 3640          && trie->bitmap
 3641          && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
 3642             {
 3643                 OP( convert ) = TRIEC;
 3644                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
 3645                 PerlMemShared_free(trie->bitmap);
 3646                 trie->bitmap= NULL;
 3647             } else
 3648                 OP( convert ) = TRIE;
 3649 
 3650             /* store the type in the flags */
 3651             convert->flags = nodetype;
 3652             DEBUG_r({
 3653             optimize = convert
 3654                       + NODE_STEP_REGNODE
 3655                       + regarglen[ OP( convert ) ];
 3656             });
 3657             /* XXX We really should free up the resource in trie now,
 3658                    as we won't use them - (which resources?) dmq */
 3659         }
 3660         /* needed for dumping*/
 3661         DEBUG_r(if (optimize) {
 3662             regnode *opt = convert;
 3663 
 3664             while ( ++opt < optimize) {
 3665                 Set_Node_Offset_Length(opt, 0, 0);
 3666             }
 3667             /*
 3668                 Try to clean up some of the debris left after the
 3669                 optimisation.
 3670              */
 3671             while( optimize < jumper ) {
 3672                 Track_Code( mjd_nodelen += Node_Length((optimize)); );
 3673                 OP( optimize ) = OPTIMIZED;
 3674                 Set_Node_Offset_Length(optimize, 0, 0);
 3675                 optimize++;
 3676             }
 3677             Set_Node_Offset_Length(convert, mjd_offset, mjd_nodelen);
 3678         });
 3679     } /* end node insert */
 3680 
 3681     /*  Finish populating the prev field of the wordinfo array.  Walk back
 3682      *  from each accept state until we find another accept state, and if
 3683      *  so, point the first word's .prev field at the second word. If the
 3684      *  second already has a .prev field set, stop now. This will be the
 3685      *  case either if we've already processed that word's accept state,
 3686      *  or that state had multiple words, and the overspill words were
 3687      *  already linked up earlier.
 3688      */
 3689     {
 3690     U16 word;
 3691     U32 state;
 3692     U16 prev;
 3693 
 3694     for (word=1; word <= trie->wordcount; word++) {
 3695         prev = 0;
 3696         if (trie->wordinfo[word].prev)
 3697         continue;
 3698         state = trie->wordinfo[word].accept;
 3699         while (state) {
 3700         state = prev_states[state];
 3701         if (!state)
 3702             break;
 3703         prev = trie->states[state].wordnum;
 3704         if (prev)
 3705             break;
 3706         }
 3707         trie->wordinfo[word].prev = prev;
 3708     }
 3709     Safefree(prev_states);
 3710     }
 3711 
 3712 
 3713     /* and now dump out the compressed format */
 3714     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
 3715 
 3716     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
 3717 #ifdef DEBUGGING
 3718     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
 3719     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
 3720 #else
 3721     SvREFCNT_dec_NN(revcharmap);
 3722 #endif
 3723     return trie->jump
 3724            ? MADE_JUMP_TRIE
 3725            : trie->startstate>1
 3726              ? MADE_EXACT_TRIE
 3727              : MADE_TRIE;
 3728 }
 3729 
 3730 STATIC regnode *
 3731 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
 3732 {
 3733 /* The Trie is constructed and compressed now so we can build a fail array if
 3734  * it's needed
 3735 
 3736    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
 3737    3.32 in the
 3738    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
 3739    Ullman 1985/88
 3740    ISBN 0-201-10088-6
 3741 
 3742    We find the fail state for each state in the trie, this state is the longest
 3743    proper suffix of the current state's 'word' that is also a proper prefix of
 3744    another word in our trie. State 1 represents the word '' and is thus the
 3745    default fail state. This allows the DFA not to have to restart after its
 3746    tried and failed a word at a given point, it simply continues as though it
 3747    had been matching the other word in the first place.
 3748    Consider
 3749       'abcdgu'=~/abcdefg|cdgu/
 3750    When we get to 'd' we are still matching the first word, we would encounter
 3751    'g' which would fail, which would bring us to the state representing 'd' in
 3752    the second word where we would try 'g' and succeed, proceeding to match
 3753    'cdgu'.
 3754  */
 3755  /* add a fail transition */
 3756     const U32 trie_offset = ARG(source);
 3757     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
 3758     U32 *q;
 3759     const U32 ucharcount = trie->uniquecharcount;
 3760     const U32 numstates = trie->statecount;
 3761     const U32 ubound = trie->lasttrans + ucharcount;
 3762     U32 q_read = 0;
 3763     U32 q_write = 0;
 3764     U32 charid;
 3765     U32 base = trie->states[ 1 ].trans.base;
 3766     U32 *fail;
 3767     reg_ac_data *aho;
 3768     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
 3769     regnode *stclass;
 3770     GET_RE_DEBUG_FLAGS_DECL;
 3771 
 3772     PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
 3773     PERL_UNUSED_CONTEXT;
 3774 #ifndef DEBUGGING
 3775     PERL_UNUSED_ARG(depth);
 3776 #endif
 3777 
 3778     if ( OP(source) == TRIE ) {
 3779         struct regnode_1 *op = (struct regnode_1 *)
 3780             PerlMemShared_calloc(1, sizeof(struct regnode_1));
 3781         StructCopy(source, op, struct regnode_1);
 3782         stclass = (regnode *)op;
 3783     } else {
 3784         struct regnode_charclass *op = (struct regnode_charclass *)
 3785             PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
 3786         StructCopy(source, op, struct regnode_charclass);
 3787         stclass = (regnode *)op;
 3788     }
 3789     OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
 3790 
 3791     ARG_SET( stclass, data_slot );
 3792     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
 3793     RExC_rxi->data->data[ data_slot ] = (void*)aho;
 3794     aho->trie=trie_offset;
 3795     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
 3796     Copy( trie->states, aho->states, numstates, reg_trie_state );
 3797     Newx( q, numstates, U32);
 3798     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
 3799     aho->refcount = 1;
 3800     fail = aho->fail;
 3801     /* initialize fail[0..1] to be 1 so that we always have
 3802        a valid final fail state */
 3803     fail[ 0 ] = fail[ 1 ] = 1;
 3804 
 3805     for ( charid = 0; charid < ucharcount ; charid++ ) {
 3806     const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
 3807     if ( newstate ) {
 3808             q[ q_write ] = newstate;
 3809             /* set to point at the root */
 3810             fail[ q[ q_write++ ] ]=1;
 3811         }
 3812     }
 3813     while ( q_read < q_write) {
 3814     const U32 cur = q[ q_read++ % numstates ];
 3815         base = trie->states[ cur ].trans.base;
 3816 
 3817         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
 3818         const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
 3819         if (ch_state) {
 3820                 U32 fail_state = cur;
 3821                 U32 fail_base;
 3822                 do {
 3823                     fail_state = fail[ fail_state ];
 3824                     fail_base = aho->states[ fail_state ].trans.base;
 3825                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
 3826 
 3827                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
 3828                 fail[ ch_state ] = fail_state;
 3829                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
 3830                 {
 3831                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
 3832                 }
 3833                 q[ q_write++ % numstates] = ch_state;
 3834             }
 3835         }
 3836     }
 3837     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
 3838        when we fail in state 1, this allows us to use the
 3839        charclass scan to find a valid start char. This is based on the principle
 3840        that theres a good chance the string being searched contains lots of stuff
 3841        that cant be a start char.
 3842      */
 3843     fail[ 0 ] = fail[ 1 ] = 0;
 3844     DEBUG_TRIE_COMPILE_r({
 3845         Perl_re_indentf( aTHX_  "Stclass Failtable (%" UVuf " states): 0",
 3846                       depth, (UV)numstates
 3847         );
 3848         for( q_read=1; q_read<numstates; q_read++ ) {
 3849             Perl_re_printf( aTHX_  ", %" UVuf, (UV)fail[q_read]);
 3850         }
 3851         Perl_re_printf( aTHX_  "\n");
 3852     });
 3853     Safefree(q);
 3854     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
 3855     return stclass;
 3856 }
 3857 
 3858 
 3859 /* The below joins as many adjacent EXACTish nodes as possible into a single
 3860  * one.  The regop may be changed if the node(s) contain certain sequences that
 3861  * require special handling.  The joining is only done if:
 3862  * 1) there is room in the current conglomerated node to entirely contain the
 3863  *    next one.
 3864  * 2) they are compatible node types
 3865  *
 3866  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
 3867  * these get optimized out
 3868  *
 3869  * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
 3870  * as possible, even if that means splitting an existing node so that its first
 3871  * part is moved to the preceeding node.  This would maximise the efficiency of
 3872  * memEQ during matching.
 3873  *
 3874  * If a node is to match under /i (folded), the number of characters it matches
 3875  * can be different than its character length if it contains a multi-character
 3876  * fold.  *min_subtract is set to the total delta number of characters of the
 3877  * input nodes.
 3878  *
 3879  * And *unfolded_multi_char is set to indicate whether or not the node contains
 3880  * an unfolded multi-char fold.  This happens when it won't be known until
 3881  * runtime whether the fold is valid or not; namely
 3882  *  1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
 3883  *      target string being matched against turns out to be UTF-8 is that fold
 3884  *      valid; or
 3885  *  2) for EXACTFL nodes whose folding rules depend on the locale in force at
 3886  *      runtime.
 3887  * (Multi-char folds whose components are all above the Latin1 range are not
 3888  * run-time locale dependent, and have already been folded by the time this
 3889  * function is called.)
 3890  *
 3891  * This is as good a place as any to discuss the design of handling these
 3892  * multi-character fold sequences.  It's been wrong in Perl for a very long
 3893  * time.  There are three code points in Unicode whose multi-character folds
 3894  * were long ago discovered to mess things up.  The previous designs for
 3895  * dealing with these involved assigning a special node for them.  This
 3896  * approach doesn't always work, as evidenced by this example:
 3897  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
 3898  * Both sides fold to "sss", but if the pattern is parsed to create a node that
 3899  * would match just the \xDF, it won't be able to handle the case where a
 3900  * successful match would have to cross the node's boundary.  The new approach
 3901  * that hopefully generally solves the problem generates an EXACTFUP node
 3902  * that is "sss" in this case.
 3903  *
 3904  * It turns out that there are problems with all multi-character folds, and not
 3905  * just these three.  Now the code is general, for all such cases.  The
 3906  * approach taken is:
 3907  * 1)   This routine examines each EXACTFish node that could contain multi-
 3908  *      character folded sequences.  Since a single character can fold into
 3909  *      such a sequence, the minimum match length for this node is less than
 3910  *      the number of characters in the node.  This routine returns in
 3911  *      *min_subtract how many characters to subtract from the the actual
 3912  *      length of the string to get a real minimum match length; it is 0 if
 3913  *      there are no multi-char foldeds.  This delta is used by the caller to
 3914  *      adjust the min length of the match, and the delta between min and max,
 3915  *      so that the optimizer doesn't reject these possibilities based on size
 3916  *      constraints.
 3917  *
 3918  * 2)   For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF)
 3919  *      under /u, we fold it to 'ss' in regatom(), and in this routine, after
 3920  *      joining, we scan for occurrences of the sequence 'ss' in non-UTF-8
 3921  *      EXACTFU nodes.  The node type of such nodes is then changed to
 3922  *      EXACTFUP, indicating it is problematic, and needs careful handling.
 3923  *      (The procedures in step 1) above are sufficient to handle this case in
 3924  *      UTF-8 encoded nodes.)  The reason this is problematic is that this is
 3925  *      the only case where there is a possible fold length change in non-UTF-8
 3926  *      patterns.  By reserving a special node type for problematic cases, the
 3927  *      far more common regular EXACTFU nodes can be processed faster.
 3928  *      regexec.c takes advantage of this.
 3929  *
 3930  *      EXACTFUP has been created as a grab-bag for (hopefully uncommon)
 3931  *      problematic cases.   These all only occur when the pattern is not
 3932  *      UTF-8.  In addition to the 'ss' sequence where there is a possible fold
 3933  *      length change, it handles the situation where the string cannot be
 3934  *      entirely folded.  The strings in an EXACTFish node are folded as much
 3935  *      as possible during compilation in regcomp.c.  This saves effort in
 3936  *      regex matching.  By using an EXACTFUP node when it is not possible to
 3937  *      fully fold at compile time, regexec.c can know that everything in an
 3938  *      EXACTFU node is folded, so folding can be skipped at runtime.  The only
 3939  *      case where folding in EXACTFU nodes can't be done at compile time is
 3940  *      the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8.  This
 3941  *      is because its fold requires UTF-8 to represent.  Thus EXACTFUP nodes
 3942  *      handle two very different cases.  Alternatively, there could have been
 3943  *      a node type where there are length changes, one for unfolded, and one
 3944  *      for both.  If yet another special case needed to be created, the number
 3945  *      of required node types would have to go to 7.  khw figures that even
 3946  *      though there are plenty of node types to spare, that the maintenance
 3947  *      cost wasn't worth the small speedup of doing it that way, especially
 3948  *      since he thinks the MICRO SIGN is rarely encountered in practice.
 3949  *
 3950  *      There are other cases where folding isn't done at compile time, but
 3951  *      none of them are under /u, and hence not for EXACTFU nodes.  The folds
 3952  *      in EXACTFL nodes aren't known until runtime, and vary as the locale
 3953  *      changes.  Some folds in EXACTF depend on if the runtime target string
 3954  *      is UTF-8 or not.  (regatom() will create an EXACTFU node even under /di
 3955  *      when no fold in it depends on the UTF-8ness of the target string.)
 3956  *
 3957  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
 3958  *      validity of the fold won't be known until runtime, and so must remain
 3959  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFAA
 3960  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
 3961  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
 3962  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
 3963  *      The reason this is a problem is that the optimizer part of regexec.c
 3964  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
 3965  *      that a character in the pattern corresponds to at most a single
 3966  *      character in the target string.  (And I do mean character, and not byte
 3967  *      here, unlike other parts of the documentation that have never been
 3968  *      updated to account for multibyte Unicode.)  Sharp s in EXACTF and
 3969  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
 3970  *      nodes it can match "\x{17F}\x{17F}".  These, along with other ones in
 3971  *      EXACTFL nodes, violate the assumption, and they are the only instances
 3972  *      where it is violated.  I'm reluctant to try to change the assumption,
 3973  *      as the code involved is impenetrable to me (khw), so instead the code
 3974  *      here punts.  This routine examines EXACTFL nodes, and (when the pattern
 3975  *      isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
 3976  *      boolean indicating whether or not the node contains such a fold.  When
 3977  *      it is true, the caller sets a flag that later causes the optimizer in
 3978  *      this file to not set values for the floating and fixed string lengths,
 3979  *      and thus avoids the optimizer code in regexec.c that makes the invalid
 3980  *      assumption.  Thus, there is no optimization based on string lengths for
 3981  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
 3982  *      EXACTF and EXACTFAA nodes that contain the sharp s.  (The reason the
 3983  *      assumption is wrong only in these cases is that all other non-UTF-8
 3984  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
 3985  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
 3986  *      EXACTF nodes because we don't know at compile time if it actually
 3987  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
 3988  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
 3989  *      always matches; and EXACTFAA where it never does.  In an EXACTFAA node
 3990  *      in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
 3991  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
 3992  *      string would require the pattern to be forced into UTF-8, the overhead
 3993  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
 3994  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
 3995  *      locale.)
 3996  *
 3997  *      Similarly, the code that generates tries doesn't currently handle
 3998  *      not-already-folded multi-char folds, and it looks like a pain to change
 3999  *      that.  Therefore, trie generation of EXACTFAA nodes with the sharp s
 4000  *      doesn't work.  Instead, such an EXACTFAA is turned into a new regnode,
 4001  *      EXACTFAA_NO_TRIE, which the trie code knows not to handle.  Most people
 4002  *      using /iaa matching will be doing so almost entirely with ASCII
 4003  *      strings, so this should rarely be encountered in practice */
 4004 
 4005 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
 4006     if (PL_regkind[OP(scan)] == EXACT) \
 4007         join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags), NULL, depth+1)
 4008 
 4009 STATIC U32
 4010 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
 4011                    UV *min_subtract, bool *unfolded_multi_char,
 4012                    U32 flags, regnode *val, U32 depth)
 4013 {
 4014     /* Merge several consecutive EXACTish nodes into one. */
 4015 
 4016     regnode *n = regnext(scan);
 4017     U32 stringok = 1;
 4018     regnode *next = scan + NODE_SZ_STR(scan);
 4019     U32 merged = 0;
 4020     U32 stopnow = 0;
 4021 #ifdef DEBUGGING
 4022     regnode *stop = scan;
 4023     GET_RE_DEBUG_FLAGS_DECL;
 4024 #else
 4025     PERL_UNUSED_ARG(depth);
 4026 #endif
 4027 
 4028     PERL_ARGS_ASSERT_JOIN_EXACT;
 4029 #ifndef EXPERIMENTAL_INPLACESCAN
 4030     PERL_UNUSED_ARG(flags);
 4031     PERL_UNUSED_ARG(val);
 4032 #endif
 4033     DEBUG_PEEP("join", scan, depth, 0);
 4034 
 4035     assert(PL_regkind[OP(scan)] == EXACT);
 4036 
 4037     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
 4038      * EXACT ones that are mergeable to the current one. */
 4039     while (    n
 4040            && (    PL_regkind[OP(n)] == NOTHING
 4041                || (stringok && PL_regkind[OP(n)] == EXACT))
 4042            && NEXT_OFF(n)
 4043            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
 4044     {
 4045 
 4046         if (OP(n) == TAIL || n > next)
 4047             stringok = 0;
 4048         if (PL_regkind[OP(n)] == NOTHING) {
 4049             DEBUG_PEEP("skip:", n, depth, 0);
 4050             NEXT_OFF(scan) += NEXT_OFF(n);
 4051             next = n + NODE_STEP_REGNODE;
 4052 #ifdef DEBUGGING
 4053             if (stringok)
 4054                 stop = n;
 4055 #endif
 4056             n = regnext(n);
 4057         }
 4058         else if (stringok) {
 4059             const unsigned int oldl = STR_LEN(scan);
 4060             regnode * const nnext = regnext(n);
 4061 
 4062             /* XXX I (khw) kind of doubt that this works on platforms (should
 4063              * Perl ever run on one) where U8_MAX is above 255 because of lots
 4064              * of other assumptions */
 4065             /* Don't join if the sum can't fit into a single node */
 4066             if (oldl + STR_LEN(n) > U8_MAX)
 4067                 break;
 4068 
 4069             /* Joining something that requires UTF-8 with something that
 4070              * doesn't, means the result requires UTF-8. */
 4071             if (OP(scan) == EXACT && (OP(n) == EXACT_ONLY8)) {
 4072                 OP(scan) = EXACT_ONLY8;
 4073             }
 4074             else if (OP(scan) == EXACT_ONLY8 && (OP(n) == EXACT)) {
 4075                 ;   /* join is compatible, no need to change OP */
 4076             }
 4077             else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_ONLY8)) {
 4078                 OP(scan) = EXACTFU_ONLY8;
 4079             }
 4080             else if ((OP(scan) == EXACTFU_ONLY8) && (OP(n) == EXACTFU)) {
 4081                 ;   /* join is compatible, no need to change OP */
 4082             }
 4083             else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
 4084                 ;   /* join is compatible, no need to change OP */
 4085             }
 4086             else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
 4087 
 4088                  /* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
 4089                   * which can join with EXACTFU ones.  We check for this case
 4090                   * here.  These need to be resolved to either EXACTFU or
 4091                   * EXACTF at joining time.  They have nothing in them that
 4092                   * would forbid them from being the more desirable EXACTFU
 4093                   * nodes except that they begin and/or end with a single [Ss].
 4094                   * The reason this is problematic is because they could be
 4095                   * joined in this loop with an adjacent node that ends and/or
 4096                   * begins with [Ss] which would then form the sequence 'ss',
 4097                   * which matches differently under /di than /ui, in which case
 4098                   * EXACTFU can't be used.  If the 'ss' sequence doesn't get
 4099                   * formed, the nodes get absorbed into any adjacent EXACTFU
 4100                   * node.  And if the only adjacent node is EXACTF, they get
 4101                   * absorbed into that, under the theory that a longer node is
 4102                   * better than two shorter ones, even if one is EXACTFU.  Note
 4103                   * that EXACTFU_ONLY8 is generated only for UTF-8 patterns,
 4104                   * and the EXACTFU_S_EDGE ones only for non-UTF-8.  */
 4105 
 4106                 if (STRING(n)[STR_LEN(n)-1] == 's') {
 4107 
 4108                     /* Here the joined node would end with 's'.  If the node
 4109                      * following the combination is an EXACTF one, it's better to
 4110                      * join this trailing edge 's' node with that one, leaving the
 4111                      * current one in 'scan' be the more desirable EXACTFU */
 4112                     if (OP(nnext) == EXACTF) {
 4113                         break;
 4114                     }
 4115 
 4116                     OP(scan) = EXACTFU_S_EDGE;
 4117 
 4118                 }   /* Otherwise, the beginning 's' of the 2nd node just
 4119                        becomes an interior 's' in 'scan' */
 4120             }
 4121             else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
 4122                 ;   /* join is compatible, no need to change OP */
 4123             }
 4124             else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
 4125 
 4126                 /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
 4127                  * nodes.  But the latter nodes can be also joined with EXACTFU
 4128                  * ones, and that is a better outcome, so if the node following
 4129                  * 'n' is EXACTFU, quit now so that those two can be joined
 4130                  * later */
 4131                 if (OP(nnext) == EXACTFU) {
 4132                     break;
 4133                 }
 4134 
 4135                 /* The join is compatible, and the combined node will be
 4136                  * EXACTF.  (These don't care if they begin or end with 's' */
 4137             }
 4138             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
 4139                 if (   STRING(scan)[STR_LEN(scan)-1] == 's'
 4140                     && STRING(n)[0] == 's')
 4141                 {
 4142                     /* When combined, we have the sequence 'ss', which means we
 4143                      * have to remain /di */
 4144                     OP(scan) = EXACTF;
 4145                 }
 4146             }
 4147             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
 4148                 if (STRING(n)[0] == 's') {
 4149                     ;   /* Here the join is compatible and the combined node
 4150                            starts with 's', no need to change OP */
 4151                 }
 4152                 else {  /* Now the trailing 's' is in the interior */
 4153                     OP(scan) = EXACTFU;
 4154                 }
 4155             }
 4156             else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
 4157 
 4158                 /* The join is compatible, and the combined node will be
 4159                  * EXACTF.  (These don't care if they begin or end with 's' */
 4160                 OP(scan) = EXACTF;
 4161             }
 4162             else if (OP(scan) != OP(n)) {
 4163 
 4164                 /* The only other compatible joinings are the same node type */
 4165                 break;
 4166             }
 4167 
 4168             DEBUG_PEEP("merg", n, depth, 0);
 4169             merged++;
 4170 
 4171             NEXT_OFF(scan) += NEXT_OFF(n);
 4172             STR_LEN(scan) += STR_LEN(n);
 4173             next = n + NODE_SZ_STR(n);
 4174             /* Now we can overwrite *n : */
 4175             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
 4176 #ifdef DEBUGGING
 4177             stop = next - 1;
 4178 #endif
 4179             n = nnext;
 4180             if (stopnow) break;
 4181         }
 4182 
 4183 #ifdef EXPERIMENTAL_INPLACESCAN
 4184     if (flags && !NEXT_OFF(n)) {
 4185         DEBUG_PEEP("atch", val, depth, 0);
 4186         if (reg_off_by_arg[OP(n)]) {
 4187         ARG_SET(n, val - n);
 4188         }
 4189         else {
 4190         NEXT_OFF(n) = val - n;
 4191         }
 4192         stopnow = 1;
 4193     }
 4194 #endif
 4195     }
 4196 
 4197     /* This temporary node can now be turned into EXACTFU, and must, as
 4198      * regexec.c doesn't handle it */
 4199     if (OP(scan) == EXACTFU_S_EDGE) {
 4200         OP(scan) = EXACTFU;
 4201     }
 4202 
 4203     *min_subtract = 0;
 4204     *unfolded_multi_char = FALSE;
 4205 
 4206     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
 4207      * can now analyze for sequences of problematic code points.  (Prior to
 4208      * this final joining, sequences could have been split over boundaries, and
 4209      * hence missed).  The sequences only happen in folding, hence for any
 4210      * non-EXACT EXACTish node */
 4211     if (OP(scan) != EXACT && OP(scan) != EXACT_ONLY8 && OP(scan) != EXACTL) {
 4212         U8* s0 = (U8*) STRING(scan);
 4213         U8* s = s0;
 4214         U8* s_end = s0 + STR_LEN(scan);
 4215 
 4216         int total_count_delta = 0;  /* Total delta number of characters that
 4217                                        multi-char folds expand to */
 4218 
 4219     /* One pass is made over the node's string looking for all the
 4220      * possibilities.  To avoid some tests in the loop, there are two main
 4221      * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
 4222      * non-UTF-8 */
 4223     if (UTF) {
 4224             U8* folded = NULL;
 4225 
 4226             if (OP(scan) == EXACTFL) {
 4227                 U8 *d;
 4228 
 4229                 /* An EXACTFL node would already have been changed to another
 4230                  * node type unless there is at least one character in it that
 4231                  * is problematic; likely a character whose fold definition
 4232                  * won't be known until runtime, and so has yet to be folded.
 4233                  * For all but the UTF-8 locale, folds are 1-1 in length, but
 4234                  * to handle the UTF-8 case, we need to create a temporary
 4235                  * folded copy using UTF-8 locale rules in order to analyze it.
 4236                  * This is because our macros that look to see if a sequence is
 4237                  * a multi-char fold assume everything is folded (otherwise the
 4238                  * tests in those macros would be too complicated and slow).
 4239                  * Note that here, the non-problematic folds will have already
 4240                  * been done, so we can just copy such characters.  We actually
 4241                  * don't completely fold the EXACTFL string.  We skip the
 4242                  * unfolded multi-char folds, as that would just create work
 4243                  * below to figure out the size they already are */
 4244 
 4245                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
 4246                 d = folded;
 4247                 while (s < s_end) {
 4248                     STRLEN s_len = UTF8SKIP(s);
 4249                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
 4250                         Copy(s, d, s_len, U8);
 4251                         d += s_len;
 4252                     }
 4253                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
 4254                         *unfolded_multi_char = TRUE;
 4255                         Copy(s, d, s_len, U8);
 4256                         d += s_len;
 4257                     }
 4258                     else if (isASCII(*s)) {
 4259                         *(d++) = toFOLD(*s);
 4260                     }
 4261                     else {
 4262                         STRLEN len;
 4263                         _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
 4264                         d += len;
 4265                     }
 4266                     s += s_len;
 4267                 }
 4268 
 4269                 /* Point the remainder of the routine to look at our temporary
 4270                  * folded copy */
 4271                 s = folded;
 4272                 s_end = d;
 4273             } /* End of creating folded copy of EXACTFL string */
 4274 
 4275             /* Examine the string for a multi-character fold sequence.  UTF-8
 4276              * patterns have all characters pre-folded by the time this code is
 4277              * executed */
 4278             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
 4279                                      length sequence we are looking for is 2 */
 4280         {
 4281                 int count = 0;  /* How many characters in a multi-char fold */
 4282                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
 4283                 if (! len) {    /* Not a multi-char fold: get next char */
 4284                     s += UTF8SKIP(s);
 4285                     continue;
 4286                 }
 4287 
 4288                 { /* Here is a generic multi-char fold. */
 4289                     U8* multi_end  = s + len;
 4290 
 4291                     /* Count how many characters are in it.  In the case of
 4292                      * /aa, no folds which contain ASCII code points are
 4293                      * allowed, so check for those, and skip if found. */
 4294                     if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
 4295                         count = utf8_length(s, multi_end);
 4296                         s = multi_end;
 4297                     }
 4298                     else {
 4299                         while (s < multi_end) {
 4300                             if (isASCII(*s)) {
 4301                                 s++;
 4302                                 goto next_iteration;
 4303                             }
 4304                             else {
 4305                                 s += UTF8SKIP(s);
 4306                             }
 4307                             count++;
 4308                         }
 4309                     }
 4310                 }
 4311 
 4312                 /* The delta is how long the sequence is minus 1 (1 is how long
 4313                  * the character that folds to the sequence is) */
 4314                 total_count_delta += count - 1;
 4315               next_iteration: ;
 4316         }
 4317 
 4318             /* We created a temporary folded copy of the string in EXACTFL
 4319              * nodes.  Therefore we need to be sure it doesn't go below zero,
 4320              * as the real string could be shorter */
 4321             if (OP(scan) == EXACTFL) {
 4322                 int total_chars = utf8_length((U8*) STRING(scan),
 4323                                            (U8*) STRING(scan) + STR_LEN(scan));
 4324                 if (total_count_delta > total_chars) {
 4325                     total_count_delta = total_chars;
 4326                 }
 4327             }
 4328 
 4329             *min_subtract += total_count_delta;
 4330             Safefree(folded);
 4331     }
 4332     else if (OP(scan) == EXACTFAA) {
 4333 
 4334             /* Non-UTF-8 pattern, EXACTFAA node.  There can't be a multi-char
 4335              * fold to the ASCII range (and there are no existing ones in the
 4336              * upper latin1 range).  But, as outlined in the comments preceding
 4337              * this function, we need to flag any occurrences of the sharp s.
 4338              * This character forbids trie formation (because of added
 4339              * complexity) */
 4340 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
 4341    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
 4342                                       || UNICODE_DOT_DOT_VERSION > 0)
 4343         while (s < s_end) {
 4344                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
 4345                     OP(scan) = EXACTFAA_NO_TRIE;
 4346                     *unfolded_multi_char = TRUE;
 4347                     break;
 4348                 }
 4349                 s++;
 4350             }
 4351         }
 4352     else {
 4353 
 4354             /* Non-UTF-8 pattern, not EXACTFAA node.  Look for the multi-char
 4355              * folds that are all Latin1.  As explained in the comments
 4356              * preceding this function, we look also for the sharp s in EXACTF
 4357              * and EXACTFL nodes; it can be in the final position.  Otherwise
 4358              * we can stop looking 1 byte earlier because have to find at least
 4359              * two characters for a multi-fold */
 4360         const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
 4361                               ? s_end
 4362                               : s_end -1;
 4363 
 4364         while (s < upper) {
 4365                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
 4366                 if (! len) {    /* Not a multi-char fold. */
 4367                     if (*s == LATIN_SMALL_LETTER_SHARP_S
 4368                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
 4369                     {
 4370                         *unfolded_multi_char = TRUE;
 4371                     }
 4372                     s++;
 4373                     continue;
 4374                 }
 4375 
 4376                 if (len == 2
 4377                     && isALPHA_FOLD_EQ(*s, 's')
 4378                     && isALPHA_FOLD_EQ(*(s+1), 's'))
 4379                 {
 4380 
 4381                     /* EXACTF nodes need to know that the minimum length
 4382                      * changed so that a sharp s in the string can match this
 4383                      * ss in the pattern, but they remain EXACTF nodes, as they
 4384                      * won't match this unless the target string is is UTF-8,
 4385                      * which we don't know until runtime.  EXACTFL nodes can't
 4386                      * transform into EXACTFU nodes */
 4387                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
 4388                         OP(scan) = EXACTFUP;
 4389                     }
 4390         }
 4391 
 4392                 *min_subtract += len - 1;
 4393                 s += len;
 4394         }
 4395 #endif
 4396     }
 4397 
 4398         if (     STR_LEN(scan) == 1
 4399             &&   isALPHA_A(* STRING(scan))
 4400             &&  (         OP(scan) == EXACTFAA
 4401                  || (     OP(scan) == EXACTFU
 4402                      && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(scan)))))
 4403         {
 4404             U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
 4405 
 4406             /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
 4407              * with the mask set to the complement of the bit that differs
 4408              * between upper and lower case, and the lowest code point of the
 4409              * pair (which the '&' forces) */
 4410             OP(scan) = ANYOFM;
 4411             ARG_SET(scan, *STRING(scan) & mask);
 4412             FLAGS(scan) = mask;
 4413         }
 4414     }
 4415 
 4416 #ifdef DEBUGGING
 4417     /* Allow dumping but overwriting the collection of skipped
 4418      * ops and/or strings with fake optimized ops */
 4419     n = scan + NODE_SZ_STR(scan);
 4420     while (n <= stop) {
 4421     OP(n) = OPTIMIZED;
 4422     FLAGS(n) = 0;
 4423     NEXT_OFF(n) = 0;
 4424         n++;
 4425     }
 4426 #endif
 4427     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
 4428     return stopnow;
 4429 }
 4430 
 4431 /* REx optimizer.  Converts nodes into quicker variants "in place".
 4432    Finds fixed substrings.  */
 4433 
 4434 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
 4435    to the position after last scanned or to NULL. */
 4436 
 4437 #define INIT_AND_WITHP \
 4438     assert(!and_withp); \
 4439     Newx(and_withp, 1, regnode_ssc); \
 4440     SAVEFREEPV(and_withp)
 4441 
 4442 
 4443 static void
 4444 S_unwind_scan_frames(pTHX_ const void *p)
 4445 {
 4446     scan_frame *f= (scan_frame *)p;
 4447     do {
 4448         scan_frame *n= f->next_frame;
 4449         Safefree(f);
 4450         f= n;
 4451     } while (f);
 4452 }
 4453 
 4454 /* Follow the next-chain of the current node and optimize away
 4455    all the NOTHINGs from it.
 4456  */
 4457 STATIC void
 4458 S_rck_elide_nothing(pTHX_ regnode *node)
 4459 {
 4460     dVAR;
 4461 
 4462     PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING;
 4463 
 4464     if (OP(node) != CURLYX) {
 4465         const int max = (reg_off_by_arg[OP(node)]
 4466                         ? I32_MAX
 4467                           /* I32 may be smaller than U16 on CRAYs! */
 4468                         : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
 4469         int off = (reg_off_by_arg[OP(node)] ? ARG(node) : NEXT_OFF(node));
 4470         int noff;
 4471         regnode *n = node;
 4472 
 4473         /* Skip NOTHING and LONGJMP. */
 4474         while (
 4475             (n = regnext(n))
 4476             && (
 4477                 (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
 4478                 || ((OP(n) == LONGJMP) && (noff = ARG(n)))
 4479             )
 4480             && off + noff < max
 4481         ) {
 4482             off += noff;
 4483         }
 4484         if (reg_off_by_arg[OP(node)])
 4485             ARG(node) = off;
 4486         else
 4487             NEXT_OFF(node) = off;
 4488     }
 4489     return;
 4490 }
 4491 
 4492 /* the return from this sub is the minimum length that could possibly match */
 4493 STATIC SSize_t
 4494 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 4495                         SSize_t *minlenp, SSize_t *deltap,
 4496             regnode *last,
 4497             scan_data_t *data,
 4498             I32 stopparen,
 4499                         U32 recursed_depth,
 4500             regnode_ssc *and_withp,
 4501             U32 flags, U32 depth, bool was_mutate_ok)
 4502             /* scanp: Start here (read-write). */
 4503             /* deltap: Write maxlen-minlen here. */
 4504             /* last: Stop before this one. */
 4505             /* data: string data about the pattern */
 4506             /* stopparen: treat close N as END */
 4507             /* recursed: which subroutines have we recursed into */
 4508             /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
 4509 {
 4510     dVAR;
 4511     /* There must be at least this number of characters to match */
 4512     SSize_t min = 0;
 4513     I32 pars = 0, code;
 4514     regnode *scan = *scanp, *next;
 4515     SSize_t delta = 0;
 4516     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
 4517     int is_inf_internal = 0;        /* The studied chunk is infinite */
 4518     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
 4519     scan_data_t data_fake;
 4520     SV *re_trie_maxbuff = NULL;
 4521     regnode *first_non_open = scan;
 4522     SSize_t stopmin = SSize_t_MAX;
 4523     scan_frame *frame = NULL;
 4524     GET_RE_DEBUG_FLAGS_DECL;
 4525 
 4526     PERL_ARGS_ASSERT_STUDY_CHUNK;
 4527     RExC_study_started= 1;
 4528 
 4529     Zero(&data_fake, 1, scan_data_t);
 4530 
 4531     if ( depth == 0 ) {
 4532         while (first_non_open && OP(first_non_open) == OPEN)
 4533             first_non_open=regnext(first_non_open);
 4534     }
 4535 
 4536 
 4537   fake_study_recurse:
 4538     DEBUG_r(
 4539         RExC_study_chunk_recursed_count++;
 4540     );
 4541     DEBUG_OPTIMISE_MORE_r(
 4542     {
 4543         Perl_re_indentf( aTHX_  "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
 4544             depth, (long)stopparen,
 4545             (unsigned long)RExC_study_chunk_recursed_count,
 4546             (unsigned long)depth, (unsigned long)recursed_depth,
 4547             scan,
 4548             last);
 4549         if (recursed_depth) {
 4550             U32 i;
 4551             U32 j;
 4552             for ( j = 0 ; j < recursed_depth ; j++ ) {
 4553                 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
 4554                     if (
 4555                         PAREN_TEST(RExC_study_chunk_recursed +
 4556                                    ( j * RExC_study_chunk_recursed_bytes), i )
 4557                         && (
 4558                             !j ||
 4559                             !PAREN_TEST(RExC_study_chunk_recursed +
 4560                                    (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
 4561                         )
 4562                     ) {
 4563                         Perl_re_printf( aTHX_ " %d",(int)i);
 4564                         break;
 4565                     }
 4566                 }
 4567                 if ( j + 1 < recursed_depth ) {
 4568                     Perl_re_printf( aTHX_  ",");
 4569                 }
 4570             }
 4571         }
 4572         Perl_re_printf( aTHX_ "\n");
 4573     }
 4574     );
 4575     while ( scan && OP(scan) != END && scan < last ){
 4576         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
 4577                                    node length to get a real minimum (because
 4578                                    the folded version may be shorter) */
 4579     bool unfolded_multi_char = FALSE;
 4580         /* avoid mutating ops if we are anywhere within the recursed or
 4581          * enframed handling for a GOSUB: the outermost level will handle it.
 4582          */
 4583         bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub);
 4584     /* Peephole optimizer: */
 4585         DEBUG_STUDYDATA("Peep", data, depth, is_inf);
 4586         DEBUG_PEEP("Peep", scan, depth, flags);
 4587 
 4588 
 4589         /* The reason we do this here is that we need to deal with things like
 4590          * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
 4591          * parsing code, as each (?:..) is handled by a different invocation of
 4592          * reg() -- Yves
 4593          */
 4594         if (mutate_ok)
 4595             JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
 4596 
 4597         /* Follow the next-chain of the current node and optimize
 4598            away all the NOTHINGs from it.
 4599          */
 4600         rck_elide_nothing(scan);
 4601 
 4602     /* The principal pseudo-switch.  Cannot be a switch, since we
 4603        look into several different things.  */
 4604         if ( OP(scan) == DEFINEP ) {
 4605             SSize_t minlen = 0;
 4606             SSize_t deltanext = 0;
 4607             SSize_t fake_last_close = 0;
 4608             I32 f = SCF_IN_DEFINE;
 4609 
 4610             StructCopy(&zero_scan_data, &data_fake, scan_data_t);
 4611             scan = regnext(scan);
 4612             assert( OP(scan) == IFTHEN );
 4613             DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
 4614 
 4615             data_fake.last_closep= &fake_last_close;
 4616             minlen = *minlenp;
 4617             next = regnext(scan);
 4618             scan = NEXTOPER(NEXTOPER(scan));
 4619             DEBUG_PEEP("scan", scan, depth, flags);
 4620             DEBUG_PEEP("next", next, depth, flags);
 4621 
 4622             /* we suppose the run is continuous, last=next...
 4623              * NOTE we dont use the return here! */
 4624             /* DEFINEP study_chunk() recursion */
 4625             (void)study_chunk(pRExC_state, &scan, &minlen,
 4626                               &deltanext, next, &data_fake, stopparen,
 4627                               recursed_depth, NULL, f, depth+1, mutate_ok);
 4628 
 4629             scan = next;
 4630         } else
 4631         if (
 4632             OP(scan) == BRANCH  ||
 4633             OP(scan) == BRANCHJ ||
 4634             OP(scan) == IFTHEN
 4635         ) {
 4636         next = regnext(scan);
 4637         code = OP(scan);
 4638 
 4639             /* The op(next)==code check below is to see if we
 4640              * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
 4641              * IFTHEN is special as it might not appear in pairs.
 4642              * Not sure whether BRANCH-BRANCHJ is possible, regardless
 4643              * we dont handle it cleanly. */
 4644         if (OP(next) == code || code == IFTHEN) {
 4645                 /* NOTE - There is similar code to this block below for
 4646                  * handling TRIE nodes on a re-study.  If you change stuff here
 4647                  * check there too. */
 4648         SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
 4649         regnode_ssc accum;
 4650         regnode * const startbranch=scan;
 4651 
 4652                 if (flags & SCF_DO_SUBSTR) {
 4653                     /* Cannot merge strings after this. */
 4654                     scan_commit(pRExC_state, data, minlenp, is_inf);
 4655                 }
 4656 
 4657                 if (flags & SCF_DO_STCLASS)
 4658             ssc_init_zero(pRExC_state, &accum);
 4659 
 4660         while (OP(scan) == code) {
 4661             SSize_t deltanext, minnext, fake;
 4662             I32 f = 0;
 4663             regnode_ssc this_class;
 4664 
 4665                     DEBUG_PEEP("Branch", scan, depth, flags);
 4666 
 4667             num++;
 4668                     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
 4669             if (data) {
 4670             data_fake.whilem_c = data->whilem_c;
 4671             data_fake.last_closep = data->last_closep;
 4672             }
 4673             else
 4674             data_fake.last_closep = &fake;
 4675 
 4676             data_fake.pos_delta = delta;
 4677             next = regnext(scan);
 4678 
 4679                     scan = NEXTOPER(scan); /* everything */
 4680                     if (code != BRANCH)    /* everything but BRANCH */
 4681             scan = NEXTOPER(scan);
 4682 
 4683             if (flags & SCF_DO_STCLASS) {
 4684             ssc_init(pRExC_state, &this_class);
 4685             data_fake.start_class = &this_class;
 4686             f = SCF_DO_STCLASS_AND;
 4687             }
 4688             if (flags & SCF_WHILEM_VISITED_POS)
 4689             f |= SCF_WHILEM_VISITED_POS;
 4690 
 4691             /* we suppose the run is continuous, last=next...*/
 4692                     /* recurse study_chunk() for each BRANCH in an alternation */
 4693             minnext = study_chunk(pRExC_state, &scan, minlenp,
 4694                                       &deltanext, next, &data_fake, stopparen,
 4695                                       recursed_depth, NULL, f, depth+1,
 4696                                       mutate_ok);
 4697 
 4698             if (min1 > minnext)
 4699             min1 = minnext;
 4700             if (deltanext == SSize_t_MAX) {
 4701             is_inf = is_inf_internal = 1;
 4702             max1 = SSize_t_MAX;
 4703             } else if (max1 < minnext + deltanext)
 4704             max1 = minnext + deltanext;
 4705             scan = next;
 4706             if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
 4707             pars++;
 4708                 if (data_fake.flags & SCF_SEEN_ACCEPT) {
 4709                     if ( stopmin > minnext)
 4710                         stopmin = min + min1;
 4711                     flags &= ~SCF_DO_SUBSTR;
 4712                     if (data)
 4713                         data->flags |= SCF_SEEN_ACCEPT;
 4714                 }
 4715             if (data) {
 4716             if (data_fake.flags & SF_HAS_EVAL)
 4717                 data->flags |= SF_HAS_EVAL;
 4718             data->whilem_c = data_fake.whilem_c;
 4719             }
 4720             if (flags & SCF_DO_STCLASS)
 4721             ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
 4722         }
 4723         if (code == IFTHEN && num < 2) /* Empty ELSE branch */
 4724             min1 = 0;
 4725         if (flags & SCF_DO_SUBSTR) {
 4726             data->pos_min += min1;
 4727             if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
 4728                 data->pos_delta = SSize_t_MAX;
 4729             else
 4730                 data->pos_delta += max1 - min1;
 4731             if (max1 != min1 || is_inf)
 4732             data->cur_is_floating = 1;
 4733         }
 4734         min += min1;
 4735         if (delta == SSize_t_MAX
 4736          || SSize_t_MAX - delta - (max1 - min1) < 0)
 4737             delta = SSize_t_MAX;
 4738         else
 4739             delta += max1 - min1;
 4740         if (flags & SCF_DO_STCLASS_OR) {
 4741             ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
 4742             if (min1) {
 4743             ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
 4744             flags &= ~SCF_DO_STCLASS;
 4745             }
 4746         }
 4747         else if (flags & SCF_DO_STCLASS_AND) {
 4748             if (min1) {
 4749             ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
 4750             flags &= ~SCF_DO_STCLASS;
 4751             }
 4752             else {
 4753             /* Switch to OR mode: cache the old value of
 4754              * data->start_class */
 4755             INIT_AND_WITHP;
 4756             StructCopy(data->start_class, and_withp, regnode_ssc);
 4757             flags &= ~SCF_DO_STCLASS_AND;
 4758             StructCopy(&accum, data->start_class, regnode_ssc);
 4759             flags |= SCF_DO_STCLASS_OR;
 4760             }
 4761         }
 4762 
 4763                 if (PERL_ENABLE_TRIE_OPTIMISATION
 4764                     && OP(startbranch) == BRANCH
 4765                     && mutate_ok
 4766                 ) {
 4767         /* demq.
 4768 
 4769                    Assuming this was/is a branch we are dealing with: 'scan'
 4770                    now points at the item that follows the branch sequence,
 4771                    whatever it is. We now start at the beginning of the
 4772                    sequence and look for subsequences of
 4773 
 4774            BRANCH->EXACT=>x1
 4775            BRANCH->EXACT=>x2
 4776            tail
 4777 
 4778                    which would be constructed from a pattern like
 4779                    /A|LIST|OF|WORDS/
 4780 
 4781            If we can find such a subsequence we need to turn the first
 4782            element into a trie and then add the subsequent branch exact
 4783            strings to the trie.
 4784 
 4785            We have two cases
 4786 
 4787                      1. patterns where the whole set of branches can be
 4788                         converted.
 4789 
 4790              2. patterns where only a subset can be converted.
 4791 
 4792            In case 1 we can replace the whole set with a single regop
 4793            for the trie. In case 2 we need to keep the start and end
 4794            branches so
 4795 
 4796              'BRANCH EXACT; BRANCH EXACT; BRANCH X'
 4797              becomes BRANCH TRIE; BRANCH X;
 4798 
 4799           There is an additional case, that being where there is a
 4800           common prefix, which gets split out into an EXACT like node
 4801           preceding the TRIE node.
 4802 
 4803           If x(1..n)==tail then we can do a simple trie, if not we make
 4804           a "jump" trie, such that when we match the appropriate word
 4805           we "jump" to the appropriate tail node. Essentially we turn
 4806           a nested if into a case structure of sorts.
 4807 
 4808         */
 4809 
 4810             int made=0;
 4811             if (!re_trie_maxbuff) {
 4812             re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
 4813             if (!SvIOK(re_trie_maxbuff))
 4814                 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
 4815             }
 4816                     if ( SvIV(re_trie_maxbuff)>=0  ) {
 4817                         regnode *cur;
 4818                         regnode *first = (regnode *)NULL;
 4819                         regnode *last = (regnode *)NULL;
 4820                         regnode *tail = scan;
 4821                         U8 trietype = 0;
 4822                         U32 count=0;
 4823 
 4824                         /* var tail is used because there may be a TAIL
 4825                            regop in the way. Ie, the exacts will point to the
 4826                            thing following the TAIL, but the last branch will
 4827                            point at the TAIL. So we advance tail. If we
 4828                            have nested (?:) we may have to move through several
 4829                            tails.
 4830                          */
 4831 
 4832                         while ( OP( tail ) == TAIL ) {
 4833                             /* this is the TAIL generated by (?:) */
 4834                             tail = regnext( tail );
 4835                         }
 4836 
 4837 
 4838                         DEBUG_TRIE_COMPILE_r({
 4839                             regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
 4840                             Perl_re_indentf( aTHX_  "%s %" UVuf ":%s\n",
 4841                               depth+1,
 4842                               "Looking for TRIE'able sequences. Tail node is ",
 4843                               (UV) REGNODE_OFFSET(tail),
 4844                               SvPV_nolen_const( RExC_mysv )
 4845                             );
 4846                         });
 4847 
 4848                         /*
 4849 
 4850                             Step through the branches
 4851                                 cur represents each branch,
 4852                                 noper is the first thing to be matched as part
 4853                                       of that branch
 4854                                 noper_next is the regnext() of that node.
 4855 
 4856                             We normally handle a case like this
 4857                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
 4858                             support building with NOJUMPTRIE, which restricts
 4859                             the trie logic to structures like /FOO|BAR/.
 4860 
 4861                             If noper is a trieable nodetype then the branch is
 4862                             a possible optimization target. If we are building
 4863                             under NOJUMPTRIE then we require that noper_next is
 4864                             the same as scan (our current position in the regex
 4865                             program).
 4866 
 4867                             Once we have two or more consecutive such branches
 4868                             we can create a trie of the EXACT's contents and
 4869                             stitch it in place into the program.
 4870 
 4871                             If the sequence represents all of the branches in
 4872                             the alternation we replace the entire thing with a
 4873                             single TRIE node.
 4874 
 4875                             Otherwise when it is a subsequence we need to
 4876                             stitch it in place and replace only the relevant
 4877                             branches. This means the first branch has to remain
 4878                             as it is used by the alternation logic, and its
 4879                             next pointer, and needs to be repointed at the item
 4880                             on the branch chain following the last branch we
 4881                             have optimized away.
 4882 
 4883                             This could be either a BRANCH, in which case the
 4884                             subsequence is internal, or it could be the item
 4885                             following the branch sequence in which case the
 4886                             subsequence is at the end (which does not
 4887                             necessarily mean the first node is the start of the
 4888                             alternation).
 4889 
 4890                             TRIE_TYPE(X) is a define which maps the optype to a
 4891                             trietype.
 4892 
 4893                                 optype          |  trietype
 4894                                 ----------------+-----------
 4895                                 NOTHING         | NOTHING
 4896                                 EXACT           | EXACT
 4897                                 EXACT_ONLY8     | EXACT
 4898                                 EXACTFU         | EXACTFU
 4899                                 EXACTFU_ONLY8   | EXACTFU
 4900                                 EXACTFUP        | EXACTFU
 4901                                 EXACTFAA        | EXACTFAA
 4902                                 EXACTL          | EXACTL
 4903                                 EXACTFLU8       | EXACTFLU8
 4904 
 4905 
 4906                         */
 4907 #define TRIE_TYPE(X) ( ( NOTHING == (X) )                                   \
 4908                        ? NOTHING                                            \
 4909                        : ( EXACT == (X) || EXACT_ONLY8 == (X) )             \
 4910                          ? EXACT                                            \
 4911                          : (     EXACTFU == (X)                             \
 4912                               || EXACTFU_ONLY8 == (X)                       \
 4913                               || EXACTFUP == (X) )                          \
 4914                            ? EXACTFU                                        \
 4915                            : ( EXACTFAA == (X) )                            \
 4916                              ? EXACTFAA                                     \
 4917                              : ( EXACTL == (X) )                            \
 4918                                ? EXACTL                                     \
 4919                                : ( EXACTFLU8 == (X) )                       \
 4920                                  ? EXACTFLU8                                \
 4921                                  : 0 )
 4922 
 4923                         /* dont use tail as the end marker for this traverse */
 4924                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
 4925                             regnode * const noper = NEXTOPER( cur );
 4926                             U8 noper_type = OP( noper );
 4927                             U8 noper_trietype = TRIE_TYPE( noper_type );
 4928 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
 4929                             regnode * const noper_next = regnext( noper );
 4930                             U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
 4931                             U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
 4932 #endif
 4933 
 4934                             DEBUG_TRIE_COMPILE_r({
 4935                                 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
 4936                                 Perl_re_indentf( aTHX_  "- %d:%s (%d)",
 4937                                    depth+1,
 4938                                    REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
 4939 
 4940                                 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
 4941                                 Perl_re_printf( aTHX_  " -> %d:%s",
 4942                                     REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
 4943 
 4944                                 if ( noper_next ) {
 4945                                   regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
 4946                                   Perl_re_printf( aTHX_ "\t=> %d:%s\t",
 4947                                     REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
 4948                                 }
 4949                                 Perl_re_printf( aTHX_  "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
 4950                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
 4951                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
 4952                 );
 4953                             });
 4954 
 4955                             /* Is noper a trieable nodetype that can be merged
 4956                              * with the current trie (if there is one)? */
 4957                             if ( noper_trietype
 4958                                   &&
 4959                                   (
 4960                                         ( noper_trietype == NOTHING )
 4961                                         || ( trietype == NOTHING )
 4962                                         || ( trietype == noper_trietype )
 4963                                   )
 4964 #ifdef NOJUMPTRIE
 4965                                   && noper_next >= tail
 4966 #endif
 4967                                   && count < U16_MAX)
 4968                             {
 4969                                 /* Handle mergable triable node Either we are
 4970                                  * the first node in a new trieable sequence,
 4971                                  * in which case we do some bookkeeping,
 4972                                  * otherwise we update the end pointer. */
 4973                                 if ( !first ) {
 4974                                     first = cur;
 4975                     if ( noper_trietype == NOTHING ) {
 4976 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
 4977                     regnode * const noper_next = regnext( noper );
 4978                                         U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
 4979                     U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
 4980 #endif
 4981 
 4982                                         if ( noper_next_trietype ) {
 4983                         trietype = noper_next_trietype;
 4984                                         } else if (noper_next_type)  {
 4985                                             /* a NOTHING regop is 1 regop wide.
 4986                                              * We need at least two for a trie
 4987                                              * so we can't merge this in */
 4988                                             first = NULL;
 4989                                         }
 4990                                     } else {
 4991                                         trietype = noper_trietype;
 4992                                     }
 4993                                 } else {
 4994                                     if ( trietype == NOTHING )
 4995                                         trietype = noper_trietype;
 4996                                     last = cur;
 4997                                 }
 4998                 if (first)
 4999                     count++;
 5000                             } /* end handle mergable triable node */
 5001                             else {
 5002                                 /* handle unmergable node -
 5003                                  * noper may either be a triable node which can
 5004                                  * not be tried together with the current trie,
 5005                                  * or a non triable node */
 5006                                 if ( last ) {
 5007                                     /* If last is set and trietype is not
 5008                                      * NOTHING then we have found at least two
 5009                                      * triable branch sequences in a row of a
 5010                                      * similar trietype so we can turn them
 5011                                      * into a trie. If/when we allow NOTHING to
 5012                                      * start a trie sequence this condition
 5013                                      * will be required, and it isn't expensive
 5014                                      * so we leave it in for now. */
 5015                                     if ( trietype && trietype != NOTHING )
 5016                                         make_trie( pRExC_state,
 5017                                                 startbranch, first, cur, tail,
 5018                                                 count, trietype, depth+1 );
 5019                                     last = NULL; /* note: we clear/update
 5020                                                     first, trietype etc below,
 5021                                                     so we dont do it here */
 5022                                 }
 5023                                 if ( noper_trietype
 5024 #ifdef NOJUMPTRIE
 5025                                      && noper_next >= tail
 5026 #endif
 5027                                 ){
 5028                                     /* noper is triable, so we can start a new
 5029                                      * trie sequence */
 5030                                     count = 1;
 5031                                     first = cur;
 5032                                     trietype = noper_trietype;
 5033                                 } else if (first) {
 5034                                     /* if we already saw a first but the
 5035                                      * current node is not triable then we have
 5036                                      * to reset the first information. */
 5037                                     count = 0;
 5038                                     first = NULL;
 5039                                     trietype = 0;
 5040                                 }
 5041                             } /* end handle unmergable node */
 5042                         } /* loop over branches */
 5043                         DEBUG_TRIE_COMPILE_r({
 5044                             regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
 5045                             Perl_re_indentf( aTHX_  "- %s (%d) <SCAN FINISHED> ",
 5046                               depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
 5047                             Perl_re_printf( aTHX_  "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
 5048                                REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
 5049                                PL_reg_name[trietype]
 5050                             );
 5051 
 5052                         });
 5053                         if ( last && trietype ) {
 5054                             if ( trietype != NOTHING ) {
 5055                                 /* the last branch of the sequence was part of
 5056                                  * a trie, so we have to construct it here
 5057                                  * outside of the loop */
 5058                                 made= make_trie( pRExC_state, startbranch,
 5059                                                  first, scan, tail, count,
 5060                                                  trietype, depth+1 );
 5061 #ifdef TRIE_STUDY_OPT
 5062                                 if ( ((made == MADE_EXACT_TRIE &&
 5063                                      startbranch == first)
 5064                                      || ( first_non_open == first )) &&
 5065                                      depth==0 ) {
 5066                                     flags |= SCF_TRIE_RESTUDY;
 5067                                     if ( startbranch == first
 5068                                          && scan >= tail )
 5069                                     {
 5070                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
 5071                                     }
 5072                                 }
 5073 #endif
 5074                             } else {
 5075                                 /* at this point we know whatever we have is a
 5076                                  * NOTHING sequence/branch AND if 'startbranch'
 5077                                  * is 'first' then we can turn the whole thing
 5078                                  * into a NOTHING
 5079                                  */
 5080                                 if ( startbranch == first ) {
 5081                                     regnode *opt;
 5082                                     /* the entire thing is a NOTHING sequence,
 5083                                      * something like this: (?:|) So we can
 5084                                      * turn it into a plain NOTHING op. */
 5085                                     DEBUG_TRIE_COMPILE_r({
 5086                                         regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
 5087                                         Perl_re_indentf( aTHX_  "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
 5088                                           depth+1,
 5089                                           SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
 5090 
 5091                                     });
 5092                                     OP(startbranch)= NOTHING;
 5093                                     NEXT_OFF(startbranch)= tail - startbranch;
 5094                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
 5095                                         OP(opt)= OPTIMIZED;
 5096                                 }
 5097                             }
 5098                         } /* end if ( last) */
 5099                     } /* TRIE_MAXBUF is non zero */
 5100 
 5101                 } /* do trie */
 5102 
 5103         }
 5104         else if ( code == BRANCHJ ) {  /* single branch is optimized. */
 5105         scan = NEXTOPER(NEXTOPER(scan));
 5106         } else          /* single branch is optimized. */
 5107         scan = NEXTOPER(scan);
 5108         continue;
 5109         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
 5110             I32 paren = 0;
 5111             regnode *start = NULL;
 5112             regnode *end = NULL;
 5113             U32 my_recursed_depth= recursed_depth;
 5114 
 5115             if (OP(scan) != SUSPEND) { /* GOSUB */
 5116                 /* Do setup, note this code has side effects beyond
 5117                  * the rest of this block. Specifically setting
 5118                  * RExC_recurse[] must happen at least once during
 5119                  * study_chunk(). */
 5120                 paren = ARG(scan);
 5121                 RExC_recurse[ARG2L(scan)] = scan;
 5122                 start = REGNODE_p(RExC_open_parens[paren]);
 5123                 end   = REGNODE_p(RExC_close_parens[paren]);
 5124 
 5125                 /* NOTE we MUST always execute the above code, even
 5126                  * if we do nothing with a GOSUB */
 5127                 if (
 5128                     ( flags & SCF_IN_DEFINE )
 5129                     ||
 5130                     (
 5131                         (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
 5132                         &&
 5133                         ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
 5134                     )
 5135                 ) {
 5136                     /* no need to do anything here if we are in a define. */
 5137                     /* or we are after some kind of infinite construct
 5138                      * so we can skip recursing into this item.
 5139                      * Since it is infinite we will not change the maxlen
 5140                      * or delta, and if we miss something that might raise
 5141                      * the minlen it will merely pessimise a little.
 5142                      *
 5143                      * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
 5144                      * might result in a minlen of 1 and not of 4,
 5145                      * but this doesn't make us mismatch, just try a bit
 5146                      * harder than we should.
 5147                      * */
 5148                     scan= regnext(scan);
 5149                     continue;
 5150                 }
 5151 
 5152                 if (
 5153                     !recursed_depth
 5154                     ||
 5155                     !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
 5156                 ) {
 5157                     /* it is quite possible that there are more efficient ways
 5158                      * to do this. We maintain a bitmap per level of recursion
 5159                      * of which patterns we have entered so we can detect if a
 5160                      * pattern creates a possible infinite loop. When we
 5161                      * recurse down a level we copy the previous levels bitmap
 5162                      * down. When we are at recursion level 0 we zero the top
 5163                      * level bitmap. It would be nice to implement a different
 5164                      * more efficient way of doing this. In particular the top
 5165                      * level bitmap may be unnecessary.
 5166                      */
 5167                     if (!recursed_depth) {
 5168                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
 5169                     } else {
 5170                         Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
 5171                              RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
 5172                              RExC_study_chunk_recursed_bytes, U8);
 5173                     }
 5174                     /* we havent recursed into this paren yet, so recurse into it */
 5175                     DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
 5176                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
 5177                     my_recursed_depth= recursed_depth + 1;
 5178                 } else {
 5179                     DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
 5180                     /* some form of infinite recursion, assume infinite length
 5181                      * */
 5182                     if (flags & SCF_DO_SUBSTR) {
 5183                         scan_commit(pRExC_state, data, minlenp, is_inf);
 5184                         data->cur_is_floating = 1;
 5185                     }
 5186                     is_inf = is_inf_internal = 1;
 5187                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
 5188                         ssc_anything(data->start_class);
 5189                     flags &= ~SCF_DO_STCLASS;
 5190 
 5191                     start= NULL; /* reset start so we dont recurse later on. */
 5192             }
 5193             } else {
 5194             paren = stopparen;
 5195                 start = scan + 2;
 5196             end = regnext(scan);
 5197         }
 5198             if (start) {
 5199                 scan_frame *newframe;
 5200                 assert(end);
 5201                 if (!RExC_frame_last) {
 5202                     Newxz(newframe, 1, scan_frame);
 5203                     SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
 5204                     RExC_frame_head= newframe;
 5205                     RExC_frame_count++;
 5206                 } else if (!RExC_frame_last->next_frame) {
 5207                     Newxz(newframe, 1, scan_frame);
 5208                     RExC_frame_last->next_frame= newframe;
 5209                     newframe->prev_frame= RExC_frame_last;
 5210                     RExC_frame_count++;
 5211                 } else {
 5212                     newframe= RExC_frame_last->next_frame;
 5213                 }
 5214                 RExC_frame_last= newframe;
 5215 
 5216                 newframe->next_regnode = regnext(scan);
 5217                 newframe->last_regnode = last;
 5218                 newframe->stopparen = stopparen;
 5219                 newframe->prev_recursed_depth = recursed_depth;
 5220                 newframe->this_prev_frame= frame;
 5221                 newframe->in_gosub = (
 5222                     (frame && frame->in_gosub) || OP(scan) == GOSUB
 5223                 );
 5224 
 5225                 DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
 5226                 DEBUG_PEEP("fnew", scan, depth, flags);
 5227 
 5228             frame = newframe;
 5229             scan =  start;
 5230             stopparen = paren;
 5231             last = end;
 5232                 depth = depth + 1;
 5233                 recursed_depth= my_recursed_depth;
 5234 
 5235             continue;
 5236         }
 5237     }
 5238     else if (   OP(scan) == EXACT
 5239                  || OP(scan) == EXACT_ONLY8
 5240                  || OP(scan) == EXACTL)
 5241         {
 5242         SSize_t l = STR_LEN(scan);
 5243         UV uc;
 5244             assert(l);
 5245         if (UTF) {
 5246         const U8 * const s = (U8*)STRING(scan);
 5247         uc = utf8_to_uvchr_buf(s, s + l, NULL);
 5248         l = utf8_length(s, s + l);
 5249         } else {
 5250         uc = *((U8*)STRING(scan));
 5251         }
 5252         min += l;
 5253         if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
 5254         /* The code below prefers earlier match for fixed
 5255            offset, later match for variable offset.  */
 5256         if (data->last_end == -1) { /* Update the start info. */
 5257             data->last_start_min = data->pos_min;
 5258             data->last_start_max = is_inf
 5259             ? SSize_t_MAX : data->pos_min + data->pos_delta;
 5260         }
 5261         sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
 5262         if (UTF)
 5263             SvUTF8_on(data->last_found);
 5264         {
 5265             SV * const sv = data->last_found;
 5266             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
 5267             mg_find(sv, PERL_MAGIC_utf8) : NULL;
 5268             if (mg && mg->mg_len >= 0)
 5269             mg->mg_len += utf8_length((U8*)STRING(scan),
 5270                                               (U8*)STRING(scan)+STR_LEN(scan));
 5271         }
 5272         data->last_end = data->pos_min + l;
 5273         data->pos_min += l; /* As in the first entry. */
 5274         data->flags &= ~SF_BEFORE_EOL;
 5275         }
 5276 
 5277             /* ANDing the code point leaves at most it, and not in locale, and
 5278              * can't match null string */
 5279         if (flags & SCF_DO_STCLASS_AND) {
 5280                 ssc_cp_and(data->start_class, uc);
 5281                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
 5282                 ssc_clear_locale(data->start_class);
 5283         }
 5284         else if (flags & SCF_DO_STCLASS_OR) {
 5285                 ssc_add_cp(data->start_class, uc);
 5286         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
 5287 
 5288                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
 5289                 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
 5290         }
 5291         flags &= ~SCF_DO_STCLASS;
 5292     }
 5293         else if (PL_regkind[OP(scan)] == EXACT) {
 5294             /* But OP != EXACT!, so is EXACTFish */
 5295         SSize_t l = STR_LEN(scan);
 5296             const U8 * s = (U8*)STRING(scan);
 5297 
 5298         /* Search for fixed substrings supports EXACT only. */
 5299         if (flags & SCF_DO_SUBSTR) {
 5300         assert(data);
 5301                 scan_commit(pRExC_state, data, minlenp, is_inf);
 5302         }
 5303         if (UTF) {
 5304         l = utf8_length(s, s + l);
 5305         }
 5306         if (unfolded_multi_char) {
 5307                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
 5308         }
 5309         min += l - min_subtract;
 5310             assert (min >= 0);
 5311             delta += min_subtract;
 5312         if (flags & SCF_DO_SUBSTR) {
 5313         data->pos_min += l - min_subtract;
 5314         if (data->pos_min < 0) {
 5315                     data->pos_min = 0;
 5316                 }
 5317                 data->pos_delta += min_subtract;
 5318         if (min_subtract) {
 5319             data->cur_is_floating = 1; /* float */
 5320         }
 5321         }
 5322 
 5323             if (flags & SCF_DO_STCLASS) {
 5324                 SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
 5325 
 5326                 assert(EXACTF_invlist);
 5327                 if (flags & SCF_DO_STCLASS_AND) {
 5328                     if (OP(scan) != EXACTFL)
 5329                         ssc_clear_locale(data->start_class);
 5330                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
 5331                     ANYOF_POSIXL_ZERO(data->start_class);
 5332                     ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
 5333                 }
 5334                 else {  /* SCF_DO_STCLASS_OR */
 5335                     ssc_union(data->start_class, EXACTF_invlist, FALSE);
 5336                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
 5337 
 5338                     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
 5339                     ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
 5340                 }
 5341                 flags &= ~SCF_DO_STCLASS;
 5342                 SvREFCNT_dec(EXACTF_invlist);
 5343             }
 5344     }
 5345     else if (REGNODE_VARIES(OP(scan))) {
 5346         SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
 5347         I32 fl = 0, f = flags;
 5348         regnode * const oscan = scan;
 5349         regnode_ssc this_class;
 5350         regnode_ssc *oclass = NULL;
 5351         I32 next_is_eval = 0;
 5352 
 5353         switch (PL_regkind[OP(scan)]) {
 5354         case WHILEM:        /* End of (?:...)* . */
 5355         scan = NEXTOPER(scan);
 5356         goto finish;
 5357         case PLUS:
 5358         if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
 5359             next = NEXTOPER(scan);
 5360             if (   OP(next) == EXACT
 5361                         || OP(next) == EXACT_ONLY8
 5362                         || OP(next) == EXACTL
 5363                         || (flags & SCF_DO_STCLASS))
 5364                     {
 5365             mincount = 1;
 5366             maxcount = REG_INFTY;
 5367             next = regnext(scan);
 5368             scan = NEXTOPER(scan);
 5369             goto do_curly;
 5370             }
 5371         }
 5372         if (flags & SCF_DO_SUBSTR)
 5373             data->pos_min++;
 5374         min++;
 5375         /* FALLTHROUGH */
 5376         case STAR:
 5377                 next = NEXTOPER(scan);
 5378 
 5379                 /* This temporary node can now be turned into EXACTFU, and
 5380                  * must, as regexec.c doesn't handle it */
 5381                 if (OP(next) == EXACTFU_S_EDGE && mutate_ok) {
 5382                     OP(next) = EXACTFU;
 5383                 }
 5384 
 5385                 if (     STR_LEN(next) == 1
 5386                     &&   isALPHA_A(* STRING(next))
 5387                     && (         OP(next) == EXACTFAA
 5388                         || (     OP(next) == EXACTFU
 5389                             && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next))))
 5390                     &&   mutate_ok
 5391                 ) {
 5392                     /* These differ in just one bit */
 5393                     U8 mask = ~ ('A' ^ 'a');
 5394 
 5395                     assert(isALPHA_A(* STRING(next)));
 5396 
 5397                     /* Then replace it by an ANYOFM node, with
 5398                     * the mask set to the complement of the
 5399                     * bit that differs between upper and lower
 5400                     * case, and the lowest code point of the
 5401                     * pair (which the '&' forces) */
 5402                     OP(next) = ANYOFM;
 5403                     ARG_SET(next, *STRING(next) & mask);
 5404                     FLAGS(next) = mask;
 5405                 }
 5406 
 5407         if (flags & SCF_DO_STCLASS) {
 5408             mincount = 0;
 5409             maxcount = REG_INFTY;
 5410             next = regnext(scan);
 5411             scan = NEXTOPER(scan);
 5412             goto do_curly;
 5413         }
 5414         if (flags & SCF_DO_SUBSTR) {
 5415                     scan_commit(pRExC_state, data, minlenp, is_inf);
 5416                     /* Cannot extend fixed substrings */
 5417             data->cur_is_floating = 1; /* float */
 5418         }
 5419                 is_inf = is_inf_internal = 1;
 5420                 scan = regnext(scan);
 5421         goto optimize_curly_tail;
 5422         case CURLY:
 5423             if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
 5424                 && (scan->flags == stopparen))
 5425         {
 5426             mincount = 1;
 5427             maxcount = 1;
 5428         } else {
 5429             mincount = ARG1(scan);
 5430             maxcount = ARG2(scan);
 5431         }
 5432         next = regnext(scan);
 5433         if (OP(scan) == CURLYX) {
 5434             I32 lp = (data ? *(data->last_closep) : 0);
 5435             scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
 5436         }
 5437         scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
 5438         next_is_eval = (OP(scan) == EVAL);
 5439           do_curly:
 5440         if (flags & SCF_DO_SUBSTR) {
 5441                     if (mincount == 0)
 5442                         scan_commit(pRExC_state, data, minlenp, is_inf);
 5443                     /* Cannot extend fixed substrings */
 5444             pos_before = data->pos_min;
 5445         }
 5446         if (data) {
 5447             fl = data->flags;
 5448             data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
 5449             if (is_inf)
 5450             data->flags |= SF_IS_INF;
 5451         }
 5452         if (flags & SCF_DO_STCLASS) {
 5453             ssc_init(pRExC_state, &this_class);
 5454             oclass = data->start_class;
 5455             data->start_class = &this_class;
 5456             f |= SCF_DO_STCLASS_AND;
 5457             f &= ~SCF_DO_STCLASS_OR;
 5458         }
 5459             /* Exclude from super-linear cache processing any {n,m}
 5460            regops for which the combination of input pos and regex
 5461            pos is not enough information to determine if a match
 5462            will be possible.
 5463 
 5464            For example, in the regex /foo(bar\s*){4,8}baz/ with the
 5465            regex pos at the \s*, the prospects for a match depend not
 5466            only on the input position but also on how many (bar\s*)
 5467            repeats into the {4,8} we are. */
 5468                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
 5469             f &= ~SCF_WHILEM_VISITED_POS;
 5470 
 5471         /* This will finish on WHILEM, setting scan, or on NULL: */
 5472                 /* recurse study_chunk() on loop bodies */
 5473         minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
 5474                                   last, data, stopparen, recursed_depth, NULL,
 5475                                   (mincount == 0
 5476                                    ? (f & ~SCF_DO_SUBSTR)
 5477                                    : f)
 5478                                   , depth+1, mutate_ok);
 5479 
 5480         if (flags & SCF_DO_STCLASS)
 5481             data->start_class = oclass;
 5482         if (mincount == 0 || minnext == 0) {
 5483             if (flags & SCF_DO_STCLASS_OR) {
 5484             ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
 5485             }
 5486             else if (flags & SCF_DO_STCLASS_AND) {
 5487             /* Switch to OR mode: cache the old value of
 5488              * data->start_class */
 5489             INIT_AND_WITHP;
 5490             StructCopy(data->start_class, and_withp, regnode_ssc);
 5491             flags &= ~SCF_DO_STCLASS_AND;
 5492             StructCopy(&this_class, data->start_class, regnode_ssc);
 5493             flags |= SCF_DO_STCLASS_OR;
 5494                         ANYOF_FLAGS(data->start_class)
 5495                                                 |= SSC_MATCHES_EMPTY_STRING;
 5496             }
 5497         } else {        /* Non-zero len */
 5498             if (flags & SCF_DO_STCLASS_OR) {
 5499             ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
 5500             ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
 5501             }
 5502             else if (flags & SCF_DO_STCLASS_AND)
 5503             ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
 5504             flags &= ~SCF_DO_STCLASS;
 5505         }
 5506         if (!scan)      /* It was not CURLYX, but CURLY. */
 5507             scan = next;
 5508         if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
 5509             /* ? quantifier ok, except for (?{ ... }) */
 5510             && (next_is_eval || !(mincount == 0 && maxcount == 1))
 5511             && (minnext == 0) && (deltanext == 0)
 5512             && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
 5513                     && maxcount <= REG_INFTY/3) /* Complement check for big
 5514                                                    count */
 5515         {
 5516             _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
 5517                         Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
 5518                             "Quantifier unexpected on zero-length expression "
 5519                             "in regex m/%" UTF8f "/",
 5520                  UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
 5521                   RExC_precomp)));
 5522                 }
 5523 
 5524                 if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext )
 5525                     || min >= SSize_t_MAX - minnext * mincount )
 5526                 {
 5527                     FAIL("Regexp out of space");
 5528                 }
 5529 
 5530         min += minnext * mincount;
 5531         is_inf_internal |= deltanext == SSize_t_MAX
 5532                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
 5533         is_inf |= is_inf_internal;
 5534                 if (is_inf) {
 5535             delta = SSize_t_MAX;
 5536                 } else {
 5537             delta += (minnext + deltanext) * maxcount
 5538                              - minnext * mincount;
 5539                 }
 5540         /* Try powerful optimization CURLYX => CURLYN. */
 5541         if (  OP(oscan) == CURLYX && data
 5542               && data->flags & SF_IN_PAR
 5543               && !(data->flags & SF_HAS_EVAL)
 5544               && !deltanext && minnext == 1
 5545                       && mutate_ok
 5546                 ) {
 5547             /* Try to optimize to CURLYN.  */
 5548             regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
 5549             regnode * const nxt1 = nxt;
 5550 #ifdef DEBUGGING
 5551             regnode *nxt2;
 5552 #endif
 5553 
 5554             /* Skip open. */
 5555             nxt = regnext(nxt);
 5556             if (!REGNODE_SIMPLE(OP(nxt))
 5557             && !(PL_regkind[OP(nxt)] == EXACT
 5558                  && STR_LEN(nxt) == 1))
 5559             goto nogo;
 5560 #ifdef DEBUGGING
 5561             nxt2 = nxt;
 5562 #endif
 5563             nxt = regnext(nxt);
 5564             if (OP(nxt) != CLOSE)
 5565             goto nogo;
 5566             if (RExC_open_parens) {
 5567 
 5568                         /*open->CURLYM*/
 5569                         RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
 5570 
 5571                         /*close->while*/
 5572                         RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt) + 2;
 5573             }
 5574             /* Now we know that nxt2 is the only contents: */
 5575             oscan->flags = (U8)ARG(nxt);
 5576             OP(oscan) = CURLYN;
 5577             OP(nxt1) = NOTHING; /* was OPEN. */
 5578 
 5579 #ifdef DEBUGGING
 5580             OP(nxt1 + 1) = OPTIMIZED; /* was count. */
 5581             NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
 5582             NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
 5583             OP(nxt) = OPTIMIZED;    /* was CLOSE. */
 5584             OP(nxt + 1) = OPTIMIZED; /* was count. */
 5585             NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
 5586 #endif
 5587         }
 5588           nogo:
 5589 
 5590         /* Try optimization CURLYX => CURLYM. */
 5591         if (  OP(oscan) == CURLYX && data
 5592               && !(data->flags & SF_HAS_PAR)
 5593               && !(data->flags & SF_HAS_EVAL)
 5594               && !deltanext /* atom is fixed width */
 5595               && minnext != 0   /* CURLYM can't handle zero width */
 5596                          /* Nor characters whose fold at run-time may be
 5597                           * multi-character */
 5598                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
 5599                       && mutate_ok
 5600         ) {
 5601             /* XXXX How to optimize if data == 0? */
 5602             /* Optimize to a simpler form.  */
 5603             regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
 5604             regnode *nxt2;
 5605 
 5606             OP(oscan) = CURLYM;
 5607             while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
 5608                 && (OP