"Fossies" - the Fresh Open Source Software Archive

Member "tin-2.4.1/pcre/pcre_dfa_exec.c" (28 Aug 2013, 80853 Bytes) of archive /linux/misc/tin-2.4.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "pcre_dfa_exec.c" see the Fossies "Dox" file reference documentation.

    1 /*************************************************
    2 *      Perl-Compatible Regular Expressions       *
    3 *************************************************/
    4 
    5 /* PCRE is a library of functions to support regular expressions whose syntax
    6 and semantics are as close as possible to those of the Perl 5 language.
    7 
    8                        Written by Philip Hazel
    9            Copyright (c) 1997-2006 University of Cambridge
   10 
   11 -----------------------------------------------------------------------------
   12 Redistribution and use in source and binary forms, with or without
   13 modification, are permitted provided that the following conditions are met:
   14 
   15     * Redistributions of source code must retain the above copyright notice,
   16       this list of conditions and the following disclaimer.
   17 
   18     * Redistributions in binary form must reproduce the above copyright
   19       notice, this list of conditions and the following disclaimer in the
   20       documentation and/or other materials provided with the distribution.
   21 
   22     * Neither the name of the University of Cambridge nor the names of its
   23       contributors may be used to endorse or promote products derived from
   24       this software without specific prior written permission.
   25 
   26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
   27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
   30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   36 POSSIBILITY OF SUCH DAMAGE.
   37 -----------------------------------------------------------------------------
   38 */
   39 
   40 
   41 /* This module contains the external function pcre_dfa_exec(), which is an
   42 alternative matching function that uses a sort of DFA algorithm (not a true
   43 FSM). This is NOT Perl- compatible, but it has advantages in certain
   44 applications. */
   45 
   46 
   47 #define NLBLOCK md             /* Block containing newline information */
   48 #define PSSTART start_subject  /* Field containing processed string start */
   49 #define PSEND   end_subject    /* Field containing processed string end */
   50 
   51 #include "pcre_internal.h"
   52 
   53 
   54 /* For use to indent debugging output */
   55 
   56 #define SP "                   "
   57 
   58 
   59 
   60 /*************************************************
   61 *      Code parameters and static tables         *
   62 *************************************************/
   63 
   64 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
   65 into others, under special conditions. A gap of 20 between the blocks should be
   66 enough. */
   67 
   68 #define OP_PROP_EXTRA 100
   69 #define OP_EXTUNI_EXTRA 120
   70 #define OP_ANYNL_EXTRA 140
   71 
   72 
   73 /* This table identifies those opcodes that are followed immediately by a
   74 character that is to be tested in some way. This makes is possible to
   75 centralize the loading of these characters. In the case of Type * etc, the
   76 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
   77 small value. */
   78 
   79 static uschar coptable[] = {
   80   0,                             /* End                                    */
   81   0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* \A, \G, \B, \b, \D, \d, \S, \s, \W, \w */
   82   0, 0,                          /* Any, Anybyte                           */
   83   0, 0, 0, 0,                    /* NOTPROP, PROP, EXTUNI, ANYNL           */
   84   0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
   85   1,                             /* Char                                   */
   86   1,                             /* Charnc                                 */
   87   1,                             /* not                                    */
   88   /* Positive single-char repeats                                          */
   89   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
   90   3, 3, 3,                       /* upto, minupto, exact                   */
   91   1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */
   92   /* Negative single-char repeats - only for chars < 256                   */
   93   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
   94   3, 3, 3,                       /* NOT upto, minupto, exact               */
   95   1, 1, 1, 3,                    /* NOT *+, ++, ?+, updo+                  */
   96   /* Positive type repeats                                                 */
   97   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
   98   3, 3, 3,                       /* Type upto, minupto, exact              */
   99   1, 1, 1, 3,                    /* Type *+, ++, ?+, upto+                 */
  100   /* Character class & ref repeats                                         */
  101   0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
  102   0, 0,                          /* CRRANGE, CRMINRANGE                    */
  103   0,                             /* CLASS                                  */
  104   0,                             /* NCLASS                                 */
  105   0,                             /* XCLASS - variable length               */
  106   0,                             /* REF                                    */
  107   0,                             /* RECURSE                                */
  108   0,                             /* CALLOUT                                */
  109   0,                             /* Alt                                    */
  110   0,                             /* Ket                                    */
  111   0,                             /* KetRmax                                */
  112   0,                             /* KetRmin                                */
  113   0,                             /* Assert                                 */
  114   0,                             /* Assert not                             */
  115   0,                             /* Assert behind                          */
  116   0,                             /* Assert behind not                      */
  117   0,                             /* Reverse                                */
  118   0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */
  119   0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */
  120   0,                             /* CREF                                   */
  121   0,                             /* RREF                                   */
  122   0,                             /* DEF                                    */
  123   0, 0                           /* BRAZERO, BRAMINZERO                    */
  124 };
  125 
  126 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
  127 and \w */
  128 
  129 static uschar toptable1[] = {
  130   0, 0, 0, 0, 0,
  131   ctype_digit, ctype_digit,
  132   ctype_space, ctype_space,
  133   ctype_word,  ctype_word,
  134   0                               /* OP_ANY */
  135 };
  136 
  137 static uschar toptable2[] = {
  138   0, 0, 0, 0, 0,
  139   ctype_digit, 0,
  140   ctype_space, 0,
  141   ctype_word,  0,
  142   1                               /* OP_ANY */
  143 };
  144 
  145 
  146 /* Structure for holding data about a particular state, which is in effect the
  147 current data for an active path through the match tree. It must consist
  148 entirely of ints because the working vector we are passed, and which we put
  149 these structures in, is a vector of ints. */
  150 
  151 typedef struct stateblock {
  152   int offset;                     /* Offset to opcode */
  153   int count;                      /* Count for repeats */
  154   int ims;                        /* ims flag bits */
  155   int data;                       /* Some use extra data */
  156 } stateblock;
  157 
  158 #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))
  159 
  160 
  161 #ifdef DEBUG
  162 /*************************************************
  163 *             Print character string             *
  164 *************************************************/
  165 
  166 /* Character string printing function for debugging.
  167 
  168 Arguments:
  169   p            points to string
  170   length       number of bytes
  171   f            where to print
  172 
  173 Returns:       nothing
  174 */
  175 
  176 static void
  177 pchars(unsigned char *p, int length, FILE *f)
  178 {
  179 int c;
  180 while (length-- > 0)
  181   {
  182   if (isprint(c = *(p++)))
  183     fprintf(f, "%c", c);
  184   else
  185     fprintf(f, "\\x%02x", c);
  186   }
  187 }
  188 #endif
  189 
  190 
  191 
  192 /*************************************************
  193 *    Execute a Regular Expression - DFA engine   *
  194 *************************************************/
  195 
  196 /* This internal function applies a compiled pattern to a subject string,
  197 starting at a given point, using a DFA engine. This function is called from the
  198 external one, possibly multiple times if the pattern is not anchored. The
  199 function calls itself recursively for some kinds of subpattern.
  200 
  201 Arguments:
  202   md                the match_data block with fixed information
  203   this_start_code   the opening bracket of this subexpression's code
  204   current_subject   where we currently are in the subject string
  205   start_offset      start offset in the subject string
  206   offsets           vector to contain the matching string offsets
  207   offsetcount       size of same
  208   workspace         vector of workspace
  209   wscount           size of same
  210   ims               the current ims flags
  211   rlevel            function call recursion level
  212   recursing         regex recursive call level
  213 
  214 Returns:            > 0 =>
  215                     = 0 =>
  216                      -1 => failed to match
  217                    < -1 => some kind of unexpected problem
  218 
  219 The following macros are used for adding states to the two state vectors (one
  220 for the current character, one for the following character). */
  221 
  222 #define ADD_ACTIVE(x,y) \
  223   if (active_count++ < wscount) \
  224     { \
  225     next_active_state->offset = (x); \
  226     next_active_state->count  = (y); \
  227     next_active_state->ims    = ims; \
  228     next_active_state++; \
  229     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
  230     } \
  231   else return PCRE_ERROR_DFA_WSSIZE
  232 
  233 #define ADD_ACTIVE_DATA(x,y,z) \
  234   if (active_count++ < wscount) \
  235     { \
  236     next_active_state->offset = (x); \
  237     next_active_state->count  = (y); \
  238     next_active_state->ims    = ims; \
  239     next_active_state->data   = (z); \
  240     next_active_state++; \
  241     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
  242     } \
  243   else return PCRE_ERROR_DFA_WSSIZE
  244 
  245 #define ADD_NEW(x,y) \
  246   if (new_count++ < wscount) \
  247     { \
  248     next_new_state->offset = (x); \
  249     next_new_state->count  = (y); \
  250     next_new_state->ims    = ims; \
  251     next_new_state++; \
  252     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
  253     } \
  254   else return PCRE_ERROR_DFA_WSSIZE
  255 
  256 #define ADD_NEW_DATA(x,y,z) \
  257   if (new_count++ < wscount) \
  258     { \
  259     next_new_state->offset = (x); \
  260     next_new_state->count  = (y); \
  261     next_new_state->ims    = ims; \
  262     next_new_state->data   = (z); \
  263     next_new_state++; \
  264     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
  265     } \
  266   else return PCRE_ERROR_DFA_WSSIZE
  267 
  268 /* And now, here is the code */
  269 
  270 static int
  271 internal_dfa_exec(
  272   dfa_match_data *md,
  273   const uschar *this_start_code,
  274   const uschar *current_subject,
  275   int start_offset,
  276   int *offsets,
  277   int offsetcount,
  278   int *workspace,
  279   int wscount,
  280   int ims,
  281   int  rlevel,
  282   int  recursing)
  283 {
  284 stateblock *active_states, *new_states, *temp_states;
  285 stateblock *next_active_state, *next_new_state;
  286 
  287 const uschar *ctypes, *lcc, *fcc;
  288 const uschar *ptr;
  289 const uschar *end_code, *first_op;
  290 
  291 int active_count, new_count, match_count;
  292 
  293 /* Some fields in the md block are frequently referenced, so we load them into
  294 independent variables in the hope that this will perform better. */
  295 
  296 const uschar *start_subject = md->start_subject;
  297 const uschar *end_subject = md->end_subject;
  298 const uschar *start_code = md->start_code;
  299 
  300 #ifdef SUPPORT_UTF8
  301 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
  302 #else
  303 BOOL utf8 = FALSE;
  304 #endif
  305 
  306 rlevel++;
  307 offsetcount &= (-2);
  308 
  309 wscount -= 2;
  310 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
  311           (2 * INTS_PER_STATEBLOCK);
  312 
  313 DPRINTF(("\n%.*s---------------------\n"
  314   "%.*sCall to internal_dfa_exec f=%d r=%d\n",
  315   rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
  316 
  317 ctypes = md->tables + ctypes_offset;
  318 lcc = md->tables + lcc_offset;
  319 fcc = md->tables + fcc_offset;
  320 
  321 match_count = PCRE_ERROR_NOMATCH;   /* A negative number */
  322 
  323 active_states = (stateblock *)(workspace + 2);
  324 next_new_state = new_states = active_states + wscount;
  325 new_count = 0;
  326 
  327 first_op = this_start_code + 1 + LINK_SIZE +
  328   ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
  329 
  330 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
  331 the alternative states onto the list, and find out where the end is. This
  332 makes is possible to use this function recursively, when we want to stop at a
  333 matching internal ket rather than at the end.
  334 
  335 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
  336 a backward assertion. In that case, we have to find out the maximum amount to
  337 move back, and set up each alternative appropriately. */
  338 
  339 if (*first_op == OP_REVERSE)
  340   {
  341   int max_back = 0;
  342   int gone_back;
  343 
  344   end_code = this_start_code;
  345   do
  346     {
  347     int back = GET(end_code, 2+LINK_SIZE);
  348     if (back > max_back) max_back = back;
  349     end_code += GET(end_code, 1);
  350     }
  351   while (*end_code == OP_ALT);
  352 
  353   /* If we can't go back the amount required for the longest lookbehind
  354   pattern, go back as far as we can; some alternatives may still be viable. */
  355 
  356 #ifdef SUPPORT_UTF8
  357   /* In character mode we have to step back character by character */
  358 
  359   if (utf8)
  360     {
  361     for (gone_back = 0; gone_back < max_back; gone_back++)
  362       {
  363       if (current_subject <= start_subject) break;
  364       current_subject--;
  365       while (current_subject > start_subject &&
  366              (*current_subject & 0xc0) == 0x80)
  367         current_subject--;
  368       }
  369     }
  370   else
  371 #endif
  372 
  373   /* In byte-mode we can do this quickly. */
  374 
  375     {
  376     gone_back = (current_subject - max_back < start_subject)?
  377       current_subject - start_subject : max_back;
  378     current_subject -= gone_back;
  379     }
  380 
  381   /* Now we can process the individual branches. */
  382 
  383   end_code = this_start_code;
  384   do
  385     {
  386     int back = GET(end_code, 2+LINK_SIZE);
  387     if (back <= gone_back)
  388       {
  389       int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
  390       ADD_NEW_DATA(-bstate, 0, gone_back - back);
  391       }
  392     end_code += GET(end_code, 1);
  393     }
  394   while (*end_code == OP_ALT);
  395  }
  396 
  397 /* This is the code for a "normal" subpattern (not a backward assertion). The
  398 start of a whole pattern is always one of these. If we are at the top level,
  399 we may be asked to restart matching from the same point that we reached for a
  400 previous partial match. We still have to scan through the top-level branches to
  401 find the end state. */
  402 
  403 else
  404   {
  405   end_code = this_start_code;
  406 
  407   /* Restarting */
  408 
  409   if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
  410     {
  411     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
  412     new_count = workspace[1];
  413     if (!workspace[0])
  414       memcpy(new_states, active_states, new_count * sizeof(stateblock));
  415     }
  416 
  417   /* Not restarting */
  418 
  419   else
  420     {
  421     int length = 1 + LINK_SIZE +
  422       ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
  423     do
  424       {
  425       ADD_NEW(end_code - start_code + length, 0);
  426       end_code += GET(end_code, 1);
  427       length = 1 + LINK_SIZE;
  428       }
  429     while (*end_code == OP_ALT);
  430     }
  431   }
  432 
  433 workspace[0] = 0;    /* Bit indicating which vector is current */
  434 
  435 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
  436 
  437 /* Loop for scanning the subject */
  438 
  439 ptr = current_subject;
  440 for (;;)
  441   {
  442   int i, j;
  443   int clen, dlen;
  444   unsigned int c, d;
  445 
  446   /* Make the new state list into the active state list and empty the
  447   new state list. */
  448 
  449   temp_states = active_states;
  450   active_states = new_states;
  451   new_states = temp_states;
  452   active_count = new_count;
  453   new_count = 0;
  454 
  455   workspace[0] ^= 1;              /* Remember for the restarting feature */
  456   workspace[1] = active_count;
  457 
  458 #ifdef DEBUG
  459   printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
  460   pchars((uschar *)ptr, strlen((char *)ptr), stdout);
  461   printf("\"\n");
  462 
  463   printf("%.*sActive states: ", rlevel*2-2, SP);
  464   for (i = 0; i < active_count; i++)
  465     printf("%d/%d ", active_states[i].offset, active_states[i].count);
  466   printf("\n");
  467 #endif
  468 
  469   /* Set the pointers for adding new states */
  470 
  471   next_active_state = active_states + active_count;
  472   next_new_state = new_states;
  473 
  474   /* Load the current character from the subject outside the loop, as many
  475   different states may want to look at it, and we assume that at least one
  476   will. */
  477 
  478   if (ptr < end_subject)
  479     {
  480     clen = 1;        /* Number of bytes in the character */
  481 #ifdef SUPPORT_UTF8
  482     if (utf8) { GETCHARLEN(c, ptr, clen); } else
  483 #endif  /* SUPPORT_UTF8 */
  484     c = *ptr;
  485     }
  486   else
  487     {
  488     clen = 0;        /* This indicates the end of the subject */
  489     c = NOTACHAR;    /* This value should never actually be used */
  490     }
  491 
  492   /* Scan up the active states and act on each one. The result of an action
  493   may be to add more states to the currently active list (e.g. on hitting a
  494   parenthesis) or it may be to put states on the new list, for considering
  495   when we move the character pointer on. */
  496 
  497   for (i = 0; i < active_count; i++)
  498     {
  499     stateblock *current_state = active_states + i;
  500     const uschar *code;
  501     int state_offset = current_state->offset;
  502     int count, codevalue;
  503     int chartype, script;
  504 
  505 #ifdef DEBUG
  506     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
  507     if (clen == 0) printf("EOL\n");
  508       else if (c > 32 && c < 127) printf("'%c'\n", c);
  509         else printf("0x%02x\n", c);
  510 #endif
  511 
  512     /* This variable is referred to implicity in the ADD_xxx macros. */
  513 
  514     ims = current_state->ims;
  515 
  516     /* A negative offset is a special case meaning "hold off going to this
  517     (negated) state until the number of characters in the data field have
  518     been skipped". */
  519 
  520     if (state_offset < 0)
  521       {
  522       if (current_state->data > 0)
  523         {
  524         DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
  525         ADD_NEW_DATA(state_offset, current_state->count,
  526           current_state->data - 1);
  527         continue;
  528         }
  529       else
  530         {
  531         current_state->offset = state_offset = -state_offset;
  532         }
  533       }
  534 
  535     /* Check for a duplicate state with the same count, and skip if found. */
  536 
  537     for (j = 0; j < i; j++)
  538       {
  539       if (active_states[j].offset == state_offset &&
  540           active_states[j].count == current_state->count)
  541         {
  542         DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
  543         goto NEXT_ACTIVE_STATE;
  544         }
  545       }
  546 
  547     /* The state offset is the offset to the opcode */
  548 
  549     code = start_code + state_offset;
  550     codevalue = *code;
  551 
  552     /* If this opcode is followed by an inline character, load it. It is
  553     tempting to test for the presence of a subject character here, but that
  554     is wrong, because sometimes zero repetitions of the subject are
  555     permitted.
  556 
  557     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
  558     argument that is not a data character - but is always one byte long.
  559     Unfortunately, we have to take special action to deal with  \P, \p, and
  560     \X in this case. To keep the other cases fast, convert these ones to new
  561     opcodes. */
  562 
  563     if (coptable[codevalue] > 0)
  564       {
  565       dlen = 1;
  566 #ifdef SUPPORT_UTF8
  567       if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
  568 #endif  /* SUPPORT_UTF8 */
  569       d = code[coptable[codevalue]];
  570       if (codevalue >= OP_TYPESTAR)
  571         {
  572         switch(d)
  573           {
  574           case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
  575           case OP_NOTPROP:
  576           case OP_PROP: codevalue += OP_PROP_EXTRA; break;
  577           case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
  578           case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
  579           default: break;
  580           }
  581         }
  582       }
  583     else
  584       {
  585       dlen = 0;         /* Not strictly necessary, but compilers moan */
  586       d = NOTACHAR;     /* if these variables are not set. */
  587       }
  588 
  589 
  590     /* Now process the individual opcodes */
  591 
  592     switch (codevalue)
  593       {
  594 
  595 /* ========================================================================== */
  596       /* Reached a closing bracket. If not at the end of the pattern, carry
  597       on with the next opcode. Otherwise, unless we have an empty string and
  598       PCRE_NOTEMPTY is set, save the match data, shifting up all previous
  599       matches so we always have the longest first. */
  600 
  601       case OP_KET:
  602       case OP_KETRMIN:
  603       case OP_KETRMAX:
  604       if (code != end_code)
  605         {
  606         ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
  607         if (codevalue != OP_KET)
  608           {
  609           ADD_ACTIVE(state_offset - GET(code, 1), 0);
  610           }
  611         }
  612       else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
  613         {
  614         if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
  615           else if (match_count > 0 && ++match_count * 2 >= offsetcount)
  616             match_count = 0;
  617         count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
  618         if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
  619         if (offsetcount >= 2)
  620           {
  621           offsets[0] = current_subject - start_subject;
  622           offsets[1] = ptr - start_subject;
  623           DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
  624             offsets[1] - offsets[0], current_subject));
  625           }
  626         if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
  627           {
  628           DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
  629             "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
  630             match_count, rlevel*2-2, SP));
  631           return match_count;
  632           }
  633         }
  634       break;
  635 
  636 /* ========================================================================== */
  637       /* These opcodes add to the current list of states without looking
  638       at the current character. */
  639 
  640       /*-----------------------------------------------------------------*/
  641       case OP_ALT:
  642       do { code += GET(code, 1); } while (*code == OP_ALT);
  643       ADD_ACTIVE(code - start_code, 0);
  644       break;
  645 
  646       /*-----------------------------------------------------------------*/
  647       case OP_BRA:
  648       case OP_SBRA:
  649       do
  650         {
  651         ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
  652         code += GET(code, 1);
  653         }
  654       while (*code == OP_ALT);
  655       break;
  656 
  657       /*-----------------------------------------------------------------*/
  658       case OP_CBRA:
  659       case OP_SCBRA:
  660       ADD_ACTIVE(code - start_code + 3 + LINK_SIZE,  0);
  661       code += GET(code, 1);
  662       while (*code == OP_ALT)
  663         {
  664         ADD_ACTIVE(code - start_code + 1 + LINK_SIZE,  0);
  665         code += GET(code, 1);
  666         }
  667       break;
  668 
  669       /*-----------------------------------------------------------------*/
  670       case OP_BRAZERO:
  671       case OP_BRAMINZERO:
  672       ADD_ACTIVE(state_offset + 1, 0);
  673       code += 1 + GET(code, 2);
  674       while (*code == OP_ALT) code += GET(code, 1);
  675       ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
  676       break;
  677 
  678       /*-----------------------------------------------------------------*/
  679       case OP_CIRC:
  680       if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
  681           ((ims & PCRE_MULTILINE) != 0 &&
  682             ptr != end_subject &&
  683             WAS_NEWLINE(ptr)))
  684         { ADD_ACTIVE(state_offset + 1, 0); }
  685       break;
  686 
  687       /*-----------------------------------------------------------------*/
  688       case OP_EOD:
  689       if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
  690       break;
  691 
  692       /*-----------------------------------------------------------------*/
  693       case OP_OPT:
  694       ims = code[1];
  695       ADD_ACTIVE(state_offset + 2, 0);
  696       break;
  697 
  698       /*-----------------------------------------------------------------*/
  699       case OP_SOD:
  700       if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
  701       break;
  702 
  703       /*-----------------------------------------------------------------*/
  704       case OP_SOM:
  705       if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
  706       break;
  707 
  708 
  709 /* ========================================================================== */
  710       /* These opcodes inspect the next subject character, and sometimes
  711       the previous one as well, but do not have an argument. The variable
  712       clen contains the length of the current character and is zero if we are
  713       at the end of the subject. */
  714 
  715       /*-----------------------------------------------------------------*/
  716       case OP_ANY:
  717       if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))
  718         { ADD_NEW(state_offset + 1, 0); }
  719       break;
  720 
  721       /*-----------------------------------------------------------------*/
  722       case OP_EODN:
  723       if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
  724         { ADD_ACTIVE(state_offset + 1, 0); }
  725       break;
  726 
  727       /*-----------------------------------------------------------------*/
  728       case OP_DOLL:
  729       if ((md->moptions & PCRE_NOTEOL) == 0)
  730         {
  731         if (clen == 0 ||
  732             (IS_NEWLINE(ptr) &&
  733                ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
  734             ))
  735           { ADD_ACTIVE(state_offset + 1, 0); }
  736         }
  737       else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
  738         { ADD_ACTIVE(state_offset + 1, 0); }
  739       break;
  740 
  741       /*-----------------------------------------------------------------*/
  742 
  743       case OP_DIGIT:
  744       case OP_WHITESPACE:
  745       case OP_WORDCHAR:
  746       if (clen > 0 && c < 256 &&
  747             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
  748         { ADD_NEW(state_offset + 1, 0); }
  749       break;
  750 
  751       /*-----------------------------------------------------------------*/
  752       case OP_NOT_DIGIT:
  753       case OP_NOT_WHITESPACE:
  754       case OP_NOT_WORDCHAR:
  755       if (clen > 0 && (c >= 256 ||
  756             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
  757         { ADD_NEW(state_offset + 1, 0); }
  758       break;
  759 
  760       /*-----------------------------------------------------------------*/
  761       case OP_WORD_BOUNDARY:
  762       case OP_NOT_WORD_BOUNDARY:
  763         {
  764         int left_word, right_word;
  765 
  766         if (ptr > start_subject)
  767           {
  768           const uschar *temp = ptr - 1;
  769 #ifdef SUPPORT_UTF8
  770           if (utf8) BACKCHAR(temp);
  771 #endif
  772           GETCHARTEST(d, temp);
  773           left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
  774           }
  775         else left_word = 0;
  776 
  777         if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
  778           else right_word = 0;
  779 
  780         if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
  781           { ADD_ACTIVE(state_offset + 1, 0); }
  782         }
  783       break;
  784 
  785 
  786 #ifdef SUPPORT_UCP
  787 
  788       /*-----------------------------------------------------------------*/
  789       /* Check the next character by Unicode property. We will get here only
  790       if the support is in the binary; otherwise a compile-time error occurs.
  791       */
  792 
  793       case OP_PROP:
  794       case OP_NOTPROP:
  795       if (clen > 0)
  796         {
  797         BOOL OK;
  798         int category = _pcre_ucp_findprop(c, &chartype, &script);
  799         switch(code[1])
  800           {
  801           case PT_ANY:
  802           OK = TRUE;
  803           break;
  804 
  805           case PT_LAMP:
  806           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
  807           break;
  808 
  809           case PT_GC:
  810           OK = category == code[2];
  811           break;
  812 
  813           case PT_PC:
  814           OK = chartype == code[2];
  815           break;
  816 
  817           case PT_SC:
  818           OK = script == code[2];
  819           break;
  820 
  821           /* Should never occur, but keep compilers from grumbling. */
  822 
  823           default:
  824           OK = codevalue != OP_PROP;
  825           break;
  826           }
  827 
  828         if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
  829         }
  830       break;
  831 #endif
  832 
  833 
  834 
  835 /* ========================================================================== */
  836       /* These opcodes likewise inspect the subject character, but have an
  837       argument that is not a data character. It is one of these opcodes:
  838       OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,
  839       OP_NOT_WORDCHAR. The value is loaded into d. */
  840 
  841       case OP_TYPEPLUS:
  842       case OP_TYPEMINPLUS:
  843       case OP_TYPEPOSPLUS:
  844       count = current_state->count;  /* Already matched */
  845       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
  846       if (clen > 0)
  847         {
  848         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
  849             (c < 256 &&
  850               (d != OP_ANY ||
  851                (ims & PCRE_DOTALL) != 0 ||
  852                !IS_NEWLINE(ptr)
  853               ) &&
  854               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
  855           {
  856           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
  857             {
  858             active_count--;            /* Remove non-match possibility */
  859             next_active_state--;
  860             }
  861           count++;
  862           ADD_NEW(state_offset, count);
  863           }
  864         }
  865       break;
  866 
  867       /*-----------------------------------------------------------------*/
  868       case OP_TYPEQUERY:
  869       case OP_TYPEMINQUERY:
  870       case OP_TYPEPOSQUERY:
  871       ADD_ACTIVE(state_offset + 2, 0);
  872       if (clen > 0)
  873         {
  874         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
  875             (c < 256 &&
  876               (d != OP_ANY ||
  877                (ims & PCRE_DOTALL) != 0 ||
  878                !IS_NEWLINE(ptr)
  879               ) &&
  880               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
  881           {
  882           if (codevalue == OP_TYPEPOSQUERY)
  883             {
  884             active_count--;            /* Remove non-match possibility */
  885             next_active_state--;
  886             }
  887           ADD_NEW(state_offset + 2, 0);
  888           }
  889         }
  890       break;
  891 
  892       /*-----------------------------------------------------------------*/
  893       case OP_TYPESTAR:
  894       case OP_TYPEMINSTAR:
  895       case OP_TYPEPOSSTAR:
  896       ADD_ACTIVE(state_offset + 2, 0);
  897       if (clen > 0)
  898         {
  899         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
  900             (c < 256 &&
  901               (d != OP_ANY ||
  902                (ims & PCRE_DOTALL) != 0 ||
  903                !IS_NEWLINE(ptr)
  904               ) &&
  905               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
  906           {
  907           if (codevalue == OP_TYPEPOSSTAR)
  908             {
  909             active_count--;            /* Remove non-match possibility */
  910             next_active_state--;
  911             }
  912           ADD_NEW(state_offset, 0);
  913           }
  914         }
  915       break;
  916 
  917       /*-----------------------------------------------------------------*/
  918       case OP_TYPEEXACT:
  919       count = current_state->count;  /* Number already matched */
  920       if (clen > 0)
  921         {
  922         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
  923             (c < 256 &&
  924               (d != OP_ANY ||
  925                (ims & PCRE_DOTALL) != 0 ||
  926                !IS_NEWLINE(ptr)
  927               ) &&
  928               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
  929           {
  930           if (++count >= GET2(code, 1))
  931             { ADD_NEW(state_offset + 4, 0); }
  932           else
  933             { ADD_NEW(state_offset, count); }
  934           }
  935         }
  936       break;
  937 
  938       /*-----------------------------------------------------------------*/
  939       case OP_TYPEUPTO:
  940       case OP_TYPEMINUPTO:
  941       case OP_TYPEPOSUPTO:
  942       ADD_ACTIVE(state_offset + 4, 0);
  943       count = current_state->count;  /* Number already matched */
  944       if (clen > 0)
  945         {
  946         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
  947             (c < 256 &&
  948               (d != OP_ANY ||
  949                (ims & PCRE_DOTALL) != 0 ||
  950                !IS_NEWLINE(ptr)
  951               ) &&
  952               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
  953           {
  954           if (codevalue == OP_TYPEPOSUPTO)
  955             {
  956             active_count--;           /* Remove non-match possibility */
  957             next_active_state--;
  958             }
  959           if (++count >= GET2(code, 1))
  960             { ADD_NEW(state_offset + 4, 0); }
  961           else
  962             { ADD_NEW(state_offset, count); }
  963           }
  964         }
  965       break;
  966 
  967 /* ========================================================================== */
  968       /* These are virtual opcodes that are used when something like
  969       OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
  970       argument. It keeps the code above fast for the other cases. The argument
  971       is in the d variable. */
  972 
  973       case OP_PROP_EXTRA + OP_TYPEPLUS:
  974       case OP_PROP_EXTRA + OP_TYPEMINPLUS:
  975       case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
  976       count = current_state->count;           /* Already matched */
  977       if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
  978       if (clen > 0)
  979         {
  980         BOOL OK;
  981         int category = _pcre_ucp_findprop(c, &chartype, &script);
  982         switch(code[2])
  983           {
  984           case PT_ANY:
  985           OK = TRUE;
  986           break;
  987 
  988           case PT_LAMP:
  989           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
  990           break;
  991 
  992           case PT_GC:
  993           OK = category == code[3];
  994           break;
  995 
  996           case PT_PC:
  997           OK = chartype == code[3];
  998           break;
  999 
 1000           case PT_SC:
 1001           OK = script == code[3];
 1002           break;
 1003 
 1004           /* Should never occur, but keep compilers from grumbling. */
 1005 
 1006           default:
 1007           OK = codevalue != OP_PROP;
 1008           break;
 1009           }
 1010 
 1011         if (OK == (d == OP_PROP))
 1012           {
 1013           if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
 1014             {
 1015             active_count--;           /* Remove non-match possibility */
 1016             next_active_state--;
 1017             }
 1018           count++;
 1019           ADD_NEW(state_offset, count);
 1020           }
 1021         }
 1022       break;
 1023 
 1024       /*-----------------------------------------------------------------*/
 1025       case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
 1026       case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
 1027       case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
 1028       count = current_state->count;  /* Already matched */
 1029       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
 1030       if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
 1031         {
 1032         const uschar *nptr = ptr + clen;
 1033         int ncount = 0;
 1034         if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
 1035           {
 1036           active_count--;           /* Remove non-match possibility */
 1037           next_active_state--;
 1038           }
 1039         while (nptr < end_subject)
 1040           {
 1041           int nd;
 1042           int ndlen = 1;
 1043           GETCHARLEN(nd, nptr, ndlen);
 1044           if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
 1045           ncount++;
 1046           nptr += ndlen;
 1047           }
 1048         count++;
 1049         ADD_NEW_DATA(-state_offset, count, ncount);
 1050         }
 1051       break;
 1052 
 1053       /*-----------------------------------------------------------------*/
 1054       case OP_ANYNL_EXTRA + OP_TYPEPLUS:
 1055       case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
 1056       case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
 1057       count = current_state->count;  /* Already matched */
 1058       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
 1059       if (clen > 0)
 1060         {
 1061         int ncount = 0;
 1062         switch (c)
 1063           {
 1064           case 0x000d:
 1065           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
 1066           /* Fall through */
 1067           case 0x000a:
 1068           case 0x000b:
 1069           case 0x000c:
 1070           case 0x0085:
 1071           case 0x2028:
 1072           case 0x2029:
 1073           if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
 1074             {
 1075             active_count--;           /* Remove non-match possibility */
 1076             next_active_state--;
 1077             }
 1078           count++;
 1079           ADD_NEW_DATA(-state_offset, count, ncount);
 1080           break;
 1081           default:
 1082           break;
 1083           }
 1084         }
 1085       break;
 1086 
 1087       /*-----------------------------------------------------------------*/
 1088       case OP_PROP_EXTRA + OP_TYPEQUERY:
 1089       case OP_PROP_EXTRA + OP_TYPEMINQUERY:
 1090       case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
 1091       count = 4;
 1092       goto QS1;
 1093 
 1094       case OP_PROP_EXTRA + OP_TYPESTAR:
 1095       case OP_PROP_EXTRA + OP_TYPEMINSTAR:
 1096       case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
 1097       count = 0;
 1098 
 1099       QS1:
 1100 
 1101       ADD_ACTIVE(state_offset + 4, 0);
 1102       if (clen > 0)
 1103         {
 1104         BOOL OK;
 1105         int category = _pcre_ucp_findprop(c, &chartype, &script);
 1106         switch(code[2])
 1107           {
 1108           case PT_ANY:
 1109           OK = TRUE;
 1110           break;
 1111 
 1112           case PT_LAMP:
 1113           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
 1114           break;
 1115 
 1116           case PT_GC:
 1117           OK = category == code[3];
 1118           break;
 1119 
 1120           case PT_PC:
 1121           OK = chartype == code[3];
 1122           break;
 1123 
 1124           case PT_SC:
 1125           OK = script == code[3];
 1126           break;
 1127 
 1128           /* Should never occur, but keep compilers from grumbling. */
 1129 
 1130           default:
 1131           OK = codevalue != OP_PROP;
 1132           break;
 1133           }
 1134 
 1135         if (OK == (d == OP_PROP))
 1136           {
 1137           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
 1138               codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
 1139             {
 1140             active_count--;           /* Remove non-match possibility */
 1141             next_active_state--;
 1142             }
 1143           ADD_NEW(state_offset + count, 0);
 1144           }
 1145         }
 1146       break;
 1147 
 1148       /*-----------------------------------------------------------------*/
 1149       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
 1150       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
 1151       case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
 1152       count = 2;
 1153       goto QS2;
 1154 
 1155       case OP_EXTUNI_EXTRA + OP_TYPESTAR:
 1156       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
 1157       case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
 1158       count = 0;
 1159 
 1160       QS2:
 1161 
 1162       ADD_ACTIVE(state_offset + 2, 0);
 1163       if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
 1164         {
 1165         const uschar *nptr = ptr + clen;
 1166         int ncount = 0;
 1167         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
 1168             codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
 1169           {
 1170           active_count--;           /* Remove non-match possibility */
 1171           next_active_state--;
 1172           }
 1173         while (nptr < end_subject)
 1174           {
 1175           int nd;
 1176           int ndlen = 1;
 1177           GETCHARLEN(nd, nptr, ndlen);
 1178           if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
 1179           ncount++;
 1180           nptr += ndlen;
 1181           }
 1182         ADD_NEW_DATA(-(state_offset + count), 0, ncount);
 1183         }
 1184       break;
 1185 
 1186       /*-----------------------------------------------------------------*/
 1187       case OP_ANYNL_EXTRA + OP_TYPEQUERY:
 1188       case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
 1189       case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
 1190       count = 2;
 1191       goto QS3;
 1192 
 1193       case OP_ANYNL_EXTRA + OP_TYPESTAR:
 1194       case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
 1195       case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
 1196       count = 0;
 1197 
 1198       QS3:
 1199       ADD_ACTIVE(state_offset + 2, 0);
 1200       if (clen > 0)
 1201         {
 1202         int ncount = 0;
 1203         switch (c)
 1204           {
 1205           case 0x000d:
 1206           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
 1207           /* Fall through */
 1208           case 0x000a:
 1209           case 0x000b:
 1210           case 0x000c:
 1211           case 0x0085:
 1212           case 0x2028:
 1213           case 0x2029:
 1214           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
 1215               codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
 1216             {
 1217             active_count--;           /* Remove non-match possibility */
 1218             next_active_state--;
 1219             }
 1220           ADD_NEW_DATA(-(state_offset + count), 0, ncount);
 1221           break;
 1222           default:
 1223           break;
 1224           }
 1225         }
 1226       break;
 1227 
 1228       /*-----------------------------------------------------------------*/
 1229       case OP_PROP_EXTRA + OP_TYPEEXACT:
 1230       case OP_PROP_EXTRA + OP_TYPEUPTO:
 1231       case OP_PROP_EXTRA + OP_TYPEMINUPTO:
 1232       case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
 1233       if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
 1234         { ADD_ACTIVE(state_offset + 6, 0); }
 1235       count = current_state->count;  /* Number already matched */
 1236       if (clen > 0)
 1237         {
 1238         BOOL OK;
 1239         int category = _pcre_ucp_findprop(c, &chartype, &script);
 1240         switch(code[4])
 1241           {
 1242           case PT_ANY:
 1243           OK = TRUE;
 1244           break;
 1245 
 1246           case PT_LAMP:
 1247           OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
 1248           break;
 1249 
 1250           case PT_GC:
 1251           OK = category == code[5];
 1252           break;
 1253 
 1254           case PT_PC:
 1255           OK = chartype == code[5];
 1256           break;
 1257 
 1258           case PT_SC:
 1259           OK = script == code[5];
 1260           break;
 1261 
 1262           /* Should never occur, but keep compilers from grumbling. */
 1263 
 1264           default:
 1265           OK = codevalue != OP_PROP;
 1266           break;
 1267           }
 1268 
 1269         if (OK == (d == OP_PROP))
 1270           {
 1271           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
 1272             {
 1273             active_count--;           /* Remove non-match possibility */
 1274             next_active_state--;
 1275             }
 1276           if (++count >= GET2(code, 1))
 1277             { ADD_NEW(state_offset + 6, 0); }
 1278           else
 1279             { ADD_NEW(state_offset, count); }
 1280           }
 1281         }
 1282       break;
 1283 
 1284       /*-----------------------------------------------------------------*/
 1285       case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
 1286       case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
 1287       case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
 1288       case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
 1289       if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
 1290         { ADD_ACTIVE(state_offset + 4, 0); }
 1291       count = current_state->count;  /* Number already matched */
 1292       if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
 1293         {
 1294         const uschar *nptr = ptr + clen;
 1295         int ncount = 0;
 1296         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
 1297           {
 1298           active_count--;           /* Remove non-match possibility */
 1299           next_active_state--;
 1300           }
 1301         while (nptr < end_subject)
 1302           {
 1303           int nd;
 1304           int ndlen = 1;
 1305           GETCHARLEN(nd, nptr, ndlen);
 1306           if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
 1307           ncount++;
 1308           nptr += ndlen;
 1309           }
 1310         if (++count >= GET2(code, 1))
 1311           { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
 1312         else
 1313           { ADD_NEW_DATA(-state_offset, count, ncount); }
 1314         }
 1315       break;
 1316 
 1317       /*-----------------------------------------------------------------*/
 1318       case OP_ANYNL_EXTRA + OP_TYPEEXACT:
 1319       case OP_ANYNL_EXTRA + OP_TYPEUPTO:
 1320       case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
 1321       case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
 1322       if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
 1323         { ADD_ACTIVE(state_offset + 4, 0); }
 1324       count = current_state->count;  /* Number already matched */
 1325       if (clen > 0)
 1326         {
 1327         int ncount = 0;
 1328         switch (c)
 1329           {
 1330           case 0x000d:
 1331           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
 1332           /* Fall through */
 1333           case 0x000a:
 1334           case 0x000b:
 1335           case 0x000c:
 1336           case 0x0085:
 1337           case 0x2028:
 1338           case 0x2029:
 1339           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
 1340             {
 1341             active_count--;           /* Remove non-match possibility */
 1342             next_active_state--;
 1343             }
 1344           if (++count >= GET2(code, 1))
 1345             { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
 1346           else
 1347             { ADD_NEW_DATA(-state_offset, count, ncount); }
 1348           break;
 1349           default:
 1350           break;
 1351           }
 1352         }
 1353       break;
 1354 
 1355 /* ========================================================================== */
 1356       /* These opcodes are followed by a character that is usually compared
 1357       to the current subject character; it is loaded into d. We still get
 1358       here even if there is no subject character, because in some cases zero
 1359       repetitions are permitted. */
 1360 
 1361       /*-----------------------------------------------------------------*/
 1362       case OP_CHAR:
 1363       if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
 1364       break;
 1365 
 1366       /*-----------------------------------------------------------------*/
 1367       case OP_CHARNC:
 1368       if (clen == 0) break;
 1369 
 1370 #ifdef SUPPORT_UTF8
 1371       if (utf8)
 1372         {
 1373         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
 1374           {
 1375           unsigned int othercase;
 1376           if (c < 128) othercase = fcc[c]; else
 1377 
 1378           /* If we have Unicode property support, we can use it to test the
 1379           other case of the character. */
 1380 
 1381 #ifdef SUPPORT_UCP
 1382           othercase = _pcre_ucp_othercase(c);
 1383 #else
 1384           othercase = NOTACHAR;
 1385 #endif
 1386 
 1387           if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
 1388           }
 1389         }
 1390       else
 1391 #endif  /* SUPPORT_UTF8 */
 1392 
 1393       /* Non-UTF-8 mode */
 1394         {
 1395         if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
 1396         }
 1397       break;
 1398 
 1399 
 1400 #ifdef SUPPORT_UCP
 1401       /*-----------------------------------------------------------------*/
 1402       /* This is a tricky one because it can match more than one character.
 1403       Find out how many characters to skip, and then set up a negative state
 1404       to wait for them to pass before continuing. */
 1405 
 1406       case OP_EXTUNI:
 1407       if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
 1408         {
 1409         const uschar *nptr = ptr + clen;
 1410         int ncount = 0;
 1411         while (nptr < end_subject)
 1412           {
 1413           int nclen = 1;
 1414           GETCHARLEN(c, nptr, nclen);
 1415           if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
 1416           ncount++;
 1417           nptr += nclen;
 1418           }
 1419         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
 1420         }
 1421       break;
 1422 #endif
 1423 
 1424       /*-----------------------------------------------------------------*/
 1425       /* This is a tricky like EXTUNI because it too can match more than one
 1426       character (when CR is followed by LF). In this case, set up a negative
 1427       state to wait for one character to pass before continuing. */
 1428 
 1429       case OP_ANYNL:
 1430       if (clen > 0) switch(c)
 1431         {
 1432         case 0x000a:
 1433         case 0x000b:
 1434         case 0x000c:
 1435         case 0x0085:
 1436         case 0x2028:
 1437         case 0x2029:
 1438         ADD_NEW(state_offset + 1, 0);
 1439         break;
 1440         case 0x000d:
 1441         if (ptr + 1 < end_subject && ptr[1] == 0x0a)
 1442           {
 1443           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
 1444           }
 1445         else
 1446           {
 1447           ADD_NEW(state_offset + 1, 0);
 1448           }
 1449         break;
 1450         }
 1451       break;
 1452 
 1453       /*-----------------------------------------------------------------*/
 1454       /* Match a negated single character. This is only used for one-byte
 1455       characters, that is, we know that d < 256. The character we are
 1456       checking (c) can be multibyte. */
 1457 
 1458       case OP_NOT:
 1459       if (clen > 0)
 1460         {
 1461         unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
 1462         if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
 1463         }
 1464       break;
 1465 
 1466       /*-----------------------------------------------------------------*/
 1467       case OP_PLUS:
 1468       case OP_MINPLUS:
 1469       case OP_POSPLUS:
 1470       case OP_NOTPLUS:
 1471       case OP_NOTMINPLUS:
 1472       case OP_NOTPOSPLUS:
 1473       count = current_state->count;  /* Already matched */
 1474       if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
 1475       if (clen > 0)
 1476         {
 1477         unsigned int otherd = NOTACHAR;
 1478         if ((ims & PCRE_CASELESS) != 0)
 1479           {
 1480 #ifdef SUPPORT_UTF8
 1481           if (utf8 && d >= 128)
 1482             {
 1483 #ifdef SUPPORT_UCP
 1484             otherd = _pcre_ucp_othercase(d);
 1485 #endif  /* SUPPORT_UCP */
 1486             }
 1487           else
 1488 #endif  /* SUPPORT_UTF8 */
 1489           otherd = fcc[d];
 1490           }
 1491         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
 1492           {
 1493           if (count > 0 &&
 1494               (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
 1495             {
 1496             active_count--;             /* Remove non-match possibility */
 1497             next_active_state--;
 1498             }
 1499           count++;
 1500           ADD_NEW(state_offset, count);
 1501           }
 1502         }
 1503       break;
 1504 
 1505       /*-----------------------------------------------------------------*/
 1506       case OP_QUERY:
 1507       case OP_MINQUERY:
 1508       case OP_POSQUERY:
 1509       case OP_NOTQUERY:
 1510       case OP_NOTMINQUERY:
 1511       case OP_NOTPOSQUERY:
 1512       ADD_ACTIVE(state_offset + dlen + 1, 0);
 1513       if (clen > 0)
 1514         {
 1515         unsigned int otherd = NOTACHAR;
 1516         if ((ims & PCRE_CASELESS) != 0)
 1517           {
 1518 #ifdef SUPPORT_UTF8
 1519           if (utf8 && d >= 128)
 1520             {
 1521 #ifdef SUPPORT_UCP
 1522             otherd = _pcre_ucp_othercase(d);
 1523 #endif  /* SUPPORT_UCP */
 1524             }
 1525           else
 1526 #endif  /* SUPPORT_UTF8 */
 1527           otherd = fcc[d];
 1528           }
 1529         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
 1530           {
 1531           if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
 1532             {
 1533             active_count--;            /* Remove non-match possibility */
 1534             next_active_state--;
 1535             }
 1536           ADD_NEW(state_offset + dlen + 1, 0);
 1537           }
 1538         }
 1539       break;
 1540 
 1541       /*-----------------------------------------------------------------*/
 1542       case OP_STAR:
 1543       case OP_MINSTAR:
 1544       case OP_POSSTAR:
 1545       case OP_NOTSTAR:
 1546       case OP_NOTMINSTAR:
 1547       case OP_NOTPOSSTAR:
 1548       ADD_ACTIVE(state_offset + dlen + 1, 0);
 1549       if (clen > 0)
 1550         {
 1551         unsigned int otherd = NOTACHAR;
 1552         if ((ims & PCRE_CASELESS) != 0)
 1553           {
 1554 #ifdef SUPPORT_UTF8
 1555           if (utf8 && d >= 128)
 1556             {
 1557 #ifdef SUPPORT_UCP
 1558             otherd = _pcre_ucp_othercase(d);
 1559 #endif  /* SUPPORT_UCP */
 1560             }
 1561           else
 1562 #endif  /* SUPPORT_UTF8 */
 1563           otherd = fcc[d];
 1564           }
 1565         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
 1566           {
 1567           if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
 1568             {
 1569             active_count--;            /* Remove non-match possibility */
 1570             next_active_state--;
 1571             }
 1572           ADD_NEW(state_offset, 0);
 1573           }
 1574         }
 1575       break;
 1576 
 1577       /*-----------------------------------------------------------------*/
 1578       case OP_EXACT:
 1579       case OP_NOTEXACT:
 1580       count = current_state->count;  /* Number already matched */
 1581       if (clen > 0)
 1582         {
 1583         unsigned int otherd = NOTACHAR;
 1584         if ((ims & PCRE_CASELESS) != 0)
 1585           {
 1586 #ifdef SUPPORT_UTF8
 1587           if (utf8 && d >= 128)
 1588             {
 1589 #ifdef SUPPORT_UCP
 1590             otherd = _pcre_ucp_othercase(d);
 1591 #endif  /* SUPPORT_UCP */
 1592             }
 1593           else
 1594 #endif  /* SUPPORT_UTF8 */
 1595           otherd = fcc[d];
 1596           }
 1597         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
 1598           {
 1599           if (++count >= GET2(code, 1))
 1600             { ADD_NEW(state_offset + dlen + 3, 0); }
 1601           else
 1602             { ADD_NEW(state_offset, count); }
 1603           }
 1604         }
 1605       break;
 1606 
 1607       /*-----------------------------------------------------------------*/
 1608       case OP_UPTO:
 1609       case OP_MINUPTO:
 1610       case OP_POSUPTO:
 1611       case OP_NOTUPTO:
 1612       case OP_NOTMINUPTO:
 1613       case OP_NOTPOSUPTO:
 1614       ADD_ACTIVE(state_offset + dlen + 3, 0);
 1615       count = current_state->count;  /* Number already matched */
 1616       if (clen > 0)
 1617         {
 1618         unsigned int otherd = NOTACHAR;
 1619         if ((ims & PCRE_CASELESS) != 0)
 1620           {
 1621 #ifdef SUPPORT_UTF8
 1622           if (utf8 && d >= 128)
 1623             {
 1624 #ifdef SUPPORT_UCP
 1625             otherd = _pcre_ucp_othercase(d);
 1626 #endif  /* SUPPORT_UCP */
 1627             }
 1628           else
 1629 #endif  /* SUPPORT_UTF8 */
 1630           otherd = fcc[d];
 1631           }
 1632         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
 1633           {
 1634           if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
 1635             {
 1636             active_count--;             /* Remove non-match possibility */
 1637             next_active_state--;
 1638             }
 1639           if (++count >= GET2(code, 1))
 1640             { ADD_NEW(state_offset + dlen + 3, 0); }
 1641           else
 1642             { ADD_NEW(state_offset, count); }
 1643           }
 1644         }
 1645       break;
 1646 
 1647 
 1648 /* ========================================================================== */
 1649       /* These are the class-handling opcodes */
 1650 
 1651       case OP_CLASS:
 1652       case OP_NCLASS:
 1653       case OP_XCLASS:
 1654         {
 1655         BOOL isinclass = FALSE;
 1656         int next_state_offset;
 1657         const uschar *ecode;
 1658 
 1659         /* For a simple class, there is always just a 32-byte table, and we
 1660         can set isinclass from it. */
 1661 
 1662         if (codevalue != OP_XCLASS)
 1663           {
 1664           ecode = code + 33;
 1665           if (clen > 0)
 1666             {
 1667             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
 1668               ((code[1 + c/8] & (1 << (c&7))) != 0);
 1669             }
 1670           }
 1671 
 1672         /* An extended class may have a table or a list of single characters,
 1673         ranges, or both, and it may be positive or negative. There's a
 1674         function that sorts all this out. */
 1675 
 1676         else
 1677          {
 1678          ecode = code + GET(code, 1);
 1679          if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
 1680          }
 1681 
 1682         /* At this point, isinclass is set for all kinds of class, and ecode
 1683         points to the byte after the end of the class. If there is a
 1684         quantifier, this is where it will be. */
 1685 
 1686         next_state_offset = ecode - start_code;
 1687 
 1688         switch (*ecode)
 1689           {
 1690           case OP_CRSTAR:
 1691           case OP_CRMINSTAR:
 1692           ADD_ACTIVE(next_state_offset + 1, 0);
 1693           if (isinclass) { ADD_NEW(state_offset, 0); }
 1694           break;
 1695 
 1696           case OP_CRPLUS:
 1697           case OP_CRMINPLUS:
 1698           count = current_state->count;  /* Already matched */
 1699           if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
 1700           if (isinclass) { count++; ADD_NEW(state_offset, count); }
 1701           break;
 1702 
 1703           case OP_CRQUERY:
 1704           case OP_CRMINQUERY:
 1705           ADD_ACTIVE(next_state_offset + 1, 0);
 1706           if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
 1707           break;
 1708 
 1709           case OP_CRRANGE:
 1710           case OP_CRMINRANGE:
 1711           count = current_state->count;  /* Already matched */
 1712           if (count >= GET2(ecode, 1))
 1713             { ADD_ACTIVE(next_state_offset + 5, 0); }
 1714           if (isinclass)
 1715             {
 1716             int max = GET2(ecode, 3);
 1717             if (++count >= max && max != 0)   /* Max 0 => no limit */
 1718               { ADD_NEW(next_state_offset + 5, 0); }
 1719             else
 1720               { ADD_NEW(state_offset, count); }
 1721             }
 1722           break;
 1723 
 1724           default:
 1725           if (isinclass) { ADD_NEW(next_state_offset, 0); }
 1726           break;
 1727           }
 1728         }
 1729       break;
 1730 
 1731 /* ========================================================================== */
 1732       /* These are the opcodes for fancy brackets of various kinds. We have
 1733       to use recursion in order to handle them. */
 1734 
 1735       case OP_ASSERT:
 1736       case OP_ASSERT_NOT:
 1737       case OP_ASSERTBACK:
 1738       case OP_ASSERTBACK_NOT:
 1739         {
 1740         int rc;
 1741         int local_offsets[2];
 1742         int local_workspace[1000];
 1743         const uschar *endasscode = code + GET(code, 1);
 1744 
 1745         while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
 1746 
 1747         rc = internal_dfa_exec(
 1748           md,                                   /* static match data */
 1749           code,                                 /* this subexpression's code */
 1750           ptr,                                  /* where we currently are */
 1751           ptr - start_subject,                  /* start offset */
 1752           local_offsets,                        /* offset vector */
 1753           sizeof(local_offsets)/sizeof(int),    /* size of same */
 1754           local_workspace,                      /* workspace vector */
 1755           sizeof(local_workspace)/sizeof(int),  /* size of same */
 1756           ims,                                  /* the current ims flags */
 1757           rlevel,                               /* function recursion level */
 1758           recursing);                           /* pass on regex recursion */
 1759 
 1760         if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
 1761             { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
 1762         }
 1763       break;
 1764 
 1765       /*-----------------------------------------------------------------*/
 1766       case OP_COND:
 1767       case OP_SCOND:
 1768         {
 1769         int local_offsets[1000];
 1770         int local_workspace[1000];
 1771         int condcode = code[LINK_SIZE+1];
 1772 
 1773         /* Back reference conditions are not supported */
 1774 
 1775         if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
 1776 
 1777         /* The DEFINE condition is always false */
 1778 
 1779         if (condcode == OP_DEF)
 1780           {
 1781           ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
 1782           }
 1783 
 1784         /* The only supported version of OP_RREF is for the value RREF_ANY,
 1785         which means "test if in any recursion". We can't test for specifically
 1786         recursed groups. */
 1787 
 1788         else if (condcode == OP_RREF)
 1789           {
 1790           int value = GET2(code, LINK_SIZE+2);
 1791           if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
 1792           if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
 1793             else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
 1794           }
 1795 
 1796         /* Otherwise, the condition is an assertion */
 1797 
 1798         else
 1799           {
 1800           int rc;
 1801           const uschar *asscode = code + LINK_SIZE + 1;
 1802           const uschar *endasscode = asscode + GET(asscode, 1);
 1803 
 1804           while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
 1805 
 1806           rc = internal_dfa_exec(
 1807             md,                                   /* fixed match data */
 1808             asscode,                              /* this subexpression's code */
 1809             ptr,                                  /* where we currently are */
 1810             ptr - start_subject,                  /* start offset */
 1811             local_offsets,                        /* offset vector */
 1812             sizeof(local_offsets)/sizeof(int),    /* size of same */
 1813             local_workspace,                      /* workspace vector */
 1814             sizeof(local_workspace)/sizeof(int),  /* size of same */
 1815             ims,                                  /* the current ims flags */
 1816             rlevel,                               /* function recursion level */
 1817             recursing);                           /* pass on regex recursion */
 1818 
 1819           if ((rc >= 0) ==
 1820                 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
 1821             { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
 1822           else
 1823             { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
 1824           }
 1825         }
 1826       break;
 1827 
 1828       /*-----------------------------------------------------------------*/
 1829       case OP_RECURSE:
 1830         {
 1831         int local_offsets[1000];
 1832         int local_workspace[1000];
 1833         int rc;
 1834 
 1835         DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
 1836           recursing + 1));
 1837 
 1838         rc = internal_dfa_exec(
 1839           md,                                   /* fixed match data */
 1840           start_code + GET(code, 1),            /* this subexpression's code */
 1841           ptr,                                  /* where we currently are */
 1842           ptr - start_subject,                  /* start offset */
 1843           local_offsets,                        /* offset vector */
 1844           sizeof(local_offsets)/sizeof(int),    /* size of same */
 1845           local_workspace,                      /* workspace vector */
 1846           sizeof(local_workspace)/sizeof(int),  /* size of same */
 1847           ims,                                  /* the current ims flags */
 1848           rlevel,                               /* function recursion level */
 1849           recursing + 1);                       /* regex recurse level */
 1850 
 1851         DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
 1852           recursing + 1, rc));
 1853 
 1854         /* Ran out of internal offsets */
 1855 
 1856         if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
 1857 
 1858         /* For each successful matched substring, set up the next state with a
 1859         count of characters to skip before trying it. Note that the count is in
 1860         characters, not bytes. */
 1861 
 1862         if (rc > 0)
 1863           {
 1864           for (rc = rc*2 - 2; rc >= 0; rc -= 2)
 1865             {
 1866             const uschar *p = start_subject + local_offsets[rc];
 1867             const uschar *pp = start_subject + local_offsets[rc+1];
 1868             int charcount = local_offsets[rc+1] - local_offsets[rc];
 1869             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
 1870             if (charcount > 0)
 1871               {
 1872               ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
 1873               }
 1874             else
 1875               {
 1876               ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
 1877               }
 1878             }
 1879           }
 1880         else if (rc != PCRE_ERROR_NOMATCH) return rc;
 1881         }
 1882       break;
 1883 
 1884       /*-----------------------------------------------------------------*/
 1885       case OP_ONCE:
 1886         {
 1887         int local_offsets[2];
 1888         int local_workspace[1000];
 1889 
 1890         int rc = internal_dfa_exec(
 1891           md,                                   /* fixed match data */
 1892           code,                                 /* this subexpression's code */
 1893           ptr,                                  /* where we currently are */
 1894           ptr - start_subject,                  /* start offset */
 1895           local_offsets,                        /* offset vector */
 1896           sizeof(local_offsets)/sizeof(int),    /* size of same */
 1897           local_workspace,                      /* workspace vector */
 1898           sizeof(local_workspace)/sizeof(int),  /* size of same */
 1899           ims,                                  /* the current ims flags */
 1900           rlevel,                               /* function recursion level */
 1901           recursing);                           /* pass on regex recursion */
 1902 
 1903         if (rc >= 0)
 1904           {
 1905           const uschar *end_subpattern = code;
 1906           int charcount = local_offsets[1] - local_offsets[0];
 1907           int next_state_offset, repeat_state_offset;
 1908 
 1909           do { end_subpattern += GET(end_subpattern, 1); }
 1910             while (*end_subpattern == OP_ALT);
 1911           next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
 1912 
 1913           /* If the end of this subpattern is KETRMAX or KETRMIN, we must
 1914           arrange for the repeat state also to be added to the relevant list.
 1915           Calculate the offset, or set -1 for no repeat. */
 1916 
 1917           repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
 1918                                  *end_subpattern == OP_KETRMIN)?
 1919             end_subpattern - start_code - GET(end_subpattern, 1) : -1;
 1920 
 1921           /* If we have matched an empty string, add the next state at the
 1922           current character pointer. This is important so that the duplicate
 1923           checking kicks in, which is what breaks infinite loops that match an
 1924           empty string. */
 1925 
 1926           if (charcount == 0)
 1927             {
 1928             ADD_ACTIVE(next_state_offset, 0);
 1929             }
 1930 
 1931           /* Optimization: if there are no more active states, and there
 1932           are no new states yet set up, then skip over the subject string
 1933           right here, to save looping. Otherwise, set up the new state to swing
 1934           into action when the end of the substring is reached. */
 1935 
 1936           else if (i + 1 >= active_count && new_count == 0)
 1937             {
 1938             ptr += charcount;
 1939             clen = 0;
 1940             ADD_NEW(next_state_offset, 0);
 1941 
 1942             /* If we are adding a repeat state at the new character position,
 1943             we must fudge things so that it is the only current state.
 1944             Otherwise, it might be a duplicate of one we processed before, and
 1945             that would cause it to be skipped. */
 1946 
 1947             if (repeat_state_offset >= 0)
 1948               {
 1949               next_active_state = active_states;
 1950               active_count = 0;
 1951               i = -1;
 1952               ADD_ACTIVE(repeat_state_offset, 0);
 1953               }
 1954             }
 1955           else
 1956             {
 1957             const uschar *p = start_subject + local_offsets[0];
 1958             const uschar *pp = start_subject + local_offsets[1];
 1959             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
 1960             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
 1961             if (repeat_state_offset >= 0)
 1962               { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
 1963             }
 1964 
 1965           }
 1966         else if (rc != PCRE_ERROR_NOMATCH) return rc;
 1967         }
 1968       break;
 1969 
 1970 
 1971 /* ========================================================================== */
 1972       /* Handle callouts */
 1973 
 1974       case OP_CALLOUT:
 1975       if (pcre_callout != NULL)
 1976         {
 1977         int rrc;
 1978         pcre_callout_block cb;
 1979         cb.version          = 1;   /* Version 1 of the callout block */
 1980         cb.callout_number   = code[1];
 1981         cb.offset_vector    = offsets;
 1982         cb.subject          = (PCRE_SPTR)start_subject;
 1983         cb.subject_length   = end_subject - start_subject;
 1984         cb.start_match      = current_subject - start_subject;
 1985         cb.current_position = ptr - start_subject;
 1986         cb.pattern_position = GET(code, 2);
 1987         cb.next_item_length = GET(code, 2 + LINK_SIZE);
 1988         cb.capture_top      = 1;
 1989         cb.capture_last     = -1;
 1990         cb.callout_data     = md->callout_data;
 1991         if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
 1992         if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
 1993         }
 1994       break;
 1995 
 1996 
 1997 /* ========================================================================== */
 1998       default:        /* Unsupported opcode */
 1999       return PCRE_ERROR_DFA_UITEM;
 2000       }
 2001 
 2002     NEXT_ACTIVE_STATE: continue;
 2003 
 2004     }      /* End of loop scanning active states */
 2005 
 2006   /* We have finished the processing at the current subject character. If no
 2007   new states have been set for the next character, we have found all the
 2008   matches that we are going to find. If we are at the top level and partial
 2009   matching has been requested, check for appropriate conditions. */
 2010 
 2011   if (new_count <= 0)
 2012     {
 2013     if (match_count < 0 &&                     /* No matches found */
 2014         rlevel == 1 &&                         /* Top level match function */
 2015         (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */
 2016         ptr >= end_subject &&                  /* Reached end of subject */
 2017         ptr > current_subject)                 /* Matched non-empty string */
 2018       {
 2019       if (offsetcount >= 2)
 2020         {
 2021         offsets[0] = current_subject - start_subject;
 2022         offsets[1] = end_subject - start_subject;
 2023         }
 2024       match_count = PCRE_ERROR_PARTIAL;
 2025       }
 2026 
 2027     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
 2028       "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
 2029       rlevel*2-2, SP));
 2030     break;        /* In effect, "return", but see the comment below */
 2031     }
 2032 
 2033   /* One or more states are active for the next character. */
 2034 
 2035   ptr += clen;    /* Advance to next subject character */
 2036   }               /* Loop to move along the subject string */
 2037 
 2038 /* Control gets here from "break" a few lines above. We do it this way because
 2039 if we use "return" above, we have compiler trouble. Some compilers warn if
 2040 there's nothing here because they think the function doesn't return a value. On
 2041 the other hand, if we put a dummy statement here, some more clever compilers
 2042 complain that it can't be reached. Sigh. */
 2043 
 2044 return match_count;
 2045 }
 2046 
 2047 
 2048 
 2049 
 2050 /*************************************************
 2051 *    Execute a Regular Expression - DFA engine   *
 2052 *************************************************/
 2053 
 2054 /* This external function applies a compiled re to a subject string using a DFA
 2055 engine. This function calls the internal function multiple times if the pattern
 2056 is not anchored.
 2057 
 2058 Arguments:
 2059   argument_re     points to the compiled expression
 2060   extra_data      points to extra data or is NULL (not currently used)
 2061   subject         points to the subject string
 2062   length          length of subject string (may contain binary zeros)
 2063   start_offset    where to start in the subject string
 2064   options         option bits
 2065   offsets         vector of match offsets
 2066   offsetcount     size of same
 2067   workspace       workspace vector
 2068   wscount         size of same
 2069 
 2070 Returns:          > 0 => number of match offset pairs placed in offsets
 2071                   = 0 => offsets overflowed; longest matches are present
 2072                    -1 => failed to match
 2073                  < -1 => some kind of unexpected problem
 2074 */
 2075 
 2076 PCRE_DATA_SCOPE int
 2077 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
 2078   const char *subject, int length, int start_offset, int options, int *offsets,
 2079   int offsetcount, int *workspace, int wscount)
 2080 {
 2081 real_pcre *re = (real_pcre *)argument_re;
 2082 dfa_match_data match_block;
 2083 dfa_match_data *md = &match_block;
 2084 BOOL utf8, anchored, startline, firstline;
 2085 const uschar *current_subject, *end_subject, *lcc;
 2086 
 2087 pcre_study_data internal_study;
 2088 const pcre_study_data *study = NULL;
 2089 real_pcre internal_re;
 2090 
 2091 const uschar *req_byte_ptr;
 2092 const uschar *start_bits = NULL;
 2093 BOOL first_byte_caseless = FALSE;
 2094 BOOL req_byte_caseless = FALSE;
 2095 int first_byte = -1;
 2096 int req_byte = -1;
 2097 int req_byte2 = -1;
 2098 int newline;
 2099 
 2100 /* Plausibility checks */
 2101 
 2102 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
 2103 if (re == NULL || subject == NULL || workspace == NULL ||
 2104    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
 2105 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
 2106 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
 2107 
 2108 /* We need to find the pointer to any study data before we test for byte
 2109 flipping, so we scan the extra_data block first. This may set two fields in the
 2110 match block, so we must initialize them beforehand. However, the other fields
 2111 in the match block must not be set until after the byte flipping. */
 2112 
 2113 md->tables = re->tables;
 2114 md->callout_data = NULL;
 2115 
 2116 if (extra_data != NULL)
 2117   {
 2118   unsigned int flags = extra_data->flags;
 2119   if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
 2120     study = (const pcre_study_data *)extra_data->study_data;
 2121   if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
 2122   if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
 2123     return PCRE_ERROR_DFA_UMLIMIT;
 2124   if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
 2125     md->callout_data = extra_data->callout_data;
 2126   if ((flags & PCRE_EXTRA_TABLES) != 0)
 2127     md->tables = extra_data->tables;
 2128   }
 2129 
 2130 /* Check that the first field in the block is the magic number. If it is not,
 2131 test for a regex that was compiled on a host of opposite endianness. If this is
 2132 the case, flipped values are put in internal_re and internal_study if there was
 2133 study data too. */
 2134 
 2135 if (re->magic_number != MAGIC_NUMBER)
 2136   {
 2137   re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
 2138   if (re == NULL) return PCRE_ERROR_BADMAGIC;
 2139   if (study != NULL) study = &internal_study;
 2140   }
 2141 
 2142 /* Set some local values */
 2143 
 2144 current_subject = (const unsigned char *)subject + start_offset;
 2145 end_subject = (const unsigned char *)subject + length;
 2146 req_byte_ptr = current_subject - 1;
 2147 
 2148 #ifdef SUPPORT_UTF8
 2149 utf8 = (re->options & PCRE_UTF8) != 0;
 2150 #else
 2151 utf8 = FALSE;
 2152 #endif
 2153 
 2154 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
 2155   (re->options & PCRE_ANCHORED) != 0;
 2156 
 2157 /* The remaining fixed data for passing around. */
 2158 
 2159 md->start_code = (const uschar *)argument_re +
 2160     re->name_table_offset + re->name_count * re->name_entry_size;
 2161 md->start_subject = (const unsigned char *)subject;
 2162 md->end_subject = end_subject;
 2163 md->moptions = options;
 2164 md->poptions = re->options;
 2165 
 2166 /* Handle different types of newline. The two bits give four cases. If nothing
 2167 is set at run time, whatever was used at compile time applies. */
 2168 
 2169 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : options) &
 2170          PCRE_NEWLINE_BITS)
 2171   {
 2172   case 0: newline = NEWLINE; break;   /* Compile-time default */
 2173   case PCRE_NEWLINE_CR: newline = '\r'; break;
 2174   case PCRE_NEWLINE_LF: newline = '\n'; break;
 2175   case PCRE_NEWLINE_CR+
 2176        PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
 2177   case PCRE_NEWLINE_ANY: newline = -1; break;
 2178   default: return PCRE_ERROR_BADNEWLINE;
 2179   }
 2180 
 2181 if (newline < 0)
 2182   {
 2183   md->nltype = NLTYPE_ANY;
 2184   }
 2185 else
 2186   {
 2187   md->nltype = NLTYPE_FIXED;
 2188   if (newline > 255)
 2189     {
 2190     md->nllen = 2;
 2191     md->nl[0] = (newline >> 8) & 255;
 2192     md->nl[1] = newline & 255;
 2193     }
 2194   else
 2195     {
 2196     md->nllen = 1;
 2197     md->nl[0] = newline;
 2198     }
 2199   }
 2200 
 2201 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
 2202 back the character offset. */
 2203 
 2204 #ifdef SUPPORT_UTF8
 2205 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
 2206   {
 2207   if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
 2208     return PCRE_ERROR_BADUTF8;
 2209   if (start_offset > 0 && start_offset < length)
 2210     {
 2211     int tb = ((uschar *)subject)[start_offset];
 2212     if (tb > 127)
 2213       {
 2214       tb &= 0xc0;
 2215       if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
 2216       }
 2217     }
 2218   }
 2219 #endif
 2220 
 2221 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
 2222 is a feature that makes it possible to save compiled regex and re-use them
 2223 in other programs later. */
 2224 
 2225 if (md->tables == NULL) md->tables = _pcre_default_tables;
 2226 
 2227 /* The lower casing table and the "must be at the start of a line" flag are
 2228 used in a loop when finding where to start. */
 2229 
 2230 lcc = md->tables + lcc_offset;
 2231 startline = (re->options & PCRE_STARTLINE) != 0;
 2232 firstline = (re->options & PCRE_FIRSTLINE) != 0;
 2233 
 2234 /* Set up the first character to match, if available. The first_byte value is
 2235 never set for an anchored regular expression, but the anchoring may be forced
 2236 at run time, so we have to test for anchoring. The first char may be unset for
 2237 an unanchored pattern, of course. If there's no first char and the pattern was
 2238 studied, there may be a bitmap of possible first characters. */
 2239 
 2240 if (!anchored)
 2241   {
 2242   if ((re->options & PCRE_FIRSTSET) != 0)
 2243     {
 2244     first_byte = re->first_byte & 255;
 2245     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
 2246       first_byte = lcc[first_byte];
 2247     }
 2248   else
 2249     {
 2250     if (startline && study != NULL &&
 2251          (study->options & PCRE_STUDY_MAPPED) != 0)
 2252       start_bits = study->start_bits;
 2253     }
 2254   }
 2255 
 2256 /* For anchored or unanchored matches, there may be a "last known required
 2257 character" set. */
 2258 
 2259 if ((re->options & PCRE_REQCHSET) != 0)
 2260   {
 2261   req_byte = re->req_byte & 255;
 2262   req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
 2263   req_byte2 = (md->tables + fcc_offset)[req_byte];  /* case flipped */
 2264   }
 2265 
 2266 /* Call the main matching function, looping for a non-anchored regex after a
 2267 failed match. Unless restarting, optimize by moving to the first match
 2268 character if possible, when not anchored. Then unless wanting a partial match,
 2269 check for a required later character. */
 2270 
 2271 for (;;)
 2272   {
 2273   int rc;
 2274 
 2275   if ((options & PCRE_DFA_RESTART) == 0)
 2276     {
 2277     const uschar *save_end_subject = end_subject;
 2278 
 2279     /* Advance to a unique first char if possible. If firstline is TRUE, the
 2280     start of the match is constrained to the first line of a multiline string.
 2281     Implement this by temporarily adjusting end_subject so that we stop
 2282     scanning at a newline. If the match fails at the newline, later code breaks
 2283     this loop. */
 2284 
 2285     if (firstline)
 2286       {
 2287       const uschar *t = current_subject;
 2288       while (t < md->end_subject && !IS_NEWLINE(t)) t++;
 2289       end_subject = t;
 2290       }
 2291 
 2292     if (first_byte >= 0)
 2293       {
 2294       if (first_byte_caseless)
 2295         while (current_subject < end_subject &&
 2296                lcc[*current_subject] != first_byte)
 2297           current_subject++;
 2298       else
 2299         while (current_subject < end_subject && *current_subject != first_byte)
 2300           current_subject++;
 2301       }
 2302 
 2303     /* Or to just after a linebreak for a multiline match if possible */
 2304 
 2305     else if (startline)
 2306       {
 2307       if (current_subject > md->start_subject + start_offset)
 2308         {
 2309         while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
 2310           current_subject++;
 2311         }
 2312       }
 2313 
 2314     /* Or to a non-unique first char after study */
 2315 
 2316     else if (start_bits != NULL)
 2317       {
 2318       while (current_subject < end_subject)
 2319         {
 2320         register unsigned int c = *current_subject;
 2321         if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
 2322           else break;
 2323         }
 2324       }
 2325 
 2326     /* Restore fudged end_subject */
 2327 
 2328     end_subject = save_end_subject;
 2329     }
 2330 
 2331   /* If req_byte is set, we know that that character must appear in the subject
 2332   for the match to succeed. If the first character is set, req_byte must be
 2333   later in the subject; otherwise the test starts at the match point. This
 2334   optimization can save a huge amount of work in patterns with nested unlimited
 2335   repeats that aren't going to match. Writing separate code for cased/caseless
 2336   versions makes it go faster, as does using an autoincrement and backing off
 2337   on a match.
 2338 
 2339   HOWEVER: when the subject string is very, very long, searching to its end can
 2340   take a long time, and give bad performance on quite ordinary patterns. This
 2341   showed up when somebody was matching /^C/ on a 32-megabyte string... so we
 2342   don't do this when the string is sufficiently long.
 2343 
 2344   ALSO: this processing is disabled when partial matching is requested.
 2345   */
 2346 
 2347   if (req_byte >= 0 &&
 2348       end_subject - current_subject < REQ_BYTE_MAX &&
 2349       (options & PCRE_PARTIAL) == 0)
 2350     {
 2351     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
 2352 
 2353     /* We don't need to repeat the search if we haven't yet reached the
 2354     place we found it at last time. */
 2355 
 2356     if (p > req_byte_ptr)
 2357       {
 2358       if (req_byte_caseless)
 2359         {
 2360         while (p < end_subject)
 2361           {
 2362           register int pp = *p++;
 2363           if (pp == req_byte || pp == req_byte2) { p--; break; }
 2364           }
 2365         }
 2366       else
 2367         {
 2368         while (p < end_subject)
 2369           {
 2370           if (*p++ == req_byte) { p--; break; }
 2371           }
 2372         }
 2373 
 2374       /* If we can't find the required character, break the matching loop,
 2375       which will cause a return or PCRE_ERROR_NOMATCH. */
 2376 
 2377       if (p >= end_subject) break;
 2378 
 2379       /* If we have found the required character, save the point where we
 2380       found it, so that we don't search again next time round the loop if
 2381       the start hasn't passed this character yet. */
 2382 
 2383       req_byte_ptr = p;
 2384       }
 2385     }
 2386 
 2387   /* OK, now we can do the business */
 2388 
 2389   rc = internal_dfa_exec(
 2390     md,                                /* fixed match data */
 2391     md->start_code,                    /* this subexpression's code */
 2392     current_subject,                   /* where we currently are */
 2393     start_offset,                      /* start offset in subject */
 2394     offsets,                           /* offset vector */
 2395     offsetcount,                       /* size of same */
 2396     workspace,                         /* workspace vector */
 2397     wscount,                           /* size of same */
 2398     re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
 2399     0,                                 /* function recurse level */
 2400     0);                                /* regex recurse level */
 2401 
 2402   /* Anything other than "no match" means we are done, always; otherwise, carry
 2403   on only if not anchored. */
 2404 
 2405   if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
 2406 
 2407   /* Advance to the next subject character unless we are at the end of a line
 2408   and firstline is set. */
 2409 
 2410   if (firstline && IS_NEWLINE(current_subject)) break;
 2411   current_subject++;
 2412   if (utf8)
 2413     {
 2414     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
 2415       current_subject++;
 2416     }
 2417   if (current_subject > end_subject) break;
 2418 
 2419   /* If we have just passed a CR and the newline option is CRLF or ANY, and we
 2420   are now at a LF, advance the match position by one more character. */
 2421 
 2422   if (current_subject[-1] == '\r' &&
 2423        (md->nltype == NLTYPE_ANY || md->nllen == 2) &&
 2424        current_subject < end_subject &&
 2425        *current_subject == '\n')
 2426     current_subject++;
 2427 
 2428   }   /* "Bumpalong" loop */
 2429 
 2430 return PCRE_ERROR_NOMATCH;
 2431 }
 2432 
 2433 /* End of pcre_dfa_exec.c */