"Fossies" - the Fresh Open Source Software Archive

Member "pcre-8.42/pcre_study.c" (28 Feb 2016, 48586 Bytes) of package /linux/misc/pcre-8.42.tar.bz2:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "pcre_study.c" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 8.38_vs_8.39.

    1 /*************************************************
    2 *      Perl-Compatible Regular Expressions       *
    3 *************************************************/
    4 
    5 /* PCRE is a library of functions to support regular expressions whose syntax
    6 and semantics are as close as possible to those of the Perl 5 language.
    7 
    8                        Written by Philip Hazel
    9            Copyright (c) 1997-2012 University of Cambridge
   10 
   11 -----------------------------------------------------------------------------
   12 Redistribution and use in source and binary forms, with or without
   13 modification, are permitted provided that the following conditions are met:
   14 
   15     * Redistributions of source code must retain the above copyright notice,
   16       this list of conditions and the following disclaimer.
   17 
   18     * Redistributions in binary form must reproduce the above copyright
   19       notice, this list of conditions and the following disclaimer in the
   20       documentation and/or other materials provided with the distribution.
   21 
   22     * Neither the name of the University of Cambridge nor the names of its
   23       contributors may be used to endorse or promote products derived from
   24       this software without specific prior written permission.
   25 
   26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
   27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
   30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   36 POSSIBILITY OF SUCH DAMAGE.
   37 -----------------------------------------------------------------------------
   38 */
   39 
   40 
   41 /* This module contains the external function pcre_study(), along with local
   42 supporting functions. */
   43 
   44 
   45 #ifdef HAVE_CONFIG_H
   46 #include "config.h"
   47 #endif
   48 
   49 #include "pcre_internal.h"
   50 
   51 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
   52 
   53 /* Returns from set_start_bits() */
   54 
   55 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
   56 
   57 
   58 
   59 /*************************************************
   60 *   Find the minimum subject length for a group  *
   61 *************************************************/
   62 
   63 /* Scan a parenthesized group and compute the minimum length of subject that
   64 is needed to match it. This is a lower bound; it does not mean there is a
   65 string of that length that matches. In UTF8 mode, the result is in characters
   66 rather than bytes.
   67 
   68 Arguments:
   69   re              compiled pattern block
   70   code            pointer to start of group (the bracket)
   71   startcode       pointer to start of the whole pattern's code
   72   options         the compiling options
   73   recurses        chain of recurse_check to catch mutual recursion
   74   countptr        pointer to call count (to catch over complexity)
   75 
   76 Returns:   the minimum length
   77            -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
   78            -2 internal error (missing capturing bracket)
   79            -3 internal error (opcode not listed)
   80 */
   81 
   82 static int
   83 find_minlength(const REAL_PCRE *re, const pcre_uchar *code,
   84   const pcre_uchar *startcode, int options, recurse_check *recurses,
   85   int *countptr)
   86 {
   87 int length = -1;
   88 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
   89 BOOL utf = (options & PCRE_UTF8) != 0;
   90 BOOL had_recurse = FALSE;
   91 recurse_check this_recurse;
   92 register int branchlength = 0;
   93 register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
   94 
   95 if ((*countptr)++ > 1000) return -1;   /* too complex */
   96 
   97 if (*code == OP_CBRA || *code == OP_SCBRA ||
   98     *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
   99 
  100 /* Scan along the opcodes for this branch. If we get to the end of the
  101 branch, check the length against that of the other branches. */
  102 
  103 for (;;)
  104   {
  105   int d, min;
  106   pcre_uchar *cs, *ce;
  107   register pcre_uchar op = *cc;
  108 
  109   switch (op)
  110     {
  111     case OP_COND:
  112     case OP_SCOND:
  113 
  114     /* If there is only one branch in a condition, the implied branch has zero
  115     length, so we don't add anything. This covers the DEFINE "condition"
  116     automatically. */
  117 
  118     cs = cc + GET(cc, 1);
  119     if (*cs != OP_ALT)
  120       {
  121       cc = cs + 1 + LINK_SIZE;
  122       break;
  123       }
  124 
  125     /* Otherwise we can fall through and treat it the same as any other
  126     subpattern. */
  127 
  128     case OP_CBRA:
  129     case OP_SCBRA:
  130     case OP_BRA:
  131     case OP_SBRA:
  132     case OP_CBRAPOS:
  133     case OP_SCBRAPOS:
  134     case OP_BRAPOS:
  135     case OP_SBRAPOS:
  136     case OP_ONCE:
  137     case OP_ONCE_NC:
  138     d = find_minlength(re, cc, startcode, options, recurses, countptr);
  139     if (d < 0) return d;
  140     branchlength += d;
  141     do cc += GET(cc, 1); while (*cc == OP_ALT);
  142     cc += 1 + LINK_SIZE;
  143     break;
  144 
  145     /* ACCEPT makes things far too complicated; we have to give up. */
  146 
  147     case OP_ACCEPT:
  148     case OP_ASSERT_ACCEPT:
  149     return -1;
  150 
  151     /* Reached end of a branch; if it's a ket it is the end of a nested
  152     call. If it's ALT it is an alternation in a nested call. If it is END it's
  153     the end of the outer call. All can be handled by the same code. If an
  154     ACCEPT was previously encountered, use the length that was in force at that
  155     time, and pass back the shortest ACCEPT length. */
  156 
  157     case OP_ALT:
  158     case OP_KET:
  159     case OP_KETRMAX:
  160     case OP_KETRMIN:
  161     case OP_KETRPOS:
  162     case OP_END:
  163     if (length < 0 || (!had_recurse && branchlength < length))
  164       length = branchlength;
  165     if (op != OP_ALT) return length;
  166     cc += 1 + LINK_SIZE;
  167     branchlength = 0;
  168     had_recurse = FALSE;
  169     break;
  170 
  171     /* Skip over assertive subpatterns */
  172 
  173     case OP_ASSERT:
  174     case OP_ASSERT_NOT:
  175     case OP_ASSERTBACK:
  176     case OP_ASSERTBACK_NOT:
  177     do cc += GET(cc, 1); while (*cc == OP_ALT);
  178     /* Fall through */
  179 
  180     /* Skip over things that don't match chars */
  181 
  182     case OP_REVERSE:
  183     case OP_CREF:
  184     case OP_DNCREF:
  185     case OP_RREF:
  186     case OP_DNRREF:
  187     case OP_DEF:
  188     case OP_CALLOUT:
  189     case OP_SOD:
  190     case OP_SOM:
  191     case OP_EOD:
  192     case OP_EODN:
  193     case OP_CIRC:
  194     case OP_CIRCM:
  195     case OP_DOLL:
  196     case OP_DOLLM:
  197     case OP_NOT_WORD_BOUNDARY:
  198     case OP_WORD_BOUNDARY:
  199     cc += PRIV(OP_lengths)[*cc];
  200     break;
  201 
  202     /* Skip over a subpattern that has a {0} or {0,x} quantifier */
  203 
  204     case OP_BRAZERO:
  205     case OP_BRAMINZERO:
  206     case OP_BRAPOSZERO:
  207     case OP_SKIPZERO:
  208     cc += PRIV(OP_lengths)[*cc];
  209     do cc += GET(cc, 1); while (*cc == OP_ALT);
  210     cc += 1 + LINK_SIZE;
  211     break;
  212 
  213     /* Handle literal characters and + repetitions */
  214 
  215     case OP_CHAR:
  216     case OP_CHARI:
  217     case OP_NOT:
  218     case OP_NOTI:
  219     case OP_PLUS:
  220     case OP_PLUSI:
  221     case OP_MINPLUS:
  222     case OP_MINPLUSI:
  223     case OP_POSPLUS:
  224     case OP_POSPLUSI:
  225     case OP_NOTPLUS:
  226     case OP_NOTPLUSI:
  227     case OP_NOTMINPLUS:
  228     case OP_NOTMINPLUSI:
  229     case OP_NOTPOSPLUS:
  230     case OP_NOTPOSPLUSI:
  231     branchlength++;
  232     cc += 2;
  233 #ifdef SUPPORT_UTF
  234     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
  235 #endif
  236     break;
  237 
  238     case OP_TYPEPLUS:
  239     case OP_TYPEMINPLUS:
  240     case OP_TYPEPOSPLUS:
  241     branchlength++;
  242     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
  243     break;
  244 
  245     /* Handle exact repetitions. The count is already in characters, but we
  246     need to skip over a multibyte character in UTF8 mode.  */
  247 
  248     case OP_EXACT:
  249     case OP_EXACTI:
  250     case OP_NOTEXACT:
  251     case OP_NOTEXACTI:
  252     branchlength += GET2(cc,1);
  253     cc += 2 + IMM2_SIZE;
  254 #ifdef SUPPORT_UTF
  255     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
  256 #endif
  257     break;
  258 
  259     case OP_TYPEEXACT:
  260     branchlength += GET2(cc,1);
  261     cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
  262       || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
  263     break;
  264 
  265     /* Handle single-char non-literal matchers */
  266 
  267     case OP_PROP:
  268     case OP_NOTPROP:
  269     cc += 2;
  270     /* Fall through */
  271 
  272     case OP_NOT_DIGIT:
  273     case OP_DIGIT:
  274     case OP_NOT_WHITESPACE:
  275     case OP_WHITESPACE:
  276     case OP_NOT_WORDCHAR:
  277     case OP_WORDCHAR:
  278     case OP_ANY:
  279     case OP_ALLANY:
  280     case OP_EXTUNI:
  281     case OP_HSPACE:
  282     case OP_NOT_HSPACE:
  283     case OP_VSPACE:
  284     case OP_NOT_VSPACE:
  285     branchlength++;
  286     cc++;
  287     break;
  288 
  289     /* "Any newline" might match two characters, but it also might match just
  290     one. */
  291 
  292     case OP_ANYNL:
  293     branchlength += 1;
  294     cc++;
  295     break;
  296 
  297     /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
  298     non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
  299     appear, but leave the code, just in case.) */
  300 
  301     case OP_ANYBYTE:
  302 #ifdef SUPPORT_UTF
  303     if (utf) return -1;
  304 #endif
  305     branchlength++;
  306     cc++;
  307     break;
  308 
  309     /* For repeated character types, we have to test for \p and \P, which have
  310     an extra two bytes of parameters. */
  311 
  312     case OP_TYPESTAR:
  313     case OP_TYPEMINSTAR:
  314     case OP_TYPEQUERY:
  315     case OP_TYPEMINQUERY:
  316     case OP_TYPEPOSSTAR:
  317     case OP_TYPEPOSQUERY:
  318     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
  319     cc += PRIV(OP_lengths)[op];
  320     break;
  321 
  322     case OP_TYPEUPTO:
  323     case OP_TYPEMINUPTO:
  324     case OP_TYPEPOSUPTO:
  325     if (cc[1 + IMM2_SIZE] == OP_PROP
  326       || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
  327     cc += PRIV(OP_lengths)[op];
  328     break;
  329 
  330     /* Check a class for variable quantification */
  331 
  332     case OP_CLASS:
  333     case OP_NCLASS:
  334 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
  335     case OP_XCLASS:
  336     /* The original code caused an unsigned overflow in 64 bit systems,
  337     so now we use a conditional statement. */
  338     if (op == OP_XCLASS)
  339       cc += GET(cc, 1);
  340     else
  341       cc += PRIV(OP_lengths)[OP_CLASS];
  342 #else
  343     cc += PRIV(OP_lengths)[OP_CLASS];
  344 #endif
  345 
  346     switch (*cc)
  347       {
  348       case OP_CRPLUS:
  349       case OP_CRMINPLUS:
  350       case OP_CRPOSPLUS:
  351       branchlength++;
  352       /* Fall through */
  353 
  354       case OP_CRSTAR:
  355       case OP_CRMINSTAR:
  356       case OP_CRQUERY:
  357       case OP_CRMINQUERY:
  358       case OP_CRPOSSTAR:
  359       case OP_CRPOSQUERY:
  360       cc++;
  361       break;
  362 
  363       case OP_CRRANGE:
  364       case OP_CRMINRANGE:
  365       case OP_CRPOSRANGE:
  366       branchlength += GET2(cc,1);
  367       cc += 1 + 2 * IMM2_SIZE;
  368       break;
  369 
  370       default:
  371       branchlength++;
  372       break;
  373       }
  374     break;
  375 
  376     /* Backreferences and subroutine calls are treated in the same way: we find
  377     the minimum length for the subpattern. A recursion, however, causes an
  378     a flag to be set that causes the length of this branch to be ignored. The
  379     logic is that a recursion can only make sense if there is another
  380     alternation that stops the recursing. That will provide the minimum length
  381     (when no recursion happens). A backreference within the group that it is
  382     referencing behaves in the same way.
  383 
  384     If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
  385     matches an empty string (by default it causes a matching failure), so in
  386     that case we must set the minimum length to zero. */
  387 
  388     case OP_DNREF:     /* Duplicate named pattern back reference */
  389     case OP_DNREFI:
  390     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
  391       {
  392       int count = GET2(cc, 1+IMM2_SIZE);
  393       pcre_uchar *slot = (pcre_uchar *)re +
  394         re->name_table_offset + GET2(cc, 1) * re->name_entry_size;
  395       d = INT_MAX;
  396       while (count-- > 0)
  397         {
  398         ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
  399         if (cs == NULL) return -2;
  400         do ce += GET(ce, 1); while (*ce == OP_ALT);
  401         if (cc > cs && cc < ce)     /* Simple recursion */
  402           {
  403           d = 0;
  404           had_recurse = TRUE;
  405           break;
  406           }
  407         else
  408           {
  409           recurse_check *r = recurses;
  410           for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
  411           if (r != NULL)           /* Mutual recursion */
  412             {
  413             d = 0;
  414             had_recurse = TRUE;
  415             break;
  416             }
  417           else
  418             {
  419             int dd;
  420             this_recurse.prev = recurses;
  421             this_recurse.group = cs;
  422             dd = find_minlength(re, cs, startcode, options, &this_recurse,
  423               countptr);
  424             if (dd < d) d = dd;
  425             }
  426           }
  427         slot += re->name_entry_size;
  428         }
  429       }
  430     else d = 0;
  431     cc += 1 + 2*IMM2_SIZE;
  432     goto REPEAT_BACK_REFERENCE;
  433 
  434     case OP_REF:      /* Single back reference */
  435     case OP_REFI:
  436     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
  437       {
  438       ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
  439       if (cs == NULL) return -2;
  440       do ce += GET(ce, 1); while (*ce == OP_ALT);
  441       if (cc > cs && cc < ce)    /* Simple recursion */
  442         {
  443         d = 0;
  444         had_recurse = TRUE;
  445         }
  446       else
  447         {
  448         recurse_check *r = recurses;
  449         for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
  450         if (r != NULL)           /* Mutual recursion */
  451           {
  452           d = 0;
  453           had_recurse = TRUE;
  454           }
  455         else
  456           {
  457           this_recurse.prev = recurses;
  458           this_recurse.group = cs;
  459           d = find_minlength(re, cs, startcode, options, &this_recurse,
  460             countptr);
  461           }
  462         }
  463       }
  464     else d = 0;
  465     cc += 1 + IMM2_SIZE;
  466 
  467     /* Handle repeated back references */
  468 
  469     REPEAT_BACK_REFERENCE:
  470     switch (*cc)
  471       {
  472       case OP_CRSTAR:
  473       case OP_CRMINSTAR:
  474       case OP_CRQUERY:
  475       case OP_CRMINQUERY:
  476       case OP_CRPOSSTAR:
  477       case OP_CRPOSQUERY:
  478       min = 0;
  479       cc++;
  480       break;
  481 
  482       case OP_CRPLUS:
  483       case OP_CRMINPLUS:
  484       case OP_CRPOSPLUS:
  485       min = 1;
  486       cc++;
  487       break;
  488 
  489       case OP_CRRANGE:
  490       case OP_CRMINRANGE:
  491       case OP_CRPOSRANGE:
  492       min = GET2(cc, 1);
  493       cc += 1 + 2 * IMM2_SIZE;
  494       break;
  495 
  496       default:
  497       min = 1;
  498       break;
  499       }
  500 
  501     branchlength += min * d;
  502     break;
  503 
  504     /* We can easily detect direct recursion, but not mutual recursion. This is
  505     caught by a recursion depth count. */
  506 
  507     case OP_RECURSE:
  508     cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
  509     do ce += GET(ce, 1); while (*ce == OP_ALT);
  510     if (cc > cs && cc < ce)    /* Simple recursion */
  511       had_recurse = TRUE;
  512     else
  513       {
  514       recurse_check *r = recurses;
  515       for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
  516       if (r != NULL)           /* Mutual recursion */
  517         had_recurse = TRUE;
  518       else
  519         {
  520         this_recurse.prev = recurses;
  521         this_recurse.group = cs;
  522         branchlength += find_minlength(re, cs, startcode, options,
  523           &this_recurse, countptr);
  524         }
  525       }
  526     cc += 1 + LINK_SIZE;
  527     break;
  528 
  529     /* Anything else does not or need not match a character. We can get the
  530     item's length from the table, but for those that can match zero occurrences
  531     of a character, we must take special action for UTF-8 characters. As it
  532     happens, the "NOT" versions of these opcodes are used at present only for
  533     ASCII characters, so they could be omitted from this list. However, in
  534     future that may change, so we include them here so as not to leave a
  535     gotcha for a future maintainer. */
  536 
  537     case OP_UPTO:
  538     case OP_UPTOI:
  539     case OP_NOTUPTO:
  540     case OP_NOTUPTOI:
  541     case OP_MINUPTO:
  542     case OP_MINUPTOI:
  543     case OP_NOTMINUPTO:
  544     case OP_NOTMINUPTOI:
  545     case OP_POSUPTO:
  546     case OP_POSUPTOI:
  547     case OP_NOTPOSUPTO:
  548     case OP_NOTPOSUPTOI:
  549 
  550     case OP_STAR:
  551     case OP_STARI:
  552     case OP_NOTSTAR:
  553     case OP_NOTSTARI:
  554     case OP_MINSTAR:
  555     case OP_MINSTARI:
  556     case OP_NOTMINSTAR:
  557     case OP_NOTMINSTARI:
  558     case OP_POSSTAR:
  559     case OP_POSSTARI:
  560     case OP_NOTPOSSTAR:
  561     case OP_NOTPOSSTARI:
  562 
  563     case OP_QUERY:
  564     case OP_QUERYI:
  565     case OP_NOTQUERY:
  566     case OP_NOTQUERYI:
  567     case OP_MINQUERY:
  568     case OP_MINQUERYI:
  569     case OP_NOTMINQUERY:
  570     case OP_NOTMINQUERYI:
  571     case OP_POSQUERY:
  572     case OP_POSQUERYI:
  573     case OP_NOTPOSQUERY:
  574     case OP_NOTPOSQUERYI:
  575 
  576     cc += PRIV(OP_lengths)[op];
  577 #ifdef SUPPORT_UTF
  578     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
  579 #endif
  580     break;
  581 
  582     /* Skip these, but we need to add in the name length. */
  583 
  584     case OP_MARK:
  585     case OP_PRUNE_ARG:
  586     case OP_SKIP_ARG:
  587     case OP_THEN_ARG:
  588     cc += PRIV(OP_lengths)[op] + cc[1];
  589     break;
  590 
  591     /* The remaining opcodes are just skipped over. */
  592 
  593     case OP_CLOSE:
  594     case OP_COMMIT:
  595     case OP_FAIL:
  596     case OP_PRUNE:
  597     case OP_SET_SOM:
  598     case OP_SKIP:
  599     case OP_THEN:
  600     cc += PRIV(OP_lengths)[op];
  601     break;
  602 
  603     /* This should not occur: we list all opcodes explicitly so that when
  604     new ones get added they are properly considered. */
  605 
  606     default:
  607     return -3;
  608     }
  609   }
  610 /* Control never gets here */
  611 }
  612 
  613 
  614 
  615 /*************************************************
  616 *      Set a bit and maybe its alternate case    *
  617 *************************************************/
  618 
  619 /* Given a character, set its first byte's bit in the table, and also the
  620 corresponding bit for the other version of a letter if we are caseless. In
  621 UTF-8 mode, for characters greater than 127, we can only do the caseless thing
  622 when Unicode property support is available.
  623 
  624 Arguments:
  625   start_bits    points to the bit map
  626   p             points to the character
  627   caseless      the caseless flag
  628   cd            the block with char table pointers
  629   utf           TRUE for UTF-8 / UTF-16 / UTF-32 mode
  630 
  631 Returns:        pointer after the character
  632 */
  633 
  634 static const pcre_uchar *
  635 set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
  636   compile_data *cd, BOOL utf)
  637 {
  638 pcre_uint32 c = *p;
  639 
  640 #ifdef COMPILE_PCRE8
  641 SET_BIT(c);
  642 
  643 #ifdef SUPPORT_UTF
  644 if (utf && c > 127)
  645   {
  646   GETCHARINC(c, p);
  647 #ifdef SUPPORT_UCP
  648   if (caseless)
  649     {
  650     pcre_uchar buff[6];
  651     c = UCD_OTHERCASE(c);
  652     (void)PRIV(ord2utf)(c, buff);
  653     SET_BIT(buff[0]);
  654     }
  655 #endif  /* Not SUPPORT_UCP */
  656   return p;
  657   }
  658 #else   /* Not SUPPORT_UTF */
  659 (void)(utf);   /* Stops warning for unused parameter */
  660 #endif  /* SUPPORT_UTF */
  661 
  662 /* Not UTF-8 mode, or character is less than 127. */
  663 
  664 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
  665 return p + 1;
  666 #endif  /* COMPILE_PCRE8 */
  667 
  668 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
  669 if (c > 0xff)
  670   {
  671   c = 0xff;
  672   caseless = FALSE;
  673   }
  674 SET_BIT(c);
  675 
  676 #ifdef SUPPORT_UTF
  677 if (utf && c > 127)
  678   {
  679   GETCHARINC(c, p);
  680 #ifdef SUPPORT_UCP
  681   if (caseless)
  682     {
  683     c = UCD_OTHERCASE(c);
  684     if (c > 0xff)
  685       c = 0xff;
  686     SET_BIT(c);
  687     }
  688 #endif  /* SUPPORT_UCP */
  689   return p;
  690   }
  691 #else   /* Not SUPPORT_UTF */
  692 (void)(utf);   /* Stops warning for unused parameter */
  693 #endif  /* SUPPORT_UTF */
  694 
  695 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
  696 return p + 1;
  697 #endif
  698 }
  699 
  700 
  701 
  702 /*************************************************
  703 *     Set bits for a positive character type     *
  704 *************************************************/
  705 
  706 /* This function sets starting bits for a character type. In UTF-8 mode, we can
  707 only do a direct setting for bytes less than 128, as otherwise there can be
  708 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
  709 environment, the tables will only recognize ASCII characters anyway, but in at
  710 least one Windows environment, some higher bytes bits were set in the tables.
  711 So we deal with that case by considering the UTF-8 encoding.
  712 
  713 Arguments:
  714   start_bits     the starting bitmap
  715   cbit type      the type of character wanted
  716   table_limit    32 for non-UTF-8; 16 for UTF-8
  717   cd             the block with char table pointers
  718 
  719 Returns:         nothing
  720 */
  721 
  722 static void
  723 set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
  724   compile_data *cd)
  725 {
  726 register pcre_uint32 c;
  727 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
  728 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
  729 if (table_limit == 32) return;
  730 for (c = 128; c < 256; c++)
  731   {
  732   if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
  733     {
  734     pcre_uchar buff[6];
  735     (void)PRIV(ord2utf)(c, buff);
  736     SET_BIT(buff[0]);
  737     }
  738   }
  739 #endif
  740 }
  741 
  742 
  743 /*************************************************
  744 *     Set bits for a negative character type     *
  745 *************************************************/
  746 
  747 /* This function sets starting bits for a negative character type such as \D.
  748 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
  749 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
  750 Unlike in the positive case, where we can set appropriate starting bits for
  751 specific high-valued UTF-8 characters, in this case we have to set the bits for
  752 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
  753 0xc0 (192) for simplicity.
  754 
  755 Arguments:
  756   start_bits     the starting bitmap
  757   cbit type      the type of character wanted
  758   table_limit    32 for non-UTF-8; 16 for UTF-8
  759   cd             the block with char table pointers
  760 
  761 Returns:         nothing
  762 */
  763 
  764 static void
  765 set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
  766   compile_data *cd)
  767 {
  768 register pcre_uint32 c;
  769 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
  770 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
  771 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
  772 #endif
  773 }
  774 
  775 
  776 
  777 /*************************************************
  778 *          Create bitmap of starting bytes       *
  779 *************************************************/
  780 
  781 /* This function scans a compiled unanchored expression recursively and
  782 attempts to build a bitmap of the set of possible starting bytes. As time goes
  783 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
  784 useful for parenthesized groups in patterns such as (a*)b where the group
  785 provides some optional starting bytes but scanning must continue at the outer
  786 level to find at least one mandatory byte. At the outermost level, this
  787 function fails unless the result is SSB_DONE.
  788 
  789 Arguments:
  790   code         points to an expression
  791   start_bits   points to a 32-byte table, initialized to 0
  792   utf          TRUE if in UTF-8 / UTF-16 / UTF-32 mode
  793   cd           the block with char table pointers
  794 
  795 Returns:       SSB_FAIL     => Failed to find any starting bytes
  796                SSB_DONE     => Found mandatory starting bytes
  797                SSB_CONTINUE => Found optional starting bytes
  798                SSB_UNKNOWN  => Hit an unrecognized opcode
  799 */
  800 
  801 static int
  802 set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
  803   compile_data *cd)
  804 {
  805 register pcre_uint32 c;
  806 int yield = SSB_DONE;
  807 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
  808 int table_limit = utf? 16:32;
  809 #else
  810 int table_limit = 32;
  811 #endif
  812 
  813 #if 0
  814 /* ========================================================================= */
  815 /* The following comment and code was inserted in January 1999. In May 2006,
  816 when it was observed to cause compiler warnings about unused values, I took it
  817 out again. If anybody is still using OS/2, they will have to put it back
  818 manually. */
  819 
  820 /* This next statement and the later reference to dummy are here in order to
  821 trick the optimizer of the IBM C compiler for OS/2 into generating correct
  822 code. Apparently IBM isn't going to fix the problem, and we would rather not
  823 disable optimization (in this module it actually makes a big difference, and
  824 the pcre module can use all the optimization it can get). */
  825 
  826 volatile int dummy;
  827 /* ========================================================================= */
  828 #endif
  829 
  830 do
  831   {
  832   BOOL try_next = TRUE;
  833   const pcre_uchar *tcode = code + 1 + LINK_SIZE;
  834 
  835   if (*code == OP_CBRA || *code == OP_SCBRA ||
  836       *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
  837 
  838   while (try_next)    /* Loop for items in this branch */
  839     {
  840     int rc;
  841 
  842     switch(*tcode)
  843       {
  844       /* If we reach something we don't understand, it means a new opcode has
  845       been created that hasn't been added to this code. Hopefully this problem
  846       will be discovered during testing. */
  847 
  848       default:
  849       return SSB_UNKNOWN;
  850 
  851       /* Fail for a valid opcode that implies no starting bits. */
  852 
  853       case OP_ACCEPT:
  854       case OP_ASSERT_ACCEPT:
  855       case OP_ALLANY:
  856       case OP_ANY:
  857       case OP_ANYBYTE:
  858       case OP_CIRC:
  859       case OP_CIRCM:
  860       case OP_CLOSE:
  861       case OP_COMMIT:
  862       case OP_COND:
  863       case OP_CREF:
  864       case OP_DEF:
  865       case OP_DNCREF:
  866       case OP_DNREF:
  867       case OP_DNREFI:
  868       case OP_DNRREF:
  869       case OP_DOLL:
  870       case OP_DOLLM:
  871       case OP_END:
  872       case OP_EOD:
  873       case OP_EODN:
  874       case OP_EXTUNI:
  875       case OP_FAIL:
  876       case OP_MARK:
  877       case OP_NOT:
  878       case OP_NOTEXACT:
  879       case OP_NOTEXACTI:
  880       case OP_NOTI:
  881       case OP_NOTMINPLUS:
  882       case OP_NOTMINPLUSI:
  883       case OP_NOTMINQUERY:
  884       case OP_NOTMINQUERYI:
  885       case OP_NOTMINSTAR:
  886       case OP_NOTMINSTARI:
  887       case OP_NOTMINUPTO:
  888       case OP_NOTMINUPTOI:
  889       case OP_NOTPLUS:
  890       case OP_NOTPLUSI:
  891       case OP_NOTPOSPLUS:
  892       case OP_NOTPOSPLUSI:
  893       case OP_NOTPOSQUERY:
  894       case OP_NOTPOSQUERYI:
  895       case OP_NOTPOSSTAR:
  896       case OP_NOTPOSSTARI:
  897       case OP_NOTPOSUPTO:
  898       case OP_NOTPOSUPTOI:
  899       case OP_NOTPROP:
  900       case OP_NOTQUERY:
  901       case OP_NOTQUERYI:
  902       case OP_NOTSTAR:
  903       case OP_NOTSTARI:
  904       case OP_NOTUPTO:
  905       case OP_NOTUPTOI:
  906       case OP_NOT_HSPACE:
  907       case OP_NOT_VSPACE:
  908       case OP_PRUNE:
  909       case OP_PRUNE_ARG:
  910       case OP_RECURSE:
  911       case OP_REF:
  912       case OP_REFI:
  913       case OP_REVERSE:
  914       case OP_RREF:
  915       case OP_SCOND:
  916       case OP_SET_SOM:
  917       case OP_SKIP:
  918       case OP_SKIP_ARG:
  919       case OP_SOD:
  920       case OP_SOM:
  921       case OP_THEN:
  922       case OP_THEN_ARG:
  923       return SSB_FAIL;
  924 
  925       /* A "real" property test implies no starting bits, but the fake property
  926       PT_CLIST identifies a list of characters. These lists are short, as they
  927       are used for characters with more than one "other case", so there is no
  928       point in recognizing them for OP_NOTPROP. */
  929 
  930       case OP_PROP:
  931       if (tcode[1] != PT_CLIST) return SSB_FAIL;
  932         {
  933         const pcre_uint32 *p = PRIV(ucd_caseless_sets) + tcode[2];
  934         while ((c = *p++) < NOTACHAR)
  935           {
  936 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
  937           if (utf)
  938             {
  939             pcre_uchar buff[6];
  940             (void)PRIV(ord2utf)(c, buff);
  941             c = buff[0];
  942             }
  943 #endif
  944           if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
  945           }
  946         }
  947       try_next = FALSE;
  948       break;
  949 
  950       /* We can ignore word boundary tests. */
  951 
  952       case OP_WORD_BOUNDARY:
  953       case OP_NOT_WORD_BOUNDARY:
  954       tcode++;
  955       break;
  956 
  957       /* If we hit a bracket or a positive lookahead assertion, recurse to set
  958       bits from within the subpattern. If it can't find anything, we have to
  959       give up. If it finds some mandatory character(s), we are done for this
  960       branch. Otherwise, carry on scanning after the subpattern. */
  961 
  962       case OP_BRA:
  963       case OP_SBRA:
  964       case OP_CBRA:
  965       case OP_SCBRA:
  966       case OP_BRAPOS:
  967       case OP_SBRAPOS:
  968       case OP_CBRAPOS:
  969       case OP_SCBRAPOS:
  970       case OP_ONCE:
  971       case OP_ONCE_NC:
  972       case OP_ASSERT:
  973       rc = set_start_bits(tcode, start_bits, utf, cd);
  974       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
  975       if (rc == SSB_DONE) try_next = FALSE; else
  976         {
  977         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
  978         tcode += 1 + LINK_SIZE;
  979         }
  980       break;
  981 
  982       /* If we hit ALT or KET, it means we haven't found anything mandatory in
  983       this branch, though we might have found something optional. For ALT, we
  984       continue with the next alternative, but we have to arrange that the final
  985       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
  986       return SSB_CONTINUE: if this is the top level, that indicates failure,
  987       but after a nested subpattern, it causes scanning to continue. */
  988 
  989       case OP_ALT:
  990       yield = SSB_CONTINUE;
  991       try_next = FALSE;
  992       break;
  993 
  994       case OP_KET:
  995       case OP_KETRMAX:
  996       case OP_KETRMIN:
  997       case OP_KETRPOS:
  998       return SSB_CONTINUE;
  999 
 1000       /* Skip over callout */
 1001 
 1002       case OP_CALLOUT:
 1003       tcode += 2 + 2*LINK_SIZE;
 1004       break;
 1005 
 1006       /* Skip over lookbehind and negative lookahead assertions */
 1007 
 1008       case OP_ASSERT_NOT:
 1009       case OP_ASSERTBACK:
 1010       case OP_ASSERTBACK_NOT:
 1011       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
 1012       tcode += 1 + LINK_SIZE;
 1013       break;
 1014 
 1015       /* BRAZERO does the bracket, but carries on. */
 1016 
 1017       case OP_BRAZERO:
 1018       case OP_BRAMINZERO:
 1019       case OP_BRAPOSZERO:
 1020       rc = set_start_bits(++tcode, start_bits, utf, cd);
 1021       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
 1022 /* =========================================================================
 1023       See the comment at the head of this function concerning the next line,
 1024       which was an old fudge for the benefit of OS/2.
 1025       dummy = 1;
 1026   ========================================================================= */
 1027       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
 1028       tcode += 1 + LINK_SIZE;
 1029       break;
 1030 
 1031       /* SKIPZERO skips the bracket. */
 1032 
 1033       case OP_SKIPZERO:
 1034       tcode++;
 1035       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
 1036       tcode += 1 + LINK_SIZE;
 1037       break;
 1038 
 1039       /* Single-char * or ? sets the bit and tries the next item */
 1040 
 1041       case OP_STAR:
 1042       case OP_MINSTAR:
 1043       case OP_POSSTAR:
 1044       case OP_QUERY:
 1045       case OP_MINQUERY:
 1046       case OP_POSQUERY:
 1047       tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
 1048       break;
 1049 
 1050       case OP_STARI:
 1051       case OP_MINSTARI:
 1052       case OP_POSSTARI:
 1053       case OP_QUERYI:
 1054       case OP_MINQUERYI:
 1055       case OP_POSQUERYI:
 1056       tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
 1057       break;
 1058 
 1059       /* Single-char upto sets the bit and tries the next */
 1060 
 1061       case OP_UPTO:
 1062       case OP_MINUPTO:
 1063       case OP_POSUPTO:
 1064       tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
 1065       break;
 1066 
 1067       case OP_UPTOI:
 1068       case OP_MINUPTOI:
 1069       case OP_POSUPTOI:
 1070       tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
 1071       break;
 1072 
 1073       /* At least one single char sets the bit and stops */
 1074 
 1075       case OP_EXACT:
 1076       tcode += IMM2_SIZE;
 1077       /* Fall through */
 1078       case OP_CHAR:
 1079       case OP_PLUS:
 1080       case OP_MINPLUS:
 1081       case OP_POSPLUS:
 1082       (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
 1083       try_next = FALSE;
 1084       break;
 1085 
 1086       case OP_EXACTI:
 1087       tcode += IMM2_SIZE;
 1088       /* Fall through */
 1089       case OP_CHARI:
 1090       case OP_PLUSI:
 1091       case OP_MINPLUSI:
 1092       case OP_POSPLUSI:
 1093       (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
 1094       try_next = FALSE;
 1095       break;
 1096 
 1097       /* Special spacing and line-terminating items. These recognize specific
 1098       lists of characters. The difference between VSPACE and ANYNL is that the
 1099       latter can match the two-character CRLF sequence, but that is not
 1100       relevant for finding the first character, so their code here is
 1101       identical. */
 1102 
 1103       case OP_HSPACE:
 1104       SET_BIT(CHAR_HT);
 1105       SET_BIT(CHAR_SPACE);
 1106 #ifdef SUPPORT_UTF
 1107       if (utf)
 1108         {
 1109 #ifdef COMPILE_PCRE8
 1110         SET_BIT(0xC2);  /* For U+00A0 */
 1111         SET_BIT(0xE1);  /* For U+1680, U+180E */
 1112         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
 1113         SET_BIT(0xE3);  /* For U+3000 */
 1114 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
 1115         SET_BIT(0xA0);
 1116         SET_BIT(0xFF);  /* For characters > 255 */
 1117 #endif  /* COMPILE_PCRE[8|16|32] */
 1118         }
 1119       else
 1120 #endif /* SUPPORT_UTF */
 1121         {
 1122 #ifndef EBCDIC
 1123         SET_BIT(0xA0);
 1124 #endif  /* Not EBCDIC */
 1125 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
 1126         SET_BIT(0xFF);  /* For characters > 255 */
 1127 #endif  /* COMPILE_PCRE[16|32] */
 1128         }
 1129       try_next = FALSE;
 1130       break;
 1131 
 1132       case OP_ANYNL:
 1133       case OP_VSPACE:
 1134       SET_BIT(CHAR_LF);
 1135       SET_BIT(CHAR_VT);
 1136       SET_BIT(CHAR_FF);
 1137       SET_BIT(CHAR_CR);
 1138 #ifdef SUPPORT_UTF
 1139       if (utf)
 1140         {
 1141 #ifdef COMPILE_PCRE8
 1142         SET_BIT(0xC2);  /* For U+0085 */
 1143         SET_BIT(0xE2);  /* For U+2028, U+2029 */
 1144 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
 1145         SET_BIT(CHAR_NEL);
 1146         SET_BIT(0xFF);  /* For characters > 255 */
 1147 #endif  /* COMPILE_PCRE[8|16|32] */
 1148         }
 1149       else
 1150 #endif /* SUPPORT_UTF */
 1151         {
 1152         SET_BIT(CHAR_NEL);
 1153 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
 1154         SET_BIT(0xFF);  /* For characters > 255 */
 1155 #endif
 1156         }
 1157       try_next = FALSE;
 1158       break;
 1159 
 1160       /* Single character types set the bits and stop. Note that if PCRE_UCP
 1161       is set, we do not see these op codes because \d etc are converted to
 1162       properties. Therefore, these apply in the case when only characters less
 1163       than 256 are recognized to match the types. */
 1164 
 1165       case OP_NOT_DIGIT:
 1166       set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
 1167       try_next = FALSE;
 1168       break;
 1169 
 1170       case OP_DIGIT:
 1171       set_type_bits(start_bits, cbit_digit, table_limit, cd);
 1172       try_next = FALSE;
 1173       break;
 1174 
 1175       /* The cbit_space table has vertical tab as whitespace; we no longer
 1176       have to play fancy tricks because Perl added VT to its whitespace at
 1177       release 5.18. PCRE added it at release 8.34. */
 1178 
 1179       case OP_NOT_WHITESPACE:
 1180       set_nottype_bits(start_bits, cbit_space, table_limit, cd);
 1181       try_next = FALSE;
 1182       break;
 1183 
 1184       case OP_WHITESPACE:
 1185       set_type_bits(start_bits, cbit_space, table_limit, cd);
 1186       try_next = FALSE;
 1187       break;
 1188 
 1189       case OP_NOT_WORDCHAR:
 1190       set_nottype_bits(start_bits, cbit_word, table_limit, cd);
 1191       try_next = FALSE;
 1192       break;
 1193 
 1194       case OP_WORDCHAR:
 1195       set_type_bits(start_bits, cbit_word, table_limit, cd);
 1196       try_next = FALSE;
 1197       break;
 1198 
 1199       /* One or more character type fudges the pointer and restarts, knowing
 1200       it will hit a single character type and stop there. */
 1201 
 1202       case OP_TYPEPLUS:
 1203       case OP_TYPEMINPLUS:
 1204       case OP_TYPEPOSPLUS:
 1205       tcode++;
 1206       break;
 1207 
 1208       case OP_TYPEEXACT:
 1209       tcode += 1 + IMM2_SIZE;
 1210       break;
 1211 
 1212       /* Zero or more repeats of character types set the bits and then
 1213       try again. */
 1214 
 1215       case OP_TYPEUPTO:
 1216       case OP_TYPEMINUPTO:
 1217       case OP_TYPEPOSUPTO:
 1218       tcode += IMM2_SIZE;  /* Fall through */
 1219 
 1220       case OP_TYPESTAR:
 1221       case OP_TYPEMINSTAR:
 1222       case OP_TYPEPOSSTAR:
 1223       case OP_TYPEQUERY:
 1224       case OP_TYPEMINQUERY:
 1225       case OP_TYPEPOSQUERY:
 1226       switch(tcode[1])
 1227         {
 1228         default:
 1229         case OP_ANY:
 1230         case OP_ALLANY:
 1231         return SSB_FAIL;
 1232 
 1233         case OP_HSPACE:
 1234         SET_BIT(CHAR_HT);
 1235         SET_BIT(CHAR_SPACE);
 1236 #ifdef SUPPORT_UTF
 1237         if (utf)
 1238           {
 1239 #ifdef COMPILE_PCRE8
 1240           SET_BIT(0xC2);  /* For U+00A0 */
 1241           SET_BIT(0xE1);  /* For U+1680, U+180E */
 1242           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
 1243           SET_BIT(0xE3);  /* For U+3000 */
 1244 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
 1245           SET_BIT(0xA0);
 1246           SET_BIT(0xFF);  /* For characters > 255 */
 1247 #endif  /* COMPILE_PCRE[8|16|32] */
 1248           }
 1249         else
 1250 #endif /* SUPPORT_UTF */
 1251 #ifndef EBCDIC
 1252           SET_BIT(0xA0);
 1253 #endif  /* Not EBCDIC */
 1254         break;
 1255 
 1256         case OP_ANYNL:
 1257         case OP_VSPACE:
 1258         SET_BIT(CHAR_LF);
 1259         SET_BIT(CHAR_VT);
 1260         SET_BIT(CHAR_FF);
 1261         SET_BIT(CHAR_CR);
 1262 #ifdef SUPPORT_UTF
 1263         if (utf)
 1264           {
 1265 #ifdef COMPILE_PCRE8
 1266           SET_BIT(0xC2);  /* For U+0085 */
 1267           SET_BIT(0xE2);  /* For U+2028, U+2029 */
 1268 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
 1269           SET_BIT(CHAR_NEL);
 1270           SET_BIT(0xFF);  /* For characters > 255 */
 1271 #endif  /* COMPILE_PCRE16 */
 1272           }
 1273         else
 1274 #endif /* SUPPORT_UTF */
 1275           SET_BIT(CHAR_NEL);
 1276         break;
 1277 
 1278         case OP_NOT_DIGIT:
 1279         set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
 1280         break;
 1281 
 1282         case OP_DIGIT:
 1283         set_type_bits(start_bits, cbit_digit, table_limit, cd);
 1284         break;
 1285 
 1286         /* The cbit_space table has vertical tab as whitespace; we no longer
 1287         have to play fancy tricks because Perl added VT to its whitespace at
 1288         release 5.18. PCRE added it at release 8.34. */
 1289 
 1290         case OP_NOT_WHITESPACE:
 1291         set_nottype_bits(start_bits, cbit_space, table_limit, cd);
 1292         break;
 1293 
 1294         case OP_WHITESPACE:
 1295         set_type_bits(start_bits, cbit_space, table_limit, cd);
 1296         break;
 1297 
 1298         case OP_NOT_WORDCHAR:
 1299         set_nottype_bits(start_bits, cbit_word, table_limit, cd);
 1300         break;
 1301 
 1302         case OP_WORDCHAR:
 1303         set_type_bits(start_bits, cbit_word, table_limit, cd);
 1304         break;
 1305         }
 1306 
 1307       tcode += 2;
 1308       break;
 1309 
 1310       /* Character class where all the information is in a bit map: set the
 1311       bits and either carry on or not, according to the repeat count. If it was
 1312       a negative class, and we are operating with UTF-8 characters, any byte
 1313       with a value >= 0xc4 is a potentially valid starter because it starts a
 1314       character with a value > 255. */
 1315 
 1316 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 1317       case OP_XCLASS:
 1318       if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0)
 1319         return SSB_FAIL;
 1320       /* All bits are set. */
 1321       if ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0 && (tcode[1 + LINK_SIZE] & XCL_NOT) != 0)
 1322         return SSB_FAIL;
 1323 #endif
 1324       /* Fall through */
 1325 
 1326       case OP_NCLASS:
 1327 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
 1328       if (utf)
 1329         {
 1330         start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
 1331         memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
 1332         }
 1333 #endif
 1334 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
 1335       SET_BIT(0xFF);                         /* For characters > 255 */
 1336 #endif
 1337       /* Fall through */
 1338 
 1339       case OP_CLASS:
 1340         {
 1341         pcre_uint8 *map;
 1342 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 1343         map = NULL;
 1344         if (*tcode == OP_XCLASS)
 1345           {
 1346           if ((tcode[1 + LINK_SIZE] & XCL_MAP) != 0)
 1347             map = (pcre_uint8 *)(tcode + 1 + LINK_SIZE + 1);
 1348           tcode += GET(tcode, 1);
 1349           }
 1350         else
 1351 #endif
 1352           {
 1353           tcode++;
 1354           map = (pcre_uint8 *)tcode;
 1355           tcode += 32 / sizeof(pcre_uchar);
 1356           }
 1357 
 1358         /* In UTF-8 mode, the bits in a bit map correspond to character
 1359         values, not to byte values. However, the bit map we are constructing is
 1360         for byte values. So we have to do a conversion for characters whose
 1361         value is > 127. In fact, there are only two possible starting bytes for
 1362         characters in the range 128 - 255. */
 1363 
 1364 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 1365         if (map != NULL)
 1366 #endif
 1367           {
 1368 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
 1369           if (utf)
 1370             {
 1371             for (c = 0; c < 16; c++) start_bits[c] |= map[c];
 1372             for (c = 128; c < 256; c++)
 1373               {
 1374               if ((map[c/8] & (1 << (c&7))) != 0)
 1375                 {
 1376                 int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
 1377                 start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
 1378                 c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
 1379                 }
 1380               }
 1381             }
 1382           else
 1383 #endif
 1384             {
 1385             /* In non-UTF-8 mode, the two bit maps are completely compatible. */
 1386             for (c = 0; c < 32; c++) start_bits[c] |= map[c];
 1387             }
 1388           }
 1389 
 1390         /* Advance past the bit map, and act on what follows. For a zero
 1391         minimum repeat, continue; otherwise stop processing. */
 1392 
 1393         switch (*tcode)
 1394           {
 1395           case OP_CRSTAR:
 1396           case OP_CRMINSTAR:
 1397           case OP_CRQUERY:
 1398           case OP_CRMINQUERY:
 1399           case OP_CRPOSSTAR:
 1400           case OP_CRPOSQUERY:
 1401           tcode++;
 1402           break;
 1403 
 1404           case OP_CRRANGE:
 1405           case OP_CRMINRANGE:
 1406           case OP_CRPOSRANGE:
 1407           if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
 1408             else try_next = FALSE;
 1409           break;
 1410 
 1411           default:
 1412           try_next = FALSE;
 1413           break;
 1414           }
 1415         }
 1416       break; /* End of bitmap class handling */
 1417 
 1418       }      /* End of switch */
 1419     }        /* End of try_next loop */
 1420 
 1421   code += GET(code, 1);   /* Advance to next branch */
 1422   }
 1423 while (*code == OP_ALT);
 1424 return yield;
 1425 }
 1426 
 1427 
 1428 
 1429 
 1430 
 1431 /*************************************************
 1432 *          Study a compiled expression           *
 1433 *************************************************/
 1434 
 1435 /* This function is handed a compiled expression that it must study to produce
 1436 information that will speed up the matching. It returns a pcre[16]_extra block
 1437 which then gets handed back to pcre_exec().
 1438 
 1439 Arguments:
 1440   re        points to the compiled expression
 1441   options   contains option bits
 1442   errorptr  points to where to place error messages;
 1443             set NULL unless error
 1444 
 1445 Returns:    pointer to a pcre[16]_extra block, with study_data filled in and
 1446               the appropriate flags set;
 1447             NULL on error or if no optimization possible
 1448 */
 1449 
 1450 #if defined COMPILE_PCRE8
 1451 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
 1452 pcre_study(const pcre *external_re, int options, const char **errorptr)
 1453 #elif defined COMPILE_PCRE16
 1454 PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
 1455 pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
 1456 #elif defined COMPILE_PCRE32
 1457 PCRE_EXP_DEFN pcre32_extra * PCRE_CALL_CONVENTION
 1458 pcre32_study(const pcre32 *external_re, int options, const char **errorptr)
 1459 #endif
 1460 {
 1461 int min;
 1462 int count = 0;
 1463 BOOL bits_set = FALSE;
 1464 pcre_uint8 start_bits[32];
 1465 PUBL(extra) *extra = NULL;
 1466 pcre_study_data *study;
 1467 const pcre_uint8 *tables;
 1468 pcre_uchar *code;
 1469 compile_data compile_block;
 1470 const REAL_PCRE *re = (const REAL_PCRE *)external_re;
 1471 
 1472 
 1473 *errorptr = NULL;
 1474 
 1475 if (re == NULL || re->magic_number != MAGIC_NUMBER)
 1476   {
 1477   *errorptr = "argument is not a compiled regular expression";
 1478   return NULL;
 1479   }
 1480 
 1481 if ((re->flags & PCRE_MODE) == 0)
 1482   {
 1483 #if defined COMPILE_PCRE8
 1484   *errorptr = "argument not compiled in 8 bit mode";
 1485 #elif defined COMPILE_PCRE16
 1486   *errorptr = "argument not compiled in 16 bit mode";
 1487 #elif defined COMPILE_PCRE32
 1488   *errorptr = "argument not compiled in 32 bit mode";
 1489 #endif
 1490   return NULL;
 1491   }
 1492 
 1493 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
 1494   {
 1495   *errorptr = "unknown or incorrect option bit(s) set";
 1496   return NULL;
 1497   }
 1498 
 1499 code = (pcre_uchar *)re + re->name_table_offset +
 1500   (re->name_count * re->name_entry_size);
 1501 
 1502 /* For an anchored pattern, or an unanchored pattern that has a first char, or
 1503 a multiline pattern that matches only at "line starts", there is no point in
 1504 seeking a list of starting bytes. */
 1505 
 1506 if ((re->options & PCRE_ANCHORED) == 0 &&
 1507     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
 1508   {
 1509   int rc;
 1510 
 1511   /* Set the character tables in the block that is passed around */
 1512 
 1513   tables = re->tables;
 1514 
 1515 #if defined COMPILE_PCRE8
 1516   if (tables == NULL)
 1517     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
 1518     (void *)(&tables));
 1519 #elif defined COMPILE_PCRE16
 1520   if (tables == NULL)
 1521     (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
 1522     (void *)(&tables));
 1523 #elif defined COMPILE_PCRE32
 1524   if (tables == NULL)
 1525     (void)pcre32_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
 1526     (void *)(&tables));
 1527 #endif
 1528 
 1529   compile_block.lcc = tables + lcc_offset;
 1530   compile_block.fcc = tables + fcc_offset;
 1531   compile_block.cbits = tables + cbits_offset;
 1532   compile_block.ctypes = tables + ctypes_offset;
 1533 
 1534   /* See if we can find a fixed set of initial characters for the pattern. */
 1535 
 1536   memset(start_bits, 0, 32 * sizeof(pcre_uint8));
 1537   rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
 1538     &compile_block);
 1539   bits_set = rc == SSB_DONE;
 1540   if (rc == SSB_UNKNOWN)
 1541     {
 1542     *errorptr = "internal error: opcode not recognized";
 1543     return NULL;
 1544     }
 1545   }
 1546 
 1547 /* Find the minimum length of subject string. */
 1548 
 1549 switch(min = find_minlength(re, code, code, re->options, NULL, &count))
 1550   {
 1551   case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
 1552   case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
 1553   default: break;
 1554   }
 1555 
 1556 /* If a set of starting bytes has been identified, or if the minimum length is
 1557 greater than zero, or if JIT optimization has been requested, or if
 1558 PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a
 1559 pcre_study_data block. The study data is put in the latter, which is pointed to
 1560 by the former, which may also get additional data set later by the calling
 1561 program. At the moment, the size of pcre_study_data is fixed. We nevertheless
 1562 save it in a field for returning via the pcre_fullinfo() function so that if it
 1563 becomes variable in the future, we don't have to change that code. */
 1564 
 1565 if (bits_set || min > 0 || (options & (
 1566 #ifdef SUPPORT_JIT
 1567     PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE |
 1568     PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE |
 1569 #endif
 1570     PCRE_STUDY_EXTRA_NEEDED)) != 0)
 1571   {
 1572   extra = (PUBL(extra) *)(PUBL(malloc))
 1573     (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
 1574   if (extra == NULL)
 1575     {
 1576     *errorptr = "failed to get memory";
 1577     return NULL;
 1578     }
 1579 
 1580   study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
 1581   extra->flags = PCRE_EXTRA_STUDY_DATA;
 1582   extra->study_data = study;
 1583 
 1584   study->size = sizeof(pcre_study_data);
 1585   study->flags = 0;
 1586 
 1587   /* Set the start bits always, to avoid unset memory errors if the
 1588   study data is written to a file, but set the flag only if any of the bits
 1589   are set, to save time looking when none are. */
 1590 
 1591   if (bits_set)
 1592     {
 1593     study->flags |= PCRE_STUDY_MAPPED;
 1594     memcpy(study->start_bits, start_bits, sizeof(start_bits));
 1595     }
 1596   else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
 1597 
 1598 #ifdef PCRE_DEBUG
 1599   if (bits_set)
 1600     {
 1601     pcre_uint8 *ptr = start_bits;
 1602     int i;
 1603 
 1604     printf("Start bits:\n");
 1605     for (i = 0; i < 32; i++)
 1606       printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
 1607     }
 1608 #endif
 1609 
 1610   /* Always set the minlength value in the block, because the JIT compiler
 1611   makes use of it. However, don't set the bit unless the length is greater than
 1612   zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
 1613   checking the zero case. */
 1614 
 1615   if (min > 0)
 1616     {
 1617     study->flags |= PCRE_STUDY_MINLEN;
 1618     study->minlength = min;
 1619     }
 1620   else study->minlength = 0;
 1621 
 1622   /* If JIT support was compiled and requested, attempt the JIT compilation.
 1623   If no starting bytes were found, and the minimum length is zero, and JIT
 1624   compilation fails, abandon the extra block and return NULL, unless
 1625   PCRE_STUDY_EXTRA_NEEDED is set. */
 1626 
 1627 #ifdef SUPPORT_JIT
 1628   extra->executable_jit = NULL;
 1629   if ((options & PCRE_STUDY_JIT_COMPILE) != 0)
 1630     PRIV(jit_compile)(re, extra, JIT_COMPILE);
 1631   if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0)
 1632     PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE);
 1633   if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0)
 1634     PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE);
 1635 
 1636   if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0 &&
 1637       (options & PCRE_STUDY_EXTRA_NEEDED) == 0)
 1638     {
 1639 #if defined COMPILE_PCRE8
 1640     pcre_free_study(extra);
 1641 #elif defined COMPILE_PCRE16
 1642     pcre16_free_study(extra);
 1643 #elif defined COMPILE_PCRE32
 1644     pcre32_free_study(extra);
 1645 #endif
 1646     extra = NULL;
 1647     }
 1648 #endif
 1649   }
 1650 
 1651 return extra;
 1652 }
 1653 
 1654 
 1655 /*************************************************
 1656 *          Free the study data                   *
 1657 *************************************************/
 1658 
 1659 /* This function frees the memory that was obtained by pcre_study().
 1660 
 1661 Argument:   a pointer to the pcre[16]_extra block
 1662 Returns:    nothing
 1663 */
 1664 
 1665 #if defined COMPILE_PCRE8
 1666 PCRE_EXP_DEFN void
 1667 pcre_free_study(pcre_extra *extra)
 1668 #elif defined COMPILE_PCRE16
 1669 PCRE_EXP_DEFN void
 1670 pcre16_free_study(pcre16_extra *extra)
 1671 #elif defined COMPILE_PCRE32
 1672 PCRE_EXP_DEFN void
 1673 pcre32_free_study(pcre32_extra *extra)
 1674 #endif
 1675 {
 1676 if (extra == NULL)
 1677   return;
 1678 #ifdef SUPPORT_JIT
 1679 if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
 1680      extra->executable_jit != NULL)
 1681   PRIV(jit_free)(extra->executable_jit);
 1682 #endif
 1683 PUBL(free)(extra);
 1684 }
 1685 
 1686 /* End of pcre_study.c */