"Fossies" - the Fresh Open Source Software Archive

Member "AutoHotkey_L-1.1.33.09/source/lib_pcre/pcre/pcre_study.c" (8 May 2021, 42467 Bytes) of package /windows/misc/AutoHotkey_L-1.1.33.09.zip:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "pcre_study.c" see the Fossies "Dox" file reference documentation.

    1 /*************************************************
    2 *      Perl-Compatible Regular Expressions       *
    3 *************************************************/
    4 
    5 /* PCRE is a library of functions to support regular expressions whose syntax
    6 and semantics are as close as possible to those of the Perl 5 language.
    7 
    8                        Written by Philip Hazel
    9            Copyright (c) 1997-2012 University of Cambridge
   10 
   11 -----------------------------------------------------------------------------
   12 Redistribution and use in source and binary forms, with or without
   13 modification, are permitted provided that the following conditions are met:
   14 
   15     * Redistributions of source code must retain the above copyright notice,
   16       this list of conditions and the following disclaimer.
   17 
   18     * Redistributions in binary form must reproduce the above copyright
   19       notice, this list of conditions and the following disclaimer in the
   20       documentation and/or other materials provided with the distribution.
   21 
   22     * Neither the name of the University of Cambridge nor the names of its
   23       contributors may be used to endorse or promote products derived from
   24       this software without specific prior written permission.
   25 
   26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
   27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
   30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   36 POSSIBILITY OF SUCH DAMAGE.
   37 -----------------------------------------------------------------------------
   38 */
   39 
   40 
   41 /* This module contains the external function pcre_study(), along with local
   42 supporting functions. */
   43 
   44 
   45 #ifdef HAVE_CONFIG_H
   46 #include "config.h"
   47 #endif
   48 
   49 #include "pcre_internal.h"
   50 
   51 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
   52 
   53 /* Returns from set_start_bits() */
   54 
   55 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
   56 
   57 
   58 
   59 /*************************************************
   60 *   Find the minimum subject length for a group  *
   61 *************************************************/
   62 
   63 /* Scan a parenthesized group and compute the minimum length of subject that
   64 is needed to match it. This is a lower bound; it does not mean there is a
   65 string of that length that matches. In UTF8 mode, the result is in characters
   66 rather than bytes.
   67 
   68 Arguments:
   69   code            pointer to start of group (the bracket)
   70   startcode       pointer to start of the whole pattern
   71   options         the compiling options
   72   int             RECURSE depth
   73 
   74 Returns:   the minimum length
   75            -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
   76            -2 internal error (missing capturing bracket)
   77            -3 internal error (opcode not listed)
   78 */
   79 
   80 static int
   81 find_minlength(const pcre_uchar *code, const pcre_uchar *startcode, int options,
   82   int recurse_depth)
   83 {
   84 int length = -1;
   85 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
   86 BOOL utf = UTF_ENABLED((options & PCRE_UTF8) != 0);
   87 BOOL had_recurse = FALSE;
   88 register int branchlength = 0;
   89 register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
   90 
   91 if (*code == OP_CBRA || *code == OP_SCBRA ||
   92     *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
   93 
   94 /* Scan along the opcodes for this branch. If we get to the end of the
   95 branch, check the length against that of the other branches. */
   96 
   97 for (;;)
   98   {
   99   int d, min;
  100   pcre_uchar *cs, *ce;
  101   register int op = *cc;
  102 
  103   switch (op)
  104     {
  105     case OP_COND:
  106     case OP_SCOND:
  107 
  108     /* If there is only one branch in a condition, the implied branch has zero
  109     length, so we don't add anything. This covers the DEFINE "condition"
  110     automatically. */
  111 
  112     cs = cc + GET(cc, 1);
  113     if (*cs != OP_ALT)
  114       {
  115       cc = cs + 1 + LINK_SIZE;
  116       break;
  117       }
  118 
  119     /* Otherwise we can fall through and treat it the same as any other
  120     subpattern. */
  121 
  122     case OP_CBRA:
  123     case OP_SCBRA:
  124     case OP_BRA:
  125     case OP_SBRA:
  126     case OP_CBRAPOS:
  127     case OP_SCBRAPOS:
  128     case OP_BRAPOS:
  129     case OP_SBRAPOS:
  130     case OP_ONCE:
  131     case OP_ONCE_NC:
  132     d = find_minlength(cc, startcode, options, recurse_depth);
  133     if (d < 0) return d;
  134     branchlength += d;
  135     do cc += GET(cc, 1); while (*cc == OP_ALT);
  136     cc += 1 + LINK_SIZE;
  137     break;
  138 
  139     /* ACCEPT makes things far too complicated; we have to give up. */
  140 
  141     case OP_ACCEPT:
  142     case OP_ASSERT_ACCEPT:
  143     return -1;
  144 
  145     /* Reached end of a branch; if it's a ket it is the end of a nested
  146     call. If it's ALT it is an alternation in a nested call. If it is END it's
  147     the end of the outer call. All can be handled by the same code. If an
  148     ACCEPT was previously encountered, use the length that was in force at that
  149     time, and pass back the shortest ACCEPT length. */
  150 
  151     case OP_ALT:
  152     case OP_KET:
  153     case OP_KETRMAX:
  154     case OP_KETRMIN:
  155     case OP_KETRPOS:
  156     case OP_END:
  157     if (length < 0 || (!had_recurse && branchlength < length))
  158       length = branchlength;
  159     if (op != OP_ALT) return length;
  160     cc += 1 + LINK_SIZE;
  161     branchlength = 0;
  162     had_recurse = FALSE;
  163     break;
  164 
  165     /* Skip over assertive subpatterns */
  166 
  167     case OP_ASSERT:
  168     case OP_ASSERT_NOT:
  169     case OP_ASSERTBACK:
  170     case OP_ASSERTBACK_NOT:
  171     do cc += GET(cc, 1); while (*cc == OP_ALT);
  172     /* Fall through */
  173 
  174     /* Skip over things that don't match chars */
  175 
  176     case OP_REVERSE:
  177     case OP_CREF:
  178     case OP_NCREF:
  179     case OP_RREF:
  180     case OP_NRREF:
  181     case OP_DEF:
  182     case OP_CALLOUT:
  183     case OP_SOD:
  184     case OP_SOM:
  185     case OP_EOD:
  186     case OP_EODN:
  187     case OP_CIRC:
  188     case OP_CIRCM:
  189     case OP_DOLL:
  190     case OP_DOLLM:
  191     case OP_NOT_WORD_BOUNDARY:
  192     case OP_WORD_BOUNDARY:
  193     cc += PRIV(OP_lengths)[*cc];
  194     break;
  195 
  196     /* Skip over a subpattern that has a {0} or {0,x} quantifier */
  197 
  198     case OP_BRAZERO:
  199     case OP_BRAMINZERO:
  200     case OP_BRAPOSZERO:
  201     case OP_SKIPZERO:
  202     cc += PRIV(OP_lengths)[*cc];
  203     do cc += GET(cc, 1); while (*cc == OP_ALT);
  204     cc += 1 + LINK_SIZE;
  205     break;
  206 
  207     /* Handle literal characters and + repetitions */
  208 
  209     case OP_CHAR:
  210     case OP_CHARI:
  211     case OP_NOT:
  212     case OP_NOTI:
  213     case OP_PLUS:
  214     case OP_PLUSI:
  215     case OP_MINPLUS:
  216     case OP_MINPLUSI:
  217     case OP_POSPLUS:
  218     case OP_POSPLUSI:
  219     case OP_NOTPLUS:
  220     case OP_NOTPLUSI:
  221     case OP_NOTMINPLUS:
  222     case OP_NOTMINPLUSI:
  223     case OP_NOTPOSPLUS:
  224     case OP_NOTPOSPLUSI:
  225     branchlength++;
  226     cc += 2;
  227 #ifdef SUPPORT_UTF
  228     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
  229 #endif
  230     break;
  231 
  232     case OP_TYPEPLUS:
  233     case OP_TYPEMINPLUS:
  234     case OP_TYPEPOSPLUS:
  235     branchlength++;
  236     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
  237     break;
  238 
  239     /* Handle exact repetitions. The count is already in characters, but we
  240     need to skip over a multibyte character in UTF8 mode.  */
  241 
  242     case OP_EXACT:
  243     case OP_EXACTI:
  244     case OP_NOTEXACT:
  245     case OP_NOTEXACTI:
  246     branchlength += GET2(cc,1);
  247     cc += 2 + IMM2_SIZE;
  248 #ifdef SUPPORT_UTF
  249     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
  250 #endif
  251     break;
  252 
  253     case OP_TYPEEXACT:
  254     branchlength += GET2(cc,1);
  255     cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
  256       || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
  257     break;
  258 
  259     /* Handle single-char non-literal matchers */
  260 
  261     case OP_PROP:
  262     case OP_NOTPROP:
  263     cc += 2;
  264     /* Fall through */
  265 
  266     case OP_NOT_DIGIT:
  267     case OP_DIGIT:
  268     case OP_NOT_WHITESPACE:
  269     case OP_WHITESPACE:
  270     case OP_NOT_WORDCHAR:
  271     case OP_WORDCHAR:
  272     case OP_ANY:
  273     case OP_ALLANY:
  274     case OP_EXTUNI:
  275     case OP_HSPACE:
  276     case OP_NOT_HSPACE:
  277     case OP_VSPACE:
  278     case OP_NOT_VSPACE:
  279     branchlength++;
  280     cc++;
  281     break;
  282 
  283     /* "Any newline" might match two characters, but it also might match just
  284     one. */
  285 
  286     case OP_ANYNL:
  287     branchlength += 1;
  288     cc++;
  289     break;
  290 
  291     /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
  292     non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
  293     appear, but leave the code, just in case.) */
  294 
  295     case OP_ANYBYTE:
  296 #ifdef SUPPORT_UTF
  297     if (utf) return -1;
  298 #endif
  299     branchlength++;
  300     cc++;
  301     break;
  302 
  303     /* For repeated character types, we have to test for \p and \P, which have
  304     an extra two bytes of parameters. */
  305 
  306     case OP_TYPESTAR:
  307     case OP_TYPEMINSTAR:
  308     case OP_TYPEQUERY:
  309     case OP_TYPEMINQUERY:
  310     case OP_TYPEPOSSTAR:
  311     case OP_TYPEPOSQUERY:
  312     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
  313     cc += PRIV(OP_lengths)[op];
  314     break;
  315 
  316     case OP_TYPEUPTO:
  317     case OP_TYPEMINUPTO:
  318     case OP_TYPEPOSUPTO:
  319     if (cc[1 + IMM2_SIZE] == OP_PROP
  320       || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
  321     cc += PRIV(OP_lengths)[op];
  322     break;
  323 
  324     /* Check a class for variable quantification */
  325 
  326 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
  327     case OP_XCLASS:
  328     cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
  329     /* Fall through */
  330 #endif
  331 
  332     case OP_CLASS:
  333     case OP_NCLASS:
  334     cc += PRIV(OP_lengths)[OP_CLASS];
  335 
  336     switch (*cc)
  337       {
  338       case OP_CRPLUS:
  339       case OP_CRMINPLUS:
  340       branchlength++;
  341       /* Fall through */
  342 
  343       case OP_CRSTAR:
  344       case OP_CRMINSTAR:
  345       case OP_CRQUERY:
  346       case OP_CRMINQUERY:
  347       cc++;
  348       break;
  349 
  350       case OP_CRRANGE:
  351       case OP_CRMINRANGE:
  352       branchlength += GET2(cc,1);
  353       cc += 1 + 2 * IMM2_SIZE;
  354       break;
  355 
  356       default:
  357       branchlength++;
  358       break;
  359       }
  360     break;
  361 
  362     /* Backreferences and subroutine calls are treated in the same way: we find
  363     the minimum length for the subpattern. A recursion, however, causes an
  364     a flag to be set that causes the length of this branch to be ignored. The
  365     logic is that a recursion can only make sense if there is another
  366     alternation that stops the recursing. That will provide the minimum length
  367     (when no recursion happens). A backreference within the group that it is
  368     referencing behaves in the same way.
  369 
  370     If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
  371     matches an empty string (by default it causes a matching failure), so in
  372     that case we must set the minimum length to zero. */
  373 
  374     case OP_REF:
  375     case OP_REFI:
  376     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
  377       {
  378       ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
  379       if (cs == NULL) return -2;
  380       do ce += GET(ce, 1); while (*ce == OP_ALT);
  381       if (cc > cs && cc < ce)
  382         {
  383         d = 0;
  384         had_recurse = TRUE;
  385         }
  386       else
  387         {
  388         d = find_minlength(cs, startcode, options, recurse_depth);
  389         }
  390       }
  391     else d = 0;
  392     cc += 1 + IMM2_SIZE;
  393 
  394     /* Handle repeated back references */
  395 
  396     switch (*cc)
  397       {
  398       case OP_CRSTAR:
  399       case OP_CRMINSTAR:
  400       case OP_CRQUERY:
  401       case OP_CRMINQUERY:
  402       min = 0;
  403       cc++;
  404       break;
  405 
  406       case OP_CRPLUS:
  407       case OP_CRMINPLUS:
  408       min = 1;
  409       cc++;
  410       break;
  411 
  412       case OP_CRRANGE:
  413       case OP_CRMINRANGE:
  414       min = GET2(cc, 1);
  415       cc += 1 + 2 * IMM2_SIZE;
  416       break;
  417 
  418       default:
  419       min = 1;
  420       break;
  421       }
  422 
  423     branchlength += min * d;
  424     break;
  425 
  426     /* We can easily detect direct recursion, but not mutual recursion. This is
  427     caught by a recursion depth count. */
  428 
  429     case OP_RECURSE:
  430     cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
  431     do ce += GET(ce, 1); while (*ce == OP_ALT);
  432     if ((cc > cs && cc < ce) || recurse_depth > 10)
  433       had_recurse = TRUE;
  434     else
  435       {
  436       branchlength += find_minlength(cs, startcode, options, recurse_depth + 1);
  437       }
  438     cc += 1 + LINK_SIZE;
  439     break;
  440 
  441     /* Anything else does not or need not match a character. We can get the
  442     item's length from the table, but for those that can match zero occurrences
  443     of a character, we must take special action for UTF-8 characters. As it
  444     happens, the "NOT" versions of these opcodes are used at present only for
  445     ASCII characters, so they could be omitted from this list. However, in
  446     future that may change, so we include them here so as not to leave a
  447     gotcha for a future maintainer. */
  448 
  449     case OP_UPTO:
  450     case OP_UPTOI:
  451     case OP_NOTUPTO:
  452     case OP_NOTUPTOI:
  453     case OP_MINUPTO:
  454     case OP_MINUPTOI:
  455     case OP_NOTMINUPTO:
  456     case OP_NOTMINUPTOI:
  457     case OP_POSUPTO:
  458     case OP_POSUPTOI:
  459     case OP_NOTPOSUPTO:
  460     case OP_NOTPOSUPTOI:
  461 
  462     case OP_STAR:
  463     case OP_STARI:
  464     case OP_NOTSTAR:
  465     case OP_NOTSTARI:
  466     case OP_MINSTAR:
  467     case OP_MINSTARI:
  468     case OP_NOTMINSTAR:
  469     case OP_NOTMINSTARI:
  470     case OP_POSSTAR:
  471     case OP_POSSTARI:
  472     case OP_NOTPOSSTAR:
  473     case OP_NOTPOSSTARI:
  474 
  475     case OP_QUERY:
  476     case OP_QUERYI:
  477     case OP_NOTQUERY:
  478     case OP_NOTQUERYI:
  479     case OP_MINQUERY:
  480     case OP_MINQUERYI:
  481     case OP_NOTMINQUERY:
  482     case OP_NOTMINQUERYI:
  483     case OP_POSQUERY:
  484     case OP_POSQUERYI:
  485     case OP_NOTPOSQUERY:
  486     case OP_NOTPOSQUERYI:
  487 
  488     cc += PRIV(OP_lengths)[op];
  489 #ifdef SUPPORT_UTF
  490     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
  491 #endif
  492     break;
  493 
  494     /* Skip these, but we need to add in the name length. */
  495 
  496     case OP_MARK:
  497     case OP_PRUNE_ARG:
  498     case OP_SKIP_ARG:
  499     case OP_THEN_ARG:
  500     cc += PRIV(OP_lengths)[op] + cc[1];
  501     break;
  502 
  503     /* The remaining opcodes are just skipped over. */
  504 
  505     case OP_CLOSE:
  506     case OP_COMMIT:
  507     case OP_FAIL:
  508     case OP_PRUNE:
  509     case OP_SET_SOM:
  510     case OP_SKIP:
  511     case OP_THEN:
  512     cc += PRIV(OP_lengths)[op];
  513     break;
  514 
  515     /* This should not occur: we list all opcodes explicitly so that when
  516     new ones get added they are properly considered. */
  517 
  518     default:
  519     return -3;
  520     }
  521   }
  522 /* Control never gets here */
  523 }
  524 
  525 
  526 
  527 /*************************************************
  528 *      Set a bit and maybe its alternate case    *
  529 *************************************************/
  530 
  531 /* Given a character, set its first byte's bit in the table, and also the
  532 corresponding bit for the other version of a letter if we are caseless. In
  533 UTF-8 mode, for characters greater than 127, we can only do the caseless thing
  534 when Unicode property support is available.
  535 
  536 Arguments:
  537   start_bits    points to the bit map
  538   p             points to the character
  539   caseless      the caseless flag
  540   cd            the block with char table pointers
  541   utf           TRUE for UTF-8 / UTF-16 mode
  542 
  543 Returns:        pointer after the character
  544 */
  545 
  546 static const pcre_uchar *
  547 set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
  548   compile_data *cd, BOOL utf)
  549 {
  550 unsigned int c = *p;
  551 
  552 #ifdef COMPILE_PCRE8
  553 SET_BIT(c);
  554 
  555 #ifdef SUPPORT_UTF
  556 if (utf && c > 127)
  557   {
  558   GETCHARINC(c, p);
  559 #ifdef SUPPORT_UCP
  560   if (caseless)
  561     {
  562     pcre_uchar buff[6];
  563     c = UCD_OTHERCASE(c);
  564     (void)PRIV(ord2utf)(c, buff);
  565     SET_BIT(buff[0]);
  566     }
  567 #endif
  568   return p;
  569   }
  570 #endif
  571 
  572 /* Not UTF-8 mode, or character is less than 127. */
  573 
  574 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
  575 return p + 1;
  576 #endif
  577 
  578 #ifdef COMPILE_PCRE16
  579 if (c > 0xff)
  580   {
  581   c = 0xff;
  582   caseless = FALSE;
  583   }
  584 SET_BIT(c);
  585 
  586 #ifdef SUPPORT_UTF
  587 if (utf && c > 127)
  588   {
  589   GETCHARINC(c, p);
  590 #ifdef SUPPORT_UCP
  591   if (caseless)
  592     {
  593     c = UCD_OTHERCASE(c);
  594     if (c > 0xff)
  595       c = 0xff;
  596     SET_BIT(c);
  597     }
  598 #endif
  599   return p;
  600   }
  601 #endif
  602 
  603 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
  604 return p + 1;
  605 #endif
  606 }
  607 
  608 
  609 
  610 /*************************************************
  611 *     Set bits for a positive character type     *
  612 *************************************************/
  613 
  614 /* This function sets starting bits for a character type. In UTF-8 mode, we can
  615 only do a direct setting for bytes less than 128, as otherwise there can be
  616 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
  617 environment, the tables will only recognize ASCII characters anyway, but in at
  618 least one Windows environment, some higher bytes bits were set in the tables.
  619 So we deal with that case by considering the UTF-8 encoding.
  620 
  621 Arguments:
  622   start_bits     the starting bitmap
  623   cbit type      the type of character wanted
  624   table_limit    32 for non-UTF-8; 16 for UTF-8
  625   cd             the block with char table pointers
  626 
  627 Returns:         nothing
  628 */
  629 
  630 static void
  631 set_type_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit,
  632   compile_data *cd)
  633 {
  634 register int c;
  635 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
  636 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
  637 if (table_limit == 32) return;
  638 for (c = 128; c < 256; c++)
  639   {
  640   if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
  641     {
  642     pcre_uchar buff[6];
  643     (void)PRIV(ord2utf)(c, buff);
  644     SET_BIT(buff[0]);
  645     }
  646   }
  647 #endif
  648 }
  649 
  650 
  651 /*************************************************
  652 *     Set bits for a negative character type     *
  653 *************************************************/
  654 
  655 /* This function sets starting bits for a negative character type such as \D.
  656 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
  657 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
  658 Unlike in the positive case, where we can set appropriate starting bits for
  659 specific high-valued UTF-8 characters, in this case we have to set the bits for
  660 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
  661 0xc0 (192) for simplicity.
  662 
  663 Arguments:
  664   start_bits     the starting bitmap
  665   cbit type      the type of character wanted
  666   table_limit    32 for non-UTF-8; 16 for UTF-8
  667   cd             the block with char table pointers
  668 
  669 Returns:         nothing
  670 */
  671 
  672 static void
  673 set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit,
  674   compile_data *cd)
  675 {
  676 register int c;
  677 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
  678 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
  679 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
  680 #endif
  681 }
  682 
  683 
  684 
  685 /*************************************************
  686 *          Create bitmap of starting bytes       *
  687 *************************************************/
  688 
  689 /* This function scans a compiled unanchored expression recursively and
  690 attempts to build a bitmap of the set of possible starting bytes. As time goes
  691 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
  692 useful for parenthesized groups in patterns such as (a*)b where the group
  693 provides some optional starting bytes but scanning must continue at the outer
  694 level to find at least one mandatory byte. At the outermost level, this
  695 function fails unless the result is SSB_DONE.
  696 
  697 Arguments:
  698   code         points to an expression
  699   start_bits   points to a 32-byte table, initialized to 0
  700   utf          TRUE if in UTF-8 / UTF-16 mode
  701   cd           the block with char table pointers
  702 
  703 Returns:       SSB_FAIL     => Failed to find any starting bytes
  704                SSB_DONE     => Found mandatory starting bytes
  705                SSB_CONTINUE => Found optional starting bytes
  706                SSB_UNKNOWN  => Hit an unrecognized opcode
  707 */
  708 
  709 static int
  710 set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
  711   compile_data *cd)
  712 {
  713 register int c;
  714 int yield = SSB_DONE;
  715 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
  716 int table_limit = utf? 16:32;
  717 #else
  718 int table_limit = 32;
  719 #endif
  720 
  721 #if 0
  722 /* ========================================================================= */
  723 /* The following comment and code was inserted in January 1999. In May 2006,
  724 when it was observed to cause compiler warnings about unused values, I took it
  725 out again. If anybody is still using OS/2, they will have to put it back
  726 manually. */
  727 
  728 /* This next statement and the later reference to dummy are here in order to
  729 trick the optimizer of the IBM C compiler for OS/2 into generating correct
  730 code. Apparently IBM isn't going to fix the problem, and we would rather not
  731 disable optimization (in this module it actually makes a big difference, and
  732 the pcre module can use all the optimization it can get). */
  733 
  734 volatile int dummy;
  735 /* ========================================================================= */
  736 #endif
  737 
  738 do
  739   {
  740   BOOL try_next = TRUE;
  741   const pcre_uchar *tcode = code + 1 + LINK_SIZE;
  742 
  743   if (*code == OP_CBRA || *code == OP_SCBRA ||
  744       *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
  745 
  746   while (try_next)    /* Loop for items in this branch */
  747     {
  748     int rc;
  749 
  750     switch(*tcode)
  751       {
  752       /* If we reach something we don't understand, it means a new opcode has
  753       been created that hasn't been added to this code. Hopefully this problem
  754       will be discovered during testing. */
  755 
  756       default:
  757       return SSB_UNKNOWN;
  758 
  759       /* Fail for a valid opcode that implies no starting bits. */
  760 
  761       case OP_ACCEPT:
  762       case OP_ASSERT_ACCEPT:
  763       case OP_ALLANY:
  764       case OP_ANY:
  765       case OP_ANYBYTE:
  766       case OP_CIRC:
  767       case OP_CIRCM:
  768       case OP_CLOSE:
  769       case OP_COMMIT:
  770       case OP_COND:
  771       case OP_CREF:
  772       case OP_DEF:
  773       case OP_DOLL:
  774       case OP_DOLLM:
  775       case OP_END:
  776       case OP_EOD:
  777       case OP_EODN:
  778       case OP_EXTUNI:
  779       case OP_FAIL:
  780       case OP_MARK:
  781       case OP_NCREF:
  782       case OP_NOT:
  783       case OP_NOTEXACT:
  784       case OP_NOTEXACTI:
  785       case OP_NOTI:
  786       case OP_NOTMINPLUS:
  787       case OP_NOTMINPLUSI:
  788       case OP_NOTMINQUERY:
  789       case OP_NOTMINQUERYI:
  790       case OP_NOTMINSTAR:
  791       case OP_NOTMINSTARI:
  792       case OP_NOTMINUPTO:
  793       case OP_NOTMINUPTOI:
  794       case OP_NOTPLUS:
  795       case OP_NOTPLUSI:
  796       case OP_NOTPOSPLUS:
  797       case OP_NOTPOSPLUSI:
  798       case OP_NOTPOSQUERY:
  799       case OP_NOTPOSQUERYI:
  800       case OP_NOTPOSSTAR:
  801       case OP_NOTPOSSTARI:
  802       case OP_NOTPOSUPTO:
  803       case OP_NOTPOSUPTOI:
  804       case OP_NOTPROP:
  805       case OP_NOTQUERY:
  806       case OP_NOTQUERYI:
  807       case OP_NOTSTAR:
  808       case OP_NOTSTARI:
  809       case OP_NOTUPTO:
  810       case OP_NOTUPTOI:
  811       case OP_NOT_HSPACE:
  812       case OP_NOT_VSPACE:
  813       case OP_NRREF:
  814       case OP_PROP:
  815       case OP_PRUNE:
  816       case OP_PRUNE_ARG:
  817       case OP_RECURSE:
  818       case OP_REF:
  819       case OP_REFI:
  820       case OP_REVERSE:
  821       case OP_RREF:
  822       case OP_SCOND:
  823       case OP_SET_SOM:
  824       case OP_SKIP:
  825       case OP_SKIP_ARG:
  826       case OP_SOD:
  827       case OP_SOM:
  828       case OP_THEN:
  829       case OP_THEN_ARG:
  830 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
  831       case OP_XCLASS:
  832 #endif
  833       return SSB_FAIL;
  834 
  835       /* We can ignore word boundary tests. */
  836 
  837       case OP_WORD_BOUNDARY:
  838       case OP_NOT_WORD_BOUNDARY:
  839       tcode++;
  840       break;
  841 
  842       /* If we hit a bracket or a positive lookahead assertion, recurse to set
  843       bits from within the subpattern. If it can't find anything, we have to
  844       give up. If it finds some mandatory character(s), we are done for this
  845       branch. Otherwise, carry on scanning after the subpattern. */
  846 
  847       case OP_BRA:
  848       case OP_SBRA:
  849       case OP_CBRA:
  850       case OP_SCBRA:
  851       case OP_BRAPOS:
  852       case OP_SBRAPOS:
  853       case OP_CBRAPOS:
  854       case OP_SCBRAPOS:
  855       case OP_ONCE:
  856       case OP_ONCE_NC:
  857       case OP_ASSERT:
  858       rc = set_start_bits(tcode, start_bits, utf, cd);
  859       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
  860       if (rc == SSB_DONE) try_next = FALSE; else
  861         {
  862         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
  863         tcode += 1 + LINK_SIZE;
  864         }
  865       break;
  866 
  867       /* If we hit ALT or KET, it means we haven't found anything mandatory in
  868       this branch, though we might have found something optional. For ALT, we
  869       continue with the next alternative, but we have to arrange that the final
  870       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
  871       return SSB_CONTINUE: if this is the top level, that indicates failure,
  872       but after a nested subpattern, it causes scanning to continue. */
  873 
  874       case OP_ALT:
  875       yield = SSB_CONTINUE;
  876       try_next = FALSE;
  877       break;
  878 
  879       case OP_KET:
  880       case OP_KETRMAX:
  881       case OP_KETRMIN:
  882       case OP_KETRPOS:
  883       return SSB_CONTINUE;
  884 
  885       /* Skip over callout */
  886 
  887       case OP_CALLOUT:
  888       tcode += 2 + IMMPTR_SIZE + 2*LINK_SIZE;
  889       break;
  890 
  891       /* Skip over lookbehind and negative lookahead assertions */
  892 
  893       case OP_ASSERT_NOT:
  894       case OP_ASSERTBACK:
  895       case OP_ASSERTBACK_NOT:
  896       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
  897       tcode += 1 + LINK_SIZE;
  898       break;
  899 
  900       /* BRAZERO does the bracket, but carries on. */
  901 
  902       case OP_BRAZERO:
  903       case OP_BRAMINZERO:
  904       case OP_BRAPOSZERO:
  905       rc = set_start_bits(++tcode, start_bits, utf, cd);
  906       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
  907 /* =========================================================================
  908       See the comment at the head of this function concerning the next line,
  909       which was an old fudge for the benefit of OS/2.
  910       dummy = 1;
  911   ========================================================================= */
  912       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
  913       tcode += 1 + LINK_SIZE;
  914       break;
  915 
  916       /* SKIPZERO skips the bracket. */
  917 
  918       case OP_SKIPZERO:
  919       tcode++;
  920       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
  921       tcode += 1 + LINK_SIZE;
  922       break;
  923 
  924       /* Single-char * or ? sets the bit and tries the next item */
  925 
  926       case OP_STAR:
  927       case OP_MINSTAR:
  928       case OP_POSSTAR:
  929       case OP_QUERY:
  930       case OP_MINQUERY:
  931       case OP_POSQUERY:
  932       tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
  933       break;
  934 
  935       case OP_STARI:
  936       case OP_MINSTARI:
  937       case OP_POSSTARI:
  938       case OP_QUERYI:
  939       case OP_MINQUERYI:
  940       case OP_POSQUERYI:
  941       tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
  942       break;
  943 
  944       /* Single-char upto sets the bit and tries the next */
  945 
  946       case OP_UPTO:
  947       case OP_MINUPTO:
  948       case OP_POSUPTO:
  949       tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
  950       break;
  951 
  952       case OP_UPTOI:
  953       case OP_MINUPTOI:
  954       case OP_POSUPTOI:
  955       tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
  956       break;
  957 
  958       /* At least one single char sets the bit and stops */
  959 
  960       case OP_EXACT:
  961       tcode += IMM2_SIZE;
  962       /* Fall through */
  963       case OP_CHAR:
  964       case OP_PLUS:
  965       case OP_MINPLUS:
  966       case OP_POSPLUS:
  967       (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
  968       try_next = FALSE;
  969       break;
  970 
  971       case OP_EXACTI:
  972       tcode += IMM2_SIZE;
  973       /* Fall through */
  974       case OP_CHARI:
  975       case OP_PLUSI:
  976       case OP_MINPLUSI:
  977       case OP_POSPLUSI:
  978       (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
  979       try_next = FALSE;
  980       break;
  981 
  982       /* Special spacing and line-terminating items. These recognize specific
  983       lists of characters. The difference between VSPACE and ANYNL is that the
  984       latter can match the two-character CRLF sequence, but that is not
  985       relevant for finding the first character, so their code here is
  986       identical. */
  987 
  988       case OP_HSPACE:
  989       SET_BIT(0x09);
  990       SET_BIT(0x20);
  991 #ifdef SUPPORT_UTF
  992       if (utf)
  993         {
  994 #ifdef COMPILE_PCRE8
  995         SET_BIT(0xC2);  /* For U+00A0 */
  996         SET_BIT(0xE1);  /* For U+1680, U+180E */
  997         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
  998         SET_BIT(0xE3);  /* For U+3000 */
  999 #endif
 1000 #ifdef COMPILE_PCRE16
 1001         SET_BIT(0xA0);
 1002         SET_BIT(0xFF);  /* For characters > 255 */
 1003 #endif
 1004         }
 1005       else
 1006 #endif /* SUPPORT_UTF */
 1007         {
 1008         SET_BIT(0xA0);
 1009 #ifdef COMPILE_PCRE16
 1010         SET_BIT(0xFF);  /* For characters > 255 */
 1011 #endif
 1012         }
 1013       try_next = FALSE;
 1014       break;
 1015 
 1016       case OP_ANYNL:
 1017       case OP_VSPACE:
 1018       SET_BIT(0x0A);
 1019       SET_BIT(0x0B);
 1020       SET_BIT(0x0C);
 1021       SET_BIT(0x0D);
 1022 #ifdef SUPPORT_UTF
 1023       if (utf)
 1024         {
 1025 #ifdef COMPILE_PCRE8
 1026         SET_BIT(0xC2);  /* For U+0085 */
 1027         SET_BIT(0xE2);  /* For U+2028, U+2029 */
 1028 #endif
 1029 #ifdef COMPILE_PCRE16
 1030         SET_BIT(0x85);
 1031         SET_BIT(0xFF);  /* For characters > 255 */
 1032 #endif
 1033         }
 1034       else
 1035 #endif /* SUPPORT_UTF */
 1036         {
 1037         SET_BIT(0x85);
 1038 #ifdef COMPILE_PCRE16
 1039         SET_BIT(0xFF);  /* For characters > 255 */
 1040 #endif
 1041         }
 1042       try_next = FALSE;
 1043       break;
 1044 
 1045       /* Single character types set the bits and stop. Note that if PCRE_UCP
 1046       is set, we do not see these op codes because \d etc are converted to
 1047       properties. Therefore, these apply in the case when only characters less
 1048       than 256 are recognized to match the types. */
 1049 
 1050       case OP_NOT_DIGIT:
 1051       set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
 1052       try_next = FALSE;
 1053       break;
 1054 
 1055       case OP_DIGIT:
 1056       set_type_bits(start_bits, cbit_digit, table_limit, cd);
 1057       try_next = FALSE;
 1058       break;
 1059 
 1060       /* The cbit_space table has vertical tab as whitespace; we have to
 1061       ensure it is set as not whitespace. */
 1062 
 1063       case OP_NOT_WHITESPACE:
 1064       set_nottype_bits(start_bits, cbit_space, table_limit, cd);
 1065       start_bits[1] |= 0x08;
 1066       try_next = FALSE;
 1067       break;
 1068 
 1069       /* The cbit_space table has vertical tab as whitespace; we have to
 1070       not set it from the table. */
 1071 
 1072       case OP_WHITESPACE:
 1073       c = start_bits[1];    /* Save in case it was already set */
 1074       set_type_bits(start_bits, cbit_space, table_limit, cd);
 1075       start_bits[1] = (start_bits[1] & ~0x08) | c;
 1076       try_next = FALSE;
 1077       break;
 1078 
 1079       case OP_NOT_WORDCHAR:
 1080       set_nottype_bits(start_bits, cbit_word, table_limit, cd);
 1081       try_next = FALSE;
 1082       break;
 1083 
 1084       case OP_WORDCHAR:
 1085       set_type_bits(start_bits, cbit_word, table_limit, cd);
 1086       try_next = FALSE;
 1087       break;
 1088 
 1089       /* One or more character type fudges the pointer and restarts, knowing
 1090       it will hit a single character type and stop there. */
 1091 
 1092       case OP_TYPEPLUS:
 1093       case OP_TYPEMINPLUS:
 1094       case OP_TYPEPOSPLUS:
 1095       tcode++;
 1096       break;
 1097 
 1098       case OP_TYPEEXACT:
 1099       tcode += 1 + IMM2_SIZE;
 1100       break;
 1101 
 1102       /* Zero or more repeats of character types set the bits and then
 1103       try again. */
 1104 
 1105       case OP_TYPEUPTO:
 1106       case OP_TYPEMINUPTO:
 1107       case OP_TYPEPOSUPTO:
 1108       tcode += IMM2_SIZE;  /* Fall through */
 1109 
 1110       case OP_TYPESTAR:
 1111       case OP_TYPEMINSTAR:
 1112       case OP_TYPEPOSSTAR:
 1113       case OP_TYPEQUERY:
 1114       case OP_TYPEMINQUERY:
 1115       case OP_TYPEPOSQUERY:
 1116       switch(tcode[1])
 1117         {
 1118         default:
 1119         case OP_ANY:
 1120         case OP_ALLANY:
 1121         return SSB_FAIL;
 1122 
 1123         case OP_HSPACE:
 1124         SET_BIT(0x09);
 1125         SET_BIT(0x20);
 1126 #ifdef COMPILE_PCRE8
 1127         if (utf)
 1128           {
 1129 #ifdef COMPILE_PCRE8
 1130           SET_BIT(0xC2);  /* For U+00A0 */
 1131           SET_BIT(0xE1);  /* For U+1680, U+180E */
 1132           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
 1133           SET_BIT(0xE3);  /* For U+3000 */
 1134 #endif
 1135 #ifdef COMPILE_PCRE16
 1136           SET_BIT(0xA0);
 1137           SET_BIT(0xFF);  /* For characters > 255 */
 1138 #endif
 1139           }
 1140         else
 1141 #endif /* SUPPORT_UTF */
 1142           SET_BIT(0xA0);
 1143         break;
 1144 
 1145         case OP_ANYNL:
 1146         case OP_VSPACE:
 1147         SET_BIT(0x0A);
 1148         SET_BIT(0x0B);
 1149         SET_BIT(0x0C);
 1150         SET_BIT(0x0D);
 1151 #ifdef COMPILE_PCRE8
 1152         if (utf)
 1153           {
 1154 #ifdef COMPILE_PCRE8
 1155           SET_BIT(0xC2);  /* For U+0085 */
 1156           SET_BIT(0xE2);  /* For U+2028, U+2029 */
 1157 #endif
 1158 #ifdef COMPILE_PCRE16
 1159           SET_BIT(0x85);
 1160           SET_BIT(0xFF);  /* For characters > 255 */
 1161 #endif
 1162           }
 1163         else
 1164 #endif /* SUPPORT_UTF */
 1165           SET_BIT(0x85);
 1166         break;
 1167 
 1168         case OP_NOT_DIGIT:
 1169         set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
 1170         break;
 1171 
 1172         case OP_DIGIT:
 1173         set_type_bits(start_bits, cbit_digit, table_limit, cd);
 1174         break;
 1175 
 1176         /* The cbit_space table has vertical tab as whitespace; we have to
 1177         ensure it gets set as not whitespace. */
 1178 
 1179         case OP_NOT_WHITESPACE:
 1180         set_nottype_bits(start_bits, cbit_space, table_limit, cd);
 1181         start_bits[1] |= 0x08;
 1182         break;
 1183 
 1184         /* The cbit_space table has vertical tab as whitespace; we have to
 1185         avoid setting it. */
 1186 
 1187         case OP_WHITESPACE:
 1188         c = start_bits[1];    /* Save in case it was already set */
 1189         set_type_bits(start_bits, cbit_space, table_limit, cd);
 1190         start_bits[1] = (start_bits[1] & ~0x08) | c;
 1191         break;
 1192 
 1193         case OP_NOT_WORDCHAR:
 1194         set_nottype_bits(start_bits, cbit_word, table_limit, cd);
 1195         break;
 1196 
 1197         case OP_WORDCHAR:
 1198         set_type_bits(start_bits, cbit_word, table_limit, cd);
 1199         break;
 1200         }
 1201 
 1202       tcode += 2;
 1203       break;
 1204 
 1205       /* Character class where all the information is in a bit map: set the
 1206       bits and either carry on or not, according to the repeat count. If it was
 1207       a negative class, and we are operating with UTF-8 characters, any byte
 1208       with a value >= 0xc4 is a potentially valid starter because it starts a
 1209       character with a value > 255. */
 1210 
 1211       case OP_NCLASS:
 1212 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
 1213       if (utf)
 1214         {
 1215         start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
 1216         memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
 1217         }
 1218 #endif
 1219 #ifdef COMPILE_PCRE16
 1220       SET_BIT(0xFF);                         /* For characters > 255 */
 1221 #endif
 1222       /* Fall through */
 1223 
 1224       case OP_CLASS:
 1225         {
 1226         pcre_uint8 *map;
 1227         tcode++;
 1228         map = (pcre_uint8 *)tcode;
 1229 
 1230         /* In UTF-8 mode, the bits in a bit map correspond to character
 1231         values, not to byte values. However, the bit map we are constructing is
 1232         for byte values. So we have to do a conversion for characters whose
 1233         value is > 127. In fact, there are only two possible starting bytes for
 1234         characters in the range 128 - 255. */
 1235 
 1236 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
 1237         if (utf)
 1238           {
 1239           for (c = 0; c < 16; c++) start_bits[c] |= map[c];
 1240           for (c = 128; c < 256; c++)
 1241             {
 1242             if ((map[c/8] && (1 << (c&7))) != 0)
 1243               {
 1244               int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
 1245               start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
 1246               c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
 1247               }
 1248             }
 1249           }
 1250         else
 1251 #endif
 1252           {
 1253           /* In non-UTF-8 mode, the two bit maps are completely compatible. */
 1254           for (c = 0; c < 32; c++) start_bits[c] |= map[c];
 1255           }
 1256 
 1257         /* Advance past the bit map, and act on what follows. For a zero
 1258         minimum repeat, continue; otherwise stop processing. */
 1259 
 1260         tcode += 32 / sizeof(pcre_uchar);
 1261         switch (*tcode)
 1262           {
 1263           case OP_CRSTAR:
 1264           case OP_CRMINSTAR:
 1265           case OP_CRQUERY:
 1266           case OP_CRMINQUERY:
 1267           tcode++;
 1268           break;
 1269 
 1270           case OP_CRRANGE:
 1271           case OP_CRMINRANGE:
 1272           if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
 1273             else try_next = FALSE;
 1274           break;
 1275 
 1276           default:
 1277           try_next = FALSE;
 1278           break;
 1279           }
 1280         }
 1281       break; /* End of bitmap class handling */
 1282 
 1283       }      /* End of switch */
 1284     }        /* End of try_next loop */
 1285 
 1286   code += GET(code, 1);   /* Advance to next branch */
 1287   }
 1288 while (*code == OP_ALT);
 1289 return yield;
 1290 }
 1291 
 1292 
 1293 
 1294 
 1295 
 1296 /*************************************************
 1297 *          Study a compiled expression           *
 1298 *************************************************/
 1299 
 1300 /* This function is handed a compiled expression that it must study to produce
 1301 information that will speed up the matching. It returns a pcre[16]_extra block
 1302 which then gets handed back to pcre_exec().
 1303 
 1304 Arguments:
 1305   re        points to the compiled expression
 1306   options   contains option bits
 1307   errorptr  points to where to place error messages;
 1308             set NULL unless error
 1309 
 1310 Returns:    pointer to a pcre[16]_extra block, with study_data filled in and
 1311               the appropriate flags set;
 1312             NULL on error or if no optimization possible
 1313 */
 1314 
 1315 #ifdef COMPILE_PCRE8
 1316 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
 1317 pcre_study(const pcre *external_re, int options, const char **errorptr)
 1318 #else
 1319 PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
 1320 pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
 1321 #endif
 1322 {
 1323 int min;
 1324 BOOL bits_set = FALSE;
 1325 pcre_uint8 start_bits[32];
 1326 PUBL(extra) *extra = NULL;
 1327 pcre_study_data *study;
 1328 const pcre_uint8 *tables;
 1329 pcre_uchar *code;
 1330 compile_data compile_block;
 1331 const REAL_PCRE *re = (const REAL_PCRE *)external_re;
 1332 
 1333 *errorptr = NULL;
 1334 
 1335 if (re == NULL || re->magic_number != MAGIC_NUMBER)
 1336   {
 1337   *errorptr = "argument is not a compiled regular expression";
 1338   return NULL;
 1339   }
 1340 
 1341 if ((re->flags & PCRE_MODE) == 0)
 1342   {
 1343 #ifdef COMPILE_PCRE8
 1344   *errorptr = "argument is compiled in 16 bit mode";
 1345 #else
 1346   *errorptr = "argument is compiled in 8 bit mode";
 1347 #endif
 1348   return NULL;
 1349   }
 1350 
 1351 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
 1352   {
 1353   *errorptr = "unknown or incorrect option bit(s) set";
 1354   return NULL;
 1355   }
 1356 
 1357 code = (pcre_uchar *)re + re->name_table_offset +
 1358   (re->name_count * re->name_entry_size);
 1359 
 1360 /* For an anchored pattern, or an unanchored pattern that has a first char, or
 1361 a multiline pattern that matches only at "line starts", there is no point in
 1362 seeking a list of starting bytes. */
 1363 
 1364 if ((re->options & PCRE_ANCHORED) == 0 &&
 1365     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
 1366   {
 1367   int rc;
 1368 
 1369   /* Set the character tables in the block that is passed around */
 1370 
 1371   tables = re->tables;
 1372 
 1373 #ifdef COMPILE_PCRE8
 1374   if (tables == NULL)
 1375     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
 1376     (void *)(&tables));
 1377 #else
 1378   if (tables == NULL)
 1379     (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
 1380     (void *)(&tables));
 1381 #endif
 1382 
 1383   compile_block.lcc = tables + lcc_offset;
 1384   compile_block.fcc = tables + fcc_offset;
 1385   compile_block.cbits = tables + cbits_offset;
 1386   compile_block.ctypes = tables + ctypes_offset;
 1387 
 1388   /* See if we can find a fixed set of initial characters for the pattern. */
 1389 
 1390   memset(start_bits, 0, 32 * sizeof(pcre_uint8));
 1391   rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
 1392     &compile_block);
 1393   bits_set = rc == SSB_DONE;
 1394   if (rc == SSB_UNKNOWN)
 1395     {
 1396     *errorptr = "internal error: opcode not recognized";
 1397     return NULL;
 1398     }
 1399   }
 1400 
 1401 /* Find the minimum length of subject string. */
 1402 
 1403 switch(min = find_minlength(code, code, re->options, 0))
 1404   {
 1405   case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
 1406   case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
 1407   default: break;
 1408   }
 1409 
 1410 /* If a set of starting bytes has been identified, or if the minimum length is
 1411 greater than zero, or if JIT optimization has been requested, get a
 1412 pcre[16]_extra block and a pcre_study_data block. The study data is put in the
 1413 latter, which is pointed to by the former, which may also get additional data
 1414 set later by the calling program. At the moment, the size of pcre_study_data
 1415 is fixed. We nevertheless save it in a field for returning via the
 1416 pcre_fullinfo() function so that if it becomes variable in the future,
 1417 we don't have to change that code. */
 1418 
 1419 if (bits_set || min > 0
 1420 #ifdef SUPPORT_JIT
 1421     || (options & PCRE_STUDY_JIT_COMPILE) != 0
 1422 #endif
 1423   )
 1424   {
 1425   extra = (PUBL(extra) *)(PUBL(malloc))
 1426     (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
 1427   if (extra == NULL)
 1428     {
 1429     *errorptr = "failed to get memory";
 1430     return NULL;
 1431     }
 1432 
 1433   study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
 1434   extra->flags = PCRE_EXTRA_STUDY_DATA;
 1435   extra->study_data = study;
 1436 
 1437   study->size = sizeof(pcre_study_data);
 1438   study->flags = 0;
 1439 
 1440   /* Set the start bits always, to avoid unset memory errors if the
 1441   study data is written to a file, but set the flag only if any of the bits
 1442   are set, to save time looking when none are. */
 1443 
 1444   if (bits_set)
 1445     {
 1446     study->flags |= PCRE_STUDY_MAPPED;
 1447     memcpy(study->start_bits, start_bits, sizeof(start_bits));
 1448     }
 1449   else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
 1450 
 1451 #ifdef PCRE_DEBUG
 1452   if (bits_set)
 1453     {
 1454     pcre_uint8 *ptr = start_bits;
 1455     int i;
 1456 
 1457     printf("Start bits:\n");
 1458     for (i = 0; i < 32; i++)
 1459       printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
 1460     }
 1461 #endif
 1462 
 1463   /* Always set the minlength value in the block, because the JIT compiler
 1464   makes use of it. However, don't set the bit unless the length is greater than
 1465   zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
 1466   checking the zero case. */
 1467 
 1468   if (min > 0)
 1469     {
 1470     study->flags |= PCRE_STUDY_MINLEN;
 1471     study->minlength = min;
 1472     }
 1473   else study->minlength = 0;
 1474 
 1475   /* If JIT support was compiled and requested, attempt the JIT compilation.
 1476   If no starting bytes were found, and the minimum length is zero, and JIT
 1477   compilation fails, abandon the extra block and return NULL. */
 1478 
 1479 #ifdef SUPPORT_JIT
 1480   extra->executable_jit = NULL;
 1481   if ((options & PCRE_STUDY_JIT_COMPILE) != 0) PRIV(jit_compile)(re, extra);
 1482   if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0)
 1483     {
 1484 #ifdef COMPILE_PCRE8
 1485     pcre_free_study(extra);
 1486 #endif
 1487 #ifdef COMPILE_PCRE16
 1488     pcre16_free_study(extra);
 1489 #endif
 1490     extra = NULL;
 1491     }
 1492 #endif
 1493   }
 1494 
 1495 return extra;
 1496 }
 1497 
 1498 
 1499 /*************************************************
 1500 *          Free the study data                   *
 1501 *************************************************/
 1502 
 1503 /* This function frees the memory that was obtained by pcre_study().
 1504 
 1505 Argument:   a pointer to the pcre[16]_extra block
 1506 Returns:    nothing
 1507 */
 1508 
 1509 #ifdef COMPILE_PCRE8
 1510 PCRE_EXP_DEFN void
 1511 pcre_free_study(pcre_extra *extra)
 1512 #else
 1513 PCRE_EXP_DEFN void
 1514 pcre16_free_study(pcre16_extra *extra)
 1515 #endif
 1516 {
 1517 if (extra == NULL)
 1518   return;
 1519 #ifdef SUPPORT_JIT
 1520 if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
 1521      extra->executable_jit != NULL)
 1522   PRIV(jit_free)(extra->executable_jit);
 1523 #endif
 1524 PUBL(free)(extra);
 1525 }
 1526 
 1527 /* End of pcre_study.c */