"Fossies" - the Fresh Open Source Software Archive

Member "pcre-8.42/pcre_compile.c" (12 Dec 2017, 322188 Bytes) of package /linux/misc/pcre-8.42.tar.bz2:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "pcre_compile.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 8.41_vs_8.42.

    1 /*************************************************
    2 *      Perl-Compatible Regular Expressions       *
    3 *************************************************/
    4 
    5 /* PCRE is a library of functions to support regular expressions whose syntax
    6 and semantics are as close as possible to those of the Perl 5 language.
    7 
    8                        Written by Philip Hazel
    9            Copyright (c) 1997-2016 University of Cambridge
   10 
   11 -----------------------------------------------------------------------------
   12 Redistribution and use in source and binary forms, with or without
   13 modification, are permitted provided that the following conditions are met:
   14 
   15     * Redistributions of source code must retain the above copyright notice,
   16       this list of conditions and the following disclaimer.
   17 
   18     * Redistributions in binary form must reproduce the above copyright
   19       notice, this list of conditions and the following disclaimer in the
   20       documentation and/or other materials provided with the distribution.
   21 
   22     * Neither the name of the University of Cambridge nor the names of its
   23       contributors may be used to endorse or promote products derived from
   24       this software without specific prior written permission.
   25 
   26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
   27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
   30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   36 POSSIBILITY OF SUCH DAMAGE.
   37 -----------------------------------------------------------------------------
   38 */
   39 
   40 
   41 /* This module contains the external function pcre_compile(), along with
   42 supporting internal functions that are not used by other modules. */
   43 
   44 
   45 #ifdef HAVE_CONFIG_H
   46 #include "config.h"
   47 #endif
   48 
   49 #define NLBLOCK cd             /* Block containing newline information */
   50 #define PSSTART start_pattern  /* Field containing pattern start */
   51 #define PSEND   end_pattern    /* Field containing pattern end */
   52 
   53 #include "pcre_internal.h"
   54 
   55 
   56 /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
   57 is also used by pcretest. PCRE_DEBUG is not defined when building a production
   58 library. We do not need to select pcre16_printint.c specially, because the
   59 COMPILE_PCREx macro will already be appropriately set. */
   60 
   61 #ifdef PCRE_DEBUG
   62 /* pcre_printint.c should not include any headers */
   63 #define PCRE_INCLUDED
   64 #include "pcre_printint.c"
   65 #undef PCRE_INCLUDED
   66 #endif
   67 
   68 
   69 /* Macro for setting individual bits in class bitmaps. */
   70 
   71 #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
   72 
   73 /* Maximum length value to check against when making sure that the integer that
   74 holds the compiled pattern length does not overflow. We make it a bit less than
   75 INT_MAX to allow for adding in group terminating bytes, so that we don't have
   76 to check them every time. */
   77 
   78 #define OFLOW_MAX (INT_MAX - 20)
   79 
   80 /* Definitions to allow mutual recursion */
   81 
   82 static int
   83   add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
   84     const pcre_uint32 *, unsigned int);
   85 
   86 static BOOL
   87   compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
   88     pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
   89     compile_data *, int *);
   90 
   91 
   92 
   93 /*************************************************
   94 *      Code parameters and static tables         *
   95 *************************************************/
   96 
   97 /* This value specifies the size of stack workspace that is used during the
   98 first pre-compile phase that determines how much memory is required. The regex
   99 is partly compiled into this space, but the compiled parts are discarded as
  100 soon as they can be, so that hopefully there will never be an overrun. The code
  101 does, however, check for an overrun. The largest amount I've seen used is 218,
  102 so this number is very generous.
  103 
  104 The same workspace is used during the second, actual compile phase for
  105 remembering forward references to groups so that they can be filled in at the
  106 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
  107 is 4 there is plenty of room for most patterns. However, the memory can get
  108 filled up by repetitions of forward references, for example patterns like
  109 /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
  110 that the workspace is expanded using malloc() in this situation. The value
  111 below is therefore a minimum, and we put a maximum on it for safety. The
  112 minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
  113 kicks in at the same number of forward references in all cases. */
  114 
  115 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
  116 #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
  117 
  118 /* This value determines the size of the initial vector that is used for
  119 remembering named groups during the pre-compile. It is allocated on the stack,
  120 but if it is too small, it is expanded using malloc(), in a similar way to the
  121 workspace. The value is the number of slots in the list. */
  122 
  123 #define NAMED_GROUP_LIST_SIZE  20
  124 
  125 /* The overrun tests check for a slightly smaller size so that they detect the
  126 overrun before it actually does run off the end of the data block. */
  127 
  128 #define WORK_SIZE_SAFETY_MARGIN (100)
  129 
  130 /* Private flags added to firstchar and reqchar. */
  131 
  132 #define REQ_CASELESS    (1 << 0)        /* Indicates caselessness */
  133 #define REQ_VARY        (1 << 1)        /* Reqchar followed non-literal item */
  134 /* Negative values for the firstchar and reqchar flags */
  135 #define REQ_UNSET       (-2)
  136 #define REQ_NONE        (-1)
  137 
  138 /* Repeated character flags. */
  139 
  140 #define UTF_LENGTH     0x10000000l      /* The char contains its length. */
  141 
  142 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
  143 are simple data values; negative values are for special things like \d and so
  144 on. Zero means further processing is needed (for things like \x), or the escape
  145 is invalid. */
  146 
  147 #ifndef EBCDIC
  148 
  149 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
  150 in UTF-8 mode. */
  151 
  152 static const short int escapes[] = {
  153      0,                       0,
  154      0,                       0,
  155      0,                       0,
  156      0,                       0,
  157      0,                       0,
  158      CHAR_COLON,              CHAR_SEMICOLON,
  159      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
  160      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
  161      CHAR_COMMERCIAL_AT,      -ESC_A,
  162      -ESC_B,                  -ESC_C,
  163      -ESC_D,                  -ESC_E,
  164      0,                       -ESC_G,
  165      -ESC_H,                  0,
  166      0,                       -ESC_K,
  167      0,                       0,
  168      -ESC_N,                  0,
  169      -ESC_P,                  -ESC_Q,
  170      -ESC_R,                  -ESC_S,
  171      0,                       0,
  172      -ESC_V,                  -ESC_W,
  173      -ESC_X,                  0,
  174      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
  175      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
  176      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
  177      CHAR_GRAVE_ACCENT,       ESC_a,
  178      -ESC_b,                  0,
  179      -ESC_d,                  ESC_e,
  180      ESC_f,                   0,
  181      -ESC_h,                  0,
  182      0,                       -ESC_k,
  183      0,                       0,
  184      ESC_n,                   0,
  185      -ESC_p,                  0,
  186      ESC_r,                   -ESC_s,
  187      ESC_tee,                 0,
  188      -ESC_v,                  -ESC_w,
  189      0,                       0,
  190      -ESC_z
  191 };
  192 
  193 #else
  194 
  195 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
  196 
  197 static const short int escapes[] = {
  198 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
  199 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
  200 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
  201 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
  202 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
  203 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
  204 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
  205 /*  80 */     0, ESC_a, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
  206 /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
  207 /*  90 */     0,     0, -ESC_k,       0,      0, ESC_n,      0, -ESC_p,
  208 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
  209 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
  210 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
  211 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
  212 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
  213 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
  214 /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
  215 /*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
  216 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
  217 /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
  218 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
  219 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
  220 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
  221 };
  222 
  223 /* We also need a table of characters that may follow \c in an EBCDIC
  224 environment for characters 0-31. */
  225 
  226 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
  227 
  228 #endif
  229 
  230 
  231 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
  232 searched linearly. Put all the names into a single string, in order to reduce
  233 the number of relocations when a shared library is dynamically linked. The
  234 string is built from string macros so that it works in UTF-8 mode on EBCDIC
  235 platforms. */
  236 
  237 typedef struct verbitem {
  238   int   len;                 /* Length of verb name */
  239   int   op;                  /* Op when no arg, or -1 if arg mandatory */
  240   int   op_arg;              /* Op when arg present, or -1 if not allowed */
  241 } verbitem;
  242 
  243 static const char verbnames[] =
  244   "\0"                       /* Empty name is a shorthand for MARK */
  245   STRING_MARK0
  246   STRING_ACCEPT0
  247   STRING_COMMIT0
  248   STRING_F0
  249   STRING_FAIL0
  250   STRING_PRUNE0
  251   STRING_SKIP0
  252   STRING_THEN;
  253 
  254 static const verbitem verbs[] = {
  255   { 0, -1,        OP_MARK },
  256   { 4, -1,        OP_MARK },
  257   { 6, OP_ACCEPT, -1 },
  258   { 6, OP_COMMIT, -1 },
  259   { 1, OP_FAIL,   -1 },
  260   { 4, OP_FAIL,   -1 },
  261   { 5, OP_PRUNE,  OP_PRUNE_ARG },
  262   { 4, OP_SKIP,   OP_SKIP_ARG  },
  263   { 4, OP_THEN,   OP_THEN_ARG  }
  264 };
  265 
  266 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
  267 
  268 
  269 /* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
  270 another regex library. */
  271 
  272 static const pcre_uchar sub_start_of_word[] = {
  273   CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
  274   CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
  275 
  276 static const pcre_uchar sub_end_of_word[] = {
  277   CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
  278   CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
  279   CHAR_RIGHT_PARENTHESIS, '\0' };
  280 
  281 
  282 /* Tables of names of POSIX character classes and their lengths. The names are
  283 now all in a single string, to reduce the number of relocations when a shared
  284 library is dynamically loaded. The list of lengths is terminated by a zero
  285 length entry. The first three must be alpha, lower, upper, as this is assumed
  286 for handling case independence. The indices for graph, print, and punct are
  287 needed, so identify them. */
  288 
  289 static const char posix_names[] =
  290   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
  291   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
  292   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
  293   STRING_word0  STRING_xdigit;
  294 
  295 static const pcre_uint8 posix_name_lengths[] = {
  296   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
  297 
  298 #define PC_GRAPH  8
  299 #define PC_PRINT  9
  300 #define PC_PUNCT 10
  301 
  302 
  303 /* Table of class bit maps for each POSIX class. Each class is formed from a
  304 base map, with an optional addition or removal of another map. Then, for some
  305 classes, there is some additional tweaking: for [:blank:] the vertical space
  306 characters are removed, and for [:alpha:] and [:alnum:] the underscore
  307 character is removed. The triples in the table consist of the base map offset,
  308 second map offset or -1 if no second map, and a non-negative value for map
  309 addition or a negative value for map subtraction (if there are two maps). The
  310 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
  311 remove vertical space characters, 2 => remove underscore. */
  312 
  313 static const int posix_class_maps[] = {
  314   cbit_word,  cbit_digit, -2,             /* alpha */
  315   cbit_lower, -1,          0,             /* lower */
  316   cbit_upper, -1,          0,             /* upper */
  317   cbit_word,  -1,          2,             /* alnum - word without underscore */
  318   cbit_print, cbit_cntrl,  0,             /* ascii */
  319   cbit_space, -1,          1,             /* blank - a GNU extension */
  320   cbit_cntrl, -1,          0,             /* cntrl */
  321   cbit_digit, -1,          0,             /* digit */
  322   cbit_graph, -1,          0,             /* graph */
  323   cbit_print, -1,          0,             /* print */
  324   cbit_punct, -1,          0,             /* punct */
  325   cbit_space, -1,          0,             /* space */
  326   cbit_word,  -1,          0,             /* word - a Perl extension */
  327   cbit_xdigit,-1,          0              /* xdigit */
  328 };
  329 
  330 /* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
  331 Unicode property escapes. */
  332 
  333 #ifdef SUPPORT_UCP
  334 static const pcre_uchar string_PNd[]  = {
  335   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
  336   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  337 static const pcre_uchar string_pNd[]  = {
  338   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
  339   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  340 static const pcre_uchar string_PXsp[] = {
  341   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
  342   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  343 static const pcre_uchar string_pXsp[] = {
  344   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
  345   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  346 static const pcre_uchar string_PXwd[] = {
  347   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
  348   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  349 static const pcre_uchar string_pXwd[] = {
  350   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
  351   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  352 
  353 static const pcre_uchar *substitutes[] = {
  354   string_PNd,           /* \D */
  355   string_pNd,           /* \d */
  356   string_PXsp,          /* \S */   /* Xsp is Perl space, but from 8.34, Perl */
  357   string_pXsp,          /* \s */   /* space and POSIX space are the same. */
  358   string_PXwd,          /* \W */
  359   string_pXwd           /* \w */
  360 };
  361 
  362 /* The POSIX class substitutes must be in the order of the POSIX class names,
  363 defined above, and there are both positive and negative cases. NULL means no
  364 general substitute of a Unicode property escape (\p or \P). However, for some
  365 POSIX classes (e.g. graph, print, punct) a special property code is compiled
  366 directly. */
  367 
  368 static const pcre_uchar string_pL[] =   {
  369   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
  370   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  371 static const pcre_uchar string_pLl[] =  {
  372   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
  373   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  374 static const pcre_uchar string_pLu[] =  {
  375   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
  376   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  377 static const pcre_uchar string_pXan[] = {
  378   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
  379   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  380 static const pcre_uchar string_h[] =    {
  381   CHAR_BACKSLASH, CHAR_h, '\0' };
  382 static const pcre_uchar string_pXps[] = {
  383   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
  384   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  385 static const pcre_uchar string_PL[] =   {
  386   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
  387   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  388 static const pcre_uchar string_PLl[] =  {
  389   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
  390   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  391 static const pcre_uchar string_PLu[] =  {
  392   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
  393   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  394 static const pcre_uchar string_PXan[] = {
  395   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
  396   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  397 static const pcre_uchar string_H[] =    {
  398   CHAR_BACKSLASH, CHAR_H, '\0' };
  399 static const pcre_uchar string_PXps[] = {
  400   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
  401   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  402 
  403 static const pcre_uchar *posix_substitutes[] = {
  404   string_pL,            /* alpha */
  405   string_pLl,           /* lower */
  406   string_pLu,           /* upper */
  407   string_pXan,          /* alnum */
  408   NULL,                 /* ascii */
  409   string_h,             /* blank */
  410   NULL,                 /* cntrl */
  411   string_pNd,           /* digit */
  412   NULL,                 /* graph */
  413   NULL,                 /* print */
  414   NULL,                 /* punct */
  415   string_pXps,          /* space */   /* Xps is POSIX space, but from 8.34 */
  416   string_pXwd,          /* word  */   /* Perl and POSIX space are the same */
  417   NULL,                 /* xdigit */
  418   /* Negated cases */
  419   string_PL,            /* ^alpha */
  420   string_PLl,           /* ^lower */
  421   string_PLu,           /* ^upper */
  422   string_PXan,          /* ^alnum */
  423   NULL,                 /* ^ascii */
  424   string_H,             /* ^blank */
  425   NULL,                 /* ^cntrl */
  426   string_PNd,           /* ^digit */
  427   NULL,                 /* ^graph */
  428   NULL,                 /* ^print */
  429   NULL,                 /* ^punct */
  430   string_PXps,          /* ^space */  /* Xps is POSIX space, but from 8.34 */
  431   string_PXwd,          /* ^word */   /* Perl and POSIX space are the same */
  432   NULL                  /* ^xdigit */
  433 };
  434 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
  435 #endif
  436 
  437 #define STRING(a)  # a
  438 #define XSTRING(s) STRING(s)
  439 
  440 /* The texts of compile-time error messages. These are "char *" because they
  441 are passed to the outside world. Do not ever re-use any error number, because
  442 they are documented. Always add a new error instead. Messages marked DEAD below
  443 are no longer used. This used to be a table of strings, but in order to reduce
  444 the number of relocations needed when a shared library is loaded dynamically,
  445 it is now one long string. We cannot use a table of offsets, because the
  446 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
  447 simply count through to the one we want - this isn't a performance issue
  448 because these strings are used only when there is a compilation error.
  449 
  450 Each substring ends with \0 to insert a null character. This includes the final
  451 substring, so that the whole string ends with \0\0, which can be detected when
  452 counting through. */
  453 
  454 static const char error_texts[] =
  455   "no error\0"
  456   "\\ at end of pattern\0"
  457   "\\c at end of pattern\0"
  458   "unrecognized character follows \\\0"
  459   "numbers out of order in {} quantifier\0"
  460   /* 5 */
  461   "number too big in {} quantifier\0"
  462   "missing terminating ] for character class\0"
  463   "invalid escape sequence in character class\0"
  464   "range out of order in character class\0"
  465   "nothing to repeat\0"
  466   /* 10 */
  467   "internal error: invalid forward reference offset\0"
  468   "internal error: unexpected repeat\0"
  469   "unrecognized character after (? or (?-\0"
  470   "POSIX named classes are supported only within a class\0"
  471   "missing )\0"
  472   /* 15 */
  473   "reference to non-existent subpattern\0"
  474   "erroffset passed as NULL\0"
  475   "unknown option bit(s) set\0"
  476   "missing ) after comment\0"
  477   "parentheses nested too deeply\0"  /** DEAD **/
  478   /* 20 */
  479   "regular expression is too large\0"
  480   "failed to get memory\0"
  481   "unmatched parentheses\0"
  482   "internal error: code overflow\0"
  483   "unrecognized character after (?<\0"
  484   /* 25 */
  485   "lookbehind assertion is not fixed length\0"
  486   "malformed number or name after (?(\0"
  487   "conditional group contains more than two branches\0"
  488   "assertion expected after (?( or (?(?C)\0"
  489   "(?R or (?[+-]digits must be followed by )\0"
  490   /* 30 */
  491   "unknown POSIX class name\0"
  492   "POSIX collating elements are not supported\0"
  493   "this version of PCRE is compiled without UTF support\0"
  494   "spare error\0"  /** DEAD **/
  495   "character value in \\x{} or \\o{} is too large\0"
  496   /* 35 */
  497   "invalid condition (?(0)\0"
  498   "\\C not allowed in lookbehind assertion\0"
  499   "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
  500   "number after (?C is > 255\0"
  501   "closing ) for (?C expected\0"
  502   /* 40 */
  503   "recursive call could loop indefinitely\0"
  504   "unrecognized character after (?P\0"
  505   "syntax error in subpattern name (missing terminator)\0"
  506   "two named subpatterns have the same name\0"
  507   "invalid UTF-8 string\0"
  508   /* 45 */
  509   "support for \\P, \\p, and \\X has not been compiled\0"
  510   "malformed \\P or \\p sequence\0"
  511   "unknown property name after \\P or \\p\0"
  512   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
  513   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
  514   /* 50 */
  515   "repeated subpattern is too long\0"    /** DEAD **/
  516   "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
  517   "internal error: overran compiling workspace\0"
  518   "internal error: previously-checked referenced subpattern not found\0"
  519   "DEFINE group contains more than one branch\0"
  520   /* 55 */
  521   "repeating a DEFINE group is not allowed\0"  /** DEAD **/
  522   "inconsistent NEWLINE options\0"
  523   "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
  524   "a numbered reference must not be zero\0"
  525   "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
  526   /* 60 */
  527   "(*VERB) not recognized or malformed\0"
  528   "number is too big\0"
  529   "subpattern name expected\0"
  530   "digit expected after (?+\0"
  531   "] is an invalid data character in JavaScript compatibility mode\0"
  532   /* 65 */
  533   "different names for subpatterns of the same number are not allowed\0"
  534   "(*MARK) must have an argument\0"
  535   "this version of PCRE is not compiled with Unicode property support\0"
  536 #ifndef EBCDIC
  537   "\\c must be followed by an ASCII character\0"
  538 #else
  539   "\\c must be followed by a letter or one of [\\]^_?\0"
  540 #endif
  541   "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
  542   /* 70 */
  543   "internal error: unknown opcode in find_fixedlength()\0"
  544   "\\N is not supported in a class\0"
  545   "too many forward references\0"
  546   "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
  547   "invalid UTF-16 string\0"
  548   /* 75 */
  549   "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
  550   "character value in \\u.... sequence is too large\0"
  551   "invalid UTF-32 string\0"
  552   "setting UTF is disabled by the application\0"
  553   "non-hex character in \\x{} (closing brace missing?)\0"
  554   /* 80 */
  555   "non-octal character in \\o{} (closing brace missing?)\0"
  556   "missing opening brace after \\o\0"
  557   "parentheses are too deeply nested\0"
  558   "invalid range in character class\0"
  559   "group name must start with a non-digit\0"
  560   /* 85 */
  561   "parentheses are too deeply nested (stack check)\0"
  562   "digits missing in \\x{} or \\o{}\0"
  563   "regular expression is too complicated\0"
  564   ;
  565 
  566 /* Table to identify digits and hex digits. This is used when compiling
  567 patterns. Note that the tables in chartables are dependent on the locale, and
  568 may mark arbitrary characters as digits - but the PCRE compiling code expects
  569 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
  570 a private table here. It costs 256 bytes, but it is a lot faster than doing
  571 character value tests (at least in some simple cases I timed), and in some
  572 applications one wants PCRE to compile efficiently as well as match
  573 efficiently.
  574 
  575 For convenience, we use the same bit definitions as in chartables:
  576 
  577   0x04   decimal digit
  578   0x08   hexadecimal digit
  579 
  580 Then we can use ctype_digit and ctype_xdigit in the code. */
  581 
  582 /* Using a simple comparison for decimal numbers rather than a memory read
  583 is much faster, and the resulting code is simpler (the compiler turns it
  584 into a subtraction and unsigned comparison). */
  585 
  586 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
  587 
  588 #ifndef EBCDIC
  589 
  590 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
  591 UTF-8 mode. */
  592 
  593 static const pcre_uint8 digitab[] =
  594   {
  595   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
  596   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
  597   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
  598   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
  599   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
  600   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
  601   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
  602   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
  603   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
  604   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
  605   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
  606   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
  607   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
  608   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
  609   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
  610   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
  611   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
  612   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
  613   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
  614   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
  615   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
  616   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
  617   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
  618   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
  619   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
  620   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
  621   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
  622   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
  623   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
  624   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
  625   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
  626   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
  627 
  628 #else
  629 
  630 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
  631 
  632 static const pcre_uint8 digitab[] =
  633   {
  634   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
  635   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
  636   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
  637   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
  638   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
  639   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
  640   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
  641   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
  642   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
  643   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
  644   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
  645   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
  646   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
  647   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
  648   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
  649   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
  650   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
  651   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
  652   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
  653   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
  654   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
  655   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
  656   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
  657   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
  658   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
  659   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
  660   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
  661   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
  662   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
  663   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
  664   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
  665   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
  666 
  667 static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
  668   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
  669   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
  670   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
  671   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
  672   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
  673   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
  674   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
  675   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
  676   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
  677   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
  678   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
  679   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
  680   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
  681   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
  682   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
  683   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
  684   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
  685   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
  686   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
  687   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
  688   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
  689   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
  690   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
  691   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
  692   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
  693   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
  694   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
  695   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
  696   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
  697   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
  698   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
  699   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
  700 #endif
  701 
  702 
  703 /* This table is used to check whether auto-possessification is possible
  704 between adjacent character-type opcodes. The left-hand (repeated) opcode is
  705 used to select the row, and the right-hand opcode is use to select the column.
  706 A value of 1 means that auto-possessification is OK. For example, the second
  707 value in the first row means that \D+\d can be turned into \D++\d.
  708 
  709 The Unicode property types (\P and \p) have to be present to fill out the table
  710 because of what their opcode values are, but the table values should always be
  711 zero because property types are handled separately in the code. The last four
  712 columns apply to items that cannot be repeated, so there is no need to have
  713 rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
  714 *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
  715 
  716 #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
  717 #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
  718 
  719 static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
  720 /* \D \d \S \s \W \w  . .+ \C \P \p \R \H \h \V \v \X \Z \z  $ $M */
  721   { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \D */
  722   { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \d */
  723   { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \S */
  724   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \s */
  725   { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \W */
  726   { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \w */
  727   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .  */
  728   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .+ */
  729   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \C */
  730   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \P */
  731   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \p */
  732   { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \R */
  733   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \H */
  734   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \h */
  735   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \V */
  736   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 },  /* \v */
  737   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }   /* \X */
  738 };
  739 
  740 
  741 /* This table is used to check whether auto-possessification is possible
  742 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
  743 left-hand (repeated) opcode is used to select the row, and the right-hand
  744 opcode is used to select the column. The values are as follows:
  745 
  746   0   Always return FALSE (never auto-possessify)
  747   1   Character groups are distinct (possessify if both are OP_PROP)
  748   2   Check character categories in the same group (general or particular)
  749   3   TRUE if the two opcodes are not the same (PROP vs NOTPROP)
  750 
  751   4   Check left general category vs right particular category
  752   5   Check right general category vs left particular category
  753 
  754   6   Left alphanum vs right general category
  755   7   Left space vs right general category
  756   8   Left word vs right general category
  757 
  758   9   Right alphanum vs left general category
  759  10   Right space vs left general category
  760  11   Right word vs left general category
  761 
  762  12   Left alphanum vs right particular category
  763  13   Left space vs right particular category
  764  14   Left word vs right particular category
  765 
  766  15   Right alphanum vs left particular category
  767  16   Right space vs left particular category
  768  17   Right word vs left particular category
  769 */
  770 
  771 static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
  772 /* ANY LAMP GC  PC  SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
  773   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_ANY */
  774   { 0,  3,  0,  0,  0,    3,    1,      1,   0,    0,   0 },  /* PT_LAMP */
  775   { 0,  0,  2,  4,  0,    9,   10,     10,  11,    0,   0 },  /* PT_GC */
  776   { 0,  0,  5,  2,  0,   15,   16,     16,  17,    0,   0 },  /* PT_PC */
  777   { 0,  0,  0,  0,  2,    0,    0,      0,   0,    0,   0 },  /* PT_SC */
  778   { 0,  3,  6, 12,  0,    3,    1,      1,   0,    0,   0 },  /* PT_ALNUM */
  779   { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_SPACE */
  780   { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_PXSPACE */
  781   { 0,  0,  8, 14,  0,    0,    1,      1,   3,    0,   0 },  /* PT_WORD */
  782   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_CLIST */
  783   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   3 }   /* PT_UCNC */
  784 };
  785 
  786 /* This table is used to check whether auto-possessification is possible
  787 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
  788 specifies a general category and the other specifies a particular category. The
  789 row is selected by the general category and the column by the particular
  790 category. The value is 1 if the particular category is not part of the general
  791 category. */
  792 
  793 static const pcre_uint8 catposstab[7][30] = {
  794 /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
  795   { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* C */
  796   { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* L */
  797   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* M */
  798   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* N */
  799   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },  /* P */
  800   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 },  /* S */
  801   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }   /* Z */
  802 };
  803 
  804 /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
  805 a general or particular category. The properties in each row are those
  806 that apply to the character set in question. Duplication means that a little
  807 unnecessary work is done when checking, but this keeps things much simpler
  808 because they can all use the same code. For more details see the comment where
  809 this table is used.
  810 
  811 Note: SPACE and PXSPACE used to be different because Perl excluded VT from
  812 "space", but from Perl 5.18 it's included, so both categories are treated the
  813 same here. */
  814 
  815 static const pcre_uint8 posspropstab[3][4] = {
  816   { ucp_L, ucp_N, ucp_N, ucp_Nl },  /* ALNUM, 3rd and 4th values redundant */
  817   { ucp_Z, ucp_Z, ucp_C, ucp_Cc },  /* SPACE and PXSPACE, 2nd value redundant */
  818   { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
  819 };
  820 
  821 /* This table is used when converting repeating opcodes into possessified
  822 versions as a result of an explicit possessive quantifier such as ++. A zero
  823 value means there is no possessified version - in those cases the item in
  824 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
  825 because all relevant opcodes are less than that. */
  826 
  827 static const pcre_uint8 opcode_possessify[] = {
  828   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
  829   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
  830 
  831   0,                       /* NOTI */
  832   OP_POSSTAR, 0,           /* STAR, MINSTAR */
  833   OP_POSPLUS, 0,           /* PLUS, MINPLUS */
  834   OP_POSQUERY, 0,          /* QUERY, MINQUERY */
  835   OP_POSUPTO, 0,           /* UPTO, MINUPTO */
  836   0,                       /* EXACT */
  837   0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
  838 
  839   OP_POSSTARI, 0,          /* STARI, MINSTARI */
  840   OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
  841   OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
  842   OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
  843   0,                       /* EXACTI */
  844   0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
  845 
  846   OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
  847   OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
  848   OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
  849   OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
  850   0,                       /* NOTEXACT */
  851   0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
  852 
  853   OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
  854   OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
  855   OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
  856   OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
  857   0,                       /* NOTEXACTI */
  858   0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
  859 
  860   OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
  861   OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
  862   OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
  863   OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
  864   0,                       /* TYPEEXACT */
  865   0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
  866 
  867   OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
  868   OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
  869   OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
  870   OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
  871   0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
  872 
  873   0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
  874   0, 0,                    /* REF, REFI */
  875   0, 0,                    /* DNREF, DNREFI */
  876   0, 0                     /* RECURSE, CALLOUT */
  877 };
  878 
  879 
  880 
  881 /*************************************************
  882 *            Find an error text                  *
  883 *************************************************/
  884 
  885 /* The error texts are now all in one long string, to save on relocations. As
  886 some of the text is of unknown length, we can't use a table of offsets.
  887 Instead, just count through the strings. This is not a performance issue
  888 because it happens only when there has been a compilation error.
  889 
  890 Argument:   the error number
  891 Returns:    pointer to the error string
  892 */
  893 
  894 static const char *
  895 find_error_text(int n)
  896 {
  897 const char *s = error_texts;
  898 for (; n > 0; n--)
  899   {
  900   while (*s++ != CHAR_NULL) {};
  901   if (*s == CHAR_NULL) return "Error text not found (please report)";
  902   }
  903 return s;
  904 }
  905 
  906 
  907 
  908 /*************************************************
  909 *           Expand the workspace                 *
  910 *************************************************/
  911 
  912 /* This function is called during the second compiling phase, if the number of
  913 forward references fills the existing workspace, which is originally a block on
  914 the stack. A larger block is obtained from malloc() unless the ultimate limit
  915 has been reached or the increase will be rather small.
  916 
  917 Argument: pointer to the compile data block
  918 Returns:  0 if all went well, else an error number
  919 */
  920 
  921 static int
  922 expand_workspace(compile_data *cd)
  923 {
  924 pcre_uchar *newspace;
  925 int newsize = cd->workspace_size * 2;
  926 
  927 if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
  928 if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
  929     newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
  930  return ERR72;
  931 
  932 newspace = (PUBL(malloc))(IN_UCHARS(newsize));
  933 if (newspace == NULL) return ERR21;
  934 memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
  935 cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
  936 if (cd->workspace_size > COMPILE_WORK_SIZE)
  937   (PUBL(free))((void *)cd->start_workspace);
  938 cd->start_workspace = newspace;
  939 cd->workspace_size = newsize;
  940 return 0;
  941 }
  942 
  943 
  944 
  945 /*************************************************
  946 *            Check for counted repeat            *
  947 *************************************************/
  948 
  949 /* This function is called when a '{' is encountered in a place where it might
  950 start a quantifier. It looks ahead to see if it really is a quantifier or not.
  951 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
  952 where the ddds are digits.
  953 
  954 Arguments:
  955   p         pointer to the first char after '{'
  956 
  957 Returns:    TRUE or FALSE
  958 */
  959 
  960 static BOOL
  961 is_counted_repeat(const pcre_uchar *p)
  962 {
  963 if (!IS_DIGIT(*p)) return FALSE;
  964 p++;
  965 while (IS_DIGIT(*p)) p++;
  966 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
  967 
  968 if (*p++ != CHAR_COMMA) return FALSE;
  969 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
  970 
  971 if (!IS_DIGIT(*p)) return FALSE;
  972 p++;
  973 while (IS_DIGIT(*p)) p++;
  974 
  975 return (*p == CHAR_RIGHT_CURLY_BRACKET);
  976 }
  977 
  978 
  979 
  980 /*************************************************
  981 *            Handle escapes                      *
  982 *************************************************/
  983 
  984 /* This function is called when a \ has been encountered. It either returns a
  985 positive value for a simple escape such as \n, or 0 for a data character which
  986 will be placed in chptr. A backreference to group n is returned as negative n.
  987 When UTF-8 is enabled, a positive value greater than 255 may be returned in
  988 chptr. On entry, ptr is pointing at the \. On exit, it is on the final
  989 character of the escape sequence.
  990 
  991 Arguments:
  992   ptrptr         points to the pattern position pointer
  993   chptr          points to a returned data character
  994   errorcodeptr   points to the errorcode variable
  995   bracount       number of previous extracting brackets
  996   options        the options bits
  997   isclass        TRUE if inside a character class
  998 
  999 Returns:         zero => a data character
 1000                  positive => a special escape sequence
 1001                  negative => a back reference
 1002                  on error, errorcodeptr is set
 1003 */
 1004 
 1005 static int
 1006 check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
 1007   int bracount, int options, BOOL isclass)
 1008 {
 1009 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
 1010 BOOL utf = (options & PCRE_UTF8) != 0;
 1011 const pcre_uchar *ptr = *ptrptr + 1;
 1012 pcre_uint32 c;
 1013 int escape = 0;
 1014 int i;
 1015 
 1016 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
 1017 ptr--;                            /* Set pointer back to the last byte */
 1018 
 1019 /* If backslash is at the end of the pattern, it's an error. */
 1020 
 1021 if (c == CHAR_NULL) *errorcodeptr = ERR1;
 1022 
 1023 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
 1024 in a table. A non-zero result is something that can be returned immediately.
 1025 Otherwise further processing may be required. */
 1026 
 1027 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
 1028 /* Not alphanumeric */
 1029 else if (c < CHAR_0 || c > CHAR_z) {}
 1030 else if ((i = escapes[c - CHAR_0]) != 0)
 1031   { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
 1032 
 1033 #else           /* EBCDIC coding */
 1034 /* Not alphanumeric */
 1035 else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
 1036 else if ((i = escapes[c - 0x48]) != 0)  { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
 1037 #endif
 1038 
 1039 /* Escapes that need further processing, or are illegal. */
 1040 
 1041 else
 1042   {
 1043   const pcre_uchar *oldptr;
 1044   BOOL braced, negated, overflow;
 1045   int s;
 1046 
 1047   switch (c)
 1048     {
 1049     /* A number of Perl escapes are not handled by PCRE. We give an explicit
 1050     error. */
 1051 
 1052     case CHAR_l:
 1053     case CHAR_L:
 1054     *errorcodeptr = ERR37;
 1055     break;
 1056 
 1057     case CHAR_u:
 1058     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
 1059       {
 1060       /* In JavaScript, \u must be followed by four hexadecimal numbers.
 1061       Otherwise it is a lowercase u letter. */
 1062       if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
 1063         && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
 1064         && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
 1065         && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
 1066         {
 1067         c = 0;
 1068         for (i = 0; i < 4; ++i)
 1069           {
 1070           register pcre_uint32 cc = *(++ptr);
 1071 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
 1072           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
 1073           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
 1074 #else           /* EBCDIC coding */
 1075           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
 1076           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
 1077 #endif
 1078           }
 1079 
 1080 #if defined COMPILE_PCRE8
 1081         if (c > (utf ? 0x10ffffU : 0xffU))
 1082 #elif defined COMPILE_PCRE16
 1083         if (c > (utf ? 0x10ffffU : 0xffffU))
 1084 #elif defined COMPILE_PCRE32
 1085         if (utf && c > 0x10ffffU)
 1086 #endif
 1087           {
 1088           *errorcodeptr = ERR76;
 1089           }
 1090         else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
 1091         }
 1092       }
 1093     else
 1094       *errorcodeptr = ERR37;
 1095     break;
 1096 
 1097     case CHAR_U:
 1098     /* In JavaScript, \U is an uppercase U letter. */
 1099     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
 1100     break;
 1101 
 1102     /* In a character class, \g is just a literal "g". Outside a character
 1103     class, \g must be followed by one of a number of specific things:
 1104 
 1105     (1) A number, either plain or braced. If positive, it is an absolute
 1106     backreference. If negative, it is a relative backreference. This is a Perl
 1107     5.10 feature.
 1108 
 1109     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
 1110     is part of Perl's movement towards a unified syntax for back references. As
 1111     this is synonymous with \k{name}, we fudge it up by pretending it really
 1112     was \k.
 1113 
 1114     (3) For Oniguruma compatibility we also support \g followed by a name or a
 1115     number either in angle brackets or in single quotes. However, these are
 1116     (possibly recursive) subroutine calls, _not_ backreferences. Just return
 1117     the ESC_g code (cf \k). */
 1118 
 1119     case CHAR_g:
 1120     if (isclass) break;
 1121     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
 1122       {
 1123       escape = ESC_g;
 1124       break;
 1125       }
 1126 
 1127     /* Handle the Perl-compatible cases */
 1128 
 1129     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
 1130       {
 1131       const pcre_uchar *p;
 1132       for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
 1133         if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
 1134       if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
 1135         {
 1136         escape = ESC_k;
 1137         break;
 1138         }
 1139       braced = TRUE;
 1140       ptr++;
 1141       }
 1142     else braced = FALSE;
 1143 
 1144     if (ptr[1] == CHAR_MINUS)
 1145       {
 1146       negated = TRUE;
 1147       ptr++;
 1148       }
 1149     else negated = FALSE;
 1150 
 1151     /* The integer range is limited by the machine's int representation. */
 1152     s = 0;
 1153     overflow = FALSE;
 1154     while (IS_DIGIT(ptr[1]))
 1155       {
 1156       if (s > INT_MAX / 10 - 1) /* Integer overflow */
 1157         {
 1158         overflow = TRUE;
 1159         break;
 1160         }
 1161       s = s * 10 + (int)(*(++ptr) - CHAR_0);
 1162       }
 1163     if (overflow) /* Integer overflow */
 1164       {
 1165       while (IS_DIGIT(ptr[1]))
 1166         ptr++;
 1167       *errorcodeptr = ERR61;
 1168       break;
 1169       }
 1170 
 1171     if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
 1172       {
 1173       *errorcodeptr = ERR57;
 1174       break;
 1175       }
 1176 
 1177     if (s == 0)
 1178       {
 1179       *errorcodeptr = ERR58;
 1180       break;
 1181       }
 1182 
 1183     if (negated)
 1184       {
 1185       if (s > bracount)
 1186         {
 1187         *errorcodeptr = ERR15;
 1188         break;
 1189         }
 1190       s = bracount - (s - 1);
 1191       }
 1192 
 1193     escape = -s;
 1194     break;
 1195 
 1196     /* The handling of escape sequences consisting of a string of digits
 1197     starting with one that is not zero is not straightforward. Perl has changed
 1198     over the years. Nowadays \g{} for backreferences and \o{} for octal are
 1199     recommended to avoid the ambiguities in the old syntax.
 1200 
 1201     Outside a character class, the digits are read as a decimal number. If the
 1202     number is less than 8 (used to be 10), or if there are that many previous
 1203     extracting left brackets, then it is a back reference. Otherwise, up to
 1204     three octal digits are read to form an escaped byte. Thus \123 is likely to
 1205     be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
 1206     the octal value is greater than 377, the least significant 8 bits are
 1207     taken. \8 and \9 are treated as the literal characters 8 and 9.
 1208 
 1209     Inside a character class, \ followed by a digit is always either a literal
 1210     8 or 9 or an octal number. */
 1211 
 1212     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
 1213     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
 1214 
 1215     if (!isclass)
 1216       {
 1217       oldptr = ptr;
 1218       /* The integer range is limited by the machine's int representation. */
 1219       s = (int)(c -CHAR_0);
 1220       overflow = FALSE;
 1221       while (IS_DIGIT(ptr[1]))
 1222         {
 1223         if (s > INT_MAX / 10 - 1) /* Integer overflow */
 1224           {
 1225           overflow = TRUE;
 1226           break;
 1227           }
 1228         s = s * 10 + (int)(*(++ptr) - CHAR_0);
 1229         }
 1230       if (overflow) /* Integer overflow */
 1231         {
 1232         while (IS_DIGIT(ptr[1]))
 1233           ptr++;
 1234         *errorcodeptr = ERR61;
 1235         break;
 1236         }
 1237       if (s < 8 || s <= bracount)  /* Check for back reference */
 1238         {
 1239         escape = -s;
 1240         break;
 1241         }
 1242       ptr = oldptr;      /* Put the pointer back and fall through */
 1243       }
 1244 
 1245     /* Handle a digit following \ when the number is not a back reference. If
 1246     the first digit is 8 or 9, Perl used to generate a binary zero byte and
 1247     then treat the digit as a following literal. At least by Perl 5.18 this
 1248     changed so as not to insert the binary zero. */
 1249 
 1250     if ((c = *ptr) >= CHAR_8) break;
 1251 
 1252     /* Fall through with a digit less than 8 */
 1253 
 1254     /* \0 always starts an octal number, but we may drop through to here with a
 1255     larger first octal digit. The original code used just to take the least
 1256     significant 8 bits of octal numbers (I think this is what early Perls used
 1257     to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
 1258     but no more than 3 octal digits. */
 1259 
 1260     case CHAR_0:
 1261     c -= CHAR_0;
 1262     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
 1263         c = c * 8 + *(++ptr) - CHAR_0;
 1264 #ifdef COMPILE_PCRE8
 1265     if (!utf && c > 0xff) *errorcodeptr = ERR51;
 1266 #endif
 1267     break;
 1268 
 1269     /* \o is a relatively new Perl feature, supporting a more general way of
 1270     specifying character codes in octal. The only supported form is \o{ddd}. */
 1271 
 1272     case CHAR_o:
 1273     if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
 1274     if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR86; else
 1275       {
 1276       ptr += 2;
 1277       c = 0;
 1278       overflow = FALSE;
 1279       while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
 1280         {
 1281         register pcre_uint32 cc = *ptr++;
 1282         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
 1283 #ifdef COMPILE_PCRE32
 1284         if (c >= 0x20000000l) { overflow = TRUE; break; }
 1285 #endif
 1286         c = (c << 3) + cc - CHAR_0 ;
 1287 #if defined COMPILE_PCRE8
 1288         if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
 1289 #elif defined COMPILE_PCRE16
 1290         if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
 1291 #elif defined COMPILE_PCRE32
 1292         if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
 1293 #endif
 1294         }
 1295       if (overflow)
 1296         {
 1297         while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
 1298         *errorcodeptr = ERR34;
 1299         }
 1300       else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
 1301         {
 1302         if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
 1303         }
 1304       else *errorcodeptr = ERR80;
 1305       }
 1306     break;
 1307 
 1308     /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
 1309     numbers. Otherwise it is a lowercase x letter. */
 1310 
 1311     case CHAR_x:
 1312     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
 1313       {
 1314       if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
 1315         && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
 1316         {
 1317         c = 0;
 1318         for (i = 0; i < 2; ++i)
 1319           {
 1320           register pcre_uint32 cc = *(++ptr);
 1321 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
 1322           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
 1323           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
 1324 #else           /* EBCDIC coding */
 1325           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
 1326           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
 1327 #endif
 1328           }
 1329         }
 1330       }    /* End JavaScript handling */
 1331 
 1332     /* Handle \x in Perl's style. \x{ddd} is a character number which can be
 1333     greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
 1334     digits. If not, { used to be treated as a data character. However, Perl
 1335     seems to read hex digits up to the first non-such, and ignore the rest, so
 1336     that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
 1337     now gives an error. */
 1338 
 1339     else
 1340       {
 1341       if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
 1342         {
 1343         ptr += 2;
 1344         if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
 1345           {
 1346           *errorcodeptr = ERR86;
 1347           break;
 1348           }
 1349         c = 0;
 1350         overflow = FALSE;
 1351         while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
 1352           {
 1353           register pcre_uint32 cc = *ptr++;
 1354           if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
 1355 
 1356 #ifdef COMPILE_PCRE32
 1357           if (c >= 0x10000000l) { overflow = TRUE; break; }
 1358 #endif
 1359 
 1360 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
 1361           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
 1362           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
 1363 #else           /* EBCDIC coding */
 1364           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
 1365           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
 1366 #endif
 1367 
 1368 #if defined COMPILE_PCRE8
 1369           if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
 1370 #elif defined COMPILE_PCRE16
 1371           if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
 1372 #elif defined COMPILE_PCRE32
 1373           if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
 1374 #endif
 1375           }
 1376 
 1377         if (overflow)
 1378           {
 1379           while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
 1380           *errorcodeptr = ERR34;
 1381           }
 1382 
 1383         else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
 1384           {
 1385           if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
 1386           }
 1387 
 1388         /* If the sequence of hex digits does not end with '}', give an error.
 1389         We used just to recognize this construct and fall through to the normal
 1390         \x handling, but nowadays Perl gives an error, which seems much more
 1391         sensible, so we do too. */
 1392 
 1393         else *errorcodeptr = ERR79;
 1394         }   /* End of \x{} processing */
 1395 
 1396       /* Read a single-byte hex-defined char (up to two hex digits after \x) */
 1397 
 1398       else
 1399         {
 1400         c = 0;
 1401         while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
 1402           {
 1403           pcre_uint32 cc;                          /* Some compilers don't like */
 1404           cc = *(++ptr);                           /* ++ in initializers */
 1405 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
 1406           if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
 1407           c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
 1408 #else           /* EBCDIC coding */
 1409           if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
 1410           c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
 1411 #endif
 1412           }
 1413         }     /* End of \xdd handling */
 1414       }       /* End of Perl-style \x handling */
 1415     break;
 1416 
 1417     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
 1418     An error is given if the byte following \c is not an ASCII character. This
 1419     coding is ASCII-specific, but then the whole concept of \cx is
 1420     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
 1421 
 1422     case CHAR_c:
 1423     c = *(++ptr);
 1424     if (c == CHAR_NULL)
 1425       {
 1426       *errorcodeptr = ERR2;
 1427       break;
 1428       }
 1429 #ifndef EBCDIC    /* ASCII/UTF-8 coding */
 1430     if (c > 127)  /* Excludes all non-ASCII in either mode */
 1431       {
 1432       *errorcodeptr = ERR68;
 1433       break;
 1434       }
 1435     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
 1436     c ^= 0x40;
 1437 #else             /* EBCDIC coding */
 1438     if (c >= CHAR_a && c <= CHAR_z) c += 64;
 1439     if (c == CHAR_QUESTION_MARK)
 1440       c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
 1441     else
 1442       {
 1443       for (i = 0; i < 32; i++)
 1444         {
 1445         if (c == ebcdic_escape_c[i]) break;
 1446         }
 1447       if (i < 32) c = i; else *errorcodeptr = ERR68;
 1448       }
 1449 #endif
 1450     break;
 1451 
 1452     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
 1453     other alphanumeric following \ is an error if PCRE_EXTRA was set;
 1454     otherwise, for Perl compatibility, it is a literal. This code looks a bit
 1455     odd, but there used to be some cases other than the default, and there may
 1456     be again in future, so I haven't "optimized" it. */
 1457 
 1458     default:
 1459     if ((options & PCRE_EXTRA) != 0) switch(c)
 1460       {
 1461       default:
 1462       *errorcodeptr = ERR3;
 1463       break;
 1464       }
 1465     break;
 1466     }
 1467   }
 1468 
 1469 /* Perl supports \N{name} for character names, as well as plain \N for "not
 1470 newline". PCRE does not support \N{name}. However, it does support
 1471 quantification such as \N{2,3}. */
 1472 
 1473 if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
 1474      !is_counted_repeat(ptr+2))
 1475   *errorcodeptr = ERR37;
 1476 
 1477 /* If PCRE_UCP is set, we change the values for \d etc. */
 1478 
 1479 if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
 1480   escape += (ESC_DU - ESC_D);
 1481 
 1482 /* Set the pointer to the final character before returning. */
 1483 
 1484 *ptrptr = ptr;
 1485 *chptr = c;
 1486 return escape;
 1487 }
 1488 
 1489 
 1490 
 1491 #ifdef SUPPORT_UCP
 1492 /*************************************************
 1493 *               Handle \P and \p                 *
 1494 *************************************************/
 1495 
 1496 /* This function is called after \P or \p has been encountered, provided that
 1497 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
 1498 pointing at the P or p. On exit, it is pointing at the final character of the
 1499 escape sequence.
 1500 
 1501 Argument:
 1502   ptrptr         points to the pattern position pointer
 1503   negptr         points to a boolean that is set TRUE for negation else FALSE
 1504   ptypeptr       points to an unsigned int that is set to the type value
 1505   pdataptr       points to an unsigned int that is set to the detailed property value
 1506   errorcodeptr   points to the error code variable
 1507 
 1508 Returns:         TRUE if the type value was found, or FALSE for an invalid type
 1509 */
 1510 
 1511 static BOOL
 1512 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
 1513   unsigned int *pdataptr, int *errorcodeptr)
 1514 {
 1515 pcre_uchar c;
 1516 int i, bot, top;
 1517 const pcre_uchar *ptr = *ptrptr;
 1518 pcre_uchar name[32];
 1519 
 1520 c = *(++ptr);
 1521 if (c == CHAR_NULL) goto ERROR_RETURN;
 1522 
 1523 *negptr = FALSE;
 1524 
 1525 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
 1526 negation. */
 1527 
 1528 if (c == CHAR_LEFT_CURLY_BRACKET)
 1529   {
 1530   if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
 1531     {
 1532     *negptr = TRUE;
 1533     ptr++;
 1534     }
 1535   for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
 1536     {
 1537     c = *(++ptr);
 1538     if (c == CHAR_NULL) goto ERROR_RETURN;
 1539     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
 1540     name[i] = c;
 1541     }
 1542   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
 1543   name[i] = 0;
 1544   }
 1545 
 1546 /* Otherwise there is just one following character */
 1547 
 1548 else
 1549   {
 1550   name[0] = c;
 1551   name[1] = 0;
 1552   }
 1553 
 1554 *ptrptr = ptr;
 1555 
 1556 /* Search for a recognized property name using binary chop */
 1557 
 1558 bot = 0;
 1559 top = PRIV(utt_size);
 1560 
 1561 while (bot < top)
 1562   {
 1563   int r;
 1564   i = (bot + top) >> 1;
 1565   r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
 1566   if (r == 0)
 1567     {
 1568     *ptypeptr = PRIV(utt)[i].type;
 1569     *pdataptr = PRIV(utt)[i].value;
 1570     return TRUE;
 1571     }
 1572   if (r > 0) bot = i + 1; else top = i;
 1573   }
 1574 
 1575 *errorcodeptr = ERR47;
 1576 *ptrptr = ptr;
 1577 return FALSE;
 1578 
 1579 ERROR_RETURN:
 1580 *errorcodeptr = ERR46;
 1581 *ptrptr = ptr;
 1582 return FALSE;
 1583 }
 1584 #endif
 1585 
 1586 
 1587 
 1588 /*************************************************
 1589 *         Read repeat counts                     *
 1590 *************************************************/
 1591 
 1592 /* Read an item of the form {n,m} and return the values. This is called only
 1593 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
 1594 so the syntax is guaranteed to be correct, but we need to check the values.
 1595 
 1596 Arguments:
 1597   p              pointer to first char after '{'
 1598   minp           pointer to int for min
 1599   maxp           pointer to int for max
 1600                  returned as -1 if no max
 1601   errorcodeptr   points to error code variable
 1602 
 1603 Returns:         pointer to '}' on success;
 1604                  current ptr on error, with errorcodeptr set non-zero
 1605 */
 1606 
 1607 static const pcre_uchar *
 1608 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
 1609 {
 1610 int min = 0;
 1611 int max = -1;
 1612 
 1613 while (IS_DIGIT(*p))
 1614   {
 1615   min = min * 10 + (int)(*p++ - CHAR_0);
 1616   if (min > 65535)
 1617     {
 1618     *errorcodeptr = ERR5;
 1619     return p;
 1620     }
 1621   }
 1622 
 1623 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
 1624   {
 1625   if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
 1626     {
 1627     max = 0;
 1628     while(IS_DIGIT(*p))
 1629       {
 1630       max = max * 10 + (int)(*p++ - CHAR_0);
 1631       if (max > 65535)
 1632         {
 1633         *errorcodeptr = ERR5;
 1634         return p;
 1635         }
 1636       }
 1637     if (max < min)
 1638       {
 1639       *errorcodeptr = ERR4;
 1640       return p;
 1641       }
 1642     }
 1643   }
 1644 
 1645 *minp = min;
 1646 *maxp = max;
 1647 return p;
 1648 }
 1649 
 1650 
 1651 
 1652 /*************************************************
 1653 *      Find first significant op code            *
 1654 *************************************************/
 1655 
 1656 /* This is called by several functions that scan a compiled expression looking
 1657 for a fixed first character, or an anchoring op code etc. It skips over things
 1658 that do not influence this. For some calls, it makes sense to skip negative
 1659 forward and all backward assertions, and also the \b assertion; for others it
 1660 does not.
 1661 
 1662 Arguments:
 1663   code         pointer to the start of the group
 1664   skipassert   TRUE if certain assertions are to be skipped
 1665 
 1666 Returns:       pointer to the first significant opcode
 1667 */
 1668 
 1669 static const pcre_uchar*
 1670 first_significant_code(const pcre_uchar *code, BOOL skipassert)
 1671 {
 1672 for (;;)
 1673   {
 1674   switch ((int)*code)
 1675     {
 1676     case OP_ASSERT_NOT:
 1677     case OP_ASSERTBACK:
 1678     case OP_ASSERTBACK_NOT:
 1679     if (!skipassert) return code;
 1680     do code += GET(code, 1); while (*code == OP_ALT);
 1681     code += PRIV(OP_lengths)[*code];
 1682     break;
 1683 
 1684     case OP_WORD_BOUNDARY:
 1685     case OP_NOT_WORD_BOUNDARY:
 1686     if (!skipassert) return code;
 1687     /* Fall through */
 1688 
 1689     case OP_CALLOUT:
 1690     case OP_CREF:
 1691     case OP_DNCREF:
 1692     case OP_RREF:
 1693     case OP_DNRREF:
 1694     case OP_DEF:
 1695     code += PRIV(OP_lengths)[*code];
 1696     break;
 1697 
 1698     default:
 1699     return code;
 1700     }
 1701   }
 1702 /* Control never reaches here */
 1703 }
 1704 
 1705 
 1706 
 1707 /*************************************************
 1708 *        Find the fixed length of a branch       *
 1709 *************************************************/
 1710 
 1711 /* Scan a branch and compute the fixed length of subject that will match it,
 1712 if the length is fixed. This is needed for dealing with backward assertions.
 1713 In UTF8 mode, the result is in characters rather than bytes. The branch is
 1714 temporarily terminated with OP_END when this function is called.
 1715 
 1716 This function is called when a backward assertion is encountered, so that if it
 1717 fails, the error message can point to the correct place in the pattern.
 1718 However, we cannot do this when the assertion contains subroutine calls,
 1719 because they can be forward references. We solve this by remembering this case
 1720 and doing the check at the end; a flag specifies which mode we are running in.
 1721 
 1722 Arguments:
 1723   code     points to the start of the pattern (the bracket)
 1724   utf      TRUE in UTF-8 / UTF-16 / UTF-32 mode
 1725   atend    TRUE if called when the pattern is complete
 1726   cd       the "compile data" structure
 1727   recurses    chain of recurse_check to catch mutual recursion
 1728 
 1729 Returns:   the fixed length,
 1730              or -1 if there is no fixed length,
 1731              or -2 if \C was encountered (in UTF-8 mode only)
 1732              or -3 if an OP_RECURSE item was encountered and atend is FALSE
 1733              or -4 if an unknown opcode was encountered (internal error)
 1734 */
 1735 
 1736 static int
 1737 find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd,
 1738   recurse_check *recurses)
 1739 {
 1740 int length = -1;
 1741 recurse_check this_recurse;
 1742 register int branchlength = 0;
 1743 register pcre_uchar *cc = code + 1 + LINK_SIZE;
 1744 
 1745 /* Scan along the opcodes for this branch. If we get to the end of the
 1746 branch, check the length against that of the other branches. */
 1747 
 1748 for (;;)
 1749   {
 1750   int d;
 1751   pcre_uchar *ce, *cs;
 1752   register pcre_uchar op = *cc;
 1753 
 1754   switch (op)
 1755     {
 1756     /* We only need to continue for OP_CBRA (normal capturing bracket) and
 1757     OP_BRA (normal non-capturing bracket) because the other variants of these
 1758     opcodes are all concerned with unlimited repeated groups, which of course
 1759     are not of fixed length. */
 1760 
 1761     case OP_CBRA:
 1762     case OP_BRA:
 1763     case OP_ONCE:
 1764     case OP_ONCE_NC:
 1765     case OP_COND:
 1766     d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd,
 1767       recurses);
 1768     if (d < 0) return d;
 1769     branchlength += d;
 1770     do cc += GET(cc, 1); while (*cc == OP_ALT);
 1771     cc += 1 + LINK_SIZE;
 1772     break;
 1773 
 1774     /* Reached end of a branch; if it's a ket it is the end of a nested call.
 1775     If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
 1776     an ALT. If it is END it's the end of the outer call. All can be handled by
 1777     the same code. Note that we must not include the OP_KETRxxx opcodes here,
 1778     because they all imply an unlimited repeat. */
 1779 
 1780     case OP_ALT:
 1781     case OP_KET:
 1782     case OP_END:
 1783     case OP_ACCEPT:
 1784     case OP_ASSERT_ACCEPT:
 1785     if (length < 0) length = branchlength;
 1786       else if (length != branchlength) return -1;
 1787     if (*cc != OP_ALT) return length;
 1788     cc += 1 + LINK_SIZE;
 1789     branchlength = 0;
 1790     break;
 1791 
 1792     /* A true recursion implies not fixed length, but a subroutine call may
 1793     be OK. If the subroutine is a forward reference, we can't deal with
 1794     it until the end of the pattern, so return -3. */
 1795 
 1796     case OP_RECURSE:
 1797     if (!atend) return -3;
 1798     cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
 1799     do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
 1800     if (cc > cs && cc < ce) return -1;                    /* Recursion */
 1801     else   /* Check for mutual recursion */
 1802       {
 1803       recurse_check *r = recurses;
 1804       for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
 1805       if (r != NULL) return -1;   /* Mutual recursion */
 1806       }
 1807     this_recurse.prev = recurses;
 1808     this_recurse.group = cs;
 1809     d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd, &this_recurse);
 1810     if (d < 0) return d;
 1811     branchlength += d;
 1812     cc += 1 + LINK_SIZE;
 1813     break;
 1814 
 1815     /* Skip over assertive subpatterns */
 1816 
 1817     case OP_ASSERT:
 1818     case OP_ASSERT_NOT:
 1819     case OP_ASSERTBACK:
 1820     case OP_ASSERTBACK_NOT:
 1821     do cc += GET(cc, 1); while (*cc == OP_ALT);
 1822     cc += 1 + LINK_SIZE;
 1823     break;
 1824 
 1825     /* Skip over things that don't match chars */
 1826 
 1827     case OP_MARK:
 1828     case OP_PRUNE_ARG:
 1829     case OP_SKIP_ARG:
 1830     case OP_THEN_ARG:
 1831     cc += cc[1] + PRIV(OP_lengths)[*cc];
 1832     break;
 1833 
 1834     case OP_CALLOUT:
 1835     case OP_CIRC:
 1836     case OP_CIRCM:
 1837     case OP_CLOSE:
 1838     case OP_COMMIT:
 1839     case OP_CREF:
 1840     case OP_DEF:
 1841     case OP_DNCREF:
 1842     case OP_DNRREF:
 1843     case OP_DOLL:
 1844     case OP_DOLLM:
 1845     case OP_EOD:
 1846     case OP_EODN:
 1847     case OP_FAIL:
 1848     case OP_NOT_WORD_BOUNDARY:
 1849     case OP_PRUNE:
 1850     case OP_REVERSE:
 1851     case OP_RREF:
 1852     case OP_SET_SOM:
 1853     case OP_SKIP:
 1854     case OP_SOD:
 1855     case OP_SOM:
 1856     case OP_THEN:
 1857     case OP_WORD_BOUNDARY:
 1858     cc += PRIV(OP_lengths)[*cc];
 1859     break;
 1860 
 1861     /* Handle literal characters */
 1862 
 1863     case OP_CHAR:
 1864     case OP_CHARI:
 1865     case OP_NOT:
 1866     case OP_NOTI:
 1867     branchlength++;
 1868     cc += 2;
 1869 #ifdef SUPPORT_UTF
 1870     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 1871 #endif
 1872     break;
 1873 
 1874     /* Handle exact repetitions. The count is already in characters, but we
 1875     need to skip over a multibyte character in UTF8 mode.  */
 1876 
 1877     case OP_EXACT:
 1878     case OP_EXACTI:
 1879     case OP_NOTEXACT:
 1880     case OP_NOTEXACTI:
 1881     branchlength += (int)GET2(cc,1);
 1882     cc += 2 + IMM2_SIZE;
 1883 #ifdef SUPPORT_UTF
 1884     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 1885 #endif
 1886     break;
 1887 
 1888     case OP_TYPEEXACT:
 1889     branchlength += GET2(cc,1);
 1890     if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
 1891       cc += 2;
 1892     cc += 1 + IMM2_SIZE + 1;
 1893     break;
 1894 
 1895     /* Handle single-char matchers */
 1896 
 1897     case OP_PROP:
 1898     case OP_NOTPROP:
 1899     cc += 2;
 1900     /* Fall through */
 1901 
 1902     case OP_HSPACE:
 1903     case OP_VSPACE:
 1904     case OP_NOT_HSPACE:
 1905     case OP_NOT_VSPACE:
 1906     case OP_NOT_DIGIT:
 1907     case OP_DIGIT:
 1908     case OP_NOT_WHITESPACE:
 1909     case OP_WHITESPACE:
 1910     case OP_NOT_WORDCHAR:
 1911     case OP_WORDCHAR:
 1912     case OP_ANY:
 1913     case OP_ALLANY:
 1914     branchlength++;
 1915     cc++;
 1916     break;
 1917 
 1918     /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
 1919     otherwise \C is coded as OP_ALLANY. */
 1920 
 1921     case OP_ANYBYTE:
 1922     return -2;
 1923 
 1924     /* Check a class for variable quantification */
 1925 
 1926     case OP_CLASS:
 1927     case OP_NCLASS:
 1928 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
 1929     case OP_XCLASS:
 1930     /* The original code caused an unsigned overflow in 64 bit systems,
 1931     so now we use a conditional statement. */
 1932     if (op == OP_XCLASS)
 1933       cc += GET(cc, 1);
 1934     else
 1935       cc += PRIV(OP_lengths)[OP_CLASS];
 1936 #else
 1937     cc += PRIV(OP_lengths)[OP_CLASS];
 1938 #endif
 1939 
 1940     switch (*cc)
 1941       {
 1942       case OP_CRSTAR:
 1943       case OP_CRMINSTAR:
 1944       case OP_CRPLUS:
 1945       case OP_CRMINPLUS:
 1946       case OP_CRQUERY:
 1947       case OP_CRMINQUERY:
 1948       case OP_CRPOSSTAR:
 1949       case OP_CRPOSPLUS:
 1950       case OP_CRPOSQUERY:
 1951       return -1;
 1952 
 1953       case OP_CRRANGE:
 1954       case OP_CRMINRANGE:
 1955       case OP_CRPOSRANGE:
 1956       if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
 1957       branchlength += (int)GET2(cc,1);
 1958       cc += 1 + 2 * IMM2_SIZE;
 1959       break;
 1960 
 1961       default:
 1962       branchlength++;
 1963       }
 1964     break;
 1965 
 1966     /* Anything else is variable length */
 1967 
 1968     case OP_ANYNL:
 1969     case OP_BRAMINZERO:
 1970     case OP_BRAPOS:
 1971     case OP_BRAPOSZERO:
 1972     case OP_BRAZERO:
 1973     case OP_CBRAPOS:
 1974     case OP_EXTUNI:
 1975     case OP_KETRMAX:
 1976     case OP_KETRMIN:
 1977     case OP_KETRPOS:
 1978     case OP_MINPLUS:
 1979     case OP_MINPLUSI:
 1980     case OP_MINQUERY:
 1981     case OP_MINQUERYI:
 1982     case OP_MINSTAR:
 1983     case OP_MINSTARI:
 1984     case OP_MINUPTO:
 1985     case OP_MINUPTOI:
 1986     case OP_NOTMINPLUS:
 1987     case OP_NOTMINPLUSI:
 1988     case OP_NOTMINQUERY:
 1989     case OP_NOTMINQUERYI:
 1990     case OP_NOTMINSTAR:
 1991     case OP_NOTMINSTARI:
 1992     case OP_NOTMINUPTO:
 1993     case OP_NOTMINUPTOI:
 1994     case OP_NOTPLUS:
 1995     case OP_NOTPLUSI:
 1996     case OP_NOTPOSPLUS:
 1997     case OP_NOTPOSPLUSI:
 1998     case OP_NOTPOSQUERY:
 1999     case OP_NOTPOSQUERYI:
 2000     case OP_NOTPOSSTAR:
 2001     case OP_NOTPOSSTARI:
 2002     case OP_NOTPOSUPTO:
 2003     case OP_NOTPOSUPTOI:
 2004     case OP_NOTQUERY:
 2005     case OP_NOTQUERYI:
 2006     case OP_NOTSTAR:
 2007     case OP_NOTSTARI:
 2008     case OP_NOTUPTO:
 2009     case OP_NOTUPTOI:
 2010     case OP_PLUS:
 2011     case OP_PLUSI:
 2012     case OP_POSPLUS:
 2013     case OP_POSPLUSI:
 2014     case OP_POSQUERY:
 2015     case OP_POSQUERYI:
 2016     case OP_POSSTAR:
 2017     case OP_POSSTARI:
 2018     case OP_POSUPTO:
 2019     case OP_POSUPTOI:
 2020     case OP_QUERY:
 2021     case OP_QUERYI:
 2022     case OP_REF:
 2023     case OP_REFI:
 2024     case OP_DNREF:
 2025     case OP_DNREFI:
 2026     case OP_SBRA:
 2027     case OP_SBRAPOS:
 2028     case OP_SCBRA:
 2029     case OP_SCBRAPOS:
 2030     case OP_SCOND:
 2031     case OP_SKIPZERO:
 2032     case OP_STAR:
 2033     case OP_STARI:
 2034     case OP_TYPEMINPLUS:
 2035     case OP_TYPEMINQUERY:
 2036     case OP_TYPEMINSTAR:
 2037     case OP_TYPEMINUPTO:
 2038     case OP_TYPEPLUS:
 2039     case OP_TYPEPOSPLUS:
 2040     case OP_TYPEPOSQUERY:
 2041     case OP_TYPEPOSSTAR:
 2042     case OP_TYPEPOSUPTO:
 2043     case OP_TYPEQUERY:
 2044     case OP_TYPESTAR:
 2045     case OP_TYPEUPTO:
 2046     case OP_UPTO:
 2047     case OP_UPTOI:
 2048     return -1;
 2049 
 2050     /* Catch unrecognized opcodes so that when new ones are added they
 2051     are not forgotten, as has happened in the past. */
 2052 
 2053     default:
 2054     return -4;
 2055     }
 2056   }
 2057 /* Control never gets here */
 2058 }
 2059 
 2060 
 2061 
 2062 /*************************************************
 2063 *    Scan compiled regex for specific bracket    *
 2064 *************************************************/
 2065 
 2066 /* This little function scans through a compiled pattern until it finds a
 2067 capturing bracket with the given number, or, if the number is negative, an
 2068 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
 2069 so that it can be called from pcre_study() when finding the minimum matching
 2070 length.
 2071 
 2072 Arguments:
 2073   code        points to start of expression
 2074   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
 2075   number      the required bracket number or negative to find a lookbehind
 2076 
 2077 Returns:      pointer to the opcode for the bracket, or NULL if not found
 2078 */
 2079 
 2080 const pcre_uchar *
 2081 PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
 2082 {
 2083 for (;;)
 2084   {
 2085   register pcre_uchar c = *code;
 2086 
 2087   if (c == OP_END) return NULL;
 2088 
 2089   /* XCLASS is used for classes that cannot be represented just by a bit
 2090   map. This includes negated single high-valued characters. The length in
 2091   the table is zero; the actual length is stored in the compiled code. */
 2092 
 2093   if (c == OP_XCLASS) code += GET(code, 1);
 2094 
 2095   /* Handle recursion */
 2096 
 2097   else if (c == OP_REVERSE)
 2098     {
 2099     if (number < 0) return (pcre_uchar *)code;
 2100     code += PRIV(OP_lengths)[c];
 2101     }
 2102 
 2103   /* Handle capturing bracket */
 2104 
 2105   else if (c == OP_CBRA || c == OP_SCBRA ||
 2106            c == OP_CBRAPOS || c == OP_SCBRAPOS)
 2107     {
 2108     int n = (int)GET2(code, 1+LINK_SIZE);
 2109     if (n == number) return (pcre_uchar *)code;
 2110     code += PRIV(OP_lengths)[c];
 2111     }
 2112 
 2113   /* Otherwise, we can get the item's length from the table, except that for
 2114   repeated character types, we have to test for \p and \P, which have an extra
 2115   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
 2116   must add in its length. */
 2117 
 2118   else
 2119     {
 2120     switch(c)
 2121       {
 2122       case OP_TYPESTAR:
 2123       case OP_TYPEMINSTAR:
 2124       case OP_TYPEPLUS:
 2125       case OP_TYPEMINPLUS:
 2126       case OP_TYPEQUERY:
 2127       case OP_TYPEMINQUERY:
 2128       case OP_TYPEPOSSTAR:
 2129       case OP_TYPEPOSPLUS:
 2130       case OP_TYPEPOSQUERY:
 2131       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
 2132       break;
 2133 
 2134       case OP_TYPEUPTO:
 2135       case OP_TYPEMINUPTO:
 2136       case OP_TYPEEXACT:
 2137       case OP_TYPEPOSUPTO:
 2138       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
 2139         code += 2;
 2140       break;
 2141 
 2142       case OP_MARK:
 2143       case OP_PRUNE_ARG:
 2144       case OP_SKIP_ARG:
 2145       case OP_THEN_ARG:
 2146       code += code[1];
 2147       break;
 2148       }
 2149 
 2150     /* Add in the fixed length from the table */
 2151 
 2152     code += PRIV(OP_lengths)[c];
 2153 
 2154   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
 2155   a multi-byte character. The length in the table is a minimum, so we have to
 2156   arrange to skip the extra bytes. */
 2157 
 2158 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
 2159     if (utf) switch(c)
 2160       {
 2161       case OP_CHAR:
 2162       case OP_CHARI:
 2163       case OP_NOT:
 2164       case OP_NOTI:
 2165       case OP_EXACT:
 2166       case OP_EXACTI:
 2167       case OP_NOTEXACT:
 2168       case OP_NOTEXACTI:
 2169       case OP_UPTO:
 2170       case OP_UPTOI:
 2171       case OP_NOTUPTO:
 2172       case OP_NOTUPTOI:
 2173       case OP_MINUPTO:
 2174       case OP_MINUPTOI:
 2175       case OP_NOTMINUPTO:
 2176       case OP_NOTMINUPTOI:
 2177       case OP_POSUPTO:
 2178       case OP_POSUPTOI:
 2179       case OP_NOTPOSUPTO:
 2180       case OP_NOTPOSUPTOI:
 2181       case OP_STAR:
 2182       case OP_STARI:
 2183       case OP_NOTSTAR:
 2184       case OP_NOTSTARI:
 2185       case OP_MINSTAR:
 2186       case OP_MINSTARI:
 2187       case OP_NOTMINSTAR:
 2188       case OP_NOTMINSTARI:
 2189       case OP_POSSTAR:
 2190       case OP_POSSTARI:
 2191       case OP_NOTPOSSTAR:
 2192       case OP_NOTPOSSTARI:
 2193       case OP_PLUS:
 2194       case OP_PLUSI:
 2195       case OP_NOTPLUS:
 2196       case OP_NOTPLUSI:
 2197       case OP_MINPLUS:
 2198       case OP_MINPLUSI:
 2199       case OP_NOTMINPLUS:
 2200       case OP_NOTMINPLUSI:
 2201       case OP_POSPLUS:
 2202       case OP_POSPLUSI:
 2203       case OP_NOTPOSPLUS:
 2204       case OP_NOTPOSPLUSI:
 2205       case OP_QUERY:
 2206       case OP_QUERYI:
 2207       case OP_NOTQUERY:
 2208       case OP_NOTQUERYI:
 2209       case OP_MINQUERY:
 2210       case OP_MINQUERYI:
 2211       case OP_NOTMINQUERY:
 2212       case OP_NOTMINQUERYI:
 2213       case OP_POSQUERY:
 2214       case OP_POSQUERYI:
 2215       case OP_NOTPOSQUERY:
 2216       case OP_NOTPOSQUERYI:
 2217       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
 2218       break;
 2219       }
 2220 #else
 2221     (void)(utf);  /* Keep compiler happy by referencing function argument */
 2222 #endif
 2223     }
 2224   }
 2225 }
 2226 
 2227 
 2228 
 2229 /*************************************************
 2230 *   Scan compiled regex for recursion reference  *
 2231 *************************************************/
 2232 
 2233 /* This little function scans through a compiled pattern until it finds an
 2234 instance of OP_RECURSE.
 2235 
 2236 Arguments:
 2237   code        points to start of expression
 2238   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
 2239 
 2240 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
 2241 */
 2242 
 2243 static const pcre_uchar *
 2244 find_recurse(const pcre_uchar *code, BOOL utf)
 2245 {
 2246 for (;;)
 2247   {
 2248   register pcre_uchar c = *code;
 2249   if (c == OP_END) return NULL;
 2250   if (c == OP_RECURSE) return code;
 2251 
 2252   /* XCLASS is used for classes that cannot be represented just by a bit
 2253   map. This includes negated single high-valued characters. The length in
 2254   the table is zero; the actual length is stored in the compiled code. */
 2255 
 2256   if (c == OP_XCLASS) code += GET(code, 1);
 2257 
 2258   /* Otherwise, we can get the item's length from the table, except that for
 2259   repeated character types, we have to test for \p and \P, which have an extra
 2260   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
 2261   must add in its length. */
 2262 
 2263   else
 2264     {
 2265     switch(c)
 2266       {
 2267       case OP_TYPESTAR:
 2268       case OP_TYPEMINSTAR:
 2269       case OP_TYPEPLUS:
 2270       case OP_TYPEMINPLUS:
 2271       case OP_TYPEQUERY:
 2272       case OP_TYPEMINQUERY:
 2273       case OP_TYPEPOSSTAR:
 2274       case OP_TYPEPOSPLUS:
 2275       case OP_TYPEPOSQUERY:
 2276       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
 2277       break;
 2278 
 2279       case OP_TYPEPOSUPTO:
 2280       case OP_TYPEUPTO:
 2281       case OP_TYPEMINUPTO:
 2282       case OP_TYPEEXACT:
 2283       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
 2284         code += 2;
 2285       break;
 2286 
 2287       case OP_MARK:
 2288       case OP_PRUNE_ARG:
 2289       case OP_SKIP_ARG:
 2290       case OP_THEN_ARG:
 2291       code += code[1];
 2292       break;
 2293       }
 2294 
 2295     /* Add in the fixed length from the table */
 2296 
 2297     code += PRIV(OP_lengths)[c];
 2298 
 2299     /* In UTF-8 mode, opcodes that are followed by a character may be followed
 2300     by a multi-byte character. The length in the table is a minimum, so we have
 2301     to arrange to skip the extra bytes. */
 2302 
 2303 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
 2304     if (utf) switch(c)
 2305       {
 2306       case OP_CHAR:
 2307       case OP_CHARI:
 2308       case OP_NOT:
 2309       case OP_NOTI:
 2310       case OP_EXACT:
 2311       case OP_EXACTI:
 2312       case OP_NOTEXACT:
 2313       case OP_NOTEXACTI:
 2314       case OP_UPTO:
 2315       case OP_UPTOI:
 2316       case OP_NOTUPTO:
 2317       case OP_NOTUPTOI:
 2318       case OP_MINUPTO:
 2319       case OP_MINUPTOI:
 2320       case OP_NOTMINUPTO:
 2321       case OP_NOTMINUPTOI:
 2322       case OP_POSUPTO:
 2323       case OP_POSUPTOI:
 2324       case OP_NOTPOSUPTO:
 2325       case OP_NOTPOSUPTOI:
 2326       case OP_STAR:
 2327       case OP_STARI:
 2328       case OP_NOTSTAR:
 2329       case OP_NOTSTARI:
 2330       case OP_MINSTAR:
 2331       case OP_MINSTARI:
 2332       case OP_NOTMINSTAR:
 2333       case OP_NOTMINSTARI:
 2334       case OP_POSSTAR:
 2335       case OP_POSSTARI:
 2336       case OP_NOTPOSSTAR:
 2337       case OP_NOTPOSSTARI:
 2338       case OP_PLUS:
 2339       case OP_PLUSI:
 2340       case OP_NOTPLUS:
 2341       case OP_NOTPLUSI:
 2342       case OP_MINPLUS:
 2343       case OP_MINPLUSI:
 2344       case OP_NOTMINPLUS:
 2345       case OP_NOTMINPLUSI:
 2346       case OP_POSPLUS:
 2347       case OP_POSPLUSI:
 2348       case OP_NOTPOSPLUS:
 2349       case OP_NOTPOSPLUSI:
 2350       case OP_QUERY:
 2351       case OP_QUERYI:
 2352       case OP_NOTQUERY:
 2353       case OP_NOTQUERYI:
 2354       case OP_MINQUERY:
 2355       case OP_MINQUERYI:
 2356       case OP_NOTMINQUERY:
 2357       case OP_NOTMINQUERYI:
 2358       case OP_POSQUERY:
 2359       case OP_POSQUERYI:
 2360       case OP_NOTPOSQUERY:
 2361       case OP_NOTPOSQUERYI:
 2362       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
 2363       break;
 2364       }
 2365 #else
 2366     (void)(utf);  /* Keep compiler happy by referencing function argument */
 2367 #endif
 2368     }
 2369   }
 2370 }
 2371 
 2372 
 2373 
 2374 /*************************************************
 2375 *    Scan compiled branch for non-emptiness      *
 2376 *************************************************/
 2377 
 2378 /* This function scans through a branch of a compiled pattern to see whether it
 2379 can match the empty string or not. It is called from could_be_empty()
 2380 below and from compile_branch() when checking for an unlimited repeat of a
 2381 group that can match nothing. Note that first_significant_code() skips over
 2382 backward and negative forward assertions when its final argument is TRUE. If we
 2383 hit an unclosed bracket, we return "empty" - this means we've struck an inner
 2384 bracket whose current branch will already have been scanned.
 2385 
 2386 Arguments:
 2387   code        points to start of search
 2388   endcode     points to where to stop
 2389   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
 2390   cd          contains pointers to tables etc.
 2391   recurses    chain of recurse_check to catch mutual recursion
 2392 
 2393 Returns:      TRUE if what is matched could be empty
 2394 */
 2395 
 2396 static BOOL
 2397 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
 2398   BOOL utf, compile_data *cd, recurse_check *recurses)
 2399 {
 2400 register pcre_uchar c;
 2401 recurse_check this_recurse;
 2402 
 2403 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
 2404      code < endcode;
 2405      code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
 2406   {
 2407   const pcre_uchar *ccode;
 2408 
 2409   c = *code;
 2410 
 2411   /* Skip over forward assertions; the other assertions are skipped by
 2412   first_significant_code() with a TRUE final argument. */
 2413 
 2414   if (c == OP_ASSERT)
 2415     {
 2416     do code += GET(code, 1); while (*code == OP_ALT);
 2417     c = *code;
 2418     continue;
 2419     }
 2420 
 2421   /* For a recursion/subroutine call, if its end has been reached, which
 2422   implies a backward reference subroutine call, we can scan it. If it's a
 2423   forward reference subroutine call, we can't. To detect forward reference
 2424   we have to scan up the list that is kept in the workspace. This function is
 2425   called only when doing the real compile, not during the pre-compile that
 2426   measures the size of the compiled pattern. */
 2427 
 2428   if (c == OP_RECURSE)
 2429     {
 2430     const pcre_uchar *scode = cd->start_code + GET(code, 1);
 2431     const pcre_uchar *endgroup = scode;
 2432     BOOL empty_branch;
 2433 
 2434     /* Test for forward reference or uncompleted reference. This is disabled
 2435     when called to scan a completed pattern by setting cd->start_workspace to
 2436     NULL. */
 2437 
 2438     if (cd->start_workspace != NULL)
 2439       {
 2440       const pcre_uchar *tcode;
 2441       for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
 2442         if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
 2443       if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
 2444       }
 2445 
 2446     /* If the reference is to a completed group, we need to detect whether this
 2447     is a recursive call, as otherwise there will be an infinite loop. If it is
 2448     a recursion, just skip over it. Simple recursions are easily detected. For
 2449     mutual recursions we keep a chain on the stack. */
 2450 
 2451     do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
 2452     if (code >= scode && code <= endgroup) continue;  /* Simple recursion */
 2453     else
 2454       {
 2455       recurse_check *r = recurses;
 2456       for (r = recurses; r != NULL; r = r->prev)
 2457         if (r->group == scode) break;
 2458       if (r != NULL) continue;   /* Mutual recursion */
 2459       }
 2460 
 2461     /* Completed reference; scan the referenced group, remembering it on the
 2462     stack chain to detect mutual recursions. */
 2463 
 2464     empty_branch = FALSE;
 2465     this_recurse.prev = recurses;
 2466     this_recurse.group = scode;
 2467 
 2468     do
 2469       {
 2470       if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
 2471         {
 2472         empty_branch = TRUE;
 2473         break;
 2474         }
 2475       scode += GET(scode, 1);
 2476       }
 2477     while (*scode == OP_ALT);
 2478 
 2479     if (!empty_branch) return FALSE;  /* All branches are non-empty */
 2480     continue;
 2481     }
 2482 
 2483   /* Groups with zero repeats can of course be empty; skip them. */
 2484 
 2485   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
 2486       c == OP_BRAPOSZERO)
 2487     {
 2488     code += PRIV(OP_lengths)[c];
 2489     do code += GET(code, 1); while (*code == OP_ALT);
 2490     c = *code;
 2491     continue;
 2492     }
 2493 
 2494   /* A nested group that is already marked as "could be empty" can just be
 2495   skipped. */
 2496 
 2497   if (c == OP_SBRA  || c == OP_SBRAPOS ||
 2498       c == OP_SCBRA || c == OP_SCBRAPOS)
 2499     {
 2500     do code += GET(code, 1); while (*code == OP_ALT);
 2501     c = *code;
 2502     continue;
 2503     }
 2504 
 2505   /* For other groups, scan the branches. */
 2506 
 2507   if (c == OP_BRA  || c == OP_BRAPOS ||
 2508       c == OP_CBRA || c == OP_CBRAPOS ||
 2509       c == OP_ONCE || c == OP_ONCE_NC ||
 2510       c == OP_COND || c == OP_SCOND)
 2511     {
 2512     BOOL empty_branch;
 2513     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
 2514 
 2515     /* If a conditional group has only one branch, there is a second, implied,
 2516     empty branch, so just skip over the conditional, because it could be empty.
 2517     Otherwise, scan the individual branches of the group. */
 2518 
 2519     if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
 2520       code += GET(code, 1);
 2521     else
 2522       {
 2523       empty_branch = FALSE;
 2524       do
 2525         {
 2526         if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd,
 2527           recurses)) empty_branch = TRUE;
 2528         code += GET(code, 1);
 2529         }
 2530       while (*code == OP_ALT);
 2531       if (!empty_branch) return FALSE;   /* All branches are non-empty */
 2532       }
 2533 
 2534     c = *code;
 2535     continue;
 2536     }
 2537 
 2538   /* Handle the other opcodes */
 2539 
 2540   switch (c)
 2541     {
 2542     /* Check for quantifiers after a class. XCLASS is used for classes that
 2543     cannot be represented just by a bit map. This includes negated single
 2544     high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
 2545     actual length is stored in the compiled code, so we must update "code"
 2546     here. */
 2547 
 2548 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 2549     case OP_XCLASS:
 2550     ccode = code += GET(code, 1);
 2551     goto CHECK_CLASS_REPEAT;
 2552 #endif
 2553 
 2554     case OP_CLASS:
 2555     case OP_NCLASS:
 2556     ccode = code + PRIV(OP_lengths)[OP_CLASS];
 2557 
 2558 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 2559     CHECK_CLASS_REPEAT:
 2560 #endif
 2561 
 2562     switch (*ccode)
 2563       {
 2564       case OP_CRSTAR:            /* These could be empty; continue */
 2565       case OP_CRMINSTAR:
 2566       case OP_CRQUERY:
 2567       case OP_CRMINQUERY:
 2568       case OP_CRPOSSTAR:
 2569       case OP_CRPOSQUERY:
 2570       break;
 2571 
 2572       default:                   /* Non-repeat => class must match */
 2573       case OP_CRPLUS:            /* These repeats aren't empty */
 2574       case OP_CRMINPLUS:
 2575       case OP_CRPOSPLUS:
 2576       return FALSE;
 2577 
 2578       case OP_CRRANGE:
 2579       case OP_CRMINRANGE:
 2580       case OP_CRPOSRANGE:
 2581       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
 2582       break;
 2583       }
 2584     break;
 2585 
 2586     /* Opcodes that must match a character */
 2587 
 2588     case OP_ANY:
 2589     case OP_ALLANY:
 2590     case OP_ANYBYTE:
 2591 
 2592     case OP_PROP:
 2593     case OP_NOTPROP:
 2594     case OP_ANYNL:
 2595 
 2596     case OP_NOT_HSPACE:
 2597     case OP_HSPACE:
 2598     case OP_NOT_VSPACE:
 2599     case OP_VSPACE:
 2600     case OP_EXTUNI:
 2601 
 2602     case OP_NOT_DIGIT:
 2603     case OP_DIGIT:
 2604     case OP_NOT_WHITESPACE:
 2605     case OP_WHITESPACE:
 2606     case OP_NOT_WORDCHAR:
 2607     case OP_WORDCHAR:
 2608 
 2609     case OP_CHAR:
 2610     case OP_CHARI:
 2611     case OP_NOT:
 2612     case OP_NOTI:
 2613 
 2614     case OP_PLUS:
 2615     case OP_PLUSI:
 2616     case OP_MINPLUS:
 2617     case OP_MINPLUSI:
 2618 
 2619     case OP_NOTPLUS:
 2620     case OP_NOTPLUSI:
 2621     case OP_NOTMINPLUS:
 2622     case OP_NOTMINPLUSI:
 2623 
 2624     case OP_POSPLUS:
 2625     case OP_POSPLUSI:
 2626     case OP_NOTPOSPLUS:
 2627     case OP_NOTPOSPLUSI:
 2628 
 2629     case OP_EXACT:
 2630     case OP_EXACTI:
 2631     case OP_NOTEXACT:
 2632     case OP_NOTEXACTI:
 2633 
 2634     case OP_TYPEPLUS:
 2635     case OP_TYPEMINPLUS:
 2636     case OP_TYPEPOSPLUS:
 2637     case OP_TYPEEXACT:
 2638 
 2639     return FALSE;
 2640 
 2641     /* These are going to continue, as they may be empty, but we have to
 2642     fudge the length for the \p and \P cases. */
 2643 
 2644     case OP_TYPESTAR:
 2645     case OP_TYPEMINSTAR:
 2646     case OP_TYPEPOSSTAR:
 2647     case OP_TYPEQUERY:
 2648     case OP_TYPEMINQUERY:
 2649     case OP_TYPEPOSQUERY:
 2650     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
 2651     break;
 2652 
 2653     /* Same for these */
 2654 
 2655     case OP_TYPEUPTO:
 2656     case OP_TYPEMINUPTO:
 2657     case OP_TYPEPOSUPTO:
 2658     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
 2659       code += 2;
 2660     break;
 2661 
 2662     /* End of branch */
 2663 
 2664     case OP_KET:
 2665     case OP_KETRMAX:
 2666     case OP_KETRMIN:
 2667     case OP_KETRPOS:
 2668     case OP_ALT:
 2669     return TRUE;
 2670 
 2671     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
 2672     MINUPTO, and POSUPTO and their caseless and negative versions may be
 2673     followed by a multibyte character. */
 2674 
 2675 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
 2676     case OP_STAR:
 2677     case OP_STARI:
 2678     case OP_NOTSTAR:
 2679     case OP_NOTSTARI:
 2680 
 2681     case OP_MINSTAR:
 2682     case OP_MINSTARI:
 2683     case OP_NOTMINSTAR:
 2684     case OP_NOTMINSTARI:
 2685 
 2686     case OP_POSSTAR:
 2687     case OP_POSSTARI:
 2688     case OP_NOTPOSSTAR:
 2689     case OP_NOTPOSSTARI:
 2690 
 2691     case OP_QUERY:
 2692     case OP_QUERYI:
 2693     case OP_NOTQUERY:
 2694     case OP_NOTQUERYI:
 2695 
 2696     case OP_MINQUERY:
 2697     case OP_MINQUERYI:
 2698     case OP_NOTMINQUERY:
 2699     case OP_NOTMINQUERYI:
 2700 
 2701     case OP_POSQUERY:
 2702     case OP_POSQUERYI:
 2703     case OP_NOTPOSQUERY:
 2704     case OP_NOTPOSQUERYI:
 2705 
 2706     if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
 2707     break;
 2708 
 2709     case OP_UPTO:
 2710     case OP_UPTOI:
 2711     case OP_NOTUPTO:
 2712     case OP_NOTUPTOI:
 2713 
 2714     case OP_MINUPTO:
 2715     case OP_MINUPTOI:
 2716     case OP_NOTMINUPTO:
 2717     case OP_NOTMINUPTOI:
 2718 
 2719     case OP_POSUPTO:
 2720     case OP_POSUPTOI:
 2721     case OP_NOTPOSUPTO:
 2722     case OP_NOTPOSUPTOI:
 2723 
 2724     if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
 2725     break;
 2726 #endif
 2727 
 2728     /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
 2729     string. */
 2730 
 2731     case OP_MARK:
 2732     case OP_PRUNE_ARG:
 2733     case OP_SKIP_ARG:
 2734     case OP_THEN_ARG:
 2735     code += code[1];
 2736     break;
 2737 
 2738     /* None of the remaining opcodes are required to match a character. */
 2739 
 2740     default:
 2741     break;
 2742     }
 2743   }
 2744 
 2745 return TRUE;
 2746 }
 2747 
 2748 
 2749 
 2750 /*************************************************
 2751 *    Scan compiled regex for non-emptiness       *
 2752 *************************************************/
 2753 
 2754 /* This function is called to check for left recursive calls. We want to check
 2755 the current branch of the current pattern to see if it could match the empty
 2756 string. If it could, we must look outwards for branches at other levels,
 2757 stopping when we pass beyond the bracket which is the subject of the recursion.
 2758 This function is called only during the real compile, not during the
 2759 pre-compile.
 2760 
 2761 Arguments:
 2762   code        points to start of the recursion
 2763   endcode     points to where to stop (current RECURSE item)
 2764   bcptr       points to the chain of current (unclosed) branch starts
 2765   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
 2766   cd          pointers to tables etc
 2767 
 2768 Returns:      TRUE if what is matched could be empty
 2769 */
 2770 
 2771 static BOOL
 2772 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
 2773   branch_chain *bcptr, BOOL utf, compile_data *cd)
 2774 {
 2775 while (bcptr != NULL && bcptr->current_branch >= code)
 2776   {
 2777   if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
 2778     return FALSE;
 2779   bcptr = bcptr->outer;
 2780   }
 2781 return TRUE;
 2782 }
 2783 
 2784 
 2785 
 2786 /*************************************************
 2787 *        Base opcode of repeated opcodes         *
 2788 *************************************************/
 2789 
 2790 /* Returns the base opcode for repeated single character type opcodes. If the
 2791 opcode is not a repeated character type, it returns with the original value.
 2792 
 2793 Arguments:  c opcode
 2794 Returns:    base opcode for the type
 2795 */
 2796 
 2797 static pcre_uchar
 2798 get_repeat_base(pcre_uchar c)
 2799 {
 2800 return (c > OP_TYPEPOSUPTO)? c :
 2801        (c >= OP_TYPESTAR)?   OP_TYPESTAR :
 2802        (c >= OP_NOTSTARI)?   OP_NOTSTARI :
 2803        (c >= OP_NOTSTAR)?    OP_NOTSTAR :
 2804        (c >= OP_STARI)?      OP_STARI :
 2805                              OP_STAR;
 2806 }
 2807 
 2808 
 2809 
 2810 #ifdef SUPPORT_UCP
 2811 /*************************************************
 2812 *        Check a character and a property        *
 2813 *************************************************/
 2814 
 2815 /* This function is called by check_auto_possessive() when a property item
 2816 is adjacent to a fixed character.
 2817 
 2818 Arguments:
 2819   c            the character
 2820   ptype        the property type
 2821   pdata        the data for the type
 2822   negated      TRUE if it's a negated property (\P or \p{^)
 2823 
 2824 Returns:       TRUE if auto-possessifying is OK
 2825 */
 2826 
 2827 static BOOL
 2828 check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
 2829   BOOL negated)
 2830 {
 2831 const pcre_uint32 *p;
 2832 const ucd_record *prop = GET_UCD(c);
 2833 
 2834 switch(ptype)
 2835   {
 2836   case PT_LAMP:
 2837   return (prop->chartype == ucp_Lu ||
 2838           prop->chartype == ucp_Ll ||
 2839           prop->chartype == ucp_Lt) == negated;
 2840 
 2841   case PT_GC:
 2842   return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
 2843 
 2844   case PT_PC:
 2845   return (pdata == prop->chartype) == negated;
 2846 
 2847   case PT_SC:
 2848   return (pdata == prop->script) == negated;
 2849 
 2850   /* These are specials */
 2851 
 2852   case PT_ALNUM:
 2853   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
 2854           PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
 2855 
 2856   /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
 2857   means that Perl space and POSIX space are now identical. PCRE was changed
 2858   at release 8.34. */
 2859 
 2860   case PT_SPACE:    /* Perl space */
 2861   case PT_PXSPACE:  /* POSIX space */
 2862   switch(c)
 2863     {
 2864     HSPACE_CASES:
 2865     VSPACE_CASES:
 2866     return negated;
 2867 
 2868     default:
 2869     return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
 2870     }
 2871   break;  /* Control never reaches here */
 2872 
 2873   case PT_WORD:
 2874   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
 2875           PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
 2876           c == CHAR_UNDERSCORE) == negated;
 2877 
 2878   case PT_CLIST:
 2879   p = PRIV(ucd_caseless_sets) + prop->caseset;
 2880   for (;;)
 2881     {
 2882     if (c < *p) return !negated;
 2883     if (c == *p++) return negated;
 2884     }
 2885   break;  /* Control never reaches here */
 2886   }
 2887 
 2888 return FALSE;
 2889 }
 2890 #endif  /* SUPPORT_UCP */
 2891 
 2892 
 2893 
 2894 /*************************************************
 2895 *        Fill the character property list        *
 2896 *************************************************/
 2897 
 2898 /* Checks whether the code points to an opcode that can take part in auto-
 2899 possessification, and if so, fills a list with its properties.
 2900 
 2901 Arguments:
 2902   code        points to start of expression
 2903   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
 2904   fcc         points to case-flipping table
 2905   list        points to output list
 2906               list[0] will be filled with the opcode
 2907               list[1] will be non-zero if this opcode
 2908                 can match an empty character string
 2909               list[2..7] depends on the opcode
 2910 
 2911 Returns:      points to the start of the next opcode if *code is accepted
 2912               NULL if *code is not accepted
 2913 */
 2914 
 2915 static const pcre_uchar *
 2916 get_chr_property_list(const pcre_uchar *code, BOOL utf,
 2917   const pcre_uint8 *fcc, pcre_uint32 *list)
 2918 {
 2919 pcre_uchar c = *code;
 2920 pcre_uchar base;
 2921 const pcre_uchar *end;
 2922 pcre_uint32 chr;
 2923 
 2924 #ifdef SUPPORT_UCP
 2925 pcre_uint32 *clist_dest;
 2926 const pcre_uint32 *clist_src;
 2927 #else
 2928 utf = utf;  /* Suppress "unused parameter" compiler warning */
 2929 #endif
 2930 
 2931 list[0] = c;
 2932 list[1] = FALSE;
 2933 code++;
 2934 
 2935 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
 2936   {
 2937   base = get_repeat_base(c);
 2938   c -= (base - OP_STAR);
 2939 
 2940   if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
 2941     code += IMM2_SIZE;
 2942 
 2943   list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
 2944 
 2945   switch(base)
 2946     {
 2947     case OP_STAR:
 2948     list[0] = OP_CHAR;
 2949     break;
 2950 
 2951     case OP_STARI:
 2952     list[0] = OP_CHARI;
 2953     break;
 2954 
 2955     case OP_NOTSTAR:
 2956     list[0] = OP_NOT;
 2957     break;
 2958 
 2959     case OP_NOTSTARI:
 2960     list[0] = OP_NOTI;
 2961     break;
 2962 
 2963     case OP_TYPESTAR:
 2964     list[0] = *code;
 2965     code++;
 2966     break;
 2967     }
 2968   c = list[0];
 2969   }
 2970 
 2971 switch(c)
 2972   {
 2973   case OP_NOT_DIGIT:
 2974   case OP_DIGIT:
 2975   case OP_NOT_WHITESPACE:
 2976   case OP_WHITESPACE:
 2977   case OP_NOT_WORDCHAR:
 2978   case OP_WORDCHAR:
 2979   case OP_ANY:
 2980   case OP_ALLANY:
 2981   case OP_ANYNL:
 2982   case OP_NOT_HSPACE:
 2983   case OP_HSPACE:
 2984   case OP_NOT_VSPACE:
 2985   case OP_VSPACE:
 2986   case OP_EXTUNI:
 2987   case OP_EODN:
 2988   case OP_EOD:
 2989   case OP_DOLL:
 2990   case OP_DOLLM:
 2991   return code;
 2992 
 2993   case OP_CHAR:
 2994   case OP_NOT:
 2995   GETCHARINCTEST(chr, code);
 2996   list[2] = chr;
 2997   list[3] = NOTACHAR;
 2998   return code;
 2999 
 3000   case OP_CHARI:
 3001   case OP_NOTI:
 3002   list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
 3003   GETCHARINCTEST(chr, code);
 3004   list[2] = chr;
 3005 
 3006 #ifdef SUPPORT_UCP
 3007   if (chr < 128 || (chr < 256 && !utf))
 3008     list[3] = fcc[chr];
 3009   else
 3010     list[3] = UCD_OTHERCASE(chr);
 3011 #elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
 3012   list[3] = (chr < 256) ? fcc[chr] : chr;
 3013 #else
 3014   list[3] = fcc[chr];
 3015 #endif
 3016 
 3017   /* The othercase might be the same value. */
 3018 
 3019   if (chr == list[3])
 3020     list[3] = NOTACHAR;
 3021   else
 3022     list[4] = NOTACHAR;
 3023   return code;
 3024 
 3025 #ifdef SUPPORT_UCP
 3026   case OP_PROP:
 3027   case OP_NOTPROP:
 3028   if (code[0] != PT_CLIST)
 3029     {
 3030     list[2] = code[0];
 3031     list[3] = code[1];
 3032     return code + 2;
 3033     }
 3034 
 3035   /* Convert only if we have enough space. */
 3036 
 3037   clist_src = PRIV(ucd_caseless_sets) + code[1];
 3038   clist_dest = list + 2;
 3039   code += 2;
 3040 
 3041   do {
 3042      if (clist_dest >= list + 8)
 3043        {
 3044        /* Early return if there is not enough space. This should never
 3045        happen, since all clists are shorter than 5 character now. */
 3046        list[2] = code[0];
 3047        list[3] = code[1];
 3048        return code;
 3049        }
 3050      *clist_dest++ = *clist_src;
 3051      }
 3052   while(*clist_src++ != NOTACHAR);
 3053 
 3054   /* All characters are stored. The terminating NOTACHAR
 3055   is copied form the clist itself. */
 3056 
 3057   list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
 3058   return code;
 3059 #endif
 3060 
 3061   case OP_NCLASS:
 3062   case OP_CLASS:
 3063 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 3064   case OP_XCLASS:
 3065   if (c == OP_XCLASS)
 3066     end = code + GET(code, 0) - 1;
 3067   else
 3068 #endif
 3069     end = code + 32 / sizeof(pcre_uchar);
 3070 
 3071   switch(*end)
 3072     {
 3073     case OP_CRSTAR:
 3074     case OP_CRMINSTAR:
 3075     case OP_CRQUERY:
 3076     case OP_CRMINQUERY:
 3077     case OP_CRPOSSTAR:
 3078     case OP_CRPOSQUERY:
 3079     list[1] = TRUE;
 3080     end++;
 3081     break;
 3082 
 3083     case OP_CRPLUS:
 3084     case OP_CRMINPLUS:
 3085     case OP_CRPOSPLUS:
 3086     end++;
 3087     break;
 3088 
 3089     case OP_CRRANGE:
 3090     case OP_CRMINRANGE:
 3091     case OP_CRPOSRANGE:
 3092     list[1] = (GET2(end, 1) == 0);
 3093     end += 1 + 2 * IMM2_SIZE;
 3094     break;
 3095     }
 3096   list[2] = (pcre_uint32)(end - code);
 3097   return end;
 3098   }
 3099 return NULL;    /* Opcode not accepted */
 3100 }
 3101 
 3102 
 3103 
 3104 /*************************************************
 3105 *    Scan further character sets for match       *
 3106 *************************************************/
 3107 
 3108 /* Checks whether the base and the current opcode have a common character, in
 3109 which case the base cannot be possessified.
 3110 
 3111 Arguments:
 3112   code        points to the byte code
 3113   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
 3114   cd          static compile data
 3115   base_list   the data list of the base opcode
 3116 
 3117 Returns:      TRUE if the auto-possessification is possible
 3118 */
 3119 
 3120 static BOOL
 3121 compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
 3122   const pcre_uint32 *base_list, const pcre_uchar *base_end, int *rec_limit)
 3123 {
 3124 pcre_uchar c;
 3125 pcre_uint32 list[8];
 3126 const pcre_uint32 *chr_ptr;
 3127 const pcre_uint32 *ochr_ptr;
 3128 const pcre_uint32 *list_ptr;
 3129 const pcre_uchar *next_code;
 3130 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 3131 const pcre_uchar *xclass_flags;
 3132 #endif
 3133 const pcre_uint8 *class_bitset;
 3134 const pcre_uint8 *set1, *set2, *set_end;
 3135 pcre_uint32 chr;
 3136 BOOL accepted, invert_bits;
 3137 BOOL entered_a_group = FALSE;
 3138 
 3139 if (*rec_limit == 0) return FALSE;
 3140 --(*rec_limit);
 3141 
 3142 /* Note: the base_list[1] contains whether the current opcode has greedy
 3143 (represented by a non-zero value) quantifier. This is a different from
 3144 other character type lists, which stores here that the character iterator
 3145 matches to an empty string (also represented by a non-zero value). */
 3146 
 3147 for(;;)
 3148   {
 3149   /* All operations move the code pointer forward.
 3150   Therefore infinite recursions are not possible. */
 3151 
 3152   c = *code;
 3153 
 3154   /* Skip over callouts */
 3155 
 3156   if (c == OP_CALLOUT)
 3157     {
 3158     code += PRIV(OP_lengths)[c];
 3159     continue;
 3160     }
 3161 
 3162   if (c == OP_ALT)
 3163     {
 3164     do code += GET(code, 1); while (*code == OP_ALT);
 3165     c = *code;
 3166     }
 3167 
 3168   switch(c)
 3169     {
 3170     case OP_END:
 3171     case OP_KETRPOS:
 3172     /* TRUE only in greedy case. The non-greedy case could be replaced by
 3173     an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
 3174     uses more memory, which we cannot get at this stage.) */
 3175 
 3176     return base_list[1] != 0;
 3177 
 3178     case OP_KET:
 3179     /* If the bracket is capturing, and referenced by an OP_RECURSE, or
 3180     it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
 3181     cannot be converted to a possessive form. */
 3182 
 3183     if (base_list[1] == 0) return FALSE;
 3184 
 3185     switch(*(code - GET(code, 1)))
 3186       {
 3187       case OP_ASSERT:
 3188       case OP_ASSERT_NOT:
 3189       case OP_ASSERTBACK:
 3190       case OP_ASSERTBACK_NOT:
 3191       case OP_ONCE:
 3192       case OP_ONCE_NC:
 3193       /* Atomic sub-patterns and assertions can always auto-possessify their
 3194       last iterator. However, if the group was entered as a result of checking
 3195       a previous iterator, this is not possible. */
 3196 
 3197       return !entered_a_group;
 3198       }
 3199 
 3200     code += PRIV(OP_lengths)[c];
 3201     continue;
 3202 
 3203     case OP_ONCE:
 3204     case OP_ONCE_NC:
 3205     case OP_BRA:
 3206     case OP_CBRA:
 3207     next_code = code + GET(code, 1);
 3208     code += PRIV(OP_lengths)[c];
 3209 
 3210     while (*next_code == OP_ALT)
 3211       {
 3212       if (!compare_opcodes(code, utf, cd, base_list, base_end, rec_limit))
 3213         return FALSE;
 3214       code = next_code + 1 + LINK_SIZE;
 3215       next_code += GET(next_code, 1);
 3216       }
 3217 
 3218     entered_a_group = TRUE;
 3219     continue;
 3220 
 3221     case OP_BRAZERO:
 3222     case OP_BRAMINZERO:
 3223 
 3224     next_code = code + 1;
 3225     if (*next_code != OP_BRA && *next_code != OP_CBRA
 3226         && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
 3227 
 3228     do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
 3229 
 3230     /* The bracket content will be checked by the
 3231     OP_BRA/OP_CBRA case above. */
 3232     next_code += 1 + LINK_SIZE;
 3233     if (!compare_opcodes(next_code, utf, cd, base_list, base_end, rec_limit))
 3234       return FALSE;
 3235 
 3236     code += PRIV(OP_lengths)[c];
 3237     continue;
 3238 
 3239     default:
 3240     break;
 3241     }
 3242 
 3243   /* Check for a supported opcode, and load its properties. */
 3244 
 3245   code = get_chr_property_list(code, utf, cd->fcc, list);
 3246   if (code == NULL) return FALSE;    /* Unsupported */
 3247 
 3248   /* If either opcode is a small character list, set pointers for comparing
 3249   characters from that list with another list, or with a property. */
 3250 
 3251   if (base_list[0] == OP_CHAR)
 3252     {
 3253     chr_ptr = base_list + 2;
 3254     list_ptr = list;
 3255     }
 3256   else if (list[0] == OP_CHAR)
 3257     {
 3258     chr_ptr = list + 2;
 3259     list_ptr = base_list;
 3260     }
 3261 
 3262   /* Character bitsets can also be compared to certain opcodes. */
 3263 
 3264   else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
 3265 #ifdef COMPILE_PCRE8
 3266       /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
 3267       || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
 3268 #endif
 3269       )
 3270     {
 3271 #ifdef COMPILE_PCRE8
 3272     if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
 3273 #else
 3274     if (base_list[0] == OP_CLASS)
 3275 #endif
 3276       {
 3277       set1 = (pcre_uint8 *)(base_end - base_list[2]);
 3278       list_ptr = list;
 3279       }
 3280     else
 3281       {
 3282       set1 = (pcre_uint8 *)(code - list[2]);
 3283       list_ptr = base_list;
 3284       }
 3285 
 3286     invert_bits = FALSE;
 3287     switch(list_ptr[0])
 3288       {
 3289       case OP_CLASS:
 3290       case OP_NCLASS:
 3291       set2 = (pcre_uint8 *)
 3292         ((list_ptr == list ? code : base_end) - list_ptr[2]);
 3293       break;
 3294 
 3295 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 3296       case OP_XCLASS:
 3297       xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE;
 3298       if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
 3299       if ((*xclass_flags & XCL_MAP) == 0)
 3300         {
 3301         /* No bits are set for characters < 256. */
 3302         if (list[1] == 0) return TRUE;
 3303         /* Might be an empty repeat. */
 3304         continue;
 3305         }
 3306       set2 = (pcre_uint8 *)(xclass_flags + 1);
 3307       break;
 3308 #endif
 3309 
 3310       case OP_NOT_DIGIT:
 3311       invert_bits = TRUE;
 3312       /* Fall through */
 3313       case OP_DIGIT:
 3314       set2 = (pcre_uint8 *)(cd->cbits + cbit_digit);
 3315       break;
 3316 
 3317       case OP_NOT_WHITESPACE:
 3318       invert_bits = TRUE;
 3319       /* Fall through */
 3320       case OP_WHITESPACE:
 3321       set2 = (pcre_uint8 *)(cd->cbits + cbit_space);
 3322       break;
 3323 
 3324       case OP_NOT_WORDCHAR:
 3325       invert_bits = TRUE;
 3326       /* Fall through */
 3327       case OP_WORDCHAR:
 3328       set2 = (pcre_uint8 *)(cd->cbits + cbit_word);
 3329       break;
 3330 
 3331       default:
 3332       return FALSE;
 3333       }
 3334 
 3335     /* Because the sets are unaligned, we need
 3336     to perform byte comparison here. */
 3337     set_end = set1 + 32;
 3338     if (invert_bits)
 3339       {
 3340       do
 3341         {
 3342         if ((*set1++ & ~(*set2++)) != 0) return FALSE;
 3343         }
 3344       while (set1 < set_end);
 3345       }
 3346     else
 3347       {
 3348       do
 3349         {
 3350         if ((*set1++ & *set2++) != 0) return FALSE;
 3351         }
 3352       while (set1 < set_end);
 3353       }
 3354 
 3355     if (list[1] == 0) return TRUE;
 3356     /* Might be an empty repeat. */
 3357     continue;
 3358     }
 3359 
 3360   /* Some property combinations also acceptable. Unicode property opcodes are
 3361   processed specially; the rest can be handled with a lookup table. */
 3362 
 3363   else
 3364     {
 3365     pcre_uint32 leftop, rightop;
 3366 
 3367     leftop = base_list[0];
 3368     rightop = list[0];
 3369 
 3370 #ifdef SUPPORT_UCP
 3371     accepted = FALSE; /* Always set in non-unicode case. */
 3372     if (leftop == OP_PROP || leftop == OP_NOTPROP)
 3373       {
 3374       if (rightop == OP_EOD)
 3375         accepted = TRUE;
 3376       else if (rightop == OP_PROP || rightop == OP_NOTPROP)
 3377         {
 3378         int n;
 3379         const pcre_uint8 *p;
 3380         BOOL same = leftop == rightop;
 3381         BOOL lisprop = leftop == OP_PROP;
 3382         BOOL risprop = rightop == OP_PROP;
 3383         BOOL bothprop = lisprop && risprop;
 3384 
 3385         /* There's a table that specifies how each combination is to be
 3386         processed:
 3387           0   Always return FALSE (never auto-possessify)
 3388           1   Character groups are distinct (possessify if both are OP_PROP)
 3389           2   Check character categories in the same group (general or particular)
 3390           3   Return TRUE if the two opcodes are not the same
 3391           ... see comments below
 3392         */
 3393 
 3394         n = propposstab[base_list[2]][list[2]];
 3395         switch(n)
 3396           {
 3397           case 0: break;
 3398           case 1: accepted = bothprop; break;
 3399           case 2: accepted = (base_list[3] == list[3]) != same; break;
 3400           case 3: accepted = !same; break;
 3401 
 3402           case 4:  /* Left general category, right particular category */
 3403           accepted = risprop && catposstab[base_list[3]][list[3]] == same;
 3404           break;
 3405 
 3406           case 5:  /* Right general category, left particular category */
 3407           accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
 3408           break;
 3409 
 3410           /* This code is logically tricky. Think hard before fiddling with it.
 3411           The posspropstab table has four entries per row. Each row relates to
 3412           one of PCRE's special properties such as ALNUM or SPACE or WORD.
 3413           Only WORD actually needs all four entries, but using repeats for the
 3414           others means they can all use the same code below.
 3415 
 3416           The first two entries in each row are Unicode general categories, and
 3417           apply always, because all the characters they include are part of the
 3418           PCRE character set. The third and fourth entries are a general and a
 3419           particular category, respectively, that include one or more relevant
 3420           characters. One or the other is used, depending on whether the check
 3421           is for a general or a particular category. However, in both cases the
 3422           category contains more characters than the specials that are defined
 3423           for the property being tested against. Therefore, it cannot be used
 3424           in a NOTPROP case.
 3425 
 3426           Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
 3427           Underscore is covered by ucp_P or ucp_Po. */
 3428 
 3429           case 6:  /* Left alphanum vs right general category */
 3430           case 7:  /* Left space vs right general category */
 3431           case 8:  /* Left word vs right general category */
 3432           p = posspropstab[n-6];
 3433           accepted = risprop && lisprop ==
 3434             (list[3] != p[0] &&
 3435              list[3] != p[1] &&
 3436             (list[3] != p[2] || !lisprop));
 3437           break;
 3438 
 3439           case 9:   /* Right alphanum vs left general category */
 3440           case 10:  /* Right space vs left general category */
 3441           case 11:  /* Right word vs left general category */
 3442           p = posspropstab[n-9];
 3443           accepted = lisprop && risprop ==
 3444             (base_list[3] != p[0] &&
 3445              base_list[3] != p[1] &&
 3446             (base_list[3] != p[2] || !risprop));
 3447           break;
 3448 
 3449           case 12:  /* Left alphanum vs right particular category */
 3450           case 13:  /* Left space vs right particular category */
 3451           case 14:  /* Left word vs right particular category */
 3452           p = posspropstab[n-12];
 3453           accepted = risprop && lisprop ==
 3454             (catposstab[p[0]][list[3]] &&
 3455              catposstab[p[1]][list[3]] &&
 3456             (list[3] != p[3] || !lisprop));
 3457           break;
 3458 
 3459           case 15:  /* Right alphanum vs left particular category */
 3460           case 16:  /* Right space vs left particular category */
 3461           case 17:  /* Right word vs left particular category */
 3462           p = posspropstab[n-15];
 3463           accepted = lisprop && risprop ==
 3464             (catposstab[p[0]][base_list[3]] &&
 3465              catposstab[p[1]][base_list[3]] &&
 3466             (base_list[3] != p[3] || !risprop));
 3467           break;
 3468           }
 3469         }
 3470       }
 3471 
 3472     else
 3473 #endif  /* SUPPORT_UCP */
 3474 
 3475     accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
 3476            rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
 3477            autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
 3478 
 3479     if (!accepted) return FALSE;
 3480 
 3481     if (list[1] == 0) return TRUE;
 3482     /* Might be an empty repeat. */
 3483     continue;
 3484     }
 3485 
 3486   /* Control reaches here only if one of the items is a small character list.
 3487   All characters are checked against the other side. */
 3488 
 3489   do
 3490     {
 3491     chr = *chr_ptr;
 3492 
 3493     switch(list_ptr[0])
 3494       {
 3495       case OP_CHAR:
 3496       ochr_ptr = list_ptr + 2;
 3497       do
 3498         {
 3499         if (chr == *ochr_ptr) return FALSE;
 3500         ochr_ptr++;
 3501         }
 3502       while(*ochr_ptr != NOTACHAR);
 3503       break;
 3504 
 3505       case OP_NOT:
 3506       ochr_ptr = list_ptr + 2;
 3507       do
 3508         {
 3509         if (chr == *ochr_ptr)
 3510           break;
 3511         ochr_ptr++;
 3512         }
 3513       while(*ochr_ptr != NOTACHAR);
 3514       if (*ochr_ptr == NOTACHAR) return FALSE;   /* Not found */
 3515       break;
 3516 
 3517       /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
 3518       set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
 3519 
 3520       case OP_DIGIT:
 3521       if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
 3522       break;
 3523 
 3524       case OP_NOT_DIGIT:
 3525       if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
 3526       break;
 3527 
 3528       case OP_WHITESPACE:
 3529       if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
 3530       break;
 3531 
 3532       case OP_NOT_WHITESPACE:
 3533       if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
 3534       break;
 3535 
 3536       case OP_WORDCHAR:
 3537       if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
 3538       break;
 3539 
 3540       case OP_NOT_WORDCHAR:
 3541       if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
 3542       break;
 3543 
 3544       case OP_HSPACE:
 3545       switch(chr)
 3546         {
 3547         HSPACE_CASES: return FALSE;
 3548         default: break;
 3549         }
 3550       break;
 3551 
 3552       case OP_NOT_HSPACE:
 3553       switch(chr)
 3554         {
 3555         HSPACE_CASES: break;
 3556         default: return FALSE;
 3557         }
 3558       break;
 3559 
 3560       case OP_ANYNL:
 3561       case OP_VSPACE:
 3562       switch(chr)
 3563         {
 3564         VSPACE_CASES: return FALSE;
 3565         default: break;
 3566         }
 3567       break;
 3568 
 3569       case OP_NOT_VSPACE:
 3570       switch(chr)
 3571         {
 3572         VSPACE_CASES: break;
 3573         default: return FALSE;
 3574         }
 3575       break;
 3576 
 3577       case OP_DOLL:
 3578       case OP_EODN:
 3579       switch (chr)
 3580         {
 3581         case CHAR_CR:
 3582         case CHAR_LF:
 3583         case CHAR_VT:
 3584         case CHAR_FF:
 3585         case CHAR_NEL:
 3586 #ifndef EBCDIC
 3587         case 0x2028:
 3588         case 0x2029:
 3589 #endif  /* Not EBCDIC */
 3590         return FALSE;
 3591         }
 3592       break;
 3593 
 3594       case OP_EOD:    /* Can always possessify before \z */
 3595       break;
 3596 
 3597 #ifdef SUPPORT_UCP
 3598       case OP_PROP:
 3599       case OP_NOTPROP:
 3600       if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
 3601             list_ptr[0] == OP_NOTPROP))
 3602         return FALSE;
 3603       break;
 3604 #endif
 3605 
 3606       case OP_NCLASS:
 3607       if (chr > 255) return FALSE;
 3608       /* Fall through */
 3609 
 3610       case OP_CLASS:
 3611       if (chr > 255) break;
 3612       class_bitset = (pcre_uint8 *)
 3613         ((list_ptr == list ? code : base_end) - list_ptr[2]);
 3614       if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
 3615       break;
 3616 
 3617 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 3618       case OP_XCLASS:
 3619       if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
 3620           list_ptr[2] + LINK_SIZE, utf)) return FALSE;
 3621       break;
 3622 #endif
 3623 
 3624       default:
 3625       return FALSE;
 3626       }
 3627 
 3628     chr_ptr++;
 3629     }
 3630   while(*chr_ptr != NOTACHAR);
 3631 
 3632   /* At least one character must be matched from this opcode. */
 3633 
 3634   if (list[1] == 0) return TRUE;
 3635   }
 3636 
 3637 /* Control never reaches here. There used to be a fail-save return FALSE; here,
 3638 but some compilers complain about an unreachable statement. */
 3639 
 3640 }
 3641 
 3642 
 3643 
 3644 /*************************************************
 3645 *    Scan compiled regex for auto-possession     *
 3646 *************************************************/
 3647 
 3648 /* Replaces single character iterations with their possessive alternatives
 3649 if appropriate. This function modifies the compiled opcode!
 3650 
 3651 Arguments:
 3652   code        points to start of the byte code
 3653   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
 3654   cd          static compile data
 3655 
 3656 Returns:      nothing
 3657 */
 3658 
 3659 static void
 3660 auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
 3661 {
 3662 register pcre_uchar c;
 3663 const pcre_uchar *end;
 3664 pcre_uchar *repeat_opcode;
 3665 pcre_uint32 list[8];
 3666 int rec_limit;
 3667 
 3668 for (;;)
 3669   {
 3670   c = *code;
 3671 
 3672   /* When a pattern with bad UTF-8 encoding is compiled with NO_UTF_CHECK,
 3673   it may compile without complaining, but may get into a loop here if the code
 3674   pointer points to a bad value. This is, of course a documentated possibility,
 3675   when NO_UTF_CHECK is set, so it isn't a bug, but we can detect this case and
 3676   just give up on this optimization. */
 3677 
 3678   if (c >= OP_TABLE_LENGTH) return;
 3679 
 3680   if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
 3681     {
 3682     c -= get_repeat_base(c) - OP_STAR;
 3683     end = (c <= OP_MINUPTO) ?
 3684       get_chr_property_list(code, utf, cd->fcc, list) : NULL;
 3685     list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
 3686 
 3687     rec_limit = 1000;
 3688     if (end != NULL && compare_opcodes(end, utf, cd, list, end, &rec_limit))
 3689       {
 3690       switch(c)
 3691         {
 3692         case OP_STAR:
 3693         *code += OP_POSSTAR - OP_STAR;
 3694         break;
 3695 
 3696         case OP_MINSTAR:
 3697         *code += OP_POSSTAR - OP_MINSTAR;
 3698         break;
 3699 
 3700         case OP_PLUS:
 3701         *code += OP_POSPLUS - OP_PLUS;
 3702         break;
 3703 
 3704         case OP_MINPLUS:
 3705         *code += OP_POSPLUS - OP_MINPLUS;
 3706         break;
 3707 
 3708         case OP_QUERY:
 3709         *code += OP_POSQUERY - OP_QUERY;
 3710         break;
 3711 
 3712         case OP_MINQUERY:
 3713         *code += OP_POSQUERY - OP_MINQUERY;
 3714         break;
 3715 
 3716         case OP_UPTO:
 3717         *code += OP_POSUPTO - OP_UPTO;
 3718         break;
 3719 
 3720         case OP_MINUPTO:
 3721         *code += OP_POSUPTO - OP_MINUPTO;
 3722         break;
 3723         }
 3724       }
 3725     c = *code;
 3726     }
 3727   else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
 3728     {
 3729 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 3730     if (c == OP_XCLASS)
 3731       repeat_opcode = code + GET(code, 1);
 3732     else
 3733 #endif
 3734       repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
 3735 
 3736     c = *repeat_opcode;
 3737     if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
 3738       {
 3739       /* end must not be NULL. */
 3740       end = get_chr_property_list(code, utf, cd->fcc, list);
 3741 
 3742       list[1] = (c & 1) == 0;
 3743 
 3744       rec_limit = 1000;
 3745       if (compare_opcodes(end, utf, cd, list, end, &rec_limit))
 3746         {
 3747         switch (c)
 3748           {
 3749           case OP_CRSTAR:
 3750           case OP_CRMINSTAR:
 3751           *repeat_opcode = OP_CRPOSSTAR;
 3752           break;
 3753 
 3754           case OP_CRPLUS:
 3755           case OP_CRMINPLUS:
 3756           *repeat_opcode = OP_CRPOSPLUS;
 3757           break;
 3758 
 3759           case OP_CRQUERY:
 3760           case OP_CRMINQUERY:
 3761           *repeat_opcode = OP_CRPOSQUERY;
 3762           break;
 3763 
 3764           case OP_CRRANGE:
 3765           case OP_CRMINRANGE:
 3766           *repeat_opcode = OP_CRPOSRANGE;
 3767           break;
 3768           }
 3769         }
 3770       }
 3771     c = *code;
 3772     }
 3773 
 3774   switch(c)
 3775     {
 3776     case OP_END:
 3777     return;
 3778 
 3779     case OP_TYPESTAR:
 3780     case OP_TYPEMINSTAR:
 3781     case OP_TYPEPLUS:
 3782     case OP_TYPEMINPLUS:
 3783     case OP_TYPEQUERY:
 3784     case OP_TYPEMINQUERY:
 3785     case OP_TYPEPOSSTAR:
 3786     case OP_TYPEPOSPLUS:
 3787     case OP_TYPEPOSQUERY:
 3788     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
 3789     break;
 3790 
 3791     case OP_TYPEUPTO:
 3792     case OP_TYPEMINUPTO:
 3793     case OP_TYPEEXACT:
 3794     case OP_TYPEPOSUPTO:
 3795     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
 3796       code += 2;
 3797     break;
 3798 
 3799 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 3800     case OP_XCLASS:
 3801     code += GET(code, 1);
 3802     break;
 3803 #endif
 3804 
 3805     case OP_MARK:
 3806     case OP_PRUNE_ARG:
 3807     case OP_SKIP_ARG:
 3808     case OP_THEN_ARG:
 3809     code += code[1];
 3810     break;
 3811     }
 3812 
 3813   /* Add in the fixed length from the table */
 3814 
 3815   code += PRIV(OP_lengths)[c];
 3816 
 3817   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
 3818   a multi-byte character. The length in the table is a minimum, so we have to
 3819   arrange to skip the extra bytes. */
 3820 
 3821 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
 3822   if (utf) switch(c)
 3823     {
 3824     case OP_CHAR:
 3825     case OP_CHARI:
 3826     case OP_NOT:
 3827     case OP_NOTI:
 3828     case OP_STAR:
 3829     case OP_MINSTAR:
 3830     case OP_PLUS:
 3831     case OP_MINPLUS:
 3832     case OP_QUERY:
 3833     case OP_MINQUERY:
 3834     case OP_UPTO:
 3835     case OP_MINUPTO:
 3836     case OP_EXACT:
 3837     case OP_POSSTAR:
 3838     case OP_POSPLUS:
 3839     case OP_POSQUERY:
 3840     case OP_POSUPTO:
 3841     case OP_STARI:
 3842     case OP_MINSTARI:
 3843     case OP_PLUSI:
 3844     case OP_MINPLUSI:
 3845     case OP_QUERYI:
 3846     case OP_MINQUERYI:
 3847     case OP_UPTOI:
 3848     case OP_MINUPTOI:
 3849     case OP_EXACTI:
 3850     case OP_POSSTARI:
 3851     case OP_POSPLUSI:
 3852     case OP_POSQUERYI:
 3853     case OP_POSUPTOI:
 3854     case OP_NOTSTAR:
 3855     case OP_NOTMINSTAR:
 3856     case OP_NOTPLUS:
 3857     case OP_NOTMINPLUS:
 3858     case OP_NOTQUERY:
 3859     case OP_NOTMINQUERY:
 3860     case OP_NOTUPTO:
 3861     case OP_NOTMINUPTO:
 3862     case OP_NOTEXACT:
 3863     case OP_NOTPOSSTAR:
 3864     case OP_NOTPOSPLUS:
 3865     case OP_NOTPOSQUERY:
 3866     case OP_NOTPOSUPTO:
 3867     case OP_NOTSTARI:
 3868     case OP_NOTMINSTARI:
 3869     case OP_NOTPLUSI:
 3870     case OP_NOTMINPLUSI:
 3871     case OP_NOTQUERYI:
 3872     case OP_NOTMINQUERYI:
 3873     case OP_NOTUPTOI:
 3874     case OP_NOTMINUPTOI:
 3875     case OP_NOTEXACTI:
 3876     case OP_NOTPOSSTARI:
 3877     case OP_NOTPOSPLUSI:
 3878     case OP_NOTPOSQUERYI:
 3879     case OP_NOTPOSUPTOI:
 3880     if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
 3881     break;
 3882     }
 3883 #else
 3884   (void)(utf);  /* Keep compiler happy by referencing function argument */
 3885 #endif
 3886   }
 3887 }
 3888 
 3889 
 3890 
 3891 /*************************************************
 3892 *           Check for POSIX class syntax         *
 3893 *************************************************/
 3894 
 3895 /* This function is called when the sequence "[:" or "[." or "[=" is
 3896 encountered in a character class. It checks whether this is followed by a
 3897 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
 3898 reach an unescaped ']' without the special preceding character, return FALSE.
 3899 
 3900 Originally, this function only recognized a sequence of letters between the
 3901 terminators, but it seems that Perl recognizes any sequence of characters,
 3902 though of course unknown POSIX names are subsequently rejected. Perl gives an
 3903 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
 3904 didn't consider this to be a POSIX class. Likewise for [:1234:].
 3905 
 3906 The problem in trying to be exactly like Perl is in the handling of escapes. We
 3907 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
 3908 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
 3909 below handles the special cases \\ and \], but does not try to do any other
 3910 escape processing. This makes it different from Perl for cases such as
 3911 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
 3912 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
 3913 when Perl does, I think.
 3914 
 3915 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
 3916 It seems that the appearance of a nested POSIX class supersedes an apparent
 3917 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
 3918 a digit.
 3919 
 3920 In Perl, unescaped square brackets may also appear as part of class names. For
 3921 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
 3922 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
 3923 seem right at all. PCRE does not allow closing square brackets in POSIX class
 3924 names.
 3925 
 3926 Arguments:
 3927   ptr      pointer to the initial [
 3928   endptr   where to return the end pointer
 3929 
 3930 Returns:   TRUE or FALSE
 3931 */
 3932 
 3933 static BOOL
 3934 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
 3935 {
 3936 pcre_uchar terminator;          /* Don't combine these lines; the Solaris cc */
 3937 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
 3938 for (++ptr; *ptr != CHAR_NULL; ptr++)
 3939   {
 3940   if (*ptr == CHAR_BACKSLASH &&
 3941       (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET ||
 3942        ptr[1] == CHAR_BACKSLASH))
 3943     ptr++;
 3944   else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
 3945             *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
 3946   else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
 3947     {
 3948     *endptr = ptr;
 3949     return TRUE;
 3950     }
 3951   }
 3952 return FALSE;
 3953 }
 3954 
 3955 
 3956 
 3957 
 3958 /*************************************************
 3959 *          Check POSIX class name                *
 3960 *************************************************/
 3961 
 3962 /* This function is called to check the name given in a POSIX-style class entry
 3963 such as [:alnum:].
 3964 
 3965 Arguments:
 3966   ptr        points to the first letter
 3967   len        the length of the name
 3968 
 3969 Returns:     a value representing the name, or -1 if unknown
 3970 */
 3971 
 3972 static int
 3973 check_posix_name(const pcre_uchar *ptr, int len)
 3974 {
 3975 const char *pn = posix_names;
 3976 register int yield = 0;
 3977 while (posix_name_lengths[yield] != 0)
 3978   {
 3979   if (len == posix_name_lengths[yield] &&
 3980     STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
 3981   pn += posix_name_lengths[yield] + 1;
 3982   yield++;
 3983   }
 3984 return -1;
 3985 }
 3986 
 3987 
 3988 /*************************************************
 3989 *    Adjust OP_RECURSE items in repeated group   *
 3990 *************************************************/
 3991 
 3992 /* OP_RECURSE items contain an offset from the start of the regex to the group
 3993 that is referenced. This means that groups can be replicated for fixed
 3994 repetition simply by copying (because the recursion is allowed to refer to
 3995 earlier groups that are outside the current group). However, when a group is
 3996 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
 3997 inserted before it, after it has been compiled. This means that any OP_RECURSE
 3998 items within it that refer to the group itself or any contained groups have to
 3999 have their offsets adjusted. That one of the jobs of this function. Before it
 4000 is called, the partially compiled regex must be temporarily terminated with
 4001 OP_END.
 4002 
 4003 This function has been extended to cope with forward references for recursions
 4004 and subroutine calls. It must check the list of such references for the
 4005 group we are dealing with. If it finds that one of the recursions in the
 4006 current group is on this list, it does not adjust the value in the reference
 4007 (which is a group number). After the group has been scanned, all the offsets in
 4008 the forward reference list for the group are adjusted.
 4009 
 4010 Arguments:
 4011   group      points to the start of the group
 4012   adjust     the amount by which the group is to be moved
 4013   utf        TRUE in UTF-8 / UTF-16 / UTF-32 mode
 4014   cd         contains pointers to tables etc.
 4015   save_hwm_offset   the hwm forward reference offset at the start of the group
 4016 
 4017 Returns:     nothing
 4018 */
 4019 
 4020 static void
 4021 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
 4022   size_t save_hwm_offset)
 4023 {
 4024 int offset;
 4025 pcre_uchar *hc;
 4026 pcre_uchar *ptr = group;
 4027 
 4028 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
 4029   {
 4030   for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
 4031        hc += LINK_SIZE)
 4032     {
 4033     offset = (int)GET(hc, 0);
 4034     if (cd->start_code + offset == ptr + 1) break;
 4035     }
 4036 
 4037   /* If we have not found this recursion on the forward reference list, adjust
 4038   the recursion's offset if it's after the start of this group. */
 4039 
 4040   if (hc >= cd->hwm)
 4041     {
 4042     offset = (int)GET(ptr, 1);
 4043     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
 4044     }
 4045 
 4046   ptr += 1 + LINK_SIZE;
 4047   }
 4048 
 4049 /* Now adjust all forward reference offsets for the group. */
 4050 
 4051 for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
 4052      hc += LINK_SIZE)
 4053   {
 4054   offset = (int)GET(hc, 0);
 4055   PUT(hc, 0, offset + adjust);
 4056   }
 4057 }
 4058 
 4059 
 4060 
 4061 /*************************************************
 4062 *        Insert an automatic callout point       *
 4063 *************************************************/
 4064 
 4065 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
 4066 callout points before each pattern item.
 4067 
 4068 Arguments:
 4069   code           current code pointer
 4070   ptr            current pattern pointer
 4071   cd             pointers to tables etc
 4072 
 4073 Returns:         new code pointer
 4074 */
 4075 
 4076 static pcre_uchar *
 4077 auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
 4078 {
 4079 *code++ = OP_CALLOUT;
 4080 *code++ = 255;
 4081 PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
 4082 PUT(code, LINK_SIZE, 0);                       /* Default length */
 4083 return code + 2 * LINK_SIZE;
 4084 }
 4085 
 4086 
 4087 
 4088 /*************************************************
 4089 *         Complete a callout item                *
 4090 *************************************************/
 4091 
 4092 /* A callout item contains the length of the next item in the pattern, which
 4093 we can't fill in till after we have reached the relevant point. This is used
 4094 for both automatic and manual callouts.
 4095 
 4096 Arguments:
 4097   previous_callout   points to previous callout item
 4098   ptr                current pattern pointer
 4099   cd                 pointers to tables etc
 4100 
 4101 Returns:             nothing
 4102 */
 4103 
 4104 static void
 4105 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
 4106 {
 4107 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
 4108 PUT(previous_callout, 2 + LINK_SIZE, length);
 4109 }
 4110 
 4111 
 4112 
 4113 #ifdef SUPPORT_UCP
 4114 /*************************************************
 4115 *           Get othercase range                  *
 4116 *************************************************/
 4117 
 4118 /* This function is passed the start and end of a class range, in UTF-8 mode
 4119 with UCP support. It searches up the characters, looking for ranges of
 4120 characters in the "other" case. Each call returns the next one, updating the
 4121 start address. A character with multiple other cases is returned on its own
 4122 with a special return value.
 4123 
 4124 Arguments:
 4125   cptr        points to starting character value; updated
 4126   d           end value
 4127   ocptr       where to put start of othercase range
 4128   odptr       where to put end of othercase range
 4129 
 4130 Yield:        -1 when no more
 4131                0 when a range is returned
 4132               >0 the CASESET offset for char with multiple other cases
 4133                 in this case, ocptr contains the original
 4134 */
 4135 
 4136 static int
 4137 get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
 4138   pcre_uint32 *odptr)
 4139 {
 4140 pcre_uint32 c, othercase, next;
 4141 unsigned int co;
 4142 
 4143 /* Find the first character that has an other case. If it has multiple other
 4144 cases, return its case offset value. */
 4145 
 4146 for (c = *cptr; c <= d; c++)
 4147   {
 4148   if ((co = UCD_CASESET(c)) != 0)
 4149     {
 4150     *ocptr = c++;   /* Character that has the set */
 4151     *cptr = c;      /* Rest of input range */
 4152     return (int)co;
 4153     }
 4154   if ((othercase = UCD_OTHERCASE(c)) != c) break;
 4155   }
 4156 
 4157 if (c > d) return -1;  /* Reached end of range */
 4158 
 4159 /* Found a character that has a single other case. Search for the end of the
 4160 range, which is either the end of the input range, or a character that has zero
 4161 or more than one other cases. */
 4162 
 4163 *ocptr = othercase;
 4164 next = othercase + 1;
 4165 
 4166 for (++c; c <= d; c++)
 4167   {
 4168   if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
 4169   next++;
 4170   }
 4171 
 4172 *odptr = next - 1;     /* End of othercase range */
 4173 *cptr = c;             /* Rest of input range */
 4174 return 0;
 4175 }
 4176 #endif  /* SUPPORT_UCP */
 4177 
 4178 
 4179 
 4180 /*************************************************
 4181 *        Add a character or range to a class     *
 4182 *************************************************/
 4183 
 4184 /* This function packages up the logic of adding a character or range of
 4185 characters to a class. The character values in the arguments will be within the
 4186 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
 4187 mutually recursive with the function immediately below.
 4188 
 4189 Arguments:
 4190   classbits     the bit map for characters < 256
 4191   uchardptr     points to the pointer for extra data
 4192   options       the options word
 4193   cd            contains pointers to tables etc.
 4194   start         start of range character
 4195   end           end of range character
 4196 
 4197 Returns:        the number of < 256 characters added
 4198                 the pointer to extra data is updated
 4199 */
 4200 
 4201 static int
 4202 add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
 4203   compile_data *cd, pcre_uint32 start, pcre_uint32 end)
 4204 {
 4205 pcre_uint32 c;
 4206 pcre_uint32 classbits_end = (end <= 0xff ? end : 0xff);
 4207 int n8 = 0;
 4208 
 4209 /* If caseless matching is required, scan the range and process alternate
 4210 cases. In Unicode, there are 8-bit characters that have alternate cases that
 4211 are greater than 255 and vice-versa. Sometimes we can just extend the original
 4212 range. */
 4213 
 4214 if ((options & PCRE_CASELESS) != 0)
 4215   {
 4216 #ifdef SUPPORT_UCP
 4217   if ((options & PCRE_UTF8) != 0)
 4218     {
 4219     int rc;
 4220     pcre_uint32 oc, od;
 4221 
 4222     options &= ~PCRE_CASELESS;   /* Remove for recursive calls */
 4223     c = start;
 4224 
 4225     while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
 4226       {
 4227       /* Handle a single character that has more than one other case. */
 4228 
 4229       if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
 4230         PRIV(ucd_caseless_sets) + rc, oc);
 4231 
 4232       /* Do nothing if the other case range is within the original range. */
 4233 
 4234       else if (oc >= start && od <= end) continue;
 4235 
 4236       /* Extend the original range if there is overlap, noting that if oc < c, we
 4237       can't have od > end because a subrange is always shorter than the basic
 4238       range. Otherwise, use a recursive call to add the additional range. */
 4239 
 4240       else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
 4241       else if (od > end && oc <= end + 1)
 4242         {
 4243         end = od;       /* Extend upwards */
 4244         if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
 4245         }
 4246       else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
 4247       }
 4248     }
 4249   else
 4250 #endif  /* SUPPORT_UCP */
 4251 
 4252   /* Not UTF-mode, or no UCP */
 4253 
 4254   for (c = start; c <= classbits_end; c++)
 4255     {
 4256     SETBIT(classbits, cd->fcc[c]);
 4257     n8++;
 4258     }
 4259   }
 4260 
 4261 /* Now handle the original range. Adjust the final value according to the bit
 4262 length - this means that the same lists of (e.g.) horizontal spaces can be used
 4263 in all cases. */
 4264 
 4265 #if defined COMPILE_PCRE8
 4266 #ifdef SUPPORT_UTF
 4267   if ((options & PCRE_UTF8) == 0)
 4268 #endif
 4269   if (end > 0xff) end = 0xff;
 4270 
 4271 #elif defined COMPILE_PCRE16
 4272 #ifdef SUPPORT_UTF
 4273   if ((options & PCRE_UTF16) == 0)
 4274 #endif
 4275   if (end > 0xffff) end = 0xffff;
 4276 
 4277 #endif /* COMPILE_PCRE[8|16] */
 4278 
 4279 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
 4280 
 4281 for (c = start; c <= classbits_end; c++)
 4282   {
 4283   /* Regardless of start, c will always be <= 255. */
 4284   SETBIT(classbits, c);
 4285   n8++;
 4286   }
 4287 
 4288 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 4289 if (start <= 0xff) start = 0xff + 1;
 4290 
 4291 if (end >= start)
 4292   {
 4293   pcre_uchar *uchardata = *uchardptr;
 4294 #ifdef SUPPORT_UTF
 4295   if ((options & PCRE_UTF8) != 0)  /* All UTFs use the same flag bit */
 4296     {
 4297     if (start < end)
 4298       {
 4299       *uchardata++ = XCL_RANGE;
 4300       uchardata += PRIV(ord2utf)(start, uchardata);
 4301       uchardata += PRIV(ord2utf)(end, uchardata);
 4302       }
 4303     else if (start == end)
 4304       {
 4305       *uchardata++ = XCL_SINGLE;
 4306       uchardata += PRIV(ord2utf)(start, uchardata);
 4307       }
 4308     }
 4309   else
 4310 #endif  /* SUPPORT_UTF */
 4311 
 4312   /* Without UTF support, character values are constrained by the bit length,
 4313   and can only be > 256 for 16-bit and 32-bit libraries. */
 4314 
 4315 #ifdef COMPILE_PCRE8
 4316     {}
 4317 #else
 4318   if (start < end)
 4319     {
 4320     *uchardata++ = XCL_RANGE;
 4321     *uchardata++ = start;
 4322     *uchardata++ = end;
 4323     }
 4324   else if (start == end)
 4325     {
 4326     *uchardata++ = XCL_SINGLE;
 4327     *uchardata++ = start;
 4328     }
 4329 #endif
 4330 
 4331   *uchardptr = uchardata;   /* Updata extra data pointer */
 4332   }
 4333 #endif /* SUPPORT_UTF || !COMPILE_PCRE8 */
 4334 
 4335 return n8;    /* Number of 8-bit characters */
 4336 }
 4337 
 4338 
 4339 
 4340 
 4341 /*************************************************
 4342 *        Add a list of characters to a class     *
 4343 *************************************************/
 4344 
 4345 /* This function is used for adding a list of case-equivalent characters to a
 4346 class, and also for adding a list of horizontal or vertical whitespace. If the
 4347 list is in order (which it should be), ranges of characters are detected and
 4348 handled appropriately. This function is mutually recursive with the function
 4349 above.
 4350 
 4351 Arguments:
 4352   classbits     the bit map for characters < 256
 4353   uchardptr     points to the pointer for extra data
 4354   options       the options word
 4355   cd            contains pointers to tables etc.
 4356   p             points to row of 32-bit values, terminated by NOTACHAR
 4357   except        character to omit; this is used when adding lists of
 4358                   case-equivalent characters to avoid including the one we
 4359                   already know about
 4360 
 4361 Returns:        the number of < 256 characters added
 4362                 the pointer to extra data is updated
 4363 */
 4364 
 4365 static int
 4366 add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
 4367   compile_data *cd, const pcre_uint32 *p, unsigned int except)
 4368 {
 4369 int n8 = 0;
 4370 while (p[0] < NOTACHAR)
 4371   {
 4372   int n = 0;
 4373   if (p[0] != except)
 4374     {
 4375     while(p[n+1] == p[0] + n + 1) n++;
 4376     n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
 4377     }
 4378   p += n + 1;
 4379   }
 4380 return n8;
 4381 }
 4382 
 4383 
 4384 
 4385 /*************************************************
 4386 *    Add characters not in a list to a class     *
 4387 *************************************************/
 4388 
 4389 /* This function is used for adding the complement of a list of horizontal or
 4390 vertical whitespace to a class. The list must be in order.
 4391 
 4392 Arguments:
 4393   classbits     the bit map for characters < 256
 4394   uchardptr     points to the pointer for extra data
 4395   options       the options word
 4396   cd            contains pointers to tables etc.
 4397   p             points to row of 32-bit values, terminated by NOTACHAR
 4398 
 4399 Returns:        the number of < 256 characters added
 4400                 the pointer to extra data is updated
 4401 */
 4402 
 4403 static int
 4404 add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
 4405   int options, compile_data *cd, const pcre_uint32 *p)
 4406 {
 4407 BOOL utf = (options & PCRE_UTF8) != 0;
 4408 int n8 = 0;
 4409 if (p[0] > 0)
 4410   n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
 4411 while (p[0] < NOTACHAR)
 4412   {
 4413   while (p[1] == p[0] + 1) p++;
 4414   n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
 4415     (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
 4416   p++;
 4417   }
 4418 return n8;
 4419 }
 4420 
 4421 
 4422 
 4423 /*************************************************
 4424 *           Compile one branch                   *
 4425 *************************************************/
 4426 
 4427 /* Scan the pattern, compiling it into the a vector. If the options are
 4428 changed during the branch, the pointer is used to change the external options
 4429 bits. This function is used during the pre-compile phase when we are trying
 4430 to find out the amount of memory needed, as well as during the real compile
 4431 phase. The value of lengthptr distinguishes the two phases.
 4432 
 4433 Arguments:
 4434   optionsptr        pointer to the option bits
 4435   codeptr           points to the pointer to the current code point
 4436   ptrptr            points to the current pattern pointer
 4437   errorcodeptr      points to error code variable
 4438   firstcharptr      place to put the first required character
 4439   firstcharflagsptr place to put the first character flags, or a negative number
 4440   reqcharptr        place to put the last required character
 4441   reqcharflagsptr   place to put the last required character flags, or a negative number
 4442   bcptr             points to current branch chain
 4443   cond_depth        conditional nesting depth
 4444   cd                contains pointers to tables etc.
 4445   lengthptr         NULL during the real compile phase
 4446                     points to length accumulator during pre-compile phase
 4447 
 4448 Returns:            TRUE on success
 4449                     FALSE, with *errorcodeptr set non-zero on error
 4450 */
 4451 
 4452 static BOOL
 4453 compile_branch(int *optionsptr, pcre_uchar **codeptr,
 4454   const pcre_uchar **ptrptr, int *errorcodeptr,
 4455   pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
 4456   pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
 4457   branch_chain *bcptr, int cond_depth,
 4458   compile_data *cd, int *lengthptr)
 4459 {
 4460 int repeat_type, op_type;
 4461 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
 4462 int bravalue = 0;
 4463 int greedy_default, greedy_non_default;
 4464 pcre_uint32 firstchar, reqchar;
 4465 pcre_int32 firstcharflags, reqcharflags;
 4466 pcre_uint32 zeroreqchar, zerofirstchar;
 4467 pcre_int32 zeroreqcharflags, zerofirstcharflags;
 4468 pcre_int32 req_caseopt, reqvary, tempreqvary;
 4469 int options = *optionsptr;               /* May change dynamically */
 4470 int after_manual_callout = 0;
 4471 int length_prevgroup = 0;
 4472 register pcre_uint32 c;
 4473 int escape;
 4474 register pcre_uchar *code = *codeptr;
 4475 pcre_uchar *last_code = code;
 4476 pcre_uchar *orig_code = code;
 4477 pcre_uchar *tempcode;
 4478 BOOL inescq = FALSE;
 4479 BOOL groupsetfirstchar = FALSE;
 4480 const pcre_uchar *ptr = *ptrptr;
 4481 const pcre_uchar *tempptr;
 4482 const pcre_uchar *nestptr = NULL;
 4483 pcre_uchar *previous = NULL;
 4484 pcre_uchar *previous_callout = NULL;
 4485 size_t item_hwm_offset = 0;
 4486 pcre_uint8 classbits[32];
 4487 
 4488 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
 4489 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
 4490 dynamically as we process the pattern. */
 4491 
 4492 #ifdef SUPPORT_UTF
 4493 /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
 4494 BOOL utf = (options & PCRE_UTF8) != 0;
 4495 #ifndef COMPILE_PCRE32
 4496 pcre_uchar utf_chars[6];
 4497 #endif
 4498 #else
 4499 BOOL utf = FALSE;
 4500 #endif
 4501 
 4502 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
 4503 class_uchardata always so that it can be passed to add_to_class() always,
 4504 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
 4505 alternative calls for the different cases. */
 4506 
 4507 pcre_uchar *class_uchardata;
 4508 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 4509 BOOL xclass;
 4510 pcre_uchar *class_uchardata_base;
 4511 #endif
 4512 
 4513 #ifdef PCRE_DEBUG
 4514 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
 4515 #endif
 4516 
 4517 /* Set up the default and non-default settings for greediness */
 4518 
 4519 greedy_default = ((options & PCRE_UNGREEDY) != 0);
 4520 greedy_non_default = greedy_default ^ 1;
 4521 
 4522 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
 4523 matching encountered yet". It gets changed to REQ_NONE if we hit something that
 4524 matches a non-fixed char first char; reqchar just remains unset if we never
 4525 find one.
 4526 
 4527 When we hit a repeat whose minimum is zero, we may have to adjust these values
 4528 to take the zero repeat into account. This is implemented by setting them to
 4529 zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
 4530 item types that can be repeated set these backoff variables appropriately. */
 4531 
 4532 firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
 4533 firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
 4534 
 4535 /* The variable req_caseopt contains either the REQ_CASELESS value
 4536 or zero, according to the current setting of the caseless flag. The
 4537 REQ_CASELESS leaves the lower 28 bit empty. It is added into the
 4538 firstchar or reqchar variables to record the case status of the
 4539 value. This is used only for ASCII characters. */
 4540 
 4541 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
 4542 
 4543 /* Switch on next character until the end of the branch */
 4544 
 4545 for (;; ptr++)
 4546   {
 4547   BOOL negate_class;
 4548   BOOL should_flip_negation;
 4549   BOOL possessive_quantifier;
 4550   BOOL is_quantifier;
 4551   BOOL is_recurse;
 4552   BOOL reset_bracount;
 4553   int class_has_8bitchar;
 4554   int class_one_char;
 4555 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 4556   BOOL xclass_has_prop;
 4557 #endif
 4558   int newoptions;
 4559   int recno;
 4560   int refsign;
 4561   int skipbytes;
 4562   pcre_uint32 subreqchar, subfirstchar;
 4563   pcre_int32 subreqcharflags, subfirstcharflags;
 4564   int terminator;
 4565   unsigned int mclength;
 4566   unsigned int tempbracount;
 4567   pcre_uint32 ec;
 4568   pcre_uchar mcbuffer[8];
 4569 
 4570   /* Come here to restart the loop without advancing the pointer. */
 4571 
 4572   REDO_LOOP:
 4573 
 4574   /* Get next character in the pattern */
 4575 
 4576   c = *ptr;
 4577 
 4578   /* If we are at the end of a nested substitution, revert to the outer level
 4579   string. Nesting only happens one level deep. */
 4580 
 4581   if (c == CHAR_NULL && nestptr != NULL)
 4582     {
 4583     ptr = nestptr;
 4584     nestptr = NULL;
 4585     c = *ptr;
 4586     }
 4587 
 4588   /* If we are in the pre-compile phase, accumulate the length used for the
 4589   previous cycle of this loop. */
 4590 
 4591   if (lengthptr != NULL)
 4592     {
 4593 #ifdef PCRE_DEBUG
 4594     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
 4595 #endif
 4596     if (code > cd->start_workspace + cd->workspace_size -
 4597         WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
 4598       {
 4599       *errorcodeptr = (code >= cd->start_workspace + cd->workspace_size)?
 4600         ERR52 : ERR87;
 4601       goto FAILED;
 4602       }
 4603 
 4604     /* There is at least one situation where code goes backwards: this is the
 4605     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
 4606     the class is simply eliminated. However, it is created first, so we have to
 4607     allow memory for it. Therefore, don't ever reduce the length at this point.
 4608     */
 4609 
 4610     if (code < last_code) code = last_code;
 4611 
 4612     /* Paranoid check for integer overflow */
 4613 
 4614     if (OFLOW_MAX - *lengthptr < code - last_code)
 4615       {
 4616       *errorcodeptr = ERR20;
 4617       goto FAILED;
 4618       }
 4619 
 4620     *lengthptr += (int)(code - last_code);
 4621     DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
 4622       (int)(code - last_code), c, c));
 4623 
 4624     /* If "previous" is set and it is not at the start of the work space, move
 4625     it back to there, in order to avoid filling up the work space. Otherwise,
 4626     if "previous" is NULL, reset the current code pointer to the start. */
 4627 
 4628     if (previous != NULL)
 4629       {
 4630       if (previous > orig_code)
 4631         {
 4632         memmove(orig_code, previous, IN_UCHARS(code - previous));
 4633         code -= previous - orig_code;
 4634         previous = orig_code;
 4635         }
 4636       }
 4637     else code = orig_code;
 4638 
 4639     /* Remember where this code item starts so we can pick up the length
 4640     next time round. */
 4641 
 4642     last_code = code;
 4643     }
 4644 
 4645   /* In the real compile phase, just check the workspace used by the forward
 4646   reference list. */
 4647 
 4648   else if (cd->hwm > cd->start_workspace + cd->workspace_size)
 4649     {
 4650     *errorcodeptr = ERR52;
 4651     goto FAILED;
 4652     }
 4653 
 4654   /* If in \Q...\E, check for the end; if not, we have a literal. Otherwise an
 4655   isolated \E is ignored. */
 4656 
 4657   if (c != CHAR_NULL)
 4658     {
 4659     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
 4660       {
 4661       inescq = FALSE;
 4662       ptr++;
 4663       continue;
 4664       }
 4665     else if (inescq)
 4666       {
 4667       if (previous_callout != NULL)
 4668         {
 4669         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
 4670           complete_callout(previous_callout, ptr, cd);
 4671         previous_callout = NULL;
 4672         }
 4673       if ((options & PCRE_AUTO_CALLOUT) != 0)
 4674         {
 4675         previous_callout = code;
 4676         code = auto_callout(code, ptr, cd);
 4677         }
 4678       goto NORMAL_CHAR;
 4679       }
 4680 
 4681     /* Check for the start of a \Q...\E sequence. We must do this here rather
 4682     than later in case it is immediately followed by \E, which turns it into a
 4683     "do nothing" sequence. */
 4684 
 4685     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
 4686       {
 4687       inescq = TRUE;
 4688       ptr++;
 4689       continue;
 4690       }
 4691     }
 4692 
 4693   /* In extended mode, skip white space and comments. */
 4694 
 4695   if ((options & PCRE_EXTENDED) != 0)
 4696     {
 4697     const pcre_uchar *wscptr = ptr;
 4698     while (MAX_255(c) && (cd->ctypes[c] & ctype_space) != 0) c = *(++ptr);
 4699     if (c == CHAR_NUMBER_SIGN)
 4700       {
 4701       ptr++;
 4702       while (*ptr != CHAR_NULL)
 4703         {
 4704         if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
 4705           {                          /* IS_NEWLINE sets cd->nllen. */
 4706           ptr += cd->nllen;
 4707           break;
 4708           }
 4709         ptr++;
 4710 #ifdef SUPPORT_UTF
 4711         if (utf) FORWARDCHAR(ptr);
 4712 #endif
 4713         }
 4714       }
 4715 
 4716     /* If we skipped any characters, restart the loop. Otherwise, we didn't see
 4717     a comment. */
 4718 
 4719     if (ptr > wscptr) goto REDO_LOOP;
 4720     }
 4721 
 4722   /* Skip over (?# comments. We need to do this here because we want to know if
 4723   the next thing is a quantifier, and these comments may come between an item
 4724   and its quantifier. */
 4725 
 4726   if (c == CHAR_LEFT_PARENTHESIS && ptr[1] == CHAR_QUESTION_MARK &&
 4727       ptr[2] == CHAR_NUMBER_SIGN)
 4728     {
 4729     ptr += 3;
 4730     while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
 4731     if (*ptr == CHAR_NULL)
 4732       {
 4733       *errorcodeptr = ERR18;
 4734       goto FAILED;
 4735       }
 4736     continue;
 4737     }
 4738 
 4739   /* See if the next thing is a quantifier. */
 4740 
 4741   is_quantifier =
 4742     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
 4743     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
 4744 
 4745   /* Fill in length of a previous callout, except when the next thing is a
 4746   quantifier or when processing a property substitution string in UCP mode. */
 4747 
 4748   if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
 4749        after_manual_callout-- <= 0)
 4750     {
 4751     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
 4752       complete_callout(previous_callout, ptr, cd);
 4753     previous_callout = NULL;
 4754     }
 4755 
 4756   /* Create auto callout, except for quantifiers, or while processing property
 4757   strings that are substituted for \w etc in UCP mode. */
 4758 
 4759   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
 4760     {
 4761     previous_callout = code;
 4762     code = auto_callout(code, ptr, cd);
 4763     }
 4764 
 4765   /* Process the next pattern item. */
 4766 
 4767   switch(c)
 4768     {
 4769     /* ===================================================================*/
 4770     case CHAR_NULL:                /* The branch terminates at string end */
 4771     case CHAR_VERTICAL_LINE:       /* or | or ) */
 4772     case CHAR_RIGHT_PARENTHESIS:
 4773     *firstcharptr = firstchar;
 4774     *firstcharflagsptr = firstcharflags;
 4775     *reqcharptr = reqchar;
 4776     *reqcharflagsptr = reqcharflags;
 4777     *codeptr = code;
 4778     *ptrptr = ptr;
 4779     if (lengthptr != NULL)
 4780       {
 4781       if (OFLOW_MAX - *lengthptr < code - last_code)
 4782         {
 4783         *errorcodeptr = ERR20;
 4784         goto FAILED;
 4785         }
 4786       *lengthptr += (int)(code - last_code);   /* To include callout length */
 4787       DPRINTF((">> end branch\n"));
 4788       }
 4789     return TRUE;
 4790 
 4791 
 4792     /* ===================================================================*/
 4793     /* Handle single-character metacharacters. In multiline mode, ^ disables
 4794     the setting of any following char as a first character. */
 4795 
 4796     case CHAR_CIRCUMFLEX_ACCENT:
 4797     previous = NULL;
 4798     if ((options & PCRE_MULTILINE) != 0)
 4799       {
 4800       if (firstcharflags == REQ_UNSET)
 4801         zerofirstcharflags = firstcharflags = REQ_NONE;
 4802       *code++ = OP_CIRCM;
 4803       }
 4804     else *code++ = OP_CIRC;
 4805     break;
 4806 
 4807     case CHAR_DOLLAR_SIGN:
 4808     previous = NULL;
 4809     *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
 4810     break;
 4811 
 4812     /* There can never be a first char if '.' is first, whatever happens about
 4813     repeats. The value of reqchar doesn't change either. */
 4814 
 4815     case CHAR_DOT:
 4816     if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
 4817     zerofirstchar = firstchar;
 4818     zerofirstcharflags = firstcharflags;
 4819     zeroreqchar = reqchar;
 4820     zeroreqcharflags = reqcharflags;
 4821     previous = code;
 4822     item_hwm_offset = cd->hwm - cd->start_workspace;
 4823     *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
 4824     break;
 4825 
 4826 
 4827     /* ===================================================================*/
 4828     /* Character classes. If the included characters are all < 256, we build a
 4829     32-byte bitmap of the permitted characters, except in the special case
 4830     where there is only one such character. For negated classes, we build the
 4831     map as usual, then invert it at the end. However, we use a different opcode
 4832     so that data characters > 255 can be handled correctly.
 4833 
 4834     If the class contains characters outside the 0-255 range, a different
 4835     opcode is compiled. It may optionally have a bit map for characters < 256,
 4836     but those above are are explicitly listed afterwards. A flag byte tells
 4837     whether the bitmap is present, and whether this is a negated class or not.
 4838 
 4839     In JavaScript compatibility mode, an isolated ']' causes an error. In
 4840     default (Perl) mode, it is treated as a data character. */
 4841 
 4842     case CHAR_RIGHT_SQUARE_BRACKET:
 4843     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
 4844       {
 4845       *errorcodeptr = ERR64;
 4846       goto FAILED;
 4847       }
 4848     goto NORMAL_CHAR;
 4849 
 4850     /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
 4851     used for "start of word" and "end of word". As these are otherwise illegal
 4852     sequences, we don't break anything by recognizing them. They are replaced
 4853     by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
 4854     erroneous and are handled by the normal code below. */
 4855 
 4856     case CHAR_LEFT_SQUARE_BRACKET:
 4857     if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
 4858       {
 4859       nestptr = ptr + 7;
 4860       ptr = sub_start_of_word;
 4861       goto REDO_LOOP;
 4862       }
 4863 
 4864     if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
 4865       {
 4866       nestptr = ptr + 7;
 4867       ptr = sub_end_of_word;
 4868       goto REDO_LOOP;
 4869       }
 4870 
 4871     /* Handle a real character class. */
 4872 
 4873     previous = code;
 4874     item_hwm_offset = cd->hwm - cd->start_workspace;
 4875 
 4876     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
 4877     they are encountered at the top level, so we'll do that too. */
 4878 
 4879     if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
 4880          ptr[1] == CHAR_EQUALS_SIGN) &&
 4881         check_posix_syntax(ptr, &tempptr))
 4882       {
 4883       *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
 4884       goto FAILED;
 4885       }
 4886 
 4887     /* If the first character is '^', set the negation flag and skip it. Also,
 4888     if the first few characters (either before or after ^) are \Q\E or \E we
 4889     skip them too. This makes for compatibility with Perl. */
 4890 
 4891     negate_class = FALSE;
 4892     for (;;)
 4893       {
 4894       c = *(++ptr);
 4895       if (c == CHAR_BACKSLASH)
 4896         {
 4897         if (ptr[1] == CHAR_E)
 4898           ptr++;
 4899         else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
 4900           ptr += 3;
 4901         else
 4902           break;
 4903         }
 4904       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
 4905         negate_class = TRUE;
 4906       else break;
 4907       }
 4908 
 4909     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
 4910     an initial ']' is taken as a data character -- the code below handles
 4911     that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
 4912     [^] must match any character, so generate OP_ALLANY. */
 4913 
 4914     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
 4915         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
 4916       {
 4917       *code++ = negate_class? OP_ALLANY : OP_FAIL;
 4918       if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
 4919       zerofirstchar = firstchar;
 4920       zerofirstcharflags = firstcharflags;
 4921       break;
 4922       }
 4923 
 4924     /* If a class contains a negative special such as \S, we need to flip the
 4925     negation flag at the end, so that support for characters > 255 works
 4926     correctly (they are all included in the class). */
 4927 
 4928     should_flip_negation = FALSE;
 4929 
 4930     /* Extended class (xclass) will be used when characters > 255
 4931     might match. */
 4932 
 4933 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 4934     xclass = FALSE;
 4935     class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
 4936     class_uchardata_base = class_uchardata;   /* Save the start */
 4937 #endif
 4938 
 4939     /* For optimization purposes, we track some properties of the class:
 4940     class_has_8bitchar will be non-zero if the class contains at least one <
 4941     256 character; class_one_char will be 1 if the class contains just one
 4942     character; xclass_has_prop will be TRUE if unicode property checks
 4943     are present in the class. */
 4944 
 4945     class_has_8bitchar = 0;
 4946     class_one_char = 0;
 4947 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 4948     xclass_has_prop = FALSE;
 4949 #endif
 4950 
 4951     /* Initialize the 32-char bit map to all zeros. We build the map in a
 4952     temporary bit of memory, in case the class contains fewer than two
 4953     8-bit characters because in that case the compiled code doesn't use the bit
 4954     map. */
 4955 
 4956     memset(classbits, 0, 32 * sizeof(pcre_uint8));
 4957 
 4958     /* Process characters until ] is reached. By writing this as a "do" it
 4959     means that an initial ] is taken as a data character. At the start of the
 4960     loop, c contains the first byte of the character. */
 4961 
 4962     if (c != CHAR_NULL) do
 4963       {
 4964       const pcre_uchar *oldptr;
 4965 
 4966 #ifdef SUPPORT_UTF
 4967       if (utf && HAS_EXTRALEN(c))
 4968         {                           /* Braces are required because the */
 4969         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
 4970         }
 4971 #endif
 4972 
 4973 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 4974       /* In the pre-compile phase, accumulate the length of any extra
 4975       data and reset the pointer. This is so that very large classes that
 4976       contain a zillion > 255 characters no longer overwrite the work space
 4977       (which is on the stack). We have to remember that there was XCLASS data,
 4978       however. */
 4979 
 4980       if (class_uchardata > class_uchardata_base) xclass = TRUE;
 4981 
 4982       if (lengthptr != NULL && class_uchardata > class_uchardata_base)
 4983         {
 4984         *lengthptr += (int)(class_uchardata - class_uchardata_base);
 4985         class_uchardata = class_uchardata_base;
 4986         }
 4987 #endif
 4988 
 4989       /* Inside \Q...\E everything is literal except \E */
 4990 
 4991       if (inescq)
 4992         {
 4993         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
 4994           {
 4995           inescq = FALSE;                   /* Reset literal state */
 4996           ptr++;                            /* Skip the 'E' */
 4997           continue;                         /* Carry on with next */
 4998           }
 4999         goto CHECK_RANGE;                   /* Could be range if \E follows */
 5000         }
 5001 
 5002       /* Handle POSIX class names. Perl allows a negation extension of the
 5003       form [:^name:]. A square bracket that doesn't match the syntax is
 5004       treated as a literal. We also recognize the POSIX constructions
 5005       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
 5006       5.6 and 5.8 do. */
 5007 
 5008       if (c == CHAR_LEFT_SQUARE_BRACKET &&
 5009           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
 5010            ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
 5011         {
 5012         BOOL local_negate = FALSE;
 5013         int posix_class, taboffset, tabopt;
 5014         register const pcre_uint8 *cbits = cd->cbits;
 5015         pcre_uint8 pbits[32];
 5016 
 5017         if (ptr[1] != CHAR_COLON)
 5018           {
 5019           *errorcodeptr = ERR31;
 5020           goto FAILED;
 5021           }
 5022 
 5023         ptr += 2;
 5024         if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
 5025           {
 5026           local_negate = TRUE;
 5027           should_flip_negation = TRUE;  /* Note negative special */
 5028           ptr++;
 5029           }
 5030 
 5031         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
 5032         if (posix_class < 0)
 5033           {
 5034           *errorcodeptr = ERR30;
 5035           goto FAILED;
 5036           }
 5037 
 5038         /* If matching is caseless, upper and lower are converted to
 5039         alpha. This relies on the fact that the class table starts with
 5040         alpha, lower, upper as the first 3 entries. */
 5041 
 5042         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
 5043           posix_class = 0;
 5044 
 5045         /* When PCRE_UCP is set, some of the POSIX classes are converted to
 5046         different escape sequences that use Unicode properties \p or \P. Others
 5047         that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
 5048         directly. */
 5049 
 5050 #ifdef SUPPORT_UCP
 5051         if ((options & PCRE_UCP) != 0)
 5052           {
 5053           unsigned int ptype = 0;
 5054           int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
 5055 
 5056           /* The posix_substitutes table specifies which POSIX classes can be
 5057           converted to \p or \P items. */
 5058 
 5059           if (posix_substitutes[pc] != NULL)
 5060             {
 5061             nestptr = tempptr + 1;
 5062             ptr = posix_substitutes[pc] - 1;
 5063             continue;
 5064             }
 5065 
 5066           /* There are three other classes that generate special property calls
 5067           that are recognized only in an XCLASS. */
 5068 
 5069           else switch(posix_class)
 5070             {
 5071             case PC_GRAPH:
 5072             ptype = PT_PXGRAPH;
 5073             /* Fall through */
 5074             case PC_PRINT:
 5075             if (ptype == 0) ptype = PT_PXPRINT;
 5076             /* Fall through */
 5077             case PC_PUNCT:
 5078             if (ptype == 0) ptype = PT_PXPUNCT;
 5079             *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
 5080             *class_uchardata++ = ptype;
 5081             *class_uchardata++ = 0;
 5082             xclass_has_prop = TRUE;
 5083             ptr = tempptr + 1;
 5084             continue;
 5085 
 5086             /* For the other POSIX classes (ascii, cntrl, xdigit) we are going
 5087             to fall through to the non-UCP case and build a bit map for
 5088             characters with code points less than 256. If we are in a negated
 5089             POSIX class, characters with code points greater than 255 must
 5090             either all match or all not match. In the special case where we
 5091             have not yet generated any xclass data, and this is the final item
 5092             in the overall class, we need do nothing: later on, the opcode
 5093             OP_NCLASS will be used to indicate that characters greater than 255
 5094             are acceptable. If we have already seen an xclass item or one may
 5095             follow (we have to assume that it might if this is not the end of
 5096             the class), explicitly list all wide codepoints, which will then
 5097             either not match or match, depending on whether the class is or is
 5098             not negated. */
 5099 
 5100             default:
 5101             if (local_negate &&
 5102                 (xclass || tempptr[2] != CHAR_RIGHT_SQUARE_BRACKET))
 5103               {
 5104               *class_uchardata++ = XCL_RANGE;
 5105               class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
 5106               class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
 5107               }
 5108             break;
 5109             }
 5110           }
 5111 #endif
 5112         /* In the non-UCP case, or when UCP makes no difference, we build the
 5113         bit map for the POSIX class in a chunk of local store because we may be
 5114         adding and subtracting from it, and we don't want to subtract bits that
 5115         may be in the main map already. At the end we or the result into the
 5116         bit map that is being built. */
 5117 
 5118         posix_class *= 3;
 5119 
 5120         /* Copy in the first table (always present) */
 5121 
 5122         memcpy(pbits, cbits + posix_class_maps[posix_class],
 5123           32 * sizeof(pcre_uint8));
 5124 
 5125         /* If there is a second table, add or remove it as required. */
 5126 
 5127         taboffset = posix_class_maps[posix_class + 1];
 5128         tabopt = posix_class_maps[posix_class + 2];
 5129 
 5130         if (taboffset >= 0)
 5131           {
 5132           if (tabopt >= 0)
 5133             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
 5134           else
 5135             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
 5136           }
 5137 
 5138         /* Now see if we need to remove any special characters. An option
 5139         value of 1 removes vertical space and 2 removes underscore. */
 5140 
 5141         if (tabopt < 0) tabopt = -tabopt;
 5142         if (tabopt == 1) pbits[1] &= ~0x3c;
 5143           else if (tabopt == 2) pbits[11] &= 0x7f;
 5144 
 5145         /* Add the POSIX table or its complement into the main table that is
 5146         being built and we are done. */
 5147 
 5148         if (local_negate)
 5149           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
 5150         else
 5151           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
 5152 
 5153         ptr = tempptr + 1;
 5154         /* Every class contains at least one < 256 character. */
 5155         class_has_8bitchar = 1;
 5156         /* Every class contains at least two characters. */
 5157         class_one_char = 2;
 5158         continue;    /* End of POSIX syntax handling */
 5159         }
 5160 
 5161       /* Backslash may introduce a single character, or it may introduce one
 5162       of the specials, which just set a flag. The sequence \b is a special
 5163       case. Inside a class (and only there) it is treated as backspace. We
 5164       assume that other escapes have more than one character in them, so
 5165       speculatively set both class_has_8bitchar and class_one_char bigger
 5166       than one. Unrecognized escapes fall through and are either treated
 5167       as literal characters (by default), or are faulted if
 5168       PCRE_EXTRA is set. */
 5169 
 5170       if (c == CHAR_BACKSLASH)
 5171         {
 5172         escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
 5173           TRUE);
 5174         if (*errorcodeptr != 0) goto FAILED;
 5175         if (escape == 0) c = ec;
 5176         else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
 5177         else if (escape == ESC_N)          /* \N is not supported in a class */
 5178           {
 5179           *errorcodeptr = ERR71;
 5180           goto FAILED;
 5181           }
 5182         else if (escape == ESC_Q)            /* Handle start of quoted string */
 5183           {
 5184           if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
 5185             {
 5186             ptr += 2; /* avoid empty string */
 5187             }
 5188           else inescq = TRUE;
 5189           continue;
 5190           }
 5191         else if (escape == ESC_E) continue;  /* Ignore orphan \E */
 5192 
 5193         else
 5194           {
 5195           register const pcre_uint8 *cbits = cd->cbits;
 5196           /* Every class contains at least two < 256 characters. */
 5197           class_has_8bitchar++;
 5198           /* Every class contains at least two characters. */
 5199           class_one_char += 2;
 5200 
 5201           switch (escape)
 5202             {
 5203 #ifdef SUPPORT_UCP
 5204             case ESC_du:     /* These are the values given for \d etc */
 5205             case ESC_DU:     /* when PCRE_UCP is set. We replace the */
 5206             case ESC_wu:     /* escape sequence with an appropriate \p */
 5207             case ESC_WU:     /* or \P to test Unicode properties instead */
 5208             case ESC_su:     /* of the default ASCII testing. */
 5209             case ESC_SU:
 5210             nestptr = ptr;
 5211             ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
 5212             class_has_8bitchar--;                /* Undo! */
 5213             continue;
 5214 #endif
 5215             case ESC_d:
 5216             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
 5217             continue;
 5218 
 5219             case ESC_D:
 5220             should_flip_negation = TRUE;
 5221             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
 5222             continue;
 5223 
 5224             case ESC_w:
 5225             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
 5226             continue;
 5227 
 5228             case ESC_W:
 5229             should_flip_negation = TRUE;
 5230             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
 5231             continue;
 5232 
 5233             /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
 5234             5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
 5235             previously set by something earlier in the character class.
 5236             Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
 5237             we could just adjust the appropriate bit. From PCRE 8.34 we no
 5238             longer treat \s and \S specially. */
 5239 
 5240             case ESC_s:
 5241             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
 5242             continue;
 5243 
 5244             case ESC_S:
 5245             should_flip_negation = TRUE;
 5246             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
 5247             continue;
 5248 
 5249             /* The rest apply in both UCP and non-UCP cases. */
 5250 
 5251             case ESC_h:
 5252             (void)add_list_to_class(classbits, &class_uchardata, options, cd,
 5253               PRIV(hspace_list), NOTACHAR);
 5254             continue;
 5255 
 5256             case ESC_H:
 5257             (void)add_not_list_to_class(classbits, &class_uchardata, options,
 5258               cd, PRIV(hspace_list));
 5259             continue;
 5260 
 5261             case ESC_v:
 5262             (void)add_list_to_class(classbits, &class_uchardata, options, cd,
 5263               PRIV(vspace_list), NOTACHAR);
 5264             continue;
 5265 
 5266             case ESC_V:
 5267             (void)add_not_list_to_class(classbits, &class_uchardata, options,
 5268               cd, PRIV(vspace_list));
 5269             continue;
 5270 
 5271             case ESC_p:
 5272             case ESC_P:
 5273 #ifdef SUPPORT_UCP
 5274               {
 5275               BOOL negated;
 5276               unsigned int ptype = 0, pdata = 0;
 5277               if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
 5278                 goto FAILED;
 5279               *class_uchardata++ = ((escape == ESC_p) != negated)?
 5280                 XCL_PROP : XCL_NOTPROP;
 5281               *class_uchardata++ = ptype;
 5282               *class_uchardata++ = pdata;
 5283               xclass_has_prop = TRUE;
 5284               class_has_8bitchar--;                /* Undo! */
 5285               continue;
 5286               }
 5287 #else
 5288             *errorcodeptr = ERR45;
 5289             goto FAILED;
 5290 #endif
 5291             /* Unrecognized escapes are faulted if PCRE is running in its
 5292             strict mode. By default, for compatibility with Perl, they are
 5293             treated as literals. */
 5294 
 5295             default:
 5296             if ((options & PCRE_EXTRA) != 0)
 5297               {
 5298               *errorcodeptr = ERR7;
 5299               goto FAILED;
 5300               }
 5301             class_has_8bitchar--;    /* Undo the speculative increase. */
 5302             class_one_char -= 2;     /* Undo the speculative increase. */
 5303             c = *ptr;                /* Get the final character and fall through */
 5304             break;
 5305             }
 5306           }
 5307 
 5308         /* Fall through if the escape just defined a single character (c >= 0).
 5309         This may be greater than 256. */
 5310 
 5311         escape = 0;
 5312 
 5313         }   /* End of backslash handling */
 5314 
 5315       /* A character may be followed by '-' to form a range. However, Perl does
 5316       not permit ']' to be the end of the range. A '-' character at the end is
 5317       treated as a literal. Perl ignores orphaned \E sequences entirely. The
 5318       code for handling \Q and \E is messy. */
 5319 
 5320       CHECK_RANGE:
 5321       while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
 5322         {
 5323         inescq = FALSE;
 5324         ptr += 2;
 5325         }
 5326       oldptr = ptr;
 5327 
 5328       /* Remember if \r or \n were explicitly used */
 5329 
 5330       if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
 5331 
 5332       /* Check for range */
 5333 
 5334       if (!inescq && ptr[1] == CHAR_MINUS)
 5335         {
 5336         pcre_uint32 d;
 5337         ptr += 2;
 5338         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
 5339 
 5340         /* If we hit \Q (not followed by \E) at this point, go into escaped
 5341         mode. */
 5342 
 5343         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
 5344           {
 5345           ptr += 2;
 5346           if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
 5347             { ptr += 2; continue; }
 5348           inescq = TRUE;
 5349           break;
 5350           }
 5351 
 5352         /* Minus (hyphen) at the end of a class is treated as a literal, so put
 5353         back the pointer and jump to handle the character that preceded it. */
 5354 
 5355         if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
 5356           {
 5357           ptr = oldptr;
 5358           goto CLASS_SINGLE_CHARACTER;
 5359           }
 5360 
 5361         /* Otherwise, we have a potential range; pick up the next character */
 5362 
 5363 #ifdef SUPPORT_UTF
 5364         if (utf)
 5365           {                           /* Braces are required because the */
 5366           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
 5367           }
 5368         else
 5369 #endif
 5370         d = *ptr;  /* Not UTF-8 mode */
 5371 
 5372         /* The second part of a range can be a single-character escape
 5373         sequence, but not any of the other escapes. Perl treats a hyphen as a
 5374         literal in such circumstances. However, in Perl's warning mode, a
 5375         warning is given, so PCRE now faults it as it is almost certainly a
 5376         mistake on the user's part. */
 5377 
 5378         if (!inescq)
 5379           {
 5380           if (d == CHAR_BACKSLASH)
 5381             {
 5382             int descape;
 5383             descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
 5384             if (*errorcodeptr != 0) goto FAILED;
 5385 
 5386             /* 0 means a character was put into d; \b is backspace; any other
 5387             special causes an error. */
 5388 
 5389             if (descape != 0)
 5390               {
 5391               if (descape == ESC_b) d = CHAR_BS; else
 5392                 {
 5393                 *errorcodeptr = ERR83;
 5394                 goto FAILED;
 5395                 }
 5396               }
 5397             }
 5398 
 5399           /* A hyphen followed by a POSIX class is treated in the same way. */
 5400 
 5401           else if (d == CHAR_LEFT_SQUARE_BRACKET &&
 5402                    (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
 5403                     ptr[1] == CHAR_EQUALS_SIGN) &&
 5404                    check_posix_syntax(ptr, &tempptr))
 5405             {
 5406             *errorcodeptr = ERR83;
 5407             goto FAILED;
 5408             }
 5409           }
 5410 
 5411         /* Check that the two values are in the correct order. Optimize
 5412         one-character ranges. */
 5413 
 5414         if (d < c)
 5415           {
 5416           *errorcodeptr = ERR8;
 5417           goto FAILED;
 5418           }
 5419         if (d == c) goto CLASS_SINGLE_CHARACTER;  /* A few lines below */
 5420 
 5421         /* We have found a character range, so single character optimizations
 5422         cannot be done anymore. Any value greater than 1 indicates that there
 5423         is more than one character. */
 5424 
 5425         class_one_char = 2;
 5426 
 5427         /* Remember an explicit \r or \n, and add the range to the class. */
 5428 
 5429         if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
 5430 
 5431         class_has_8bitchar +=
 5432           add_to_class(classbits, &class_uchardata, options, cd, c, d);
 5433 
 5434         continue;   /* Go get the next char in the class */
 5435         }
 5436 
 5437       /* Handle a single character - we can get here for a normal non-escape
 5438       char, or after \ that introduces a single character or for an apparent
 5439       range that isn't. Only the value 1 matters for class_one_char, so don't
 5440       increase it if it is already 2 or more ... just in case there's a class
 5441       with a zillion characters in it. */
 5442 
 5443       CLASS_SINGLE_CHARACTER:
 5444       if (class_one_char < 2) class_one_char++;
 5445 
 5446       /* If xclass_has_prop is false and class_one_char is 1, we have the first
 5447       single character in the class, and there have been no prior ranges, or
 5448       XCLASS items generated by escapes. If this is the final character in the
 5449       class, we can optimize by turning the item into a 1-character OP_CHAR[I]
 5450       if it's positive, or OP_NOT[I] if it's negative. In the positive case, it
 5451       can cause firstchar to be set. Otherwise, there can be no first char if
 5452       this item is first, whatever repeat count may follow. In the case of
 5453       reqchar, save the previous value for reinstating. */
 5454 
 5455       if (!inescq &&
 5456 #ifdef SUPPORT_UCP
 5457           !xclass_has_prop &&
 5458 #endif
 5459           class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
 5460         {
 5461         ptr++;
 5462         zeroreqchar = reqchar;
 5463         zeroreqcharflags = reqcharflags;
 5464 
 5465         if (negate_class)
 5466           {
 5467 #ifdef SUPPORT_UCP
 5468           int d;
 5469 #endif
 5470           if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
 5471           zerofirstchar = firstchar;
 5472           zerofirstcharflags = firstcharflags;
 5473 
 5474           /* For caseless UTF-8 mode when UCP support is available, check
 5475           whether this character has more than one other case. If so, generate
 5476           a special OP_NOTPROP item instead of OP_NOTI. */
 5477 
 5478 #ifdef SUPPORT_UCP
 5479           if (utf && (options & PCRE_CASELESS) != 0 &&
 5480               (d = UCD_CASESET(c)) != 0)
 5481             {
 5482             *code++ = OP_NOTPROP;
 5483             *code++ = PT_CLIST;
 5484             *code++ = d;
 5485             }
 5486           else
 5487 #endif
 5488           /* Char has only one other case, or UCP not available */
 5489 
 5490             {
 5491             *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
 5492 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
 5493             if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
 5494               code += PRIV(ord2utf)(c, code);
 5495             else
 5496 #endif
 5497               *code++ = c;
 5498             }
 5499 
 5500           /* We are finished with this character class */
 5501 
 5502           goto END_CLASS;
 5503           }
 5504 
 5505         /* For a single, positive character, get the value into mcbuffer, and
 5506         then we can handle this with the normal one-character code. */
 5507 
 5508 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
 5509         if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
 5510           mclength = PRIV(ord2utf)(c, mcbuffer);
 5511         else
 5512 #endif
 5513           {
 5514           mcbuffer[0] = c;
 5515           mclength = 1;
 5516           }
 5517         goto ONE_CHAR;
 5518         }       /* End of 1-char optimization */
 5519 
 5520       /* There is more than one character in the class, or an XCLASS item
 5521       has been generated. Add this character to the class. */
 5522 
 5523       class_has_8bitchar +=
 5524         add_to_class(classbits, &class_uchardata, options, cd, c, c);
 5525       }