"Fossies" - the Fresh Open Source Software Archive

Member "AutoHotkey_L-1.1.33.09/source/lib_pcre/pcre/pcre_compile.c" (8 May 2021, 270097 Bytes) of package /windows/misc/AutoHotkey_L-1.1.33.09.zip:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "pcre_compile.c" see the Fossies "Dox" file reference documentation.

    1 /*************************************************
    2 *      Perl-Compatible Regular Expressions       *
    3 *************************************************/
    4 
    5 /* PCRE is a library of functions to support regular expressions whose syntax
    6 and semantics are as close as possible to those of the Perl 5 language.
    7 
    8                        Written by Philip Hazel
    9            Copyright (c) 1997-2012 University of Cambridge
   10 
   11 -----------------------------------------------------------------------------
   12 Redistribution and use in source and binary forms, with or without
   13 modification, are permitted provided that the following conditions are met:
   14 
   15     * Redistributions of source code must retain the above copyright notice,
   16       this list of conditions and the following disclaimer.
   17 
   18     * Redistributions in binary form must reproduce the above copyright
   19       notice, this list of conditions and the following disclaimer in the
   20       documentation and/or other materials provided with the distribution.
   21 
   22     * Neither the name of the University of Cambridge nor the names of its
   23       contributors may be used to endorse or promote products derived from
   24       this software without specific prior written permission.
   25 
   26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
   27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
   30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   36 POSSIBILITY OF SUCH DAMAGE.
   37 -----------------------------------------------------------------------------
   38 */
   39 
   40 
   41 /* This module contains the external function pcre_compile(), along with
   42 supporting internal functions that are not used by other modules. */
   43 
   44 
   45 #ifdef HAVE_CONFIG_H
   46 #include "config.h"
   47 #endif
   48 
   49 #define NLBLOCK cd             /* Block containing newline information */
   50 #define PSSTART start_pattern  /* Field containing processed string start */
   51 #define PSEND   end_pattern    /* Field containing processed string end */
   52 
   53 #include "pcre_internal.h"
   54 
   55 
   56 /* When PCRE_DEBUG is defined, we need the pcre(16)_printint() function, which
   57 is also used by pcretest. PCRE_DEBUG is not defined when building a production
   58 library. We do not need to select pcre16_printint.c specially, because the
   59 COMPILE_PCREx macro will already be appropriately set. */
   60 
   61 #ifdef PCRE_DEBUG
   62 /* pcre_printint.c should not include any headers */
   63 #define PCRE_INCLUDED
   64 #include "pcre_printint.c"
   65 #undef PCRE_INCLUDED
   66 #endif
   67 
   68 
   69 /* Macro for setting individual bits in class bitmaps. */
   70 
   71 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
   72 
   73 /* Maximum length value to check against when making sure that the integer that
   74 holds the compiled pattern length does not overflow. We make it a bit less than
   75 INT_MAX to allow for adding in group terminating bytes, so that we don't have
   76 to check them every time. */
   77 
   78 #define OFLOW_MAX (INT_MAX - 20)
   79 
   80 
   81 /*************************************************
   82 *      Code parameters and static tables         *
   83 *************************************************/
   84 
   85 /* This value specifies the size of stack workspace that is used during the
   86 first pre-compile phase that determines how much memory is required. The regex
   87 is partly compiled into this space, but the compiled parts are discarded as
   88 soon as they can be, so that hopefully there will never be an overrun. The code
   89 does, however, check for an overrun. The largest amount I've seen used is 218,
   90 so this number is very generous.
   91 
   92 The same workspace is used during the second, actual compile phase for
   93 remembering forward references to groups so that they can be filled in at the
   94 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
   95 is 4 there is plenty of room for most patterns. However, the memory can get
   96 filled up by repetitions of forward references, for example patterns like
   97 /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
   98 that the workspace is expanded using malloc() in this situation. The value
   99 below is therefore a minimum, and we put a maximum on it for safety. The
  100 minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
  101 kicks in at the same number of forward references in all cases. */
  102 
  103 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
  104 #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
  105 
  106 /* The overrun tests check for a slightly smaller size so that they detect the
  107 overrun before it actually does run off the end of the data block. */
  108 
  109 #define WORK_SIZE_SAFETY_MARGIN (100)
  110 
  111 /* Private flags added to firstchar and reqchar. */
  112 
  113 #define REQ_CASELESS   0x10000000l      /* Indicates caselessness */
  114 #define REQ_VARY       0x20000000l      /* Reqchar followed non-literal item */
  115 
  116 /* Repeated character flags. */
  117 
  118 #define UTF_LENGTH     0x10000000l      /* The char contains its length. */
  119 
  120 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
  121 are simple data values; negative values are for special things like \d and so
  122 on. Zero means further processing is needed (for things like \x), or the escape
  123 is invalid. */
  124 
  125 #ifndef EBCDIC
  126 
  127 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
  128 in UTF-8 mode. */
  129 
  130 static const short int escapes[] = {
  131      0,                       0,
  132      0,                       0,
  133      0,                       0,
  134      0,                       0,
  135      0,                       0,
  136      CHAR_COLON,              CHAR_SEMICOLON,
  137      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
  138      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
  139      CHAR_COMMERCIAL_AT,      -ESC_A,
  140      -ESC_B,                  -ESC_C,
  141      -ESC_D,                  -ESC_E,
  142      0,                       -ESC_G,
  143      -ESC_H,                  0,
  144      0,                       -ESC_K,
  145      0,                       0,
  146      -ESC_N,                  0,
  147      -ESC_P,                  -ESC_Q,
  148      -ESC_R,                  -ESC_S,
  149      0,                       0,
  150      -ESC_V,                  -ESC_W,
  151      -ESC_X,                  0,
  152      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
  153      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
  154      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
  155      CHAR_GRAVE_ACCENT,       7,
  156      -ESC_b,                  0,
  157      -ESC_d,                  ESC_e,
  158      ESC_f,                   0,
  159      -ESC_h,                  0,
  160      0,                       -ESC_k,
  161      0,                       0,
  162      ESC_n,                   0,
  163      -ESC_p,                  0,
  164      ESC_r,                   -ESC_s,
  165      ESC_tee,                 0,
  166      -ESC_v,                  -ESC_w,
  167      0,                       0,
  168      -ESC_z
  169 };
  170 
  171 #else
  172 
  173 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
  174 
  175 static const short int escapes[] = {
  176 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
  177 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
  178 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
  179 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
  180 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
  181 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
  182 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
  183 /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
  184 /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
  185 /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
  186 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
  187 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
  188 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
  189 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
  190 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
  191 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
  192 /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
  193 /*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
  194 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
  195 /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
  196 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
  197 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
  198 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
  199 };
  200 #endif
  201 
  202 
  203 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
  204 searched linearly. Put all the names into a single string, in order to reduce
  205 the number of relocations when a shared library is dynamically linked. The
  206 string is built from string macros so that it works in UTF-8 mode on EBCDIC
  207 platforms. */
  208 
  209 typedef struct verbitem {
  210   int   len;                 /* Length of verb name */
  211   int   op;                  /* Op when no arg, or -1 if arg mandatory */
  212   int   op_arg;              /* Op when arg present, or -1 if not allowed */
  213 } verbitem;
  214 
  215 static const char verbnames[] =
  216   "\0"                       /* Empty name is a shorthand for MARK */
  217   STRING_MARK0
  218   STRING_ACCEPT0
  219   STRING_COMMIT0
  220   STRING_F0
  221   STRING_FAIL0
  222   STRING_PRUNE0
  223   STRING_SKIP0
  224   STRING_THEN;
  225 
  226 static const verbitem verbs[] = {
  227   { 0, -1,        OP_MARK },
  228   { 4, -1,        OP_MARK },
  229   { 6, OP_ACCEPT, -1 },
  230   { 6, OP_COMMIT, -1 },
  231   { 1, OP_FAIL,   -1 },
  232   { 4, OP_FAIL,   -1 },
  233   { 5, OP_PRUNE,  OP_PRUNE_ARG },
  234   { 4, OP_SKIP,   OP_SKIP_ARG  },
  235   { 4, OP_THEN,   OP_THEN_ARG  }
  236 };
  237 
  238 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
  239 
  240 
  241 /* Tables of names of POSIX character classes and their lengths. The names are
  242 now all in a single string, to reduce the number of relocations when a shared
  243 library is dynamically loaded. The list of lengths is terminated by a zero
  244 length entry. The first three must be alpha, lower, upper, as this is assumed
  245 for handling case independence. */
  246 
  247 static const char posix_names[] =
  248   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
  249   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
  250   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
  251   STRING_word0  STRING_xdigit;
  252 
  253 static const pcre_uint8 posix_name_lengths[] = {
  254   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
  255 
  256 /* Table of class bit maps for each POSIX class. Each class is formed from a
  257 base map, with an optional addition or removal of another map. Then, for some
  258 classes, there is some additional tweaking: for [:blank:] the vertical space
  259 characters are removed, and for [:alpha:] and [:alnum:] the underscore
  260 character is removed. The triples in the table consist of the base map offset,
  261 second map offset or -1 if no second map, and a non-negative value for map
  262 addition or a negative value for map subtraction (if there are two maps). The
  263 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
  264 remove vertical space characters, 2 => remove underscore. */
  265 
  266 static const int posix_class_maps[] = {
  267   cbit_word,  cbit_digit, -2,             /* alpha */
  268   cbit_lower, -1,          0,             /* lower */
  269   cbit_upper, -1,          0,             /* upper */
  270   cbit_word,  -1,          2,             /* alnum - word without underscore */
  271   cbit_print, cbit_cntrl,  0,             /* ascii */
  272   cbit_space, -1,          1,             /* blank - a GNU extension */
  273   cbit_cntrl, -1,          0,             /* cntrl */
  274   cbit_digit, -1,          0,             /* digit */
  275   cbit_graph, -1,          0,             /* graph */
  276   cbit_print, -1,          0,             /* print */
  277   cbit_punct, -1,          0,             /* punct */
  278   cbit_space, -1,          0,             /* space */
  279   cbit_word,  -1,          0,             /* word - a Perl extension */
  280   cbit_xdigit,-1,          0              /* xdigit */
  281 };
  282 
  283 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
  284 substitutes must be in the order of the names, defined above, and there are
  285 both positive and negative cases. NULL means no substitute. */
  286 
  287 #ifdef SUPPORT_UCP
  288 static const pcre_uchar string_PNd[]  = {
  289   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
  290   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  291 static const pcre_uchar string_pNd[]  = {
  292   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
  293   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  294 static const pcre_uchar string_PXsp[] = {
  295   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
  296   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  297 static const pcre_uchar string_pXsp[] = {
  298   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
  299   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  300 static const pcre_uchar string_PXwd[] = {
  301   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
  302   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  303 static const pcre_uchar string_pXwd[] = {
  304   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
  305   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  306 
  307 static const pcre_uchar *substitutes[] = {
  308   string_PNd,           /* \D */
  309   string_pNd,           /* \d */
  310   string_PXsp,          /* \S */       /* NOTE: Xsp is Perl space */
  311   string_pXsp,          /* \s */
  312   string_PXwd,          /* \W */
  313   string_pXwd           /* \w */
  314 };
  315 
  316 static const pcre_uchar string_pL[] =   {
  317   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
  318   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  319 static const pcre_uchar string_pLl[] =  {
  320   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
  321   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  322 static const pcre_uchar string_pLu[] =  {
  323   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
  324   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  325 static const pcre_uchar string_pXan[] = {
  326   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
  327   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  328 static const pcre_uchar string_h[] =    {
  329   CHAR_BACKSLASH, CHAR_h, '\0' };
  330 static const pcre_uchar string_pXps[] = {
  331   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
  332   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  333 static const pcre_uchar string_PL[] =   {
  334   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
  335   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  336 static const pcre_uchar string_PLl[] =  {
  337   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
  338   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  339 static const pcre_uchar string_PLu[] =  {
  340   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
  341   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  342 static const pcre_uchar string_PXan[] = {
  343   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
  344   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  345 static const pcre_uchar string_H[] =    {
  346   CHAR_BACKSLASH, CHAR_H, '\0' };
  347 static const pcre_uchar string_PXps[] = {
  348   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
  349   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
  350 
  351 static const pcre_uchar *posix_substitutes[] = {
  352   string_pL,            /* alpha */
  353   string_pLl,           /* lower */
  354   string_pLu,           /* upper */
  355   string_pXan,          /* alnum */
  356   NULL,                 /* ascii */
  357   string_h,             /* blank */
  358   NULL,                 /* cntrl */
  359   string_pNd,           /* digit */
  360   NULL,                 /* graph */
  361   NULL,                 /* print */
  362   NULL,                 /* punct */
  363   string_pXps,          /* space */    /* NOTE: Xps is POSIX space */
  364   string_pXwd,          /* word */
  365   NULL,                 /* xdigit */
  366   /* Negated cases */
  367   string_PL,            /* ^alpha */
  368   string_PLl,           /* ^lower */
  369   string_PLu,           /* ^upper */
  370   string_PXan,          /* ^alnum */
  371   NULL,                 /* ^ascii */
  372   string_H,             /* ^blank */
  373   NULL,                 /* ^cntrl */
  374   string_PNd,           /* ^digit */
  375   NULL,                 /* ^graph */
  376   NULL,                 /* ^print */
  377   NULL,                 /* ^punct */
  378   string_PXps,          /* ^space */   /* NOTE: Xps is POSIX space */
  379   string_PXwd,          /* ^word */
  380   NULL                  /* ^xdigit */
  381 };
  382 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
  383 #endif
  384 
  385 #define STRING(a)  # a
  386 #define XSTRING(s) STRING(s)
  387 
  388 /* The texts of compile-time error messages. These are "char *" because they
  389 are passed to the outside world. Do not ever re-use any error number, because
  390 they are documented. Always add a new error instead. Messages marked DEAD below
  391 are no longer used. This used to be a table of strings, but in order to reduce
  392 the number of relocations needed when a shared library is loaded dynamically,
  393 it is now one long string. We cannot use a table of offsets, because the
  394 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
  395 simply count through to the one we want - this isn't a performance issue
  396 because these strings are used only when there is a compilation error.
  397 
  398 Each substring ends with \0 to insert a null character. This includes the final
  399 substring, so that the whole string ends with \0\0, which can be detected when
  400 counting through. */
  401 
  402 static const char error_texts[] =
  403   "no error\0"
  404   "\\ at end of pattern\0"
  405   "\\c at end of pattern\0"
  406   "unrecognized character follows \\\0"
  407   "numbers out of order in {} quantifier\0"
  408   /* 5 */
  409   "number too big in {} quantifier\0"
  410   "missing terminating ] for character class\0"
  411   "invalid escape sequence in character class\0"
  412   "range out of order in character class\0"
  413   "nothing to repeat\0"
  414   /* 10 */
  415   "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
  416   "internal error: unexpected repeat\0"
  417   "unrecognized character after (? or (?-\0"
  418   "POSIX named classes are supported only within a class\0"
  419   "missing )\0"
  420   /* 15 */
  421   "reference to non-existent subpattern\0"
  422   "erroffset passed as NULL\0"
  423   "unknown option bit(s) set\0"
  424   "missing ) after comment\0"
  425   "parentheses nested too deeply\0"  /** DEAD **/
  426   /* 20 */
  427   "regular expression is too large\0"
  428   "failed to get memory\0"
  429   "unmatched parentheses\0"
  430   "internal error: code overflow\0"
  431   "unrecognized character after (?<\0"
  432   /* 25 */
  433   "lookbehind assertion is not fixed length\0"
  434   "malformed number or name after (?(\0"
  435   "conditional group contains more than two branches\0"
  436   "assertion expected after (?(\0"
  437   "(?R or (?[+-]digits must be followed by )\0"
  438   /* 30 */
  439   "unknown POSIX class name\0"
  440   "POSIX collating elements are not supported\0"
  441   "this version of PCRE is compiled without UTF support\0"
  442   "spare error\0"  /** DEAD **/
  443   "character value in \\x{...} sequence is too large\0"
  444   /* 35 */
  445   "invalid condition (?(0)\0"
  446   "\\C not allowed in lookbehind assertion\0"
  447   "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
  448   "number after (?C is > 255\0"
  449   "closing ) for (?C expected\0"
  450   /* 40 */
  451   "recursive call could loop indefinitely\0"
  452   "unrecognized character after (?P\0"
  453   "syntax error in subpattern name (missing terminator)\0"
  454   "two named subpatterns have the same name\0"
  455   "invalid UTF-8 string\0"
  456   /* 45 */
  457   "support for \\P, \\p, and \\X has not been compiled\0"
  458   "malformed \\P or \\p sequence\0"
  459   "unknown property name after \\P or \\p\0"
  460   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
  461   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
  462   /* 50 */
  463   "repeated subpattern is too long\0"    /** DEAD **/
  464   "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
  465   "internal error: overran compiling workspace\0"
  466   "internal error: previously-checked referenced subpattern not found\0"
  467   "DEFINE group contains more than one branch\0"
  468   /* 55 */
  469   "repeating a DEFINE group is not allowed\0"  /** DEAD **/
  470   "inconsistent NEWLINE options\0"
  471   "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
  472   "a numbered reference must not be zero\0"
  473   "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
  474   /* 60 */
  475   "(*VERB) not recognized\0"
  476   "number is too big\0"
  477   "subpattern name expected\0"
  478   "digit expected after (?+\0"
  479   "] is an invalid data character in JavaScript compatibility mode\0"
  480   /* 65 */
  481   "different names for subpatterns of the same number are not allowed\0"
  482   "(*MARK) must have an argument\0"
  483   "this version of PCRE is not compiled with Unicode property support\0"
  484   "\\c must be followed by an ASCII character\0"
  485   "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
  486   /* 70 */
  487   "internal error: unknown opcode in find_fixedlength()\0"
  488   "\\N is not supported in a class\0"
  489   "too many forward references\0"
  490   "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
  491   "invalid UTF-16 string\0"
  492   ;
  493 
  494 /* Table to identify digits and hex digits. This is used when compiling
  495 patterns. Note that the tables in chartables are dependent on the locale, and
  496 may mark arbitrary characters as digits - but the PCRE compiling code expects
  497 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
  498 a private table here. It costs 256 bytes, but it is a lot faster than doing
  499 character value tests (at least in some simple cases I timed), and in some
  500 applications one wants PCRE to compile efficiently as well as match
  501 efficiently.
  502 
  503 For convenience, we use the same bit definitions as in chartables:
  504 
  505   0x04   decimal digit
  506   0x08   hexadecimal digit
  507 
  508 Then we can use ctype_digit and ctype_xdigit in the code. */
  509 
  510 /* Using a simple comparison for decimal numbers rather than a memory read
  511 is much faster, and the resulting code is simpler (the compiler turns it
  512 into a subtraction and unsigned comparison). */
  513 
  514 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
  515 
  516 #ifndef EBCDIC
  517 
  518 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
  519 UTF-8 mode. */
  520 
  521 static const pcre_uint8 digitab[] =
  522   {
  523   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
  524   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
  525   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
  526   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
  527   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
  528   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
  529   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
  530   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
  531   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
  532   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
  533   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
  534   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
  535   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
  536   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
  537   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
  538   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
  539   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
  540   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
  541   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
  542   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
  543   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
  544   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
  545   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
  546   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
  547   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
  548   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
  549   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
  550   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
  551   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
  552   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
  553   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
  554   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
  555 
  556 #else
  557 
  558 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
  559 
  560 static const pcre_uint8 digitab[] =
  561   {
  562   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
  563   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
  564   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
  565   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
  566   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
  567   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
  568   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
  569   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
  570   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
  571   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
  572   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
  573   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
  574   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
  575   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
  576   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
  577   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
  578   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
  579   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
  580   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
  581   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
  582   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
  583   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
  584   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
  585   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
  586   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
  587   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
  588   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
  589   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
  590   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
  591   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
  592   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
  593   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
  594 
  595 static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
  596   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
  597   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
  598   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
  599   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
  600   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
  601   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
  602   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
  603   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
  604   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
  605   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
  606   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
  607   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
  608   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
  609   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
  610   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
  611   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
  612   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
  613   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
  614   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
  615   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
  616   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
  617   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
  618   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
  619   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
  620   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
  621   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
  622   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
  623   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
  624   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
  625   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
  626   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
  627   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
  628 #endif
  629 
  630 
  631 /* Definition to allow mutual recursion */
  632 
  633 static BOOL
  634   compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
  635     int *, int *, branch_chain *, compile_data *, int *);
  636 
  637 
  638 
  639 /*************************************************
  640 *            Find an error text                  *
  641 *************************************************/
  642 
  643 /* The error texts are now all in one long string, to save on relocations. As
  644 some of the text is of unknown length, we can't use a table of offsets.
  645 Instead, just count through the strings. This is not a performance issue
  646 because it happens only when there has been a compilation error.
  647 
  648 Argument:   the error number
  649 Returns:    pointer to the error string
  650 */
  651 
  652 static const char *
  653 find_error_text(int n)
  654 {
  655 const char *s = error_texts;
  656 for (; n > 0; n--)
  657   {
  658   while (*s++ != 0) {};
  659   if (*s == 0) return "Error text not found (please report)";
  660   }
  661 return s;
  662 }
  663 
  664 
  665 /*************************************************
  666 *           Expand the workspace                 *
  667 *************************************************/
  668 
  669 /* This function is called during the second compiling phase, if the number of
  670 forward references fills the existing workspace, which is originally a block on
  671 the stack. A larger block is obtained from malloc() unless the ultimate limit
  672 has been reached or the increase will be rather small.
  673 
  674 Argument: pointer to the compile data block
  675 Returns:  0 if all went well, else an error number
  676 */
  677 
  678 static int
  679 expand_workspace(compile_data *cd)
  680 {
  681 pcre_uchar *newspace;
  682 int newsize = cd->workspace_size * 2;
  683 
  684 if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
  685 if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
  686     newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
  687  return ERR72;
  688 
  689 newspace = (PUBL(malloc))(IN_UCHARS(newsize));
  690 if (newspace == NULL) return ERR21;
  691 memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
  692 cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
  693 if (cd->workspace_size > COMPILE_WORK_SIZE)
  694   (PUBL(free))((void *)cd->start_workspace);
  695 cd->start_workspace = newspace;
  696 cd->workspace_size = newsize;
  697 return 0;
  698 }
  699 
  700 
  701 
  702 /*************************************************
  703 *            Check for counted repeat            *
  704 *************************************************/
  705 
  706 /* This function is called when a '{' is encountered in a place where it might
  707 start a quantifier. It looks ahead to see if it really is a quantifier or not.
  708 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
  709 where the ddds are digits.
  710 
  711 Arguments:
  712   p         pointer to the first char after '{'
  713 
  714 Returns:    TRUE or FALSE
  715 */
  716 
  717 static BOOL
  718 is_counted_repeat(const pcre_uchar *p)
  719 {
  720 if (!IS_DIGIT(*p)) return FALSE;
  721 p++;
  722 while (IS_DIGIT(*p)) p++;
  723 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
  724 
  725 if (*p++ != CHAR_COMMA) return FALSE;
  726 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
  727 
  728 if (!IS_DIGIT(*p)) return FALSE;
  729 p++;
  730 while (IS_DIGIT(*p)) p++;
  731 
  732 return (*p == CHAR_RIGHT_CURLY_BRACKET);
  733 }
  734 
  735 
  736 
  737 /*************************************************
  738 *            Handle escapes                      *
  739 *************************************************/
  740 
  741 /* This function is called when a \ has been encountered. It either returns a
  742 positive value for a simple escape such as \n, or a negative value which
  743 encodes one of the more complicated things such as \d. A backreference to group
  744 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
  745 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
  746 ptr is pointing at the \. On exit, it is on the final character of the escape
  747 sequence.
  748 
  749 Arguments:
  750   ptrptr         points to the pattern position pointer
  751   errorcodeptr   points to the errorcode variable
  752   bracount       number of previous extracting brackets
  753   options        the options bits
  754   isclass        TRUE if inside a character class
  755 
  756 Returns:         zero or positive => a data character
  757                  negative => a special escape sequence
  758                  on error, errorcodeptr is set
  759 */
  760 
  761 static int
  762 check_escape(const pcre_uchar **ptrptr, int *errorcodeptr, int bracount,
  763   int options, BOOL isclass)
  764 {
  765 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
  766 BOOL utf = UTF_ENABLED((options & PCRE_UTF8) != 0);
  767 const pcre_uchar *ptr = *ptrptr + 1;
  768 pcre_int32 c;
  769 int i;
  770 
  771 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
  772 ptr--;                            /* Set pointer back to the last byte */
  773 
  774 /* If backslash is at the end of the pattern, it's an error. */
  775 
  776 if (c == 0) *errorcodeptr = ERR1;
  777 
  778 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
  779 in a table. A non-zero result is something that can be returned immediately.
  780 Otherwise further processing may be required. */
  781 
  782 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
  783 /* Not alphanumeric */
  784 else if (c < CHAR_0 || c > CHAR_z) {}
  785 else if ((i = escapes[c - CHAR_0]) != 0) c = i;
  786 
  787 #else           /* EBCDIC coding */
  788 /* Not alphanumeric */
  789 else if (c < 'a' || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
  790 else if ((i = escapes[c - 0x48]) != 0)  c = i;
  791 #endif
  792 
  793 /* Escapes that need further processing, or are illegal. */
  794 
  795 else
  796   {
  797   const pcre_uchar *oldptr;
  798   BOOL braced, negated;
  799 
  800   switch (c)
  801     {
  802     /* A number of Perl escapes are not handled by PCRE. We give an explicit
  803     error. */
  804 
  805     case CHAR_l:
  806     case CHAR_L:
  807     *errorcodeptr = ERR37;
  808     break;
  809 
  810     case CHAR_u:
  811     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
  812       {
  813       /* In JavaScript, \u must be followed by four hexadecimal numbers.
  814       Otherwise it is a lowercase u letter. */
  815       if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
  816         && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
  817         && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
  818         && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
  819         {
  820         c = 0;
  821         for (i = 0; i < 4; ++i)
  822           {
  823           register int cc = *(++ptr);
  824 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
  825           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
  826           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
  827 #else           /* EBCDIC coding */
  828           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
  829           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
  830 #endif
  831           }
  832         }
  833       }
  834     else
  835       *errorcodeptr = ERR37;
  836     break;
  837 
  838     case CHAR_U:
  839     /* In JavaScript, \U is an uppercase U letter. */
  840     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
  841     break;
  842 
  843     /* In a character class, \g is just a literal "g". Outside a character
  844     class, \g must be followed by one of a number of specific things:
  845 
  846     (1) A number, either plain or braced. If positive, it is an absolute
  847     backreference. If negative, it is a relative backreference. This is a Perl
  848     5.10 feature.
  849 
  850     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
  851     is part of Perl's movement towards a unified syntax for back references. As
  852     this is synonymous with \k{name}, we fudge it up by pretending it really
  853     was \k.
  854 
  855     (3) For Oniguruma compatibility we also support \g followed by a name or a
  856     number either in angle brackets or in single quotes. However, these are
  857     (possibly recursive) subroutine calls, _not_ backreferences. Just return
  858     the -ESC_g code (cf \k). */
  859 
  860     case CHAR_g:
  861     if (isclass) break;
  862     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
  863       {
  864       c = -ESC_g;
  865       break;
  866       }
  867 
  868     /* Handle the Perl-compatible cases */
  869 
  870     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
  871       {
  872       const pcre_uchar *p;
  873       for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
  874         if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
  875       if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
  876         {
  877         c = -ESC_k;
  878         break;
  879         }
  880       braced = TRUE;
  881       ptr++;
  882       }
  883     else braced = FALSE;
  884 
  885     if (ptr[1] == CHAR_MINUS)
  886       {
  887       negated = TRUE;
  888       ptr++;
  889       }
  890     else negated = FALSE;
  891 
  892     /* The integer range is limited by the machine's int representation. */
  893     c = 0;
  894     while (IS_DIGIT(ptr[1]))
  895       {
  896       if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
  897         {
  898         c = -1;
  899         break;
  900         }
  901       c = c * 10 + *(++ptr) - CHAR_0;
  902       }
  903     if (((unsigned int)c) > INT_MAX) /* Integer overflow */
  904       {
  905       while (IS_DIGIT(ptr[1]))
  906         ptr++;
  907       *errorcodeptr = ERR61;
  908       break;
  909       }
  910 
  911     if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
  912       {
  913       *errorcodeptr = ERR57;
  914       break;
  915       }
  916 
  917     if (c == 0)
  918       {
  919       *errorcodeptr = ERR58;
  920       break;
  921       }
  922 
  923     if (negated)
  924       {
  925       if (c > bracount)
  926         {
  927         *errorcodeptr = ERR15;
  928         break;
  929         }
  930       c = bracount - (c - 1);
  931       }
  932 
  933     c = -(ESC_REF + c);
  934     break;
  935 
  936     /* The handling of escape sequences consisting of a string of digits
  937     starting with one that is not zero is not straightforward. By experiment,
  938     the way Perl works seems to be as follows:
  939 
  940     Outside a character class, the digits are read as a decimal number. If the
  941     number is less than 10, or if there are that many previous extracting
  942     left brackets, then it is a back reference. Otherwise, up to three octal
  943     digits are read to form an escaped byte. Thus \123 is likely to be octal
  944     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
  945     value is greater than 377, the least significant 8 bits are taken. Inside a
  946     character class, \ followed by a digit is always an octal number. */
  947 
  948     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
  949     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
  950 
  951     if (!isclass)
  952       {
  953       oldptr = ptr;
  954       /* The integer range is limited by the machine's int representation. */
  955       c -= CHAR_0;
  956       while (IS_DIGIT(ptr[1]))
  957         {
  958         if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
  959           {
  960           c = -1;
  961           break;
  962           }
  963         c = c * 10 + *(++ptr) - CHAR_0;
  964         }
  965       if (((unsigned int)c) > INT_MAX) /* Integer overflow */
  966         {
  967         while (IS_DIGIT(ptr[1]))
  968           ptr++;
  969         *errorcodeptr = ERR61;
  970         break;
  971         }
  972       if (c < 10 || c <= bracount)
  973         {
  974         c = -(ESC_REF + c);
  975         break;
  976         }
  977       ptr = oldptr;      /* Put the pointer back and fall through */
  978       }
  979 
  980     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
  981     generates a binary zero byte and treats the digit as a following literal.
  982     Thus we have to pull back the pointer by one. */
  983 
  984     if ((c = *ptr) >= CHAR_8)
  985       {
  986       ptr--;
  987       c = 0;
  988       break;
  989       }
  990 
  991     /* \0 always starts an octal number, but we may drop through to here with a
  992     larger first octal digit. The original code used just to take the least
  993     significant 8 bits of octal numbers (I think this is what early Perls used
  994     to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
  995     but no more than 3 octal digits. */
  996 
  997     case CHAR_0:
  998     c -= CHAR_0;
  999     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
 1000         c = c * 8 + *(++ptr) - CHAR_0;
 1001 #ifdef COMPILE_PCRE8
 1002     if (!utf && c > 0xff) *errorcodeptr = ERR51;
 1003 #endif
 1004     break;
 1005 
 1006     /* \x is complicated. \x{ddd} is a character number which can be greater
 1007     than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.
 1008     If not, { is treated as a data character. */
 1009 
 1010     case CHAR_x:
 1011     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
 1012       {
 1013       /* In JavaScript, \x must be followed by two hexadecimal numbers.
 1014       Otherwise it is a lowercase x letter. */
 1015       if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
 1016         && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
 1017         {
 1018         c = 0;
 1019         for (i = 0; i < 2; ++i)
 1020           {
 1021           register int cc = *(++ptr);
 1022 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
 1023           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
 1024           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
 1025 #else           /* EBCDIC coding */
 1026           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
 1027           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
 1028 #endif
 1029           }
 1030         }
 1031       break;
 1032       }
 1033 
 1034     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
 1035       {
 1036       const pcre_uchar *pt = ptr + 2;
 1037 
 1038       c = 0;
 1039       while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)
 1040         {
 1041         register int cc = *pt++;
 1042         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
 1043 
 1044 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
 1045         if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
 1046         c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
 1047 #else           /* EBCDIC coding */
 1048         if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
 1049         c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
 1050 #endif
 1051 
 1052 #ifdef COMPILE_PCRE8
 1053         if (c > (utf ? 0x10ffff : 0xff)) { c = -1; break; }
 1054 #else
 1055 #ifdef COMPILE_PCRE16
 1056         if (c > (utf ? 0x10ffff : 0xffff)) { c = -1; break; }
 1057 #endif
 1058 #endif
 1059         }
 1060 
 1061       if (c < 0)
 1062         {
 1063         while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;
 1064         *errorcodeptr = ERR34;
 1065         }
 1066 
 1067       if (*pt == CHAR_RIGHT_CURLY_BRACKET)
 1068         {
 1069         if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
 1070         ptr = pt;
 1071         break;
 1072         }
 1073 
 1074       /* If the sequence of hex digits does not end with '}', then we don't
 1075       recognize this construct; fall through to the normal \x handling. */
 1076       }
 1077 
 1078     /* Read just a single-byte hex-defined char */
 1079 
 1080     c = 0;
 1081     while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
 1082       {
 1083       int cc;                                  /* Some compilers don't like */
 1084       cc = *(++ptr);                           /* ++ in initializers */
 1085 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
 1086       if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
 1087       c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
 1088 #else           /* EBCDIC coding */
 1089       if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
 1090       c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
 1091 #endif
 1092       }
 1093     break;
 1094 
 1095     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
 1096     An error is given if the byte following \c is not an ASCII character. This
 1097     coding is ASCII-specific, but then the whole concept of \cx is
 1098     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
 1099 
 1100     case CHAR_c:
 1101     c = *(++ptr);
 1102     if (c == 0)
 1103       {
 1104       *errorcodeptr = ERR2;
 1105       break;
 1106       }
 1107 #ifndef EBCDIC    /* ASCII/UTF-8 coding */
 1108     if (c > 127)  /* Excludes all non-ASCII in either mode */
 1109       {
 1110       *errorcodeptr = ERR68;
 1111       break;
 1112       }
 1113     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
 1114     c ^= 0x40;
 1115 #else             /* EBCDIC coding */
 1116     if (c >= CHAR_a && c <= CHAR_z) c += 64;
 1117     c ^= 0xC0;
 1118 #endif
 1119     break;
 1120 
 1121     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
 1122     other alphanumeric following \ is an error if PCRE_EXTRA was set;
 1123     otherwise, for Perl compatibility, it is a literal. This code looks a bit
 1124     odd, but there used to be some cases other than the default, and there may
 1125     be again in future, so I haven't "optimized" it. */
 1126 
 1127     default:
 1128     if ((options & PCRE_EXTRA) != 0) switch(c)
 1129       {
 1130       default:
 1131       *errorcodeptr = ERR3;
 1132       break;
 1133       }
 1134     break;
 1135     }
 1136   }
 1137 
 1138 /* Perl supports \N{name} for character names, as well as plain \N for "not
 1139 newline". PCRE does not support \N{name}. However, it does support
 1140 quantification such as \N{2,3}. */
 1141 
 1142 if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
 1143      !is_counted_repeat(ptr+2))
 1144   *errorcodeptr = ERR37;
 1145 
 1146 /* If PCRE_UCP is set, we change the values for \d etc. */
 1147 
 1148 if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
 1149   c -= (ESC_DU - ESC_D);
 1150 
 1151 /* Set the pointer to the final character before returning. */
 1152 
 1153 *ptrptr = ptr;
 1154 return c;
 1155 }
 1156 
 1157 
 1158 
 1159 #ifdef SUPPORT_UCP
 1160 /*************************************************
 1161 *               Handle \P and \p                 *
 1162 *************************************************/
 1163 
 1164 /* This function is called after \P or \p has been encountered, provided that
 1165 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
 1166 pointing at the P or p. On exit, it is pointing at the final character of the
 1167 escape sequence.
 1168 
 1169 Argument:
 1170   ptrptr         points to the pattern position pointer
 1171   negptr         points to a boolean that is set TRUE for negation else FALSE
 1172   dptr           points to an int that is set to the detailed property value
 1173   errorcodeptr   points to the error code variable
 1174 
 1175 Returns:         type value from ucp_type_table, or -1 for an invalid type
 1176 */
 1177 
 1178 static int
 1179 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
 1180 {
 1181 int c, i, bot, top;
 1182 const pcre_uchar *ptr = *ptrptr;
 1183 pcre_uchar name[32];
 1184 
 1185 c = *(++ptr);
 1186 if (c == 0) goto ERROR_RETURN;
 1187 
 1188 *negptr = FALSE;
 1189 
 1190 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
 1191 negation. */
 1192 
 1193 if (c == CHAR_LEFT_CURLY_BRACKET)
 1194   {
 1195   if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
 1196     {
 1197     *negptr = TRUE;
 1198     ptr++;
 1199     }
 1200   for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
 1201     {
 1202     c = *(++ptr);
 1203     if (c == 0) goto ERROR_RETURN;
 1204     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
 1205     name[i] = c;
 1206     }
 1207   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
 1208   name[i] = 0;
 1209   }
 1210 
 1211 /* Otherwise there is just one following character */
 1212 
 1213 else
 1214   {
 1215   name[0] = c;
 1216   name[1] = 0;
 1217   }
 1218 
 1219 *ptrptr = ptr;
 1220 
 1221 /* Search for a recognized property name using binary chop */
 1222 
 1223 bot = 0;
 1224 top = PRIV(utt_size);
 1225 
 1226 while (bot < top)
 1227   {
 1228   i = (bot + top) >> 1;
 1229   c = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
 1230   if (c == 0)
 1231     {
 1232     *dptr = PRIV(utt)[i].value;
 1233     return PRIV(utt)[i].type;
 1234     }
 1235   if (c > 0) bot = i + 1; else top = i;
 1236   }
 1237 
 1238 *errorcodeptr = ERR47;
 1239 *ptrptr = ptr;
 1240 return -1;
 1241 
 1242 ERROR_RETURN:
 1243 *errorcodeptr = ERR46;
 1244 *ptrptr = ptr;
 1245 return -1;
 1246 }
 1247 #endif
 1248 
 1249 
 1250 
 1251 
 1252 /*************************************************
 1253 *         Read repeat counts                     *
 1254 *************************************************/
 1255 
 1256 /* Read an item of the form {n,m} and return the values. This is called only
 1257 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
 1258 so the syntax is guaranteed to be correct, but we need to check the values.
 1259 
 1260 Arguments:
 1261   p              pointer to first char after '{'
 1262   minp           pointer to int for min
 1263   maxp           pointer to int for max
 1264                  returned as -1 if no max
 1265   errorcodeptr   points to error code variable
 1266 
 1267 Returns:         pointer to '}' on success;
 1268                  current ptr on error, with errorcodeptr set non-zero
 1269 */
 1270 
 1271 static const pcre_uchar *
 1272 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
 1273 {
 1274 int min = 0;
 1275 int max = -1;
 1276 
 1277 /* Read the minimum value and do a paranoid check: a negative value indicates
 1278 an integer overflow. */
 1279 
 1280 while (IS_DIGIT(*p)) min = min * 10 + *p++ - CHAR_0;
 1281 if (min < 0 || min > 65535)
 1282   {
 1283   *errorcodeptr = ERR5;
 1284   return p;
 1285   }
 1286 
 1287 /* Read the maximum value if there is one, and again do a paranoid on its size.
 1288 Also, max must not be less than min. */
 1289 
 1290 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
 1291   {
 1292   if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
 1293     {
 1294     max = 0;
 1295     while(IS_DIGIT(*p)) max = max * 10 + *p++ - CHAR_0;
 1296     if (max < 0 || max > 65535)
 1297       {
 1298       *errorcodeptr = ERR5;
 1299       return p;
 1300       }
 1301     if (max < min)
 1302       {
 1303       *errorcodeptr = ERR4;
 1304       return p;
 1305       }
 1306     }
 1307   }
 1308 
 1309 /* Fill in the required variables, and pass back the pointer to the terminating
 1310 '}'. */
 1311 
 1312 *minp = min;
 1313 *maxp = max;
 1314 return p;
 1315 }
 1316 
 1317 
 1318 
 1319 /*************************************************
 1320 *  Subroutine for finding forward reference      *
 1321 *************************************************/
 1322 
 1323 /* This recursive function is called only from find_parens() below. The
 1324 top-level call starts at the beginning of the pattern. All other calls must
 1325 start at a parenthesis. It scans along a pattern's text looking for capturing
 1326 subpatterns, and counting them. If it finds a named pattern that matches the
 1327 name it is given, it returns its number. Alternatively, if the name is NULL, it
 1328 returns when it reaches a given numbered subpattern. Recursion is used to keep
 1329 track of subpatterns that reset the capturing group numbers - the (?| feature.
 1330 
 1331 This function was originally called only from the second pass, in which we know
 1332 that if (?< or (?' or (?P< is encountered, the name will be correctly
 1333 terminated because that is checked in the first pass. There is now one call to
 1334 this function in the first pass, to check for a recursive back reference by
 1335 name (so that we can make the whole group atomic). In this case, we need check
 1336 only up to the current position in the pattern, and that is still OK because
 1337 and previous occurrences will have been checked. To make this work, the test
 1338 for "end of pattern" is a check against cd->end_pattern in the main loop,
 1339 instead of looking for a binary zero. This means that the special first-pass
 1340 call can adjust cd->end_pattern temporarily. (Checks for binary zero while
 1341 processing items within the loop are OK, because afterwards the main loop will
 1342 terminate.)
 1343 
 1344 Arguments:
 1345   ptrptr       address of the current character pointer (updated)
 1346   cd           compile background data
 1347   name         name to seek, or NULL if seeking a numbered subpattern
 1348   lorn         name length, or subpattern number if name is NULL
 1349   xmode        TRUE if we are in /x mode
 1350   utf          TRUE if we are in UTF-8 / UTF-16 mode
 1351   count        pointer to the current capturing subpattern number (updated)
 1352 
 1353 Returns:       the number of the named subpattern, or -1 if not found
 1354 */
 1355 
 1356 static int
 1357 find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn,
 1358   BOOL xmode, BOOL utf, int *count)
 1359 {
 1360 pcre_uchar *ptr = *ptrptr;
 1361 int start_count = *count;
 1362 int hwm_count = start_count;
 1363 BOOL dup_parens = FALSE;
 1364 
 1365 /* If the first character is a parenthesis, check on the type of group we are
 1366 dealing with. The very first call may not start with a parenthesis. */
 1367 
 1368 if (ptr[0] == CHAR_LEFT_PARENTHESIS)
 1369   {
 1370   /* Handle specials such as (*SKIP) or (*UTF8) etc. */
 1371 
 1372   if (ptr[1] == CHAR_ASTERISK) ptr += 2;
 1373 
 1374   /* Handle a normal, unnamed capturing parenthesis. */
 1375 
 1376   else if (ptr[1] != CHAR_QUESTION_MARK)
 1377     {
 1378     *count += 1;
 1379     if (name == NULL && *count == lorn) return *count;
 1380     ptr++;
 1381     }
 1382 
 1383   /* All cases now have (? at the start. Remember when we are in a group
 1384   where the parenthesis numbers are duplicated. */
 1385 
 1386   else if (ptr[2] == CHAR_VERTICAL_LINE)
 1387     {
 1388     ptr += 3;
 1389     dup_parens = TRUE;
 1390     }
 1391 
 1392   /* Handle comments; all characters are allowed until a ket is reached. */
 1393 
 1394   else if (ptr[2] == CHAR_NUMBER_SIGN)
 1395     {
 1396     for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
 1397     goto FAIL_EXIT;
 1398     }
 1399 
 1400   /* Handle a condition. If it is an assertion, just carry on so that it
 1401   is processed as normal. If not, skip to the closing parenthesis of the
 1402   condition (there can't be any nested parens). */
 1403 
 1404   else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
 1405     {
 1406     ptr += 2;
 1407     if (ptr[1] != CHAR_QUESTION_MARK)
 1408       {
 1409       while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
 1410       if (*ptr != 0) ptr++;
 1411       }
 1412     }
 1413 
 1414   /* Start with (? but not a condition. */
 1415 
 1416   else
 1417     {
 1418     ptr += 2;
 1419     if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */
 1420 
 1421     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
 1422 
 1423     if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
 1424         ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
 1425       {
 1426       int term;
 1427       const pcre_uchar *thisname;
 1428       *count += 1;
 1429       if (name == NULL && *count == lorn) return *count;
 1430       term = *ptr++;
 1431       if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
 1432       thisname = ptr;
 1433       while (*ptr != term) ptr++;
 1434       if (name != NULL && lorn == ptr - thisname &&
 1435           STRNCMP_UC_UC(name, thisname, lorn) == 0)
 1436         return *count;
 1437       term++;
 1438       }
 1439     }
 1440   }
 1441 
 1442 /* Past any initial parenthesis handling, scan for parentheses or vertical
 1443 bars. Stop if we get to cd->end_pattern. Note that this is important for the
 1444 first-pass call when this value is temporarily adjusted to stop at the current
 1445 position. So DO NOT change this to a test for binary zero. */
 1446 
 1447 for (; ptr < cd->end_pattern; ptr++)
 1448   {
 1449   /* Skip over backslashed characters and also entire \Q...\E */
 1450 
 1451   if (*ptr == CHAR_BACKSLASH)
 1452     {
 1453     if (*(++ptr) == 0) goto FAIL_EXIT;
 1454     if (*ptr == CHAR_Q) for (;;)
 1455       {
 1456       while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
 1457       if (*ptr == 0) goto FAIL_EXIT;
 1458       if (*(++ptr) == CHAR_E) break;
 1459       }
 1460     continue;
 1461     }
 1462 
 1463   /* Skip over character classes; this logic must be similar to the way they
 1464   are handled for real. If the first character is '^', skip it. Also, if the
 1465   first few characters (either before or after ^) are \Q\E or \E we skip them
 1466   too. This makes for compatibility with Perl. Note the use of STR macros to
 1467   encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
 1468 
 1469   if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
 1470     {
 1471     BOOL negate_class = FALSE;
 1472     for (;;)
 1473       {
 1474       if (ptr[1] == CHAR_BACKSLASH)
 1475         {
 1476         if (ptr[2] == CHAR_E)
 1477           ptr+= 2;
 1478         else if (STRNCMP_UC_C8(ptr + 2,
 1479                  STR_Q STR_BACKSLASH STR_E, 3) == 0)
 1480           ptr += 4;
 1481         else
 1482           break;
 1483         }
 1484       else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
 1485         {
 1486         negate_class = TRUE;
 1487         ptr++;
 1488         }
 1489       else break;
 1490       }
 1491 
 1492     /* If the next character is ']', it is a data character that must be
 1493     skipped, except in JavaScript compatibility mode. */
 1494 
 1495     if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
 1496         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
 1497       ptr++;
 1498 
 1499     while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
 1500       {
 1501       if (*ptr == 0) return -1;
 1502       if (*ptr == CHAR_BACKSLASH)
 1503         {
 1504         if (*(++ptr) == 0) goto FAIL_EXIT;
 1505         if (*ptr == CHAR_Q) for (;;)
 1506           {
 1507           while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
 1508           if (*ptr == 0) goto FAIL_EXIT;
 1509           if (*(++ptr) == CHAR_E) break;
 1510           }
 1511         continue;
 1512         }
 1513       }
 1514     continue;
 1515     }
 1516 
 1517   /* Skip comments in /x mode */
 1518 
 1519   if (xmode && *ptr == CHAR_NUMBER_SIGN)
 1520     {
 1521     ptr++;
 1522     while (*ptr != 0)
 1523       {
 1524       if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
 1525       ptr++;
 1526 #ifdef SUPPORT_UTF
 1527       if (utf) FORWARDCHAR(ptr);
 1528 #endif
 1529       }
 1530     if (*ptr == 0) goto FAIL_EXIT;
 1531     continue;
 1532     }
 1533 
 1534   /* Check for the special metacharacters */
 1535 
 1536   if (*ptr == CHAR_LEFT_PARENTHESIS)
 1537     {
 1538     int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, count);
 1539     if (rc > 0) return rc;
 1540     if (*ptr == 0) goto FAIL_EXIT;
 1541     }
 1542 
 1543   else if (*ptr == CHAR_RIGHT_PARENTHESIS)
 1544     {
 1545     if (dup_parens && *count < hwm_count) *count = hwm_count;
 1546     goto FAIL_EXIT;
 1547     }
 1548 
 1549   else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
 1550     {
 1551     if (*count > hwm_count) hwm_count = *count;
 1552     *count = start_count;
 1553     }
 1554   }
 1555 
 1556 FAIL_EXIT:
 1557 *ptrptr = ptr;
 1558 return -1;
 1559 }
 1560 
 1561 
 1562 
 1563 
 1564 /*************************************************
 1565 *       Find forward referenced subpattern       *
 1566 *************************************************/
 1567 
 1568 /* This function scans along a pattern's text looking for capturing
 1569 subpatterns, and counting them. If it finds a named pattern that matches the
 1570 name it is given, it returns its number. Alternatively, if the name is NULL, it
 1571 returns when it reaches a given numbered subpattern. This is used for forward
 1572 references to subpatterns. We used to be able to start this scan from the
 1573 current compiling point, using the current count value from cd->bracount, and
 1574 do it all in a single loop, but the addition of the possibility of duplicate
 1575 subpattern numbers means that we have to scan from the very start, in order to
 1576 take account of such duplicates, and to use a recursive function to keep track
 1577 of the different types of group.
 1578 
 1579 Arguments:
 1580   cd           compile background data
 1581   name         name to seek, or NULL if seeking a numbered subpattern
 1582   lorn         name length, or subpattern number if name is NULL
 1583   xmode        TRUE if we are in /x mode
 1584   utf          TRUE if we are in UTF-8 / UTF-16 mode
 1585 
 1586 Returns:       the number of the found subpattern, or -1 if not found
 1587 */
 1588 
 1589 static int
 1590 find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode,
 1591   BOOL utf)
 1592 {
 1593 pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern;
 1594 int count = 0;
 1595 int rc;
 1596 
 1597 /* If the pattern does not start with an opening parenthesis, the first call
 1598 to find_parens_sub() will scan right to the end (if necessary). However, if it
 1599 does start with a parenthesis, find_parens_sub() will return when it hits the
 1600 matching closing parens. That is why we have to have a loop. */
 1601 
 1602 for (;;)
 1603   {
 1604   rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, &count);
 1605   if (rc > 0 || *ptr++ == 0) break;
 1606   }
 1607 
 1608 return rc;
 1609 }
 1610 
 1611 
 1612 
 1613 
 1614 /*************************************************
 1615 *      Find first significant op code            *
 1616 *************************************************/
 1617 
 1618 /* This is called by several functions that scan a compiled expression looking
 1619 for a fixed first character, or an anchoring op code etc. It skips over things
 1620 that do not influence this. For some calls, it makes sense to skip negative
 1621 forward and all backward assertions, and also the \b assertion; for others it
 1622 does not.
 1623 
 1624 Arguments:
 1625   code         pointer to the start of the group
 1626   skipassert   TRUE if certain assertions are to be skipped
 1627 
 1628 Returns:       pointer to the first significant opcode
 1629 */
 1630 
 1631 static const pcre_uchar*
 1632 first_significant_code(const pcre_uchar *code, BOOL skipassert)
 1633 {
 1634 for (;;)
 1635   {
 1636   switch ((int)*code)
 1637     {
 1638     case OP_ASSERT_NOT:
 1639     case OP_ASSERTBACK:
 1640     case OP_ASSERTBACK_NOT:
 1641     if (!skipassert) return code;
 1642     do code += GET(code, 1); while (*code == OP_ALT);
 1643     code += PRIV(OP_lengths)[*code];
 1644     break;
 1645 
 1646     case OP_WORD_BOUNDARY:
 1647     case OP_NOT_WORD_BOUNDARY:
 1648     if (!skipassert) return code;
 1649     /* Fall through */
 1650 
 1651     case OP_CALLOUT:
 1652     case OP_CREF:
 1653     case OP_NCREF:
 1654     case OP_RREF:
 1655     case OP_NRREF:
 1656     case OP_DEF:
 1657     code += PRIV(OP_lengths)[*code];
 1658     break;
 1659 
 1660     default:
 1661     return code;
 1662     }
 1663   }
 1664 /* Control never reaches here */
 1665 }
 1666 
 1667 
 1668 
 1669 
 1670 /*************************************************
 1671 *        Find the fixed length of a branch       *
 1672 *************************************************/
 1673 
 1674 /* Scan a branch and compute the fixed length of subject that will match it,
 1675 if the length is fixed. This is needed for dealing with backward assertions.
 1676 In UTF8 mode, the result is in characters rather than bytes. The branch is
 1677 temporarily terminated with OP_END when this function is called.
 1678 
 1679 This function is called when a backward assertion is encountered, so that if it
 1680 fails, the error message can point to the correct place in the pattern.
 1681 However, we cannot do this when the assertion contains subroutine calls,
 1682 because they can be forward references. We solve this by remembering this case
 1683 and doing the check at the end; a flag specifies which mode we are running in.
 1684 
 1685 Arguments:
 1686   code     points to the start of the pattern (the bracket)
 1687   utf      TRUE in UTF-8 / UTF-16 mode
 1688   atend    TRUE if called when the pattern is complete
 1689   cd       the "compile data" structure
 1690 
 1691 Returns:   the fixed length,
 1692              or -1 if there is no fixed length,
 1693              or -2 if \C was encountered (in UTF-8 mode only)
 1694              or -3 if an OP_RECURSE item was encountered and atend is FALSE
 1695              or -4 if an unknown opcode was encountered (internal error)
 1696 */
 1697 
 1698 static int
 1699 find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
 1700 {
 1701 int length = -1;
 1702 
 1703 register int branchlength = 0;
 1704 register pcre_uchar *cc = code + 1 + LINK_SIZE;
 1705 
 1706 /* Scan along the opcodes for this branch. If we get to the end of the
 1707 branch, check the length against that of the other branches. */
 1708 
 1709 for (;;)
 1710   {
 1711   int d;
 1712   pcre_uchar *ce, *cs;
 1713   register int op = *cc;
 1714 
 1715   switch (op)
 1716     {
 1717     /* We only need to continue for OP_CBRA (normal capturing bracket) and
 1718     OP_BRA (normal non-capturing bracket) because the other variants of these
 1719     opcodes are all concerned with unlimited repeated groups, which of course
 1720     are not of fixed length. */
 1721 
 1722     case OP_CBRA:
 1723     case OP_BRA:
 1724     case OP_ONCE:
 1725     case OP_ONCE_NC:
 1726     case OP_COND:
 1727     d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
 1728     if (d < 0) return d;
 1729     branchlength += d;
 1730     do cc += GET(cc, 1); while (*cc == OP_ALT);
 1731     cc += 1 + LINK_SIZE;
 1732     break;
 1733 
 1734     /* Reached end of a branch; if it's a ket it is the end of a nested call.
 1735     If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
 1736     an ALT. If it is END it's the end of the outer call. All can be handled by
 1737     the same code. Note that we must not include the OP_KETRxxx opcodes here,
 1738     because they all imply an unlimited repeat. */
 1739 
 1740     case OP_ALT:
 1741     case OP_KET:
 1742     case OP_END:
 1743     case OP_ACCEPT:
 1744     case OP_ASSERT_ACCEPT:
 1745     if (length < 0) length = branchlength;
 1746       else if (length != branchlength) return -1;
 1747     if (*cc != OP_ALT) return length;
 1748     cc += 1 + LINK_SIZE;
 1749     branchlength = 0;
 1750     break;
 1751 
 1752     /* A true recursion implies not fixed length, but a subroutine call may
 1753     be OK. If the subroutine is a forward reference, we can't deal with
 1754     it until the end of the pattern, so return -3. */
 1755 
 1756     case OP_RECURSE:
 1757     if (!atend) return -3;
 1758     cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
 1759     do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
 1760     if (cc > cs && cc < ce) return -1;                    /* Recursion */
 1761     d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
 1762     if (d < 0) return d;
 1763     branchlength += d;
 1764     cc += 1 + LINK_SIZE;
 1765     break;
 1766 
 1767     /* Skip over assertive subpatterns */
 1768 
 1769     case OP_ASSERT:
 1770     case OP_ASSERT_NOT:
 1771     case OP_ASSERTBACK:
 1772     case OP_ASSERTBACK_NOT:
 1773     do cc += GET(cc, 1); while (*cc == OP_ALT);
 1774     cc += PRIV(OP_lengths)[*cc];
 1775     break;
 1776 
 1777     /* Skip over things that don't match chars */
 1778 
 1779     case OP_MARK:
 1780     case OP_PRUNE_ARG:
 1781     case OP_SKIP_ARG:
 1782     case OP_THEN_ARG:
 1783     cc += cc[1] + PRIV(OP_lengths)[*cc];
 1784     break;
 1785 
 1786     case OP_CALLOUT:
 1787     case OP_CIRC:
 1788     case OP_CIRCM:
 1789     case OP_CLOSE:
 1790     case OP_COMMIT:
 1791     case OP_CREF:
 1792     case OP_DEF:
 1793     case OP_DOLL:
 1794     case OP_DOLLM:
 1795     case OP_EOD:
 1796     case OP_EODN:
 1797     case OP_FAIL:
 1798     case OP_NCREF:
 1799     case OP_NRREF:
 1800     case OP_NOT_WORD_BOUNDARY:
 1801     case OP_PRUNE:
 1802     case OP_REVERSE:
 1803     case OP_RREF:
 1804     case OP_SET_SOM:
 1805     case OP_SKIP:
 1806     case OP_SOD:
 1807     case OP_SOM:
 1808     case OP_THEN:
 1809     case OP_WORD_BOUNDARY:
 1810     cc += PRIV(OP_lengths)[*cc];
 1811     break;
 1812 
 1813     /* Handle literal characters */
 1814 
 1815     case OP_CHAR:
 1816     case OP_CHARI:
 1817     case OP_NOT:
 1818     case OP_NOTI:
 1819     branchlength++;
 1820     cc += 2;
 1821 #ifdef SUPPORT_UTF
 1822     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 1823 #endif
 1824     break;
 1825 
 1826     /* Handle exact repetitions. The count is already in characters, but we
 1827     need to skip over a multibyte character in UTF8 mode.  */
 1828 
 1829     case OP_EXACT:
 1830     case OP_EXACTI:
 1831     case OP_NOTEXACT:
 1832     case OP_NOTEXACTI:
 1833     branchlength += GET2(cc,1);
 1834     cc += 2 + IMM2_SIZE;
 1835 #ifdef SUPPORT_UTF
 1836     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 1837 #endif
 1838     break;
 1839 
 1840     case OP_TYPEEXACT:
 1841     branchlength += GET2(cc,1);
 1842     if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
 1843     cc += 1 + IMM2_SIZE + 1;
 1844     break;
 1845 
 1846     /* Handle single-char matchers */
 1847 
 1848     case OP_PROP:
 1849     case OP_NOTPROP:
 1850     cc += 2;
 1851     /* Fall through */
 1852 
 1853     case OP_HSPACE:
 1854     case OP_VSPACE:
 1855     case OP_NOT_HSPACE:
 1856     case OP_NOT_VSPACE:
 1857     case OP_NOT_DIGIT:
 1858     case OP_DIGIT:
 1859     case OP_NOT_WHITESPACE:
 1860     case OP_WHITESPACE:
 1861     case OP_NOT_WORDCHAR:
 1862     case OP_WORDCHAR:
 1863     case OP_ANY:
 1864     case OP_ALLANY:
 1865     branchlength++;
 1866     cc++;
 1867     break;
 1868 
 1869     /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
 1870     otherwise \C is coded as OP_ALLANY. */
 1871 
 1872     case OP_ANYBYTE:
 1873     return -2;
 1874 
 1875     /* Check a class for variable quantification */
 1876 
 1877 #if defined SUPPORT_UTF || defined COMPILE_PCRE16
 1878     case OP_XCLASS:
 1879     cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
 1880     /* Fall through */
 1881 #endif
 1882 
 1883     case OP_CLASS:
 1884     case OP_NCLASS:
 1885     cc += PRIV(OP_lengths)[OP_CLASS];
 1886 
 1887     switch (*cc)
 1888       {
 1889       case OP_CRPLUS:
 1890       case OP_CRMINPLUS:
 1891       case OP_CRSTAR:
 1892       case OP_CRMINSTAR:
 1893       case OP_CRQUERY:
 1894       case OP_CRMINQUERY:
 1895       return -1;
 1896 
 1897       case OP_CRRANGE:
 1898       case OP_CRMINRANGE:
 1899       if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
 1900       branchlength += GET2(cc,1);
 1901       cc += 1 + 2 * IMM2_SIZE;
 1902       break;
 1903 
 1904       default:
 1905       branchlength++;
 1906       }
 1907     break;
 1908 
 1909     /* Anything else is variable length */
 1910 
 1911     case OP_ANYNL:
 1912     case OP_BRAMINZERO:
 1913     case OP_BRAPOS:
 1914     case OP_BRAPOSZERO:
 1915     case OP_BRAZERO:
 1916     case OP_CBRAPOS:
 1917     case OP_EXTUNI:
 1918     case OP_KETRMAX:
 1919     case OP_KETRMIN:
 1920     case OP_KETRPOS:
 1921     case OP_MINPLUS:
 1922     case OP_MINPLUSI:
 1923     case OP_MINQUERY:
 1924     case OP_MINQUERYI:
 1925     case OP_MINSTAR:
 1926     case OP_MINSTARI:
 1927     case OP_MINUPTO:
 1928     case OP_MINUPTOI:
 1929     case OP_NOTMINPLUS:
 1930     case OP_NOTMINPLUSI:
 1931     case OP_NOTMINQUERY:
 1932     case OP_NOTMINQUERYI:
 1933     case OP_NOTMINSTAR:
 1934     case OP_NOTMINSTARI:
 1935     case OP_NOTMINUPTO:
 1936     case OP_NOTMINUPTOI:
 1937     case OP_NOTPLUS:
 1938     case OP_NOTPLUSI:
 1939     case OP_NOTPOSPLUS:
 1940     case OP_NOTPOSPLUSI:
 1941     case OP_NOTPOSQUERY:
 1942     case OP_NOTPOSQUERYI:
 1943     case OP_NOTPOSSTAR:
 1944     case OP_NOTPOSSTARI:
 1945     case OP_NOTPOSUPTO:
 1946     case OP_NOTPOSUPTOI:
 1947     case OP_NOTQUERY:
 1948     case OP_NOTQUERYI:
 1949     case OP_NOTSTAR:
 1950     case OP_NOTSTARI:
 1951     case OP_NOTUPTO:
 1952     case OP_NOTUPTOI:
 1953     case OP_PLUS:
 1954     case OP_PLUSI:
 1955     case OP_POSPLUS:
 1956     case OP_POSPLUSI:
 1957     case OP_POSQUERY:
 1958     case OP_POSQUERYI:
 1959     case OP_POSSTAR:
 1960     case OP_POSSTARI:
 1961     case OP_POSUPTO:
 1962     case OP_POSUPTOI:
 1963     case OP_QUERY:
 1964     case OP_QUERYI:
 1965     case OP_REF:
 1966     case OP_REFI:
 1967     case OP_SBRA:
 1968     case OP_SBRAPOS:
 1969     case OP_SCBRA:
 1970     case OP_SCBRAPOS:
 1971     case OP_SCOND:
 1972     case OP_SKIPZERO:
 1973     case OP_STAR:
 1974     case OP_STARI:
 1975     case OP_TYPEMINPLUS:
 1976     case OP_TYPEMINQUERY:
 1977     case OP_TYPEMINSTAR:
 1978     case OP_TYPEMINUPTO:
 1979     case OP_TYPEPLUS:
 1980     case OP_TYPEPOSPLUS:
 1981     case OP_TYPEPOSQUERY:
 1982     case OP_TYPEPOSSTAR:
 1983     case OP_TYPEPOSUPTO:
 1984     case OP_TYPEQUERY:
 1985     case OP_TYPESTAR:
 1986     case OP_TYPEUPTO:
 1987     case OP_UPTO:
 1988     case OP_UPTOI:
 1989     return -1;
 1990 
 1991     /* Catch unrecognized opcodes so that when new ones are added they
 1992     are not forgotten, as has happened in the past. */
 1993 
 1994     default:
 1995     return -4;
 1996     }
 1997   }
 1998 /* Control never gets here */
 1999 }
 2000 
 2001 
 2002 
 2003 
 2004 /*************************************************
 2005 *    Scan compiled regex for specific bracket    *
 2006 *************************************************/
 2007 
 2008 /* This little function scans through a compiled pattern until it finds a
 2009 capturing bracket with the given number, or, if the number is negative, an
 2010 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
 2011 so that it can be called from pcre_study() when finding the minimum matching
 2012 length.
 2013 
 2014 Arguments:
 2015   code        points to start of expression
 2016   utf         TRUE in UTF-8 / UTF-16 mode
 2017   number      the required bracket number or negative to find a lookbehind
 2018 
 2019 Returns:      pointer to the opcode for the bracket, or NULL if not found
 2020 */
 2021 
 2022 const pcre_uchar *
 2023 PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
 2024 {
 2025 for (;;)
 2026   {
 2027   register int c = *code;
 2028 
 2029   if (c == OP_END) return NULL;
 2030 
 2031   /* XCLASS is used for classes that cannot be represented just by a bit
 2032   map. This includes negated single high-valued characters. The length in
 2033   the table is zero; the actual length is stored in the compiled code. */
 2034 
 2035   if (c == OP_XCLASS) code += GET(code, 1);
 2036 
 2037   /* Handle recursion */
 2038 
 2039   else if (c == OP_REVERSE)
 2040     {
 2041     if (number < 0) return (pcre_uchar *)code;
 2042     code += PRIV(OP_lengths)[c];
 2043     }
 2044 
 2045   /* Handle capturing bracket */
 2046 
 2047   else if (c == OP_CBRA || c == OP_SCBRA ||
 2048            c == OP_CBRAPOS || c == OP_SCBRAPOS)
 2049     {
 2050     int n = GET2(code, 1+LINK_SIZE);
 2051     if (n == number) return (pcre_uchar *)code;
 2052     code += PRIV(OP_lengths)[c];
 2053     }
 2054 
 2055   /* Otherwise, we can get the item's length from the table, except that for
 2056   repeated character types, we have to test for \p and \P, which have an extra
 2057   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
 2058   must add in its length. */
 2059 
 2060   else
 2061     {
 2062     switch(c)
 2063       {
 2064       case OP_TYPESTAR:
 2065       case OP_TYPEMINSTAR:
 2066       case OP_TYPEPLUS:
 2067       case OP_TYPEMINPLUS:
 2068       case OP_TYPEQUERY:
 2069       case OP_TYPEMINQUERY:
 2070       case OP_TYPEPOSSTAR:
 2071       case OP_TYPEPOSPLUS:
 2072       case OP_TYPEPOSQUERY:
 2073       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
 2074       break;
 2075 
 2076       case OP_TYPEUPTO:
 2077       case OP_TYPEMINUPTO:
 2078       case OP_TYPEEXACT:
 2079       case OP_TYPEPOSUPTO:
 2080       if (code[1 + IMM2_SIZE] == OP_PROP
 2081         || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
 2082       break;
 2083 
 2084       case OP_MARK:
 2085       case OP_PRUNE_ARG:
 2086       case OP_SKIP_ARG:
 2087       code += code[1];
 2088       break;
 2089 
 2090       case OP_THEN_ARG:
 2091       code += code[1];
 2092       break;
 2093       }
 2094 
 2095     /* Add in the fixed length from the table */
 2096 
 2097     code += PRIV(OP_lengths)[c];
 2098 
 2099   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
 2100   a multi-byte character. The length in the table is a minimum, so we have to
 2101   arrange to skip the extra bytes. */
 2102 
 2103 #ifdef SUPPORT_UTF
 2104     if (utf) switch(c)
 2105       {
 2106       case OP_CHAR:
 2107       case OP_CHARI:
 2108       case OP_EXACT:
 2109       case OP_EXACTI:
 2110       case OP_UPTO:
 2111       case OP_UPTOI:
 2112       case OP_MINUPTO:
 2113       case OP_MINUPTOI:
 2114       case OP_POSUPTO:
 2115       case OP_POSUPTOI:
 2116       case OP_STAR:
 2117       case OP_STARI:
 2118       case OP_MINSTAR:
 2119       case OP_MINSTARI:
 2120       case OP_POSSTAR:
 2121       case OP_POSSTARI:
 2122       case OP_PLUS:
 2123       case OP_PLUSI:
 2124       case OP_MINPLUS:
 2125       case OP_MINPLUSI:
 2126       case OP_POSPLUS:
 2127       case OP_POSPLUSI:
 2128       case OP_QUERY:
 2129       case OP_QUERYI:
 2130       case OP_MINQUERY:
 2131       case OP_MINQUERYI:
 2132       case OP_POSQUERY:
 2133       case OP_POSQUERYI:
 2134       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
 2135       break;
 2136       }
 2137 #else
 2138     (void)(utf);  /* Keep compiler happy by referencing function argument */
 2139 #endif
 2140     }
 2141   }
 2142 }
 2143 
 2144 
 2145 
 2146 /*************************************************
 2147 *   Scan compiled regex for recursion reference  *
 2148 *************************************************/
 2149 
 2150 /* This little function scans through a compiled pattern until it finds an
 2151 instance of OP_RECURSE.
 2152 
 2153 Arguments:
 2154   code        points to start of expression
 2155   utf         TRUE in UTF-8 / UTF-16 mode
 2156 
 2157 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
 2158 */
 2159 
 2160 static const pcre_uchar *
 2161 find_recurse(const pcre_uchar *code, BOOL utf)
 2162 {
 2163 for (;;)
 2164   {
 2165   register int c = *code;
 2166   if (c == OP_END) return NULL;
 2167   if (c == OP_RECURSE) return code;
 2168 
 2169   /* XCLASS is used for classes that cannot be represented just by a bit
 2170   map. This includes negated single high-valued characters. The length in
 2171   the table is zero; the actual length is stored in the compiled code. */
 2172 
 2173   if (c == OP_XCLASS) code += GET(code, 1);
 2174 
 2175   /* Otherwise, we can get the item's length from the table, except that for
 2176   repeated character types, we have to test for \p and \P, which have an extra
 2177   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
 2178   must add in its length. */
 2179 
 2180   else
 2181     {
 2182     switch(c)
 2183       {
 2184       case OP_TYPESTAR:
 2185       case OP_TYPEMINSTAR:
 2186       case OP_TYPEPLUS:
 2187       case OP_TYPEMINPLUS:
 2188       case OP_TYPEQUERY:
 2189       case OP_TYPEMINQUERY:
 2190       case OP_TYPEPOSSTAR:
 2191       case OP_TYPEPOSPLUS:
 2192       case OP_TYPEPOSQUERY:
 2193       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
 2194       break;
 2195 
 2196       case OP_TYPEPOSUPTO:
 2197       case OP_TYPEUPTO:
 2198       case OP_TYPEMINUPTO:
 2199       case OP_TYPEEXACT:
 2200       if (code[1 + IMM2_SIZE] == OP_PROP
 2201         || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
 2202       break;
 2203 
 2204       case OP_MARK:
 2205       case OP_PRUNE_ARG:
 2206       case OP_SKIP_ARG:
 2207       code += code[1];
 2208       break;
 2209 
 2210       case OP_THEN_ARG:
 2211       code += code[1];
 2212       break;
 2213       }
 2214 
 2215     /* Add in the fixed length from the table */
 2216 
 2217     code += PRIV(OP_lengths)[c];
 2218 
 2219     /* In UTF-8 mode, opcodes that are followed by a character may be followed
 2220     by a multi-byte character. The length in the table is a minimum, so we have
 2221     to arrange to skip the extra bytes. */
 2222 
 2223 #ifdef SUPPORT_UTF
 2224     if (utf) switch(c)
 2225       {
 2226       case OP_CHAR:
 2227       case OP_CHARI:
 2228       case OP_EXACT:
 2229       case OP_EXACTI:
 2230       case OP_UPTO:
 2231       case OP_UPTOI:
 2232       case OP_MINUPTO:
 2233       case OP_MINUPTOI:
 2234       case OP_POSUPTO:
 2235       case OP_POSUPTOI:
 2236       case OP_STAR:
 2237       case OP_STARI:
 2238       case OP_MINSTAR:
 2239       case OP_MINSTARI:
 2240       case OP_POSSTAR:
 2241       case OP_POSSTARI:
 2242       case OP_PLUS:
 2243       case OP_PLUSI:
 2244       case OP_MINPLUS:
 2245       case OP_MINPLUSI:
 2246       case OP_POSPLUS:
 2247       case OP_POSPLUSI:
 2248       case OP_QUERY:
 2249       case OP_QUERYI:
 2250       case OP_MINQUERY:
 2251       case OP_MINQUERYI:
 2252       case OP_POSQUERY:
 2253       case OP_POSQUERYI:
 2254       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
 2255       break;
 2256       }
 2257 #else
 2258     (void)(utf);  /* Keep compiler happy by referencing function argument */
 2259 #endif
 2260     }
 2261   }
 2262 }
 2263 
 2264 
 2265 
 2266 /*************************************************
 2267 *    Scan compiled branch for non-emptiness      *
 2268 *************************************************/
 2269 
 2270 /* This function scans through a branch of a compiled pattern to see whether it
 2271 can match the empty string or not. It is called from could_be_empty()
 2272 below and from compile_branch() when checking for an unlimited repeat of a
 2273 group that can match nothing. Note that first_significant_code() skips over
 2274 backward and negative forward assertions when its final argument is TRUE. If we
 2275 hit an unclosed bracket, we return "empty" - this means we've struck an inner
 2276 bracket whose current branch will already have been scanned.
 2277 
 2278 Arguments:
 2279   code        points to start of search
 2280   endcode     points to where to stop
 2281   utf         TRUE if in UTF-8 / UTF-16 mode
 2282   cd          contains pointers to tables etc.
 2283 
 2284 Returns:      TRUE if what is matched could be empty
 2285 */
 2286 
 2287 static BOOL
 2288 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
 2289   BOOL utf, compile_data *cd)
 2290 {
 2291 register int c;
 2292 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
 2293      code < endcode;
 2294      code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
 2295   {
 2296   const pcre_uchar *ccode;
 2297 
 2298   c = *code;
 2299 
 2300   /* Skip over forward assertions; the other assertions are skipped by
 2301   first_significant_code() with a TRUE final argument. */
 2302 
 2303   if (c == OP_ASSERT)
 2304     {
 2305     do code += GET(code, 1); while (*code == OP_ALT);
 2306     c = *code;
 2307     continue;
 2308     }
 2309 
 2310   /* For a recursion/subroutine call, if its end has been reached, which
 2311   implies a backward reference subroutine call, we can scan it. If it's a
 2312   forward reference subroutine call, we can't. To detect forward reference
 2313   we have to scan up the list that is kept in the workspace. This function is
 2314   called only when doing the real compile, not during the pre-compile that
 2315   measures the size of the compiled pattern. */
 2316 
 2317   if (c == OP_RECURSE)
 2318     {
 2319     const pcre_uchar *scode;
 2320     BOOL empty_branch;
 2321 
 2322     /* Test for forward reference */
 2323 
 2324     for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
 2325       if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
 2326 
 2327     /* Not a forward reference, test for completed backward reference */
 2328 
 2329     empty_branch = FALSE;
 2330     scode = cd->start_code + GET(code, 1);
 2331     if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
 2332 
 2333     /* Completed backwards reference */
 2334 
 2335     do
 2336       {
 2337       if (could_be_empty_branch(scode, endcode, utf, cd))
 2338         {
 2339         empty_branch = TRUE;
 2340         break;
 2341         }
 2342       scode += GET(scode, 1);
 2343       }
 2344     while (*scode == OP_ALT);
 2345 
 2346     if (!empty_branch) return FALSE;  /* All branches are non-empty */
 2347     continue;
 2348     }
 2349 
 2350   /* Groups with zero repeats can of course be empty; skip them. */
 2351 
 2352   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
 2353       c == OP_BRAPOSZERO)
 2354     {
 2355     code += PRIV(OP_lengths)[c];
 2356     do code += GET(code, 1); while (*code == OP_ALT);
 2357     c = *code;
 2358     continue;
 2359     }
 2360 
 2361   /* A nested group that is already marked as "could be empty" can just be
 2362   skipped. */
 2363 
 2364   if (c == OP_SBRA  || c == OP_SBRAPOS ||
 2365       c == OP_SCBRA || c == OP_SCBRAPOS)
 2366     {
 2367     do code += GET(code, 1); while (*code == OP_ALT);
 2368     c = *code;
 2369     continue;
 2370     }
 2371 
 2372   /* For other groups, scan the branches. */
 2373 
 2374   if (c == OP_BRA  || c == OP_BRAPOS ||
 2375       c == OP_CBRA || c == OP_CBRAPOS ||
 2376       c == OP_ONCE || c == OP_ONCE_NC ||
 2377       c == OP_COND)
 2378     {
 2379     BOOL empty_branch;
 2380     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
 2381 
 2382     /* If a conditional group has only one branch, there is a second, implied,
 2383     empty branch, so just skip over the conditional, because it could be empty.
 2384     Otherwise, scan the individual branches of the group. */
 2385 
 2386     if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
 2387       code += GET(code, 1);
 2388     else
 2389       {
 2390       empty_branch = FALSE;
 2391       do
 2392         {
 2393         if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd))
 2394           empty_branch = TRUE;
 2395         code += GET(code, 1);
 2396         }
 2397       while (*code == OP_ALT);
 2398       if (!empty_branch) return FALSE;   /* All branches are non-empty */
 2399       }
 2400 
 2401     c = *code;
 2402     continue;
 2403     }
 2404 
 2405   /* Handle the other opcodes */
 2406 
 2407   switch (c)
 2408     {
 2409     /* Check for quantifiers after a class. XCLASS is used for classes that
 2410     cannot be represented just by a bit map. This includes negated single
 2411     high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
 2412     actual length is stored in the compiled code, so we must update "code"
 2413     here. */
 2414 
 2415 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 2416     case OP_XCLASS:
 2417     ccode = code += GET(code, 1);
 2418     goto CHECK_CLASS_REPEAT;
 2419 #endif
 2420 
 2421     case OP_CLASS:
 2422     case OP_NCLASS:
 2423     ccode = code + PRIV(OP_lengths)[OP_CLASS];
 2424 
 2425 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 2426     CHECK_CLASS_REPEAT:
 2427 #endif
 2428 
 2429     switch (*ccode)
 2430       {
 2431       case OP_CRSTAR:            /* These could be empty; continue */
 2432       case OP_CRMINSTAR:
 2433       case OP_CRQUERY:
 2434       case OP_CRMINQUERY:
 2435       break;
 2436 
 2437       default:                   /* Non-repeat => class must match */
 2438       case OP_CRPLUS:            /* These repeats aren't empty */
 2439       case OP_CRMINPLUS:
 2440       return FALSE;
 2441 
 2442       case OP_CRRANGE:
 2443       case OP_CRMINRANGE:
 2444       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
 2445       break;
 2446       }
 2447     break;
 2448 
 2449     /* Opcodes that must match a character */
 2450 
 2451     case OP_PROP:
 2452     case OP_NOTPROP:
 2453     case OP_EXTUNI:
 2454     case OP_NOT_DIGIT:
 2455     case OP_DIGIT:
 2456     case OP_NOT_WHITESPACE:
 2457     case OP_WHITESPACE:
 2458     case OP_NOT_WORDCHAR:
 2459     case OP_WORDCHAR:
 2460     case OP_ANY:
 2461     case OP_ALLANY:
 2462     case OP_ANYBYTE:
 2463     case OP_CHAR:
 2464     case OP_CHARI:
 2465     case OP_NOT:
 2466     case OP_NOTI:
 2467     case OP_PLUS:
 2468     case OP_MINPLUS:
 2469     case OP_POSPLUS:
 2470     case OP_EXACT:
 2471     case OP_NOTPLUS:
 2472     case OP_NOTMINPLUS:
 2473     case OP_NOTPOSPLUS:
 2474     case OP_NOTEXACT:
 2475     case OP_TYPEPLUS:
 2476     case OP_TYPEMINPLUS:
 2477     case OP_TYPEPOSPLUS:
 2478     case OP_TYPEEXACT:
 2479     return FALSE;
 2480 
 2481     /* These are going to continue, as they may be empty, but we have to
 2482     fudge the length for the \p and \P cases. */
 2483 
 2484     case OP_TYPESTAR:
 2485     case OP_TYPEMINSTAR:
 2486     case OP_TYPEPOSSTAR:
 2487     case OP_TYPEQUERY:
 2488     case OP_TYPEMINQUERY:
 2489     case OP_TYPEPOSQUERY:
 2490     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
 2491     break;
 2492 
 2493     /* Same for these */
 2494 
 2495     case OP_TYPEUPTO:
 2496     case OP_TYPEMINUPTO:
 2497     case OP_TYPEPOSUPTO:
 2498     if (code[1 + IMM2_SIZE] == OP_PROP
 2499       || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
 2500     break;
 2501 
 2502     /* End of branch */
 2503 
 2504     case OP_KET:
 2505     case OP_KETRMAX:
 2506     case OP_KETRMIN:
 2507     case OP_KETRPOS:
 2508     case OP_ALT:
 2509     return TRUE;
 2510 
 2511     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
 2512     MINUPTO, and POSUPTO may be followed by a multibyte character */
 2513 
 2514 #ifdef SUPPORT_UTF
 2515     case OP_STAR:
 2516     case OP_STARI:
 2517     case OP_MINSTAR:
 2518     case OP_MINSTARI:
 2519     case OP_POSSTAR:
 2520     case OP_POSSTARI:
 2521     case OP_QUERY:
 2522     case OP_QUERYI:
 2523     case OP_MINQUERY:
 2524     case OP_MINQUERYI:
 2525     case OP_POSQUERY:
 2526     case OP_POSQUERYI:
 2527     if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
 2528     break;
 2529 
 2530     case OP_UPTO:
 2531     case OP_UPTOI:
 2532     case OP_MINUPTO:
 2533     case OP_MINUPTOI:
 2534     case OP_POSUPTO:
 2535     case OP_POSUPTOI:
 2536     if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
 2537     break;
 2538 #endif
 2539 
 2540     /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
 2541     string. */
 2542 
 2543     case OP_MARK:
 2544     case OP_PRUNE_ARG:
 2545     case OP_SKIP_ARG:
 2546     code += code[1];
 2547     break;
 2548 
 2549     case OP_THEN_ARG:
 2550     code += code[1];
 2551     break;
 2552 
 2553     /* None of the remaining opcodes are required to match a character. */
 2554 
 2555     default:
 2556     break;
 2557     }
 2558   }
 2559 
 2560 return TRUE;
 2561 }
 2562 
 2563 
 2564 
 2565 /*************************************************
 2566 *    Scan compiled regex for non-emptiness       *
 2567 *************************************************/
 2568 
 2569 /* This function is called to check for left recursive calls. We want to check
 2570 the current branch of the current pattern to see if it could match the empty
 2571 string. If it could, we must look outwards for branches at other levels,
 2572 stopping when we pass beyond the bracket which is the subject of the recursion.
 2573 This function is called only during the real compile, not during the
 2574 pre-compile.
 2575 
 2576 Arguments:
 2577   code        points to start of the recursion
 2578   endcode     points to where to stop (current RECURSE item)
 2579   bcptr       points to the chain of current (unclosed) branch starts
 2580   utf         TRUE if in UTF-8 / UTF-16 mode
 2581   cd          pointers to tables etc
 2582 
 2583 Returns:      TRUE if what is matched could be empty
 2584 */
 2585 
 2586 static BOOL
 2587 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
 2588   branch_chain *bcptr, BOOL utf, compile_data *cd)
 2589 {
 2590 while (bcptr != NULL && bcptr->current_branch >= code)
 2591   {
 2592   if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd))
 2593     return FALSE;
 2594   bcptr = bcptr->outer;
 2595   }
 2596 return TRUE;
 2597 }
 2598 
 2599 
 2600 
 2601 /*************************************************
 2602 *           Check for POSIX class syntax         *
 2603 *************************************************/
 2604 
 2605 /* This function is called when the sequence "[:" or "[." or "[=" is
 2606 encountered in a character class. It checks whether this is followed by a
 2607 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
 2608 reach an unescaped ']' without the special preceding character, return FALSE.
 2609 
 2610 Originally, this function only recognized a sequence of letters between the
 2611 terminators, but it seems that Perl recognizes any sequence of characters,
 2612 though of course unknown POSIX names are subsequently rejected. Perl gives an
 2613 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
 2614 didn't consider this to be a POSIX class. Likewise for [:1234:].
 2615 
 2616 The problem in trying to be exactly like Perl is in the handling of escapes. We
 2617 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
 2618 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
 2619 below handles the special case of \], but does not try to do any other escape
 2620 processing. This makes it different from Perl for cases such as [:l\ower:]
 2621 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
 2622 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
 2623 I think.
 2624 
 2625 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
 2626 It seems that the appearance of a nested POSIX class supersedes an apparent
 2627 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
 2628 a digit.
 2629 
 2630 In Perl, unescaped square brackets may also appear as part of class names. For
 2631 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
 2632 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
 2633 seem right at all. PCRE does not allow closing square brackets in POSIX class
 2634 names.
 2635 
 2636 Arguments:
 2637   ptr      pointer to the initial [
 2638   endptr   where to return the end pointer
 2639 
 2640 Returns:   TRUE or FALSE
 2641 */
 2642 
 2643 static BOOL
 2644 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
 2645 {
 2646 int terminator;          /* Don't combine these lines; the Solaris cc */
 2647 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
 2648 for (++ptr; *ptr != 0; ptr++)
 2649   {
 2650   if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
 2651     ptr++;
 2652   else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
 2653   else
 2654     {
 2655     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
 2656       {
 2657       *endptr = ptr;
 2658       return TRUE;
 2659       }
 2660     if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
 2661          (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
 2662           ptr[1] == CHAR_EQUALS_SIGN) &&
 2663         check_posix_syntax(ptr, endptr))
 2664       return FALSE;
 2665     }
 2666   }
 2667 return FALSE;
 2668 }
 2669 
 2670 
 2671 
 2672 
 2673 /*************************************************
 2674 *          Check POSIX class name                *
 2675 *************************************************/
 2676 
 2677 /* This function is called to check the name given in a POSIX-style class entry
 2678 such as [:alnum:].
 2679 
 2680 Arguments:
 2681   ptr        points to the first letter
 2682   len        the length of the name
 2683 
 2684 Returns:     a value representing the name, or -1 if unknown
 2685 */
 2686 
 2687 static int
 2688 check_posix_name(const pcre_uchar *ptr, int len)
 2689 {
 2690 const char *pn = posix_names;
 2691 register int yield = 0;
 2692 while (posix_name_lengths[yield] != 0)
 2693   {
 2694   if (len == posix_name_lengths[yield] &&
 2695     STRNCMP_UC_C8(ptr, pn, len) == 0) return yield;
 2696   pn += posix_name_lengths[yield] + 1;
 2697   yield++;
 2698   }
 2699 return -1;
 2700 }
 2701 
 2702 
 2703 /*************************************************
 2704 *    Adjust OP_RECURSE items in repeated group   *
 2705 *************************************************/
 2706 
 2707 /* OP_RECURSE items contain an offset from the start of the regex to the group
 2708 that is referenced. This means that groups can be replicated for fixed
 2709 repetition simply by copying (because the recursion is allowed to refer to
 2710 earlier groups that are outside the current group). However, when a group is
 2711 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
 2712 inserted before it, after it has been compiled. This means that any OP_RECURSE
 2713 items within it that refer to the group itself or any contained groups have to
 2714 have their offsets adjusted. That one of the jobs of this function. Before it
 2715 is called, the partially compiled regex must be temporarily terminated with
 2716 OP_END.
 2717 
 2718 This function has been extended with the possibility of forward references for
 2719 recursions and subroutine calls. It must also check the list of such references
 2720 for the group we are dealing with. If it finds that one of the recursions in
 2721 the current group is on this list, it adjusts the offset in the list, not the
 2722 value in the reference (which is a group number).
 2723 
 2724 Arguments:
 2725   group      points to the start of the group
 2726   adjust     the amount by which the group is to be moved
 2727   utf        TRUE in UTF-8 / UTF-16 mode
 2728   cd         contains pointers to tables etc.
 2729   save_hwm   the hwm forward reference pointer at the start of the group
 2730 
 2731 Returns:     nothing
 2732 */
 2733 
 2734 static void
 2735 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
 2736   pcre_uchar *save_hwm)
 2737 {
 2738 pcre_uchar *ptr = group;
 2739 
 2740 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
 2741   {
 2742   int offset;
 2743   pcre_uchar *hc;
 2744 
 2745   /* See if this recursion is on the forward reference list. If so, adjust the
 2746   reference. */
 2747 
 2748   for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
 2749     {
 2750     offset = GET(hc, 0);
 2751     if (cd->start_code + offset == ptr + 1)
 2752       {
 2753       PUT(hc, 0, offset + adjust);
 2754       break;
 2755       }
 2756     }
 2757 
 2758   /* Otherwise, adjust the recursion offset if it's after the start of this
 2759   group. */
 2760 
 2761   if (hc >= cd->hwm)
 2762     {
 2763     offset = GET(ptr, 1);
 2764     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
 2765     }
 2766 
 2767   ptr += 1 + LINK_SIZE;
 2768   }
 2769 }
 2770 
 2771 
 2772 
 2773 /*************************************************
 2774 *        Insert an automatic callout point       *
 2775 *************************************************/
 2776 
 2777 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
 2778 callout points before each pattern item.
 2779 
 2780 Arguments:
 2781   code           current code pointer
 2782   ptr            current pattern pointer
 2783   cd             pointers to tables etc
 2784 
 2785 Returns:         new code pointer
 2786 */
 2787 
 2788 static pcre_uchar *
 2789 auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
 2790 {
 2791 *code++ = OP_CALLOUT;
 2792 *code++ = 255;
 2793 *((void **)code)++ = NULL;
 2794 PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
 2795 PUT(code, LINK_SIZE, 0);                       /* Default length */
 2796 return code + 2 * LINK_SIZE;
 2797 }
 2798 
 2799 
 2800 
 2801 /*************************************************
 2802 *         Complete a callout item                *
 2803 *************************************************/
 2804 
 2805 /* A callout item contains the length of the next item in the pattern, which
 2806 we can't fill in till after we have reached the relevant point. This is used
 2807 for both automatic and manual callouts.
 2808 
 2809 Arguments:
 2810   previous_callout   points to previous callout item
 2811   ptr                current pattern pointer
 2812   cd                 pointers to tables etc
 2813 
 2814 Returns:             nothing
 2815 */
 2816 
 2817 static void
 2818 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
 2819 {
 2820 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2 + IMMPTR_SIZE));
 2821 PUT(previous_callout, 2 + IMMPTR_SIZE + LINK_SIZE, length);
 2822 }
 2823 
 2824 
 2825 
 2826 #ifdef SUPPORT_UCP
 2827 /*************************************************
 2828 *           Get othercase range                  *
 2829 *************************************************/
 2830 
 2831 /* This function is passed the start and end of a class range, in UTF-8 mode
 2832 with UCP support. It searches up the characters, looking for internal ranges of
 2833 characters in the "other" case. Each call returns the next one, updating the
 2834 start address.
 2835 
 2836 Arguments:
 2837   cptr        points to starting character value; updated
 2838   d           end value
 2839   ocptr       where to put start of othercase range
 2840   odptr       where to put end of othercase range
 2841 
 2842 Yield:        TRUE when range returned; FALSE when no more
 2843 */
 2844 
 2845 static BOOL
 2846 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
 2847   unsigned int *odptr)
 2848 {
 2849 unsigned int c, othercase, next;
 2850 
 2851 for (c = *cptr; c <= d; c++)
 2852   { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
 2853 
 2854 if (c > d) return FALSE;
 2855 
 2856 *ocptr = othercase;
 2857 next = othercase + 1;
 2858 
 2859 for (++c; c <= d; c++)
 2860   {
 2861   if (UCD_OTHERCASE(c) != next) break;
 2862   next++;
 2863   }
 2864 
 2865 *odptr = next - 1;
 2866 *cptr = c;
 2867 
 2868 return TRUE;
 2869 }
 2870 
 2871 
 2872 
 2873 /*************************************************
 2874 *        Check a character and a property        *
 2875 *************************************************/
 2876 
 2877 /* This function is called by check_auto_possessive() when a property item
 2878 is adjacent to a fixed character.
 2879 
 2880 Arguments:
 2881   c            the character
 2882   ptype        the property type
 2883   pdata        the data for the type
 2884   negated      TRUE if it's a negated property (\P or \p{^)
 2885 
 2886 Returns:       TRUE if auto-possessifying is OK
 2887 */
 2888 
 2889 static BOOL
 2890 check_char_prop(int c, int ptype, int pdata, BOOL negated)
 2891 {
 2892 const ucd_record *prop = GET_UCD(c);
 2893 switch(ptype)
 2894   {
 2895   case PT_LAMP:
 2896   return (prop->chartype == ucp_Lu ||
 2897           prop->chartype == ucp_Ll ||
 2898           prop->chartype == ucp_Lt) == negated;
 2899 
 2900   case PT_GC:
 2901   return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
 2902 
 2903   case PT_PC:
 2904   return (pdata == prop->chartype) == negated;
 2905 
 2906   case PT_SC:
 2907   return (pdata == prop->script) == negated;
 2908 
 2909   /* These are specials */
 2910 
 2911   case PT_ALNUM:
 2912   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
 2913           PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
 2914 
 2915   case PT_SPACE:    /* Perl space */
 2916   return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
 2917           c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
 2918           == negated;
 2919 
 2920   case PT_PXSPACE:  /* POSIX space */
 2921   return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
 2922           c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
 2923           c == CHAR_FF || c == CHAR_CR)
 2924           == negated;
 2925 
 2926   case PT_WORD:
 2927   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
 2928           PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
 2929           c == CHAR_UNDERSCORE) == negated;
 2930   }
 2931 return FALSE;
 2932 }
 2933 #endif  /* SUPPORT_UCP */
 2934 
 2935 
 2936 
 2937 /*************************************************
 2938 *     Check if auto-possessifying is possible    *
 2939 *************************************************/
 2940 
 2941 /* This function is called for unlimited repeats of certain items, to see
 2942 whether the next thing could possibly match the repeated item. If not, it makes
 2943 sense to automatically possessify the repeated item.
 2944 
 2945 Arguments:
 2946   previous      pointer to the repeated opcode
 2947   utf           TRUE in UTF-8 / UTF-16 mode
 2948   ptr           next character in pattern
 2949   options       options bits
 2950   cd            contains pointers to tables etc.
 2951 
 2952 Returns:        TRUE if possessifying is wanted
 2953 */
 2954 
 2955 static BOOL
 2956 check_auto_possessive(const pcre_uchar *previous, BOOL utf,
 2957   const pcre_uchar *ptr, int options, compile_data *cd)
 2958 {
 2959 pcre_int32 c, next;
 2960 int op_code = *previous++;
 2961 
 2962 /* Skip whitespace and comments in extended mode */
 2963 
 2964 if ((options & PCRE_EXTENDED) != 0)
 2965   {
 2966   for (;;)
 2967     {
 2968     while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
 2969     if (*ptr == CHAR_NUMBER_SIGN)
 2970       {
 2971       ptr++;
 2972       while (*ptr != 0)
 2973         {
 2974         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
 2975         ptr++;
 2976 #ifdef SUPPORT_UTF
 2977         if (utf) FORWARDCHAR(ptr);
 2978 #endif
 2979         }
 2980       }
 2981     else break;
 2982     }
 2983   }
 2984 
 2985 /* If the next item is one that we can handle, get its value. A non-negative
 2986 value is a character, a negative value is an escape value. */
 2987 
 2988 if (*ptr == CHAR_BACKSLASH)
 2989   {
 2990   int temperrorcode = 0;
 2991   next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
 2992   if (temperrorcode != 0) return FALSE;
 2993   ptr++;    /* Point after the escape sequence */
 2994   }
 2995 else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0)
 2996   {
 2997 #ifdef SUPPORT_UTF
 2998   if (utf) { GETCHARINC(next, ptr); } else
 2999 #endif
 3000   next = *ptr++;
 3001   }
 3002 else return FALSE;
 3003 
 3004 /* Skip whitespace and comments in extended mode */
 3005 
 3006 if ((options & PCRE_EXTENDED) != 0)
 3007   {
 3008   for (;;)
 3009     {
 3010     while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
 3011     if (*ptr == CHAR_NUMBER_SIGN)
 3012       {
 3013       ptr++;
 3014       while (*ptr != 0)
 3015         {
 3016         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
 3017         ptr++;
 3018 #ifdef SUPPORT_UTF
 3019         if (utf) FORWARDCHAR(ptr);
 3020 #endif
 3021         }
 3022       }
 3023     else break;
 3024     }
 3025   }
 3026 
 3027 /* If the next thing is itself optional, we have to give up. */
 3028 
 3029 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
 3030   STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
 3031     return FALSE;
 3032 
 3033 /* Now compare the next item with the previous opcode. First, handle cases when
 3034 the next item is a character. */
 3035 
 3036 if (next >= 0) switch(op_code)
 3037   {
 3038   case OP_CHAR:
 3039 #ifdef SUPPORT_UTF
 3040   GETCHARTEST(c, previous);
 3041 #else
 3042   c = *previous;
 3043 #endif
 3044   return c != next;
 3045 
 3046   /* For CHARI (caseless character) we must check the other case. If we have
 3047   Unicode property support, we can use it to test the other case of
 3048   high-valued characters. */
 3049 
 3050   case OP_CHARI:
 3051 #ifdef SUPPORT_UTF
 3052   GETCHARTEST(c, previous);
 3053 #else
 3054   c = *previous;
 3055 #endif
 3056   if (c == next) return FALSE;
 3057 #ifdef SUPPORT_UTF
 3058   if (utf)
 3059     {
 3060     unsigned int othercase;
 3061     if (next < 128) othercase = cd->fcc[next]; else
 3062 #ifdef SUPPORT_UCP
 3063     othercase = UCD_OTHERCASE((unsigned int)next);
 3064 #else
 3065     othercase = NOTACHAR;
 3066 #endif
 3067     return (unsigned int)c != othercase;
 3068     }
 3069   else
 3070 #endif  /* SUPPORT_UTF */
 3071   return (c != TABLE_GET((unsigned int)next, cd->fcc, next));  /* Non-UTF-8 mode */
 3072 
 3073   /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
 3074   opcodes are not used for multi-byte characters, because they are coded using
 3075   an XCLASS instead. */
 3076 
 3077   case OP_NOT:
 3078   return (c = *previous) == next;
 3079 
 3080   case OP_NOTI:
 3081   if ((c = *previous) == next) return TRUE;
 3082 #ifdef SUPPORT_UTF
 3083   if (utf)
 3084     {
 3085     unsigned int othercase;
 3086     if (next < 128) othercase = cd->fcc[next]; else
 3087 #ifdef SUPPORT_UCP
 3088     othercase = UCD_OTHERCASE(next);
 3089 #else
 3090     othercase = NOTACHAR;
 3091 #endif
 3092     return (unsigned int)c == othercase;
 3093     }
 3094   else
 3095 #endif  /* SUPPORT_UTF */
 3096   return (c == (int)(TABLE_GET((unsigned int)next, cd->fcc, next)));  /* Non-UTF-8 mode */
 3097 
 3098   /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
 3099   When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
 3100 
 3101   case OP_DIGIT:
 3102   return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
 3103 
 3104   case OP_NOT_DIGIT:
 3105   return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
 3106 
 3107   case OP_WHITESPACE:
 3108   return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
 3109 
 3110   case OP_NOT_WHITESPACE:
 3111   return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
 3112 
 3113   case OP_WORDCHAR:
 3114   return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
 3115 
 3116   case OP_NOT_WORDCHAR:
 3117   return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
 3118 
 3119   case OP_HSPACE:
 3120   case OP_NOT_HSPACE:
 3121   switch(next)
 3122     {
 3123     case 0x09:
 3124     case 0x20:
 3125     case 0xa0:
 3126     case 0x1680:
 3127     case 0x180e:
 3128     case 0x2000:
 3129     case 0x2001:
 3130     case 0x2002:
 3131     case 0x2003:
 3132     case 0x2004:
 3133     case 0x2005:
 3134     case 0x2006:
 3135     case 0x2007:
 3136     case 0x2008:
 3137     case 0x2009:
 3138     case 0x200A:
 3139     case 0x202f:
 3140     case 0x205f:
 3141     case 0x3000:
 3142     return op_code == OP_NOT_HSPACE;
 3143     default:
 3144     return op_code != OP_NOT_HSPACE;
 3145     }
 3146 
 3147   case OP_ANYNL:
 3148   case OP_VSPACE:
 3149   case OP_NOT_VSPACE:
 3150   switch(next)
 3151     {
 3152     case 0x0a:
 3153     case 0x0b:
 3154     case 0x0c:
 3155     case 0x0d:
 3156     case 0x85:
 3157     case 0x2028:
 3158     case 0x2029:
 3159     return op_code == OP_NOT_VSPACE;
 3160     default:
 3161     return op_code != OP_NOT_VSPACE;
 3162     }
 3163 
 3164 #ifdef SUPPORT_UCP
 3165   case OP_PROP:
 3166   return check_char_prop(next, previous[0], previous[1], FALSE);
 3167 
 3168   case OP_NOTPROP:
 3169   return check_char_prop(next, previous[0], previous[1], TRUE);
 3170 #endif
 3171 
 3172   default:
 3173   return FALSE;
 3174   }
 3175 
 3176 
 3177 /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
 3178 is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
 3179 generated only when PCRE_UCP is *not* set, that is, when only ASCII
 3180 characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
 3181 replaced by OP_PROP codes when PCRE_UCP is set. */
 3182 
 3183 switch(op_code)
 3184   {
 3185   case OP_CHAR:
 3186   case OP_CHARI:
 3187 #ifdef SUPPORT_UTF
 3188   GETCHARTEST(c, previous);
 3189 #else
 3190   c = *previous;
 3191 #endif
 3192   switch(-next)
 3193     {
 3194     case ESC_d:
 3195     return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
 3196 
 3197     case ESC_D:
 3198     return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
 3199 
 3200     case ESC_s:
 3201     return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
 3202 
 3203     case ESC_S:
 3204     return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
 3205 
 3206     case ESC_w:
 3207     return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
 3208 
 3209     case ESC_W:
 3210     return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
 3211 
 3212     case ESC_h:
 3213     case ESC_H:
 3214     switch(c)
 3215       {
 3216       case 0x09:
 3217       case 0x20:
 3218       case 0xa0:
 3219       case 0x1680:
 3220       case 0x180e:
 3221       case 0x2000:
 3222       case 0x2001:
 3223       case 0x2002:
 3224       case 0x2003:
 3225       case 0x2004:
 3226       case 0x2005:
 3227       case 0x2006:
 3228       case 0x2007:
 3229       case 0x2008:
 3230       case 0x2009:
 3231       case 0x200A:
 3232       case 0x202f:
 3233       case 0x205f:
 3234       case 0x3000:
 3235       return -next != ESC_h;
 3236       default:
 3237       return -next == ESC_h;
 3238       }
 3239 
 3240     case ESC_v:
 3241     case ESC_V:
 3242     switch(c)
 3243       {
 3244       case 0x0a:
 3245       case 0x0b:
 3246       case 0x0c:
 3247       case 0x0d:
 3248       case 0x85:
 3249       case 0x2028:
 3250       case 0x2029:
 3251       return -next != ESC_v;
 3252       default:
 3253       return -next == ESC_v;
 3254       }
 3255 
 3256     /* When PCRE_UCP is set, these values get generated for \d etc. Find
 3257     their substitutions and process them. The result will always be either
 3258     -ESC_p or -ESC_P. Then fall through to process those values. */
 3259 
 3260 #ifdef SUPPORT_UCP
 3261     case ESC_du:
 3262     case ESC_DU:
 3263     case ESC_wu:
 3264     case ESC_WU:
 3265     case ESC_su:
 3266     case ESC_SU:
 3267       {
 3268       int temperrorcode = 0;
 3269       ptr = substitutes[-next - ESC_DU];
 3270       next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
 3271       if (temperrorcode != 0) return FALSE;
 3272       ptr++;    /* For compatibility */
 3273       }
 3274     /* Fall through */
 3275 
 3276     case ESC_p:
 3277     case ESC_P:
 3278       {
 3279       int ptype, pdata, errorcodeptr;
 3280       BOOL negated;
 3281 
 3282       ptr--;      /* Make ptr point at the p or P */
 3283       ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
 3284       if (ptype < 0) return FALSE;
 3285       ptr++;      /* Point past the final curly ket */
 3286 
 3287       /* If the property item is optional, we have to give up. (When generated
 3288       from \d etc by PCRE_UCP, this test will have been applied much earlier,
 3289       to the original \d etc. At this point, ptr will point to a zero byte. */
 3290 
 3291       if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
 3292         STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
 3293           return FALSE;
 3294 
 3295       /* Do the property check. */
 3296 
 3297       return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
 3298       }
 3299 #endif
 3300 
 3301     default:
 3302     return FALSE;
 3303     }
 3304 
 3305   /* In principle, support for Unicode properties should be integrated here as
 3306   well. It means re-organizing the above code so as to get hold of the property
 3307   values before switching on the op-code. However, I wonder how many patterns
 3308   combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
 3309   these op-codes are never generated.) */
 3310 
 3311   case OP_DIGIT:
 3312   return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
 3313          next == -ESC_h || next == -ESC_v || next == -ESC_R;
 3314 
 3315   case OP_NOT_DIGIT:
 3316   return next == -ESC_d;
 3317 
 3318   case OP_WHITESPACE:
 3319   return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
 3320 
 3321   case OP_NOT_WHITESPACE:
 3322   return next == -ESC_s || next == -ESC_h || next == -ESC_v;
 3323 
 3324   case OP_HSPACE:
 3325   return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
 3326          next == -ESC_w || next == -ESC_v || next == -ESC_R;
 3327 
 3328   case OP_NOT_HSPACE:
 3329   return next == -ESC_h;
 3330 
 3331   /* Can't have \S in here because VT matches \S (Perl anomaly) */
 3332   case OP_ANYNL:
 3333   case OP_VSPACE:
 3334   return next == -ESC_V || next == -ESC_d || next == -ESC_w;
 3335 
 3336   case OP_NOT_VSPACE:
 3337   return next == -ESC_v || next == -ESC_R;
 3338 
 3339   case OP_WORDCHAR:
 3340   return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
 3341          next == -ESC_v || next == -ESC_R;
 3342 
 3343   case OP_NOT_WORDCHAR:
 3344   return next == -ESC_w || next == -ESC_d;
 3345 
 3346   default:
 3347   return FALSE;
 3348   }
 3349 
 3350 /* Control does not reach here */
 3351 }
 3352 
 3353 
 3354 
 3355 /*************************************************
 3356 *           Compile one branch                   *
 3357 *************************************************/
 3358 
 3359 /* Scan the pattern, compiling it into the a vector. If the options are
 3360 changed during the branch, the pointer is used to change the external options
 3361 bits. This function is used during the pre-compile phase when we are trying
 3362 to find out the amount of memory needed, as well as during the real compile
 3363 phase. The value of lengthptr distinguishes the two phases.
 3364 
 3365 Arguments:
 3366   optionsptr     pointer to the option bits
 3367   codeptr        points to the pointer to the current code point
 3368   ptrptr         points to the current pattern pointer
 3369   errorcodeptr   points to error code variable
 3370   firstcharptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
 3371   reqcharptr     set to the last literal character required, else < 0
 3372   bcptr          points to current branch chain
 3373   cond_depth     conditional nesting depth
 3374   cd             contains pointers to tables etc.
 3375   lengthptr      NULL during the real compile phase
 3376                  points to length accumulator during pre-compile phase
 3377 
 3378 Returns:         TRUE on success
 3379                  FALSE, with *errorcodeptr set non-zero on error
 3380 */
 3381 
 3382 static BOOL
 3383 compile_branch(int *optionsptr, pcre_uchar **codeptr,
 3384   const pcre_uchar **ptrptr, int *errorcodeptr, pcre_int32 *firstcharptr,
 3385   pcre_int32 *reqcharptr, branch_chain *bcptr, int cond_depth,
 3386   compile_data *cd, int *lengthptr)
 3387 {
 3388 int repeat_type, op_type;
 3389 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
 3390 int bravalue = 0;
 3391 int greedy_default, greedy_non_default;
 3392 pcre_int32 firstchar, reqchar;
 3393 pcre_int32 zeroreqchar, zerofirstchar;
 3394 pcre_int32 req_caseopt, reqvary, tempreqvary;
 3395 int options = *optionsptr;               /* May change dynamically */
 3396 int after_manual_callout = 0;
 3397 int length_prevgroup = 0;
 3398 register int c;
 3399 register pcre_uchar *code = *codeptr;
 3400 pcre_uchar *last_code = code;
 3401 pcre_uchar *orig_code = code;
 3402 pcre_uchar *tempcode;
 3403 BOOL inescq = FALSE;
 3404 BOOL groupsetfirstchar = FALSE;
 3405 const pcre_uchar *ptr = *ptrptr;
 3406 const pcre_uchar *tempptr;
 3407 const pcre_uchar *nestptr = NULL;
 3408 pcre_uchar *previous = NULL;
 3409 pcre_uchar *previous_callout = NULL;
 3410 pcre_uchar *save_hwm = NULL;
 3411 pcre_uint8 classbits[32];
 3412 
 3413 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
 3414 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
 3415 dynamically as we process the pattern. */
 3416 
 3417 #ifdef SUPPORT_UTF
 3418 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
 3419 BOOL utf = UTF_ENABLED((options & PCRE_UTF8) != 0);
 3420 pcre_uchar utf_chars[6];
 3421 #else
 3422 BOOL utf = FALSE;
 3423 #endif
 3424 
 3425 /* Helper variables for OP_XCLASS opcode (for characters > 255). */
 3426 
 3427 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 3428 BOOL xclass;
 3429 pcre_uchar *class_uchardata;
 3430 pcre_uchar *class_uchardata_base;
 3431 #endif
 3432 
 3433 #ifdef PCRE_DEBUG
 3434 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
 3435 #endif
 3436 
 3437 /* Set up the default and non-default settings for greediness */
 3438 
 3439 greedy_default = ((options & PCRE_UNGREEDY) != 0);
 3440 greedy_non_default = greedy_default ^ 1;
 3441 
 3442 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
 3443 matching encountered yet". It gets changed to REQ_NONE if we hit something that
 3444 matches a non-fixed char first char; reqchar just remains unset if we never
 3445 find one.
 3446 
 3447 When we hit a repeat whose minimum is zero, we may have to adjust these values
 3448 to take the zero repeat into account. This is implemented by setting them to
 3449 zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
 3450 item types that can be repeated set these backoff variables appropriately. */
 3451 
 3452 firstchar = reqchar = zerofirstchar = zeroreqchar = REQ_UNSET;
 3453 
 3454 /* The variable req_caseopt contains either the REQ_CASELESS value
 3455 or zero, according to the current setting of the caseless flag. The
 3456 REQ_CASELESS leaves the lower 28 bit empty. It is added into the
 3457 firstchar or reqchar variables to record the case status of the
 3458 value. This is used only for ASCII characters. */
 3459 
 3460 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
 3461 
 3462 /* Switch on next character until the end of the branch */
 3463 
 3464 for (;; ptr++)
 3465   {
 3466   BOOL negate_class;
 3467   BOOL should_flip_negation;
 3468   BOOL possessive_quantifier;
 3469   BOOL is_quantifier;
 3470   BOOL is_recurse;
 3471   BOOL reset_bracount;
 3472   int class_has_8bitchar;
 3473   int class_single_char;
 3474   int newoptions;
 3475   int recno;
 3476   int refsign;
 3477   int skipbytes;
 3478   int subreqchar;
 3479   int subfirstchar;
 3480   int terminator;
 3481   int mclength;
 3482   int tempbracount;
 3483   pcre_uchar mcbuffer[8];
 3484 
 3485   /* Get next character in the pattern */
 3486 
 3487   c = *ptr;
 3488 
 3489   /* If we are at the end of a nested substitution, revert to the outer level
 3490   string. Nesting only happens one level deep. */
 3491 
 3492   if (c == 0 && nestptr != NULL)
 3493     {
 3494     ptr = nestptr;
 3495     nestptr = NULL;
 3496     c = *ptr;
 3497     }
 3498 
 3499   /* If we are in the pre-compile phase, accumulate the length used for the
 3500   previous cycle of this loop. */
 3501 
 3502   if (lengthptr != NULL)
 3503     {
 3504 #ifdef PCRE_DEBUG
 3505     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
 3506 #endif
 3507     if (code > cd->start_workspace + cd->workspace_size -
 3508         WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
 3509       {
 3510       *errorcodeptr = ERR52;
 3511       goto FAILED;
 3512       }
 3513 
 3514     /* There is at least one situation where code goes backwards: this is the
 3515     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
 3516     the class is simply eliminated. However, it is created first, so we have to
 3517     allow memory for it. Therefore, don't ever reduce the length at this point.
 3518     */
 3519 
 3520     if (code < last_code) code = last_code;
 3521 
 3522     /* Paranoid check for integer overflow */
 3523 
 3524     if (OFLOW_MAX - *lengthptr < code - last_code)
 3525       {
 3526       *errorcodeptr = ERR20;
 3527       goto FAILED;
 3528       }
 3529 
 3530     *lengthptr += (int)(code - last_code);
 3531     DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
 3532       (int)(code - last_code), c, c));
 3533 
 3534     /* If "previous" is set and it is not at the start of the work space, move
 3535     it back to there, in order to avoid filling up the work space. Otherwise,
 3536     if "previous" is NULL, reset the current code pointer to the start. */
 3537 
 3538     if (previous != NULL)
 3539       {
 3540       if (previous > orig_code)
 3541         {
 3542         memmove(orig_code, previous, IN_UCHARS(code - previous));
 3543         code -= previous - orig_code;
 3544         previous = orig_code;
 3545         }
 3546       }
 3547     else code = orig_code;
 3548 
 3549     /* Remember where this code item starts so we can pick up the length
 3550     next time round. */
 3551 
 3552     last_code = code;
 3553     }
 3554 
 3555   /* In the real compile phase, just check the workspace used by the forward
 3556   reference list. */
 3557 
 3558   else if (cd->hwm > cd->start_workspace + cd->workspace_size -
 3559            WORK_SIZE_SAFETY_MARGIN)
 3560     {
 3561     *errorcodeptr = ERR52;
 3562     goto FAILED;
 3563     }
 3564 
 3565   /* If in \Q...\E, check for the end; if not, we have a literal */
 3566 
 3567   if (inescq && c != 0)
 3568     {
 3569     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
 3570       {
 3571       inescq = FALSE;
 3572       ptr++;
 3573       continue;
 3574       }
 3575     else
 3576       {
 3577       if (previous_callout != NULL)
 3578         {
 3579         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
 3580           complete_callout(previous_callout, ptr, cd);
 3581         previous_callout = NULL;
 3582         }
 3583       if ((options & PCRE_AUTO_CALLOUT) != 0)
 3584         {
 3585         previous_callout = code;
 3586         code = auto_callout(code, ptr, cd);
 3587         }
 3588       goto NORMAL_CHAR;
 3589       }
 3590     }
 3591 
 3592   /* Fill in length of a previous callout, except when the next thing is
 3593   a quantifier. */
 3594 
 3595   is_quantifier =
 3596     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
 3597     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
 3598 
 3599   if (!is_quantifier && previous_callout != NULL &&
 3600        after_manual_callout-- <= 0)
 3601     {
 3602     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
 3603       complete_callout(previous_callout, ptr, cd);
 3604     previous_callout = NULL;
 3605     }
 3606 
 3607   /* In extended mode, skip white space and comments. */
 3608 
 3609   if ((options & PCRE_EXTENDED) != 0)
 3610     {
 3611     if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;
 3612     if (c == CHAR_NUMBER_SIGN)
 3613       {
 3614       ptr++;
 3615       while (*ptr != 0)
 3616         {
 3617         if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
 3618         ptr++;
 3619 #ifdef SUPPORT_UTF
 3620         if (utf) FORWARDCHAR(ptr);
 3621 #endif
 3622         }
 3623       if (*ptr != 0) continue;
 3624 
 3625       /* Else fall through to handle end of string */
 3626       c = 0;
 3627       }
 3628     }
 3629 
 3630   /* No auto callout for quantifiers. */
 3631 
 3632   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
 3633     {
 3634     previous_callout = code;
 3635     code = auto_callout(code, ptr, cd);
 3636     }
 3637 
 3638   switch(c)
 3639     {
 3640     /* ===================================================================*/
 3641     case 0:                        /* The branch terminates at string end */
 3642     case CHAR_VERTICAL_LINE:       /* or | or ) */
 3643     case CHAR_RIGHT_PARENTHESIS:
 3644     *firstcharptr = firstchar;
 3645     *reqcharptr = reqchar;
 3646     *codeptr = code;
 3647     *ptrptr = ptr;
 3648     if (lengthptr != NULL)
 3649       {
 3650       if (OFLOW_MAX - *lengthptr < code - last_code)
 3651         {
 3652         *errorcodeptr = ERR20;
 3653         goto FAILED;
 3654         }
 3655       *lengthptr += (int)(code - last_code);   /* To include callout length */
 3656       DPRINTF((">> end branch\n"));
 3657       }
 3658     return TRUE;
 3659 
 3660 
 3661     /* ===================================================================*/
 3662     /* Handle single-character metacharacters. In multiline mode, ^ disables
 3663     the setting of any following char as a first character. */
 3664 
 3665     case CHAR_CIRCUMFLEX_ACCENT:
 3666     previous = NULL;
 3667     if ((options & PCRE_MULTILINE) != 0)
 3668       {
 3669       if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
 3670       *code++ = OP_CIRCM;
 3671       }
 3672     else *code++ = OP_CIRC;
 3673     break;
 3674 
 3675     case CHAR_DOLLAR_SIGN:
 3676     previous = NULL;
 3677     *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
 3678     break;
 3679 
 3680     /* There can never be a first char if '.' is first, whatever happens about
 3681     repeats. The value of reqchar doesn't change either. */
 3682 
 3683     case CHAR_DOT:
 3684     if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
 3685     zerofirstchar = firstchar;
 3686     zeroreqchar = reqchar;
 3687     previous = code;
 3688     *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
 3689     break;
 3690 
 3691 
 3692     /* ===================================================================*/
 3693     /* Character classes. If the included characters are all < 256, we build a
 3694     32-byte bitmap of the permitted characters, except in the special case
 3695     where there is only one such character. For negated classes, we build the
 3696     map as usual, then invert it at the end. However, we use a different opcode
 3697     so that data characters > 255 can be handled correctly.
 3698 
 3699     If the class contains characters outside the 0-255 range, a different
 3700     opcode is compiled. It may optionally have a bit map for characters < 256,
 3701     but those above are are explicitly listed afterwards. A flag byte tells
 3702     whether the bitmap is present, and whether this is a negated class or not.
 3703 
 3704     In JavaScript compatibility mode, an isolated ']' causes an error. In
 3705     default (Perl) mode, it is treated as a data character. */
 3706 
 3707     case CHAR_RIGHT_SQUARE_BRACKET:
 3708     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
 3709       {
 3710       *errorcodeptr = ERR64;
 3711       goto FAILED;
 3712       }
 3713     goto NORMAL_CHAR;
 3714 
 3715     case CHAR_LEFT_SQUARE_BRACKET:
 3716     previous = code;
 3717 
 3718     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
 3719     they are encountered at the top level, so we'll do that too. */
 3720 
 3721     if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
 3722          ptr[1] == CHAR_EQUALS_SIGN) &&
 3723         check_posix_syntax(ptr, &tempptr))
 3724       {
 3725       *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
 3726       goto FAILED;
 3727       }
 3728 
 3729     /* If the first character is '^', set the negation flag and skip it. Also,
 3730     if the first few characters (either before or after ^) are \Q\E or \E we
 3731     skip them too. This makes for compatibility with Perl. */
 3732 
 3733     negate_class = FALSE;
 3734     for (;;)
 3735       {
 3736       c = *(++ptr);
 3737       if (c == CHAR_BACKSLASH)
 3738         {
 3739         if (ptr[1] == CHAR_E)
 3740           ptr++;
 3741         else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
 3742           ptr += 3;
 3743         else
 3744           break;
 3745         }
 3746       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
 3747         negate_class = TRUE;
 3748       else break;
 3749       }
 3750 
 3751     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
 3752     an initial ']' is taken as a data character -- the code below handles
 3753     that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
 3754     [^] must match any character, so generate OP_ALLANY. */
 3755 
 3756     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
 3757         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
 3758       {
 3759       *code++ = negate_class? OP_ALLANY : OP_FAIL;
 3760       if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
 3761       zerofirstchar = firstchar;
 3762       break;
 3763       }
 3764 
 3765     /* If a class contains a negative special such as \S, we need to flip the
 3766     negation flag at the end, so that support for characters > 255 works
 3767     correctly (they are all included in the class). */
 3768 
 3769     should_flip_negation = FALSE;
 3770 
 3771     /* For optimization purposes, we track some properties of the class.
 3772     class_has_8bitchar will be non-zero, if the class contains at least one
 3773     < 256 character. class_single_char will be 1 if the class contains only
 3774     a single character. */
 3775 
 3776     class_has_8bitchar = 0;
 3777     class_single_char = 0;
 3778 
 3779     /* Initialize the 32-char bit map to all zeros. We build the map in a
 3780     temporary bit of memory, in case the class contains only 1 character (less
 3781     than 256), because in that case the compiled code doesn't use the bit map.
 3782     */
 3783 
 3784     memset(classbits, 0, 32 * sizeof(pcre_uint8));
 3785 
 3786 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 3787     xclass = FALSE;                           /* No chars >= 256 */
 3788     class_uchardata = code + LINK_SIZE + 2;   /* For UTF-8 items */
 3789     class_uchardata_base = class_uchardata;   /* For resetting in pass 1 */
 3790 #endif
 3791 
 3792     /* Process characters until ] is reached. By writing this as a "do" it
 3793     means that an initial ] is taken as a data character. At the start of the
 3794     loop, c contains the first byte of the character. */
 3795 
 3796     if (c != 0) do
 3797       {
 3798       const pcre_uchar *oldptr;
 3799 
 3800 #ifdef SUPPORT_UTF
 3801       if (utf && HAS_EXTRALEN(c))
 3802         {                           /* Braces are required because the */
 3803         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
 3804         }
 3805 #endif
 3806 
 3807 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 3808       /* In the pre-compile phase, accumulate the length of any extra
 3809       data and reset the pointer. This is so that very large classes that
 3810       contain a zillion > 255 characters no longer overwrite the work space
 3811       (which is on the stack). */
 3812 
 3813       if (lengthptr != NULL)
 3814         {
 3815         *lengthptr += (int)(class_uchardata - class_uchardata_base);
 3816         class_uchardata = class_uchardata_base;
 3817         }
 3818 #endif
 3819 
 3820       /* Inside \Q...\E everything is literal except \E */
 3821 
 3822       if (inescq)
 3823         {
 3824         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
 3825           {
 3826           inescq = FALSE;                   /* Reset literal state */
 3827           ptr++;                            /* Skip the 'E' */
 3828           continue;                         /* Carry on with next */
 3829           }
 3830         goto CHECK_RANGE;                   /* Could be range if \E follows */
 3831         }
 3832 
 3833       /* Handle POSIX class names. Perl allows a negation extension of the
 3834       form [:^name:]. A square bracket that doesn't match the syntax is
 3835       treated as a literal. We also recognize the POSIX constructions
 3836       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
 3837       5.6 and 5.8 do. */
 3838 
 3839       if (c == CHAR_LEFT_SQUARE_BRACKET &&
 3840           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
 3841            ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
 3842         {
 3843         BOOL local_negate = FALSE;
 3844         int posix_class, taboffset, tabopt;
 3845         register const pcre_uint8 *cbits = cd->cbits;
 3846         pcre_uint8 pbits[32];
 3847 
 3848         if (ptr[1] != CHAR_COLON)
 3849           {
 3850           *errorcodeptr = ERR31;
 3851           goto FAILED;
 3852           }
 3853 
 3854         ptr += 2;
 3855         if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
 3856           {
 3857           local_negate = TRUE;
 3858           should_flip_negation = TRUE;  /* Note negative special */
 3859           ptr++;
 3860           }
 3861 
 3862         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
 3863         if (posix_class < 0)
 3864           {
 3865           *errorcodeptr = ERR30;
 3866           goto FAILED;
 3867           }
 3868 
 3869         /* If matching is caseless, upper and lower are converted to
 3870         alpha. This relies on the fact that the class table starts with
 3871         alpha, lower, upper as the first 3 entries. */
 3872 
 3873         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
 3874           posix_class = 0;
 3875 
 3876         /* When PCRE_UCP is set, some of the POSIX classes are converted to
 3877         different escape sequences that use Unicode properties. */
 3878 
 3879 #ifdef SUPPORT_UCP
 3880         if ((options & PCRE_UCP) != 0)
 3881           {
 3882           int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
 3883           if (posix_substitutes[pc] != NULL)
 3884             {
 3885             nestptr = tempptr + 1;
 3886             ptr = posix_substitutes[pc] - 1;
 3887             continue;
 3888             }
 3889           }
 3890 #endif
 3891         /* In the non-UCP case, we build the bit map for the POSIX class in a
 3892         chunk of local store because we may be adding and subtracting from it,
 3893         and we don't want to subtract bits that may be in the main map already.
 3894         At the end we or the result into the bit map that is being built. */
 3895 
 3896         posix_class *= 3;
 3897 
 3898         /* Copy in the first table (always present) */
 3899 
 3900         memcpy(pbits, cbits + posix_class_maps[posix_class],
 3901           32 * sizeof(pcre_uint8));
 3902 
 3903         /* If there is a second table, add or remove it as required. */
 3904 
 3905         taboffset = posix_class_maps[posix_class + 1];
 3906         tabopt = posix_class_maps[posix_class + 2];
 3907 
 3908         if (taboffset >= 0)
 3909           {
 3910           if (tabopt >= 0)
 3911             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
 3912           else
 3913             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
 3914           }
 3915 
 3916         /* Not see if we need to remove any special characters. An option
 3917         value of 1 removes vertical space and 2 removes underscore. */
 3918 
 3919         if (tabopt < 0) tabopt = -tabopt;
 3920         if (tabopt == 1) pbits[1] &= ~0x3c;
 3921           else if (tabopt == 2) pbits[11] &= 0x7f;
 3922 
 3923         /* Add the POSIX table or its complement into the main table that is
 3924         being built and we are done. */
 3925 
 3926         if (local_negate)
 3927           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
 3928         else
 3929           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
 3930 
 3931         ptr = tempptr + 1;
 3932         /* Every class contains at least one < 256 characters. */
 3933         class_has_8bitchar = 1;
 3934         /* Every class contains at least two characters. */
 3935         class_single_char = 2;
 3936         continue;    /* End of POSIX syntax handling */
 3937         }
 3938 
 3939       /* Backslash may introduce a single character, or it may introduce one
 3940       of the specials, which just set a flag. The sequence \b is a special
 3941       case. Inside a class (and only there) it is treated as backspace. We
 3942       assume that other escapes have more than one character in them, so
 3943       speculatively set both class_has_8bitchar and class_single_char bigger
 3944       than one. Unrecognized escapes fall through and are either treated
 3945       as literal characters (by default), or are faulted if
 3946       PCRE_EXTRA is set. */
 3947 
 3948       if (c == CHAR_BACKSLASH)
 3949         {
 3950         c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
 3951         if (*errorcodeptr != 0) goto FAILED;
 3952 
 3953         if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */
 3954         else if (-c == ESC_N)            /* \N is not supported in a class */
 3955           {
 3956           *errorcodeptr = ERR71;
 3957           goto FAILED;
 3958           }
 3959         else if (-c == ESC_Q)            /* Handle start of quoted string */
 3960           {
 3961           if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
 3962             {
 3963             ptr += 2; /* avoid empty string */
 3964             }
 3965           else inescq = TRUE;
 3966           continue;
 3967           }
 3968         else if (-c == ESC_E) continue;  /* Ignore orphan \E */
 3969 
 3970         if (c < 0)
 3971           {
 3972           register const pcre_uint8 *cbits = cd->cbits;
 3973           /* Every class contains at least two < 256 characters. */
 3974           class_has_8bitchar++;
 3975           /* Every class contains at least two characters. */
 3976           class_single_char += 2;
 3977 
 3978           switch (-c)
 3979             {
 3980 #ifdef SUPPORT_UCP
 3981             case ESC_du:     /* These are the values given for \d etc */
 3982             case ESC_DU:     /* when PCRE_UCP is set. We replace the */
 3983             case ESC_wu:     /* escape sequence with an appropriate \p */
 3984             case ESC_WU:     /* or \P to test Unicode properties instead */
 3985             case ESC_su:     /* of the default ASCII testing. */
 3986             case ESC_SU:
 3987             nestptr = ptr;
 3988             ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
 3989             class_has_8bitchar--;                /* Undo! */
 3990             continue;
 3991 #endif
 3992             case ESC_d:
 3993             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
 3994             continue;
 3995 
 3996             case ESC_D:
 3997             should_flip_negation = TRUE;
 3998             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
 3999             continue;
 4000 
 4001             case ESC_w:
 4002             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
 4003             continue;
 4004 
 4005             case ESC_W:
 4006             should_flip_negation = TRUE;
 4007             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
 4008             continue;
 4009 
 4010             /* Perl 5.004 onwards omits VT from \s, but we must preserve it
 4011             if it was previously set by something earlier in the character
 4012             class. */
 4013 
 4014             case ESC_s:
 4015             classbits[0] |= cbits[cbit_space];
 4016             classbits[1] |= cbits[cbit_space+1] & ~0x08;
 4017             for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
 4018             continue;
 4019 
 4020             case ESC_S:
 4021             should_flip_negation = TRUE;
 4022             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
 4023             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
 4024             continue;
 4025 
 4026             case ESC_h:
 4027             SETBIT(classbits, 0x09); /* VT */
 4028             SETBIT(classbits, 0x20); /* SPACE */
 4029             SETBIT(classbits, 0xa0); /* NSBP */
 4030 #ifndef COMPILE_PCRE8
 4031             xclass = TRUE;
 4032             *class_uchardata++ = XCL_SINGLE;
 4033             *class_uchardata++ = 0x1680;
 4034             *class_uchardata++ = XCL_SINGLE;
 4035             *class_uchardata++ = 0x180e;
 4036             *class_uchardata++ = XCL_RANGE;
 4037             *class_uchardata++ = 0x2000;
 4038             *class_uchardata++ = 0x200a;
 4039             *class_uchardata++ = XCL_SINGLE;
 4040             *class_uchardata++ = 0x202f;
 4041             *class_uchardata++ = XCL_SINGLE;
 4042             *class_uchardata++ = 0x205f;
 4043             *class_uchardata++ = XCL_SINGLE;
 4044             *class_uchardata++ = 0x3000;
 4045 #elif defined SUPPORT_UTF
 4046             if (utf)
 4047               {
 4048               xclass = TRUE;
 4049               *class_uchardata++ = XCL_SINGLE;
 4050               class_uchardata += PRIV(ord2utf)(0x1680, class_uchardata);
 4051               *class_uchardata++ = XCL_SINGLE;
 4052               class_uchardata += PRIV(ord2utf)(0x180e, class_uchardata);
 4053               *class_uchardata++ = XCL_RANGE;
 4054               class_uchardata += PRIV(ord2utf)(0x2000, class_uchardata);
 4055               class_uchardata += PRIV(ord2utf)(0x200a, class_uchardata);
 4056               *class_uchardata++ = XCL_SINGLE;
 4057               class_uchardata += PRIV(ord2utf)(0x202f, class_uchardata);
 4058               *class_uchardata++ = XCL_SINGLE;
 4059               class_uchardata += PRIV(ord2utf)(0x205f, class_uchardata);
 4060               *class_uchardata++ = XCL_SINGLE;
 4061               class_uchardata += PRIV(ord2utf)(0x3000, class_uchardata);
 4062               }
 4063 #endif
 4064             continue;
 4065 
 4066             case ESC_H:
 4067             for (c = 0; c < 32; c++)
 4068               {
 4069               int x = 0xff;
 4070               switch (c)
 4071                 {
 4072                 case 0x09/8: x ^= 1 << (0x09%8); break;
 4073                 case 0x20/8: x ^= 1 << (0x20%8); break;
 4074                 case 0xa0/8: x ^= 1 << (0xa0%8); break;
 4075                 default: break;
 4076                 }
 4077               classbits[c] |= x;
 4078               }
 4079 #ifndef COMPILE_PCRE8
 4080             xclass = TRUE;
 4081             *class_uchardata++ = XCL_RANGE;
 4082             *class_uchardata++ = 0x0100;
 4083             *class_uchardata++ = 0x167f;
 4084             *class_uchardata++ = XCL_RANGE;
 4085             *class_uchardata++ = 0x1681;
 4086             *class_uchardata++ = 0x180d;
 4087             *class_uchardata++ = XCL_RANGE;
 4088             *class_uchardata++ = 0x180f;
 4089             *class_uchardata++ = 0x1fff;
 4090             *class_uchardata++ = XCL_RANGE;
 4091             *class_uchardata++ = 0x200b;
 4092             *class_uchardata++ = 0x202e;
 4093             *class_uchardata++ = XCL_RANGE;
 4094             *class_uchardata++ = 0x2030;
 4095             *class_uchardata++ = 0x205e;
 4096             *class_uchardata++ = XCL_RANGE;
 4097             *class_uchardata++ = 0x2060;
 4098             *class_uchardata++ = 0x2fff;
 4099             *class_uchardata++ = XCL_RANGE;
 4100             *class_uchardata++ = 0x3001;
 4101 #ifdef SUPPORT_UTF
 4102             if (utf)
 4103               class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
 4104             else
 4105 #endif
 4106               *class_uchardata++ = 0xffff;
 4107 #elif defined SUPPORT_UTF
 4108             if (utf)
 4109               {
 4110               xclass = TRUE;
 4111               *class_uchardata++ = XCL_RANGE;
 4112               class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
 4113               class_uchardata += PRIV(ord2utf)(0x167f, class_uchardata);
 4114               *class_uchardata++ = XCL_RANGE;
 4115               class_uchardata += PRIV(ord2utf)(0x1681, class_uchardata);
 4116               class_uchardata += PRIV(ord2utf)(0x180d, class_uchardata);
 4117               *class_uchardata++ = XCL_RANGE;
 4118               class_uchardata += PRIV(ord2utf)(0x180f, class_uchardata);
 4119               class_uchardata += PRIV(ord2utf)(0x1fff, class_uchardata);
 4120               *class_uchardata++ = XCL_RANGE;
 4121               class_uchardata += PRIV(ord2utf)(0x200b, class_uchardata);
 4122               class_uchardata += PRIV(ord2utf)(0x202e, class_uchardata);
 4123               *class_uchardata++ = XCL_RANGE;
 4124               class_uchardata += PRIV(ord2utf)(0x2030, class_uchardata);
 4125               class_uchardata += PRIV(ord2utf)(0x205e, class_uchardata);
 4126               *class_uchardata++ = XCL_RANGE;
 4127               class_uchardata += PRIV(ord2utf)(0x2060, class_uchardata);
 4128               class_uchardata += PRIV(ord2utf)(0x2fff, class_uchardata);
 4129               *class_uchardata++ = XCL_RANGE;
 4130               class_uchardata += PRIV(ord2utf)(0x3001, class_uchardata);
 4131               class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
 4132               }
 4133 #endif
 4134             continue;
 4135 
 4136             case ESC_v:
 4137             SETBIT(classbits, 0x0a); /* LF */
 4138             SETBIT(classbits, 0x0b); /* VT */
 4139             SETBIT(classbits, 0x0c); /* FF */
 4140             SETBIT(classbits, 0x0d); /* CR */
 4141             SETBIT(classbits, 0x85); /* NEL */
 4142 #ifndef COMPILE_PCRE8
 4143             xclass = TRUE;
 4144             *class_uchardata++ = XCL_RANGE;
 4145             *class_uchardata++ = 0x2028;
 4146             *class_uchardata++ = 0x2029;
 4147 #elif defined SUPPORT_UTF
 4148             if (utf)
 4149               {
 4150               xclass = TRUE;
 4151               *class_uchardata++ = XCL_RANGE;
 4152               class_uchardata += PRIV(ord2utf)(0x2028, class_uchardata);
 4153               class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata);
 4154               }
 4155 #endif
 4156             continue;
 4157 
 4158             case ESC_V:
 4159             for (c = 0; c < 32; c++)
 4160               {
 4161               int x = 0xff;
 4162               switch (c)
 4163                 {
 4164                 case 0x0a/8: x ^= 1 << (0x0a%8);
 4165                              x ^= 1 << (0x0b%8);
 4166                              x ^= 1 << (0x0c%8);
 4167                              x ^= 1 << (0x0d%8);
 4168                              break;
 4169                 case 0x85/8: x ^= 1 << (0x85%8); break;
 4170                 default: break;
 4171                 }
 4172               classbits[c] |= x;
 4173               }
 4174 
 4175 #ifndef COMPILE_PCRE8
 4176             xclass = TRUE;
 4177             *class_uchardata++ = XCL_RANGE;
 4178             *class_uchardata++ = 0x0100;
 4179             *class_uchardata++ = 0x2027;
 4180             *class_uchardata++ = XCL_RANGE;
 4181             *class_uchardata++ = 0x202a;
 4182 #ifdef SUPPORT_UTF
 4183             if (utf)
 4184               class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
 4185             else
 4186 #endif
 4187               *class_uchardata++ = 0xffff;
 4188 #elif defined SUPPORT_UTF
 4189             if (utf)
 4190               {
 4191               xclass = TRUE;
 4192               *class_uchardata++ = XCL_RANGE;
 4193               class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
 4194               class_uchardata += PRIV(ord2utf)(0x2027, class_uchardata);
 4195               *class_uchardata++ = XCL_RANGE;
 4196               class_uchardata += PRIV(ord2utf)(0x202a, class_uchardata);
 4197               class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
 4198               }
 4199 #endif
 4200             continue;
 4201 
 4202 #ifdef SUPPORT_UCP
 4203             case ESC_p:
 4204             case ESC_P:
 4205               {
 4206               BOOL negated;
 4207               int pdata;
 4208               int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
 4209               if (ptype < 0) goto FAILED;
 4210               xclass = TRUE;
 4211               *class_uchardata++ = ((-c == ESC_p) != negated)?
 4212                 XCL_PROP : XCL_NOTPROP;
 4213               *class_uchardata++ = ptype;
 4214               *class_uchardata++ = pdata;
 4215               class_has_8bitchar--;                /* Undo! */
 4216               continue;
 4217               }
 4218 #endif
 4219             /* Unrecognized escapes are faulted if PCRE is running in its
 4220             strict mode. By default, for compatibility with Perl, they are
 4221             treated as literals. */
 4222 
 4223             default:
 4224             if ((options & PCRE_EXTRA) != 0)
 4225               {
 4226               *errorcodeptr = ERR7;
 4227               goto FAILED;
 4228               }
 4229             class_has_8bitchar--;    /* Undo the speculative increase. */
 4230             class_single_char -= 2;  /* Undo the speculative increase. */
 4231             c = *ptr;                /* Get the final character and fall through */
 4232             break;
 4233             }
 4234           }
 4235 
 4236         /* Fall through if we have a single character (c >= 0). This may be
 4237         greater than 256. */
 4238 
 4239         }   /* End of backslash handling */
 4240 
 4241       /* A single character may be followed by '-' to form a range. However,
 4242       Perl does not permit ']' to be the end of the range. A '-' character
 4243       at the end is treated as a literal. Perl ignores orphaned \E sequences
 4244       entirely. The code for handling \Q and \E is messy. */
 4245 
 4246       CHECK_RANGE:
 4247       while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
 4248         {
 4249         inescq = FALSE;
 4250         ptr += 2;
 4251         }
 4252 
 4253       oldptr = ptr;
 4254 
 4255       /* Remember \r or \n */
 4256 
 4257       if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
 4258 
 4259       /* Check for range */
 4260 
 4261       if (!inescq && ptr[1] == CHAR_MINUS)
 4262         {
 4263         int d;
 4264         ptr += 2;
 4265         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
 4266 
 4267         /* If we hit \Q (not followed by \E) at this point, go into escaped
 4268         mode. */
 4269 
 4270         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
 4271           {
 4272           ptr += 2;
 4273           if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
 4274             { ptr += 2; continue; }
 4275           inescq = TRUE;
 4276           break;
 4277           }
 4278 
 4279         if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
 4280           {
 4281           ptr = oldptr;
 4282           goto LONE_SINGLE_CHARACTER;
 4283           }
 4284 
 4285 #ifdef SUPPORT_UTF
 4286         if (utf)
 4287           {                           /* Braces are required because the */
 4288           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
 4289           }
 4290         else
 4291 #endif
 4292         d = *ptr;  /* Not UTF-8 mode */
 4293 
 4294         /* The second part of a range can be a single-character escape, but
 4295         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
 4296         in such circumstances. */
 4297 
 4298         if (!inescq && d == CHAR_BACKSLASH)
 4299           {
 4300           d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
 4301           if (*errorcodeptr != 0) goto FAILED;
 4302 
 4303           /* \b is backspace; any other special means the '-' was literal */
 4304 
 4305           if (d < 0)
 4306             {
 4307             if (d == -ESC_b) d = CHAR_BS; else
 4308               {
 4309               ptr = oldptr;
 4310               goto LONE_SINGLE_CHARACTER;  /* A few lines below */
 4311               }
 4312             }
 4313           }
 4314 
 4315         /* Check that the two values are in the correct order. Optimize
 4316         one-character ranges */
 4317 
 4318         if (d < c)
 4319           {
 4320           *errorcodeptr = ERR8;
 4321           goto FAILED;
 4322           }
 4323 
 4324         if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
 4325 
 4326         /* Remember \r or \n */
 4327 
 4328         if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
 4329 
 4330         /* Since we found a character range, single character optimizations
 4331         cannot be done anymore. */
 4332         class_single_char = 2;
 4333 
 4334         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
 4335         matching, we have to use an XCLASS with extra data items. Caseless
 4336         matching for characters > 127 is available only if UCP support is
 4337         available. */
 4338 
 4339 #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
 4340         if ((d > 255) || (utf && ((options & PCRE_CASELESS) != 0 && d > 127)))
 4341 #elif defined  SUPPORT_UTF
 4342         if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
 4343 #elif !(defined COMPILE_PCRE8)
 4344         if (d > 255)
 4345 #endif
 4346 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
 4347           {
 4348           xclass = TRUE;
 4349 
 4350           /* With UCP support, we can find the other case equivalents of
 4351           the relevant characters. There may be several ranges. Optimize how
 4352           they fit with the basic range. */
 4353 
 4354 #ifdef SUPPORT_UCP
 4355 #ifndef COMPILE_PCRE8
 4356           if (utf && (options & PCRE_CASELESS) != 0)
 4357 #else
 4358           if ((options & PCRE_CASELESS) != 0)
 4359 #endif
 4360             {
 4361             unsigned int occ, ocd;
 4362             unsigned int cc = c;
 4363             unsigned int origd = d;
 4364             while (get_othercase_range(&cc, origd, &occ, &ocd))
 4365               {
 4366               if (occ >= (unsigned int)c &&
 4367                   ocd <= (unsigned int)d)
 4368                 continue;                          /* Skip embedded ranges */
 4369 
 4370               if (occ < (unsigned int)c  &&
 4371                   ocd >= (unsigned int)c - 1)      /* Extend the basic range */
 4372                 {                                  /* if there is overlap,   */
 4373                 c = occ;                           /* noting that if occ < c */
 4374                 continue;                          /* we can't have ocd > d  */
 4375                 }                                  /* because a subrange is  */
 4376               if (ocd > (unsigned int)d &&
 4377                   occ <= (unsigned int)d + 1)      /* always shorter than    */
 4378                 {                                  /* the basic range.       */
 4379                 d = ocd;
 4380                 continue;
 4381                 }
 4382 
 4383               if (occ == ocd)
 4384                 {
 4385                 *class_uchardata++ = XCL_SINGLE;
 4386                 }
 4387               else
 4388                 {
 4389                 *class_uchardata++ = XCL_RANGE;
 4390                 class_uchardata += PRIV(ord2utf)(occ, class_uchardata);
 4391                 }
 4392               class_uchardata += PRIV(ord2utf)(ocd, class_uchardata);
 4393               }
 4394             }
 4395 #endif  /* SUPPORT_UCP */
 4396 
 4397           /* Now record the original range, possibly modified for UCP caseless
 4398           overlapping ranges. */
 4399 
 4400           *class_uchardata++ = XCL_RANGE;
 4401 #ifdef SUPPORT_UTF
 4402 #ifndef COMPILE_PCRE8
 4403           if (utf)
 4404             {
 4405             class_uchardata += PRIV(ord2utf)(c, class_uchardata);
 4406             class_uchardata += PRIV(ord2utf)(d, class_uchardata);
 4407             }
 4408           else
 4409             {
 4410             *class_uchardata++ = c;
 4411             *class_uchardata++ = d;
 4412             }
 4413 #else
 4414           class_uchardata += PRIV(ord2utf)(c, class_uchardata);
 4415           class_uchardata += PRIV(ord2utf)(d, class_uchardata);
 4416 #endif
 4417 #else /* SUPPORT_UTF */
 4418           *class_uchardata++ = c;
 4419           *class_uchardata++ = d;
 4420 #endif /* SUPPORT_UTF */
 4421 
 4422           /* With UCP support, we are done. Without UCP support, there is no
 4423           caseless matching for UTF characters > 127; we can use the bit map
 4424           for the smaller ones. As for 16 bit characters without UTF, we
 4425           can still use  */
 4426 
 4427 #ifdef SUPPORT_UCP
 4428 #ifndef COMPILE_PCRE8
 4429           if (utf)
 4430 #endif
 4431             continue;    /* With next character in the class */
 4432 #endif  /* SUPPORT_UCP */
 4433 
 4434 #if defined SUPPORT_UTF && !defined(SUPPORT_UCP) && !(defined COMPILE_PCRE8)
 4435           if (utf)
 4436             {
 4437             if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
 4438             /* Adjust upper limit and fall through to set up the map */
 4439             d = 127;
 4440             }
 4441           else
 4442             {
 4443             if (c > 255) continue;
 4444             /* Adjust upper limit and fall through to set up the map */
 4445             d = 255;
 4446             }
 4447 #elif defined SUPPORT_UTF && !defined(SUPPORT_UCP)
 4448           if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
 4449           /* Adjust upper limit and fall through to set up the map */
 4450           d = 127;
 4451 #else
 4452           if (c > 255) continue;
 4453           /* Adjust upper limit and fall through to set up the map */
 4454           d = 255;
 4455 #endif  /* SUPPORT_UTF && !SUPPORT_UCP && !COMPILE_PCRE8 */
 4456           }
 4457 #endif  /* SUPPORT_UTF || !COMPILE_PCRE8 */
 4458 
 4459         /* We use the bit map for 8 bit mode, or when the characters fall
 4460         partially or entirely to [0-255] ([0-127] for UCP) ranges. */
 4461 
 4462         class_has_8bitchar = 1;
 4463 
 4464         /* We can save a bit of time by skipping this in the pre-compile. */
 4465 
 4466         if (lengthptr == NULL) for (; c <= d; c++)
 4467           {
 4468           classbits[c/8] |= (1 << (c&7));
 4469           if ((options & PCRE_CASELESS) != 0)
 4470             {
 4471             int uc = cd->fcc[c]; /* flip case */
 4472             classbits[uc/8] |= (1 << (uc&7));
 4473             }
 4474           }
 4475 
 4476         continue;   /* Go get the next char in the class */
 4477         }
 4478 
 4479       /* Handle a lone single character - we can get here for a normal
 4480       non-escape char, or after \ that introduces a single character or for an
 4481       apparent range that isn't. */
 4482 
 4483       LONE_SINGLE_CHARACTER:
 4484 
 4485       /* Only the value of 1 matters for class_single_char. */
 4486       if (class_single_char < 2) class_single_char++;
 4487 
 4488       /* If class_charcount is 1, we saw precisely one character. As long as
 4489       there were no negated characters >= 128 and there was no use of \p or \P,
 4490       in other words, no use of any XCLASS features, we can optimize.
 4491 
 4492       In UTF-8 mode, we can optimize the negative case only if there were no
 4493       characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
 4494       operate on single-bytes characters only. This is an historical hangover.
 4495       Maybe one day we can tidy these opcodes to handle multi-byte characters.
 4496 
 4497       The optimization throws away the bit map. We turn the item into a
 4498       1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
 4499       Note that OP_NOT[I] does not support multibyte characters. In the positive
 4500       case, it can cause firstchar to be set. Otherwise, there can be no first
 4501       char if this item is first, whatever repeat count may follow. In the case
 4502       of reqchar, save the previous value for reinstating. */
 4503 
 4504 #ifdef SUPPORT_UTF
 4505       if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET
 4506         && (!utf || !negate_class || c < (MAX_VALUE_FOR_SINGLE_CHAR + 1)))
 4507 #else
 4508       if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
 4509 #endif
 4510         {
 4511         ptr++;
 4512         zeroreqchar = reqchar;
 4513 
 4514         /* The OP_NOT[I] opcodes work on single characters only. */
 4515 
 4516         if (negate_class)
 4517           {
 4518           if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
 4519           zerofirstchar = firstchar;
 4520           *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
 4521           *code++ = c;
 4522           goto NOT_CHAR;
 4523           }
 4524 
 4525         /* For a single, positive character, get the value into mcbuffer, and
 4526         then we can handle this with the normal one-character code. */
 4527 
 4528 #ifdef SUPPORT_UTF
 4529         if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
 4530           mclength = PRIV(ord2utf)(c, mcbuffer);
 4531         else
 4532 #endif
 4533           {
 4534           mcbuffer[0] = c;
 4535           mclength = 1;
 4536           }
 4537         goto ONE_CHAR;
 4538         }       /* End of 1-char optimization */
 4539 
 4540       /* Handle a character that cannot go in the bit map. */
 4541 
 4542 #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
 4543       if ((c > 255) || (utf && ((options & PCRE_CASELESS) != 0 && c > 127)))
 4544 #elif defined SUPPORT_UTF
 4545       if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
 4546 #elif !(defined COMPILE_PCRE8)
 4547       if (c > 255)
 4548 #endif
 4549 
 4550 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
 4551         {
 4552         xclass = TRUE;
 4553         *class_uchardata++ = XCL_SINGLE;
 4554 #ifdef SUPPORT_UTF
 4555 #ifndef COMPILE_PCRE8
 4556         /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
 4557         if (!utf)
 4558           *class_uchardata++ = c;
 4559         else
 4560 #endif
 4561           class_uchardata += PRIV(ord2utf)(c, class_uchardata);
 4562 #else /* SUPPORT_UTF */
 4563         *class_uchardata++ = c;
 4564 #endif /* SUPPORT_UTF */
 4565 
 4566 #ifdef SUPPORT_UCP
 4567 #ifdef COMPILE_PCRE8
 4568         if ((options & PCRE_CASELESS) != 0)
 4569 #else
 4570         /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
 4571         if (utf && (options & PCRE_CASELESS) != 0)
 4572 #endif
 4573           {
 4574           unsigned int othercase;
 4575           if ((int)(othercase = UCD_OTHERCASE(c)) != c)
 4576             {
 4577             *class_uchardata++ = XCL_SINGLE;
 4578             class_uchardata += PRIV(ord2utf)(othercase, class_uchardata);
 4579             }
 4580           }
 4581 #endif  /* SUPPORT_UCP */
 4582 
 4583         }
 4584       else
 4585 #endif  /* SUPPORT_UTF || COMPILE_PCRE16 */
 4586 
 4587       /* Handle a single-byte character */
 4588         {
 4589         class_has_8bitchar = 1;
 4590         classbits[c/8] |= (1 << (c&7));
 4591         if ((options & PCRE_CASELESS) != 0)
 4592           {
 4593           c = cd->fcc[c]; /* flip case */
 4594           classbits[c/8] |= (1 << (c&7));
 4595           }
 4596         }
 4597       }
 4598 
 4599     /* Loop until ']' reached. This "while" is the end of the "do" far above.
 4600     If we are at the end of an internal nested string, revert to the outer
 4601     string. */
 4602 
 4603     while (((c = *(++ptr)) != 0 ||
 4604            (nestptr != NULL &&
 4605              (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != 0)) &&
 4606            (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
 4607 
 4608     /* Check for missing terminating ']' */
 4609 
 4610     if (c == 0)
 4611       {
 4612       *errorcodeptr = ERR6;
 4613       goto FAILED;
 4614       }
 4615 
 4616     /* If this is the first thing in the branch, there can be no first char
 4617     setting, whatever the repeat count. Any reqchar setting must remain
 4618     unchanged after any kind of repeat. */
 4619 
 4620     if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
 4621     zerofirstchar = firstchar;
 4622     zeroreqchar = reqchar;
 4623 
 4624     /* If there are characters with values > 255, we have to compile an
 4625     extended class, with its own opcode, unless there was a negated special
 4626     such as \S in the class, and PCRE_UCP is not set, because in that case all
 4627     characters > 255 are in the class, so any that were explicitly given as
 4628     well can be ignored. If (when there are explicit characters > 255 that must
 4629     be listed) there are no characters < 256, we can omit the bitmap in the
 4630     actual compiled code. */
 4631 
 4632 #ifdef SUPPORT_UTF
 4633     if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
 4634 #elif !defined COMPILE_PCRE8
 4635     if (xclass && !should_flip_negation)
 4636 #endif
 4637 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 4638       {
 4639       *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
 4640       *code++ = OP_XCLASS;
 4641       code += LINK_SIZE;
 4642       *code = negate_class? XCL_NOT:0;
 4643 
 4644       /* If the map is required, move up the extra data to make room for it;
 4645       otherwise just move the code pointer to the end of the extra data. */
 4646 
 4647       if (class_has_8bitchar > 0)
 4648         {
 4649         *code++ |= XCL_MAP;
 4650         memmove(code + (32 / sizeof(pcre_uchar)), code,
 4651           IN_UCHARS(class_uchardata - code));
 4652         memcpy(code, classbits, 32);
 4653         code = class_uchardata + (32 / sizeof(pcre_uchar));
 4654         }
 4655       else code = class_uchardata;
 4656 
 4657       /* Now fill in the complete length of the item */
 4658 
 4659       PUT(previous, 1, (int)(code - previous));
 4660       break;   /* End of class handling */
 4661       }
 4662 #endif
 4663 
 4664     /* If there are no characters > 255, or they are all to be included or
 4665     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
 4666     whole class was negated and whether there were negative specials such as \S
 4667     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
 4668     negating it if necessary. */
 4669 
 4670     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
 4671     if (lengthptr == NULL)    /* Save time in the pre-compile phase */
 4672       {
 4673       if (negate_class)
 4674         for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
 4675       memcpy(code, classbits, 32);
 4676       }
 4677     code += 32 / sizeof(pcre_uchar);
 4678     NOT_CHAR:
 4679     break;
 4680 
 4681 
 4682     /* ===================================================================*/
 4683     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
 4684     has been tested above. */
 4685 
 4686     case CHAR_LEFT_CURLY_BRACKET:
 4687     if (!is_quantifier) goto NORMAL_CHAR;
 4688     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
 4689     if (*errorcodeptr != 0) goto FAILED;
 4690     goto REPEAT;
 4691 
 4692     case CHAR_ASTERISK:
 4693     repeat_min = 0;
 4694     repeat_max = -1;
 4695     goto REPEAT;
 4696 
 4697     case CHAR_PLUS:
 4698     repeat_min = 1;
 4699     repeat_max = -1;
 4700     goto REPEAT;
 4701 
 4702     case CHAR_QUESTION_MARK:
 4703     repeat_min = 0;
 4704     repeat_max = 1;
 4705 
 4706     REPEAT:
 4707     if (previous == NULL)
 4708       {
 4709       *errorcodeptr = ERR9;
 4710       goto FAILED;
 4711       }
 4712 
 4713     if (repeat_min == 0)
 4714       {
 4715       firstchar = zerofirstchar;    /* Adjust for zero repeat */
 4716       reqchar = zeroreqchar;        /* Ditto */
 4717       }
 4718 
 4719     /* Remember whether this is a variable length repeat */
 4720 
 4721     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
 4722 
 4723     op_type = 0;                    /* Default single-char op codes */
 4724     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
 4725 
 4726     /* Save start of previous item, in case we have to move it up in order to
 4727     insert something before it. */
 4728 
 4729     tempcode = previous;
 4730 
 4731     /* If the next character is '+', we have a possessive quantifier. This
 4732     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
 4733     If the next character is '?' this is a minimizing repeat, by default,
 4734     but if PCRE_UNGREEDY is set, it works the other way round. We change the
 4735     repeat type to the non-default. */
 4736 
 4737     if (ptr[1] == CHAR_PLUS)
 4738       {
 4739       repeat_type = 0;                  /* Force greedy */
 4740       possessive_quantifier = TRUE;
 4741       ptr++;
 4742       }
 4743     else if (ptr[1] == CHAR_QUESTION_MARK)
 4744       {
 4745       repeat_type = greedy_non_default;
 4746       ptr++;
 4747       }
 4748     else repeat_type = greedy_default;
 4749 
 4750     /* If previous was a recursion call, wrap it in atomic brackets so that
 4751     previous becomes the atomic group. All recursions were so wrapped in the
 4752     past, but it no longer happens for non-repeated recursions. In fact, the
 4753     repeated ones could be re-implemented independently so as not to need this,
 4754     but for the moment we rely on the code for repeating groups. */
 4755 
 4756     if (*previous == OP_RECURSE)
 4757       {
 4758       memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
 4759       *previous = OP_ONCE;
 4760       PUT(previous, 1, 2 + 2*LINK_SIZE);
 4761       previous[2 + 2*LINK_SIZE] = OP_KET;
 4762       PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
 4763       code += 2 + 2 * LINK_SIZE;
 4764       length_prevgroup = 3 + 3*LINK_SIZE;
 4765 
 4766       /* When actually compiling, we need to check whether this was a forward
 4767       reference, and if so, adjust the offset. */
 4768 
 4769       if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
 4770         {
 4771         int offset = GET(cd->hwm, -LINK_SIZE);
 4772         if (offset == previous + 1 - cd->start_code)
 4773           PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
 4774         }
 4775       }
 4776 
 4777     /* Now handle repetition for the different types of item. */
 4778 
 4779     /* If previous was a character match, abolish the item and generate a
 4780     repeat item instead. If a char item has a minumum of more than one, ensure
 4781     that it is set in reqchar - it might not be if a sequence such as x{3} is
 4782     the first thing in a branch because the x will have gone into firstchar
 4783     instead.  */
 4784 
 4785     if (*previous == OP_CHAR || *previous == OP_CHARI)
 4786       {
 4787       op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;
 4788 
 4789       /* Deal with UTF characters that take up more than one character. It's
 4790       easier to write this out separately than try to macrify it. Use c to
 4791       hold the length of the character in bytes, plus UTF_LENGTH to flag that
 4792       it's a length rather than a small character. */
 4793 
 4794 #ifdef SUPPORT_UTF
 4795       if (utf && NOT_FIRSTCHAR(code[-1]))
 4796         {
 4797         pcre_uchar *lastchar = code - 1;
 4798         BACKCHAR(lastchar);
 4799         c = (int)(code - lastchar);     /* Length of UTF-8 character */
 4800         memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
 4801         c |= UTF_LENGTH;                /* Flag c as a length */
 4802         }
 4803       else
 4804 #endif /* SUPPORT_UTF */
 4805 
 4806       /* Handle the case of a single charater - either with no UTF support, or
 4807       with UTF disabled, or for a single character UTF character. */
 4808         {
 4809         c = code[-1];
 4810         if (repeat_min > 1) reqchar = c | req_caseopt | cd->req_varyopt;
 4811         }
 4812 
 4813       /* If the repetition is unlimited, it pays to see if the next thing on
 4814       the line is something that cannot possibly match this character. If so,
 4815       automatically possessifying this item gains some performance in the case
 4816       where the match fails. */
 4817 
 4818       if (!possessive_quantifier &&
 4819           repeat_max < 0 &&
 4820           check_auto_possessive(previous, utf, ptr + 1, options, cd))
 4821         {
 4822         repeat_type = 0;    /* Force greedy */
 4823         possessive_quantifier = TRUE;
 4824         }
 4825 
 4826       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
 4827       }
 4828 
 4829     /* If previous was a single negated character ([^a] or similar), we use
 4830     one of the special opcodes, replacing it. The code is shared with single-
 4831     character repeats by setting opt_type to add a suitable offset into
 4832     repeat_type. We can also test for auto-possessification. OP_NOT and OP_NOTI
 4833     are currently used only for single-byte chars. */
 4834 
 4835     else if (*previous == OP_NOT || *previous == OP_NOTI)
 4836       {
 4837       op_type = ((*previous == OP_NOT)? OP_NOTSTAR : OP_NOTSTARI) - OP_STAR;
 4838       c = previous[1];
 4839       if (!possessive_quantifier &&
 4840           repeat_max < 0 &&
 4841           check_auto_possessive(previous, utf, ptr + 1, options, cd))
 4842         {
 4843         repeat_type = 0;    /* Force greedy */
 4844         possessive_quantifier = TRUE;
 4845         }
 4846       goto OUTPUT_SINGLE_REPEAT;
 4847       }
 4848 
 4849     /* If previous was a character type match (\d or similar), abolish it and
 4850     create a suitable repeat item. The code is shared with single-character
 4851     repeats by setting op_type to add a suitable offset into repeat_type. Note
 4852     the the Unicode property types will be present only when SUPPORT_UCP is
 4853     defined, but we don't wrap the little bits of code here because it just
 4854     makes it horribly messy. */
 4855 
 4856     else if (*previous < OP_EODN)
 4857       {
 4858       pcre_uchar *oldcode;
 4859       int prop_type, prop_value;
 4860       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
 4861       c = *previous;
 4862 
 4863       if (!possessive_quantifier &&
 4864           repeat_max < 0 &&
 4865           check_auto_possessive(previous, utf, ptr + 1, options, cd))
 4866         {
 4867         repeat_type = 0;    /* Force greedy */
 4868         possessive_quantifier = TRUE;
 4869         }
 4870 
 4871       OUTPUT_SINGLE_REPEAT:
 4872       if (*previous == OP_PROP || *previous == OP_NOTPROP)
 4873         {
 4874         prop_type = previous[1];
 4875         prop_value = previous[2];
 4876         }
 4877       else prop_type = prop_value = -1;
 4878 
 4879       oldcode = code;
 4880       code = previous;                  /* Usually overwrite previous item */
 4881 
 4882       /* If the maximum is zero then the minimum must also be zero; Perl allows
 4883       this case, so we do too - by simply omitting the item altogether. */
 4884 
 4885       if (repeat_max == 0) goto END_REPEAT;
 4886 
 4887       /*--------------------------------------------------------------------*/
 4888       /* This code is obsolete from release 8.00; the restriction was finally
 4889       removed: */
 4890 
 4891       /* All real repeats make it impossible to handle partial matching (maybe
 4892       one day we will be able to remove this restriction). */
 4893 
 4894       /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
 4895       /*--------------------------------------------------------------------*/
 4896 
 4897       /* Combine the op_type with the repeat_type */
 4898 
 4899       repeat_type += op_type;
 4900 
 4901       /* A minimum of zero is handled either as the special case * or ?, or as
 4902       an UPTO, with the maximum given. */
 4903 
 4904       if (repeat_min == 0)
 4905         {
 4906         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
 4907           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
 4908         else
 4909           {
 4910           *code++ = OP_UPTO + repeat_type;
 4911           PUT2INC(code, 0, repeat_max);
 4912           }
 4913         }
 4914 
 4915       /* A repeat minimum of 1 is optimized into some special cases. If the
 4916       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
 4917       left in place and, if the maximum is greater than 1, we use OP_UPTO with
 4918       one less than the maximum. */
 4919 
 4920       else if (repeat_min == 1)
 4921         {
 4922         if (repeat_max == -1)
 4923           *code++ = OP_PLUS + repeat_type;
 4924         else
 4925           {
 4926           code = oldcode;                 /* leave previous item in place */
 4927           if (repeat_max == 1) goto END_REPEAT;
 4928           *code++ = OP_UPTO + repeat_type;
 4929           PUT2INC(code, 0, repeat_max - 1);
 4930           }
 4931         }
 4932 
 4933       /* The case {n,n} is just an EXACT, while the general case {n,m} is
 4934       handled as an EXACT followed by an UPTO. */
 4935 
 4936       else
 4937         {
 4938         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
 4939         PUT2INC(code, 0, repeat_min);
 4940 
 4941         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
 4942         we have to insert the character for the previous code. For a repeated
 4943         Unicode property match, there are two extra bytes that define the
 4944         required property. In UTF-8 mode, long characters have their length in
 4945         c, with the UTF_LENGTH bit as a flag. */
 4946 
 4947         if (repeat_max < 0)
 4948           {
 4949 #ifdef SUPPORT_UTF
 4950           if (utf && (c & UTF_LENGTH) != 0)
 4951             {
 4952             memcpy(code, utf_chars, IN_UCHARS(c & 7));
 4953             code += c & 7;
 4954             }
 4955           else
 4956 #endif
 4957             {
 4958             *code++ = c;
 4959             if (prop_type >= 0)
 4960               {
 4961               *code++ = prop_type;
 4962               *code++ = prop_value;
 4963               }
 4964             }
 4965           *code++ = OP_STAR + repeat_type;
 4966           }
 4967 
 4968         /* Else insert an UPTO if the max is greater than the min, again
 4969         preceded by the character, for the previously inserted code. If the
 4970         UPTO is just for 1 instance, we can use QUERY instead. */
 4971 
 4972         else if (repeat_max != repeat_min)
 4973           {
 4974 #ifdef SUPPORT_UTF
 4975           if (utf && (c & UTF_LENGTH) != 0)
 4976             {
 4977             memcpy(code, utf_chars, IN_UCHARS(c & 7));
 4978             code += c & 7;
 4979             }
 4980           else
 4981 #endif
 4982           *code++ = c;
 4983           if (prop_type >= 0)
 4984             {
 4985             *code++ = prop_type;
 4986             *code++ = prop_value;
 4987             }
 4988           repeat_max -= repeat_min;
 4989 
 4990           if (repeat_max == 1)
 4991             {
 4992             *code++ = OP_QUERY + repeat_type;
 4993             }
 4994           else
 4995             {
 4996             *code++ = OP_UPTO + repeat_type;
 4997             PUT2INC(code, 0, repeat_max);
 4998             }
 4999           }
 5000         }
 5001 
 5002       /* The character or character type itself comes last in all cases. */
 5003 
 5004 #ifdef SUPPORT_UTF
 5005       if (utf && (c & UTF_LENGTH) != 0)
 5006         {
 5007         memcpy(code, utf_chars, IN_UCHARS(c & 7));
 5008         code += c & 7;
 5009         }
 5010       else
 5011 #endif
 5012       *code++ = c;
 5013 
 5014       /* For a repeated Unicode property match, there are two extra bytes that
 5015       define the required property. */
 5016 
 5017 #ifdef SUPPORT_UCP
 5018       if (prop_type >= 0)
 5019         {
 5020         *code++ = prop_type;
 5021         *code++ = prop_value;
 5022         }
 5023 #endif
 5024       }
 5025 
 5026     /* If previous was a character class or a back reference, we put the repeat
 5027     stuff after it, but just skip the item if the repeat was {0,0}. */
 5028 
 5029     else if (*previous == OP_CLASS ||
 5030              *previous == OP_NCLASS ||
 5031 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 5032              *previous == OP_XCLASS ||
 5033 #endif
 5034              *previous == OP_REF ||
 5035              *previous == OP_REFI)
 5036       {
 5037       if (repeat_max == 0)
 5038         {
 5039         code = previous;
 5040         goto END_REPEAT;
 5041         }
 5042 
 5043       /*--------------------------------------------------------------------*/
 5044       /* This code is obsolete from release 8.00; the restriction was finally
 5045       removed: */
 5046 
 5047       /* All real repeats make it impossible to handle partial matching (maybe
 5048       one day we will be able to remove this restriction). */
 5049 
 5050       /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
 5051       /*--------------------------------------------------------------------*/
 5052 
 5053       if (repeat_min == 0 && repeat_max == -1)
 5054         *code++ = OP_CRSTAR + repeat_type;
 5055       else if (repeat_min == 1 && repeat_max == -1)
 5056         *code++ = OP_CRPLUS + repeat_type;
 5057       else if (repeat_min == 0 && repeat_max == 1)
 5058         *code++ = OP_CRQUERY + repeat_type;
 5059       else
 5060         {
 5061         *code++ = OP_CRRANGE + repeat_type;
 5062         PUT2INC(code, 0, repeat_min);
 5063         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
 5064         PUT2INC(code, 0, repeat_max);
 5065         }
 5066       }
 5067 
 5068     /* If previous was a bracket group, we may have to replicate it in certain
 5069     cases. Note that at this point we can encounter only the "basic" bracket
 5070     opcodes such as BRA and CBRA, as this is the place where they get converted
 5071     into the more special varieties such as BRAPOS and SBRA. A test for >=
 5072     OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
 5073     ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow
 5074     repetition of assertions, but now it does, for Perl compatibility. */
 5075 
 5076     else if (*previous >= OP_ASSERT && *previous <= OP_COND)
 5077       {
 5078       register int i;
 5079       int len = (int)(code - previous);
 5080       pcre_uchar *bralink = NULL;
 5081       pcre_uchar *brazeroptr = NULL;
 5082 
 5083       /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
 5084       we just ignore the repeat. */
 5085 
 5086       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
 5087         goto END_REPEAT;
 5088 
 5089       /* There is no sense in actually repeating assertions. The only potential
 5090       use of repetition is in cases when the assertion is optional. Therefore,
 5091       if the minimum is greater than zero, just ignore the repeat. If the
 5092       maximum is not not zero or one, set it to 1. */
 5093 
 5094       if (*previous < OP_ONCE)    /* Assertion */
 5095         {
 5096         if (repeat_min > 0) goto END_REPEAT;
 5097         if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
 5098         }
 5099 
 5100       /* The case of a zero minimum is special because of the need to stick
 5101       OP_BRAZERO in front of it, and because the group appears once in the
 5102       data, whereas in other cases it appears the minimum number of times. For
 5103       this reason, it is simplest to treat this case separately, as otherwise
 5104       the code gets far too messy. There are several special subcases when the
 5105       minimum is zero. */
 5106 
 5107       if (repeat_min == 0)
 5108         {
 5109         /* If the maximum is also zero, we used to just omit the group from the
 5110         output altogether, like this:
 5111 
 5112         ** if (repeat_max == 0)
 5113         **   {
 5114         **   code = previous;
 5115         **   goto END_REPEAT;
 5116         **   }
 5117 
 5118         However, that fails when a group or a subgroup within it is referenced
 5119         as a subroutine from elsewhere in the pattern, so now we stick in
 5120         OP_SKIPZERO in front of it so that it is skipped on execution. As we
 5121         don't have a list of which groups are referenced, we cannot do this
 5122         selectively.
 5123 
 5124         If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
 5125         and do no more at this point. However, we do need to adjust any
 5126         OP_RECURSE calls inside the group that refer to the group itself or any
 5127         internal or forward referenced group, because the offset is from the
 5128         start of the whole regex. Temporarily terminate the pattern while doing
 5129         this. */
 5130 
 5131         if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
 5132           {
 5133           *code = OP_END;
 5134           adjust_recurse(previous, 1, utf, cd, save_hwm);
 5135           memmove(previous + 1, previous, IN_UCHARS(len));
 5136           code++;
 5137           if (repeat_max == 0)
 5138             {
 5139             *previous++ = OP_SKIPZERO;
 5140             goto END_REPEAT;
 5141             }
 5142           brazeroptr = previous;    /* Save for possessive optimizing */
 5143           *previous++ = OP_BRAZERO + repeat_type;
 5144           }
 5145 
 5146         /* If the maximum is greater than 1 and limited, we have to replicate
 5147         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
 5148         The first one has to be handled carefully because it's the original
 5149         copy, which has to be moved up. The remainder can be handled by code
 5150         that is common with the non-zero minimum case below. We have to
 5151         adjust the value or repeat_max, since one less copy is required. Once
 5152         again, we may have to adjust any OP_RECURSE calls inside the group. */
 5153 
 5154         else
 5155           {
 5156           int offset;
 5157           *code = OP_END;
 5158           adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
 5159           memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
 5160           code += 2 + LINK_SIZE;
 5161           *previous++ = OP_BRAZERO + repeat_type;
 5162           *previous++ = OP_BRA;
 5163 
 5164           /* We chain together the bracket offset fields that have to be
 5165           filled in later when the ends of the brackets are reached. */
 5166 
 5167           offset = (bralink == NULL)? 0 : (int)(previous - bralink);
 5168           bralink = previous;
 5169           PUTINC(previous, 0, offset);
 5170           }
 5171 
 5172         repeat_max--;
 5173         }
 5174 
 5175       /* If the minimum is greater than zero, replicate the group as many
 5176       times as necessary, and adjust the maximum to the number of subsequent
 5177       copies that we need. If we set a first char from the group, and didn't
 5178       set a required char, copy the latter from the former. If there are any
 5179       forward reference subroutine calls in the group, there will be entries on
 5180       the workspace list; replicate these with an appropriate increment. */
 5181 
 5182       else
 5183         {
 5184         if (repeat_min > 1)
 5185           {
 5186           /* In the pre-compile phase, we don't actually do the replication. We
 5187           just adjust the length as if we had. Do some paranoid checks for
 5188           potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
 5189           integer type when available, otherwise double. */
 5190 
 5191           if (lengthptr != NULL)
 5192             {
 5193             int delta = (repeat_min - 1)*length_prevgroup;
 5194             if ((INT64_OR_DOUBLE)(repeat_min - 1)*
 5195                   (INT64_OR_DOUBLE)length_prevgroup >
 5196                     (INT64_OR_DOUBLE)INT_MAX ||
 5197                 OFLOW_MAX - *lengthptr < delta)
 5198               {
 5199               *errorcodeptr = ERR20;
 5200               goto FAILED;
 5201               }
 5202             *lengthptr += delta;
 5203             }
 5204 
 5205           /* This is compiling for real. If there is a set first byte for
 5206           the group, and we have not yet set a "required byte", set it. Make
 5207           sure there is enough workspace for copying forward references before
 5208           doing the copy. */
 5209 
 5210           else
 5211             {
 5212             if (groupsetfirstchar && reqchar < 0) reqchar = firstchar;
 5213 
 5214             for (i = 1; i < repeat_min; i++)
 5215               {
 5216               pcre_uchar *hc;
 5217               pcre_uchar *this_hwm = cd->hwm;
 5218               memcpy(code, previous, IN_UCHARS(len));
 5219 
 5220               while (cd->hwm > cd->start_workspace + cd->workspace_size -
 5221                      WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
 5222                 {
 5223                 int save_offset = (int)(save_hwm - cd->start_workspace);
 5224                 int this_offset = (int)(this_hwm - cd->start_workspace);
 5225                 *errorcodeptr = expand_workspace(cd);
 5226                 if (*errorcodeptr != 0) goto FAILED;
 5227                 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
 5228                 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
 5229                 }
 5230 
 5231               for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
 5232                 {
 5233                 PUT(cd->hwm, 0, GET(hc, 0) + len);
 5234                 cd->hwm += LINK_SIZE;
 5235                 }
 5236               save_hwm = this_hwm;
 5237               code += len;
 5238               }
 5239             }
 5240           }
 5241 
 5242         if (repeat_max > 0) repeat_max -= repeat_min;
 5243         }
 5244 
 5245       /* This code is common to both the zero and non-zero minimum cases. If
 5246       the maximum is limited, it replicates the group in a nested fashion,
 5247       remembering the bracket starts on a stack. In the case of a zero minimum,
 5248       the first one was set up above. In all cases the repeat_max now specifies
 5249       the number of additional copies needed. Again, we must remember to
 5250       replicate entries on the forward reference list. */
 5251 
 5252       if (repeat_max >= 0)
 5253         {
 5254         /* In the pre-compile phase, we don't actually do the replication. We
 5255         just adjust the length as if we had. For each repetition we must add 1
 5256         to the length for BRAZERO and for all but the last repetition we must
 5257         add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
 5258         paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
 5259         a 64-bit integer type when available, otherwise double. */
 5260 
 5261         if (lengthptr != NULL && repeat_max > 0)
 5262           {
 5263           int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
 5264                       2 - 2*LINK_SIZE;   /* Last one doesn't nest */
 5265           if ((INT64_OR_DOUBLE)repeat_max *
 5266                 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
 5267                   > (INT64_OR_DOUBLE)INT_MAX ||
 5268               OFLOW_MAX - *lengthptr < delta)
 5269             {
 5270             *errorcodeptr = ERR20;
 5271             goto FAILED;
 5272             }
 5273           *lengthptr += delta;
 5274           }
 5275 
 5276         /* This is compiling for real */
 5277 
 5278         else for (i = repeat_max - 1; i >= 0; i--)
 5279           {
 5280           pcre_uchar *hc;
 5281           pcre_uchar *this_hwm = cd->hwm;
 5282 
 5283           *code++ = OP_BRAZERO + repeat_type;
 5284 
 5285           /* All but the final copy start a new nesting, maintaining the
 5286           chain of brackets outstanding. */
 5287 
 5288           if (i != 0)
 5289             {
 5290             int offset;
 5291             *code++ = OP_BRA;
 5292             offset = (bralink == NULL)? 0 : (int)(code - bralink);
 5293             bralink = code;
 5294             PUTINC(code, 0, offset);
 5295             }
 5296 
 5297           memcpy(code, previous, IN_UCHARS(len));
 5298 
 5299           /* Ensure there is enough workspace for forward references before
 5300           copying them. */
 5301 
 5302           while (cd->hwm > cd->start_workspace + cd->workspace_size -
 5303                  WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
 5304             {
 5305             int save_offset = (int)(save_hwm - cd->start_workspace);
 5306             int this_offset = (int)(this_hwm - cd->start_workspace);
 5307             *errorcodeptr = expand_workspace(cd);
 5308             if (*errorcodeptr != 0) goto FAILED;
 5309             save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
 5310             this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
 5311             }
 5312 
 5313           for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
 5314             {
 5315             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
 5316             cd->hwm += LINK_SIZE;
 5317             }
 5318           save_hwm = this_hwm;
 5319           code += len;
 5320           }
 5321 
 5322         /* Now chain through the pending brackets, and fill in their length
 5323         fields (which are holding the chain links pro tem). */
 5324 
 5325         while (bralink != NULL)
 5326           {
 5327           int oldlinkoffset;
 5328           int offset = (int)(code - bralink + 1);
 5329           pcre_uchar *bra = code - offset;
 5330           oldlinkoffset = GET(bra, 1);
 5331           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
 5332           *code++ = OP_KET;
 5333           PUTINC(code, 0, offset);
 5334           PUT(bra, 1, offset);
 5335           }
 5336         }
 5337 
 5338       /* If the maximum is unlimited, set a repeater in the final copy. For
 5339       ONCE brackets, that's all we need to do. However, possessively repeated
 5340       ONCE brackets can be converted into non-capturing brackets, as the
 5341       behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
 5342       deal with possessive ONCEs specially.
 5343 
 5344       Otherwise, when we are doing the actual compile phase, check to see
 5345       whether this group is one that could match an empty string. If so,
 5346       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
 5347       that runtime checking can be done. [This check is also applied to ONCE
 5348       groups at runtime, but in a different way.]
 5349 
 5350       Then, if the quantifier was possessive and the bracket is not a
 5351       conditional, we convert the BRA code to the POS form, and the KET code to
 5352       KETRPOS. (It turns out to be convenient at runtime to detect this kind of
 5353       subpattern at both the start and at the end.) The use of special opcodes
 5354       makes it possible to reduce greatly the stack usage in pcre_exec(). If
 5355       the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
 5356 
 5357       Then, if the minimum number of matches is 1 or 0, cancel the possessive
 5358       flag so that the default action below, of wrapping everything inside
 5359       atomic brackets, does not happen. When the minimum is greater than 1,
 5360       there will be earlier copies of the group, and so we still have to wrap
 5361       the whole thing. */
 5362 
 5363       else
 5364         {
 5365         pcre_uchar *ketcode = code - 1 - LINK_SIZE;
 5366         pcre_uchar *bracode = ketcode - GET(ketcode, 1);
 5367 
 5368         /* Convert possessive ONCE brackets to non-capturing */
 5369 
 5370         if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
 5371             possessive_quantifier) *bracode = OP_BRA;
 5372 
 5373         /* For non-possessive ONCE brackets, all we need to do is to
 5374         set the KET. */
 5375 
 5376         if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
 5377           *ketcode = OP_KETRMAX + repeat_type;
 5378 
 5379         /* Handle non-ONCE brackets and possessive ONCEs (which have been
 5380         converted to non-capturing above). */
 5381 
 5382         else
 5383           {
 5384           /* In the compile phase, check for empty string matching. */
 5385 
 5386           if (lengthptr == NULL)
 5387             {
 5388             pcre_uchar *scode = bracode;
 5389             do
 5390               {
 5391               if (could_be_empty_branch(scode, ketcode, utf, cd))
 5392                 {
 5393                 *bracode += OP_SBRA - OP_BRA;
 5394                 break;
 5395                 }
 5396               scode += GET(scode, 1);
 5397               }
 5398             while (*scode == OP_ALT);
 5399             }
 5400 
 5401           /* Handle possessive quantifiers. */
 5402 
 5403           if (possessive_quantifier)
 5404             {
 5405             /* For COND brackets, we wrap the whole thing in a possessively
 5406             repeated non-capturing bracket, because we have not invented POS
 5407             versions of the COND opcodes. Because we are moving code along, we
 5408             must ensure that any pending recursive references are updated. */
 5409 
 5410             if (*bracode == OP_COND || *bracode == OP_SCOND)
 5411               {
 5412               int nlen = (int)(code - bracode);
 5413               *code = OP_END;
 5414               adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);
 5415               memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
 5416               code += 1 + LINK_SIZE;
 5417               nlen += 1 + LINK_SIZE;
 5418               *bracode = OP_BRAPOS;
 5419               *code++ = OP_KETRPOS;
 5420               PUTINC(code, 0, nlen);
 5421               PUT(bracode, 1, nlen);
 5422               }
 5423 
 5424             /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
 5425 
 5426             else
 5427               {
 5428               *bracode += 1;              /* Switch to xxxPOS opcodes */
 5429               *ketcode = OP_KETRPOS;
 5430               }
 5431 
 5432             /* If the minimum is zero, mark it as possessive, then unset the
 5433             possessive flag when the minimum is 0 or 1. */
 5434 
 5435             if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
 5436             if (repeat_min < 2) possessive_quantifier = FALSE;
 5437             }
 5438 
 5439           /* Non-possessive quantifier */
 5440 
 5441           else *ketcode = OP_KETRMAX + repeat_type;
 5442           }
 5443         }
 5444       }
 5445 
 5446     /* If previous is OP_FAIL, it was generated by an empty class [] in
 5447     JavaScript mode. The other ways in which OP_FAIL can be generated, that is
 5448     by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
 5449     error above. We can just ignore the repeat in JS case. */
 5450 
 5451     else if (*previous == OP_FAIL) goto END_REPEAT;
 5452 
 5453     /* Else there's some kind of shambles */
 5454 
 5455     else
 5456       {
 5457       *errorcodeptr = ERR11;
 5458       goto FAILED;
 5459       }
 5460 
 5461     /* If the character following a repeat is '+', or if certain optimization
 5462     tests above succeeded, possessive_quantifier is TRUE. For some opcodes,
 5463     there are special alternative opcodes for this case. For anything else, we
 5464     wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'
 5465     notation is just syntactic sugar, taken from Sun's Java package, but the
 5466     special opcodes can optimize it.
 5467 
 5468     Some (but not all) possessively repeated subpatterns have already been
 5469     completely handled in the code just above. For them, possessive_quantifier
 5470     is always FALSE at this stage.
 5471 
 5472     Note that the repeated item starts at tempcode, not at previous, which
 5473     might be the first part of a string whose (former) last char we repeated.
 5474 
 5475     Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
 5476     an 'upto' may follow. We skip over an 'exact' item, and then test the
 5477     length of what remains before proceeding. */
 5478 
 5479     if (possessive_quantifier)
 5480       {
 5481       int len;
 5482 
 5483       if (*tempcode == OP_TYPEEXACT)
 5484         tempcode += PRIV(OP_lengths)[*tempcode] +
 5485           ((tempcode[1 + IMM2_SIZE] == OP_PROP
 5486           || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
 5487 
 5488       else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
 5489         {
 5490         tempcode += PRIV(OP_lengths)[*tempcode];
 5491 #ifdef SUPPORT_UTF
 5492         if (utf && HAS_EXTRALEN(tempcode[-1]))
 5493           tempcode += GET_EXTRALEN(tempcode[-1]);
 5494 #endif
 5495         }
 5496 
 5497       len = (int)(code - tempcode);
 5498       if (len > 0) switch (*tempcode)
 5499         {
 5500         case OP_STAR:  *tempcode = OP_POSSTAR; break;
 5501         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
 5502         case OP_QUERY: *tempcode = OP_POSQUERY; break;
 5503         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
 5504 
 5505         case OP_STARI:  *tempcode = OP_POSSTARI; break;
 5506         case OP_PLUSI:  *tempcode = OP_POSPLUSI; break;
 5507         case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
 5508         case OP_UPTOI:  *tempcode = OP_POSUPTOI; break;
 5509 
 5510         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
 5511         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
 5512         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
 5513         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
 5514 
 5515         case OP_NOTSTARI:  *tempcode = OP_NOTPOSSTARI; break;
 5516         case OP_NOTPLUSI:  *tempcode = OP_NOTPOSPLUSI; break;
 5517         case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
 5518         case OP_NOTUPTOI:  *tempcode = OP_NOTPOSUPTOI; break;
 5519 
 5520         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
 5521         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
 5522         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
 5523         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
 5524 
 5525         /* Because we are moving code along, we must ensure that any
 5526         pending recursive references are updated. */
 5527 
 5528         default:
 5529         *code = OP_END;
 5530         adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
 5531         memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
 5532         code += 1 + LINK_SIZE;
 5533         len += 1 + LINK_SIZE;
 5534         tempcode[0] = OP_ONCE;
 5535         *code++ = OP_KET;
 5536         PUTINC(code, 0, len);
 5537         PUT(tempcode, 1, len);
 5538         break;
 5539         }
 5540       }
 5541 
 5542     /* In all case we no longer have a previous item. We also set the
 5543     "follows varying string" flag for subsequently encountered reqchars if
 5544     it isn't already set and we have just passed a varying length item. */
 5545 
 5546     END_REPEAT:
 5547     previous = NULL;
 5548     cd->req_varyopt |= reqvary;
 5549     break;
 5550 
 5551 
 5552     /* ===================================================================*/
 5553     /* Start of nested parenthesized sub-expression, or comment or lookahead or
 5554     lookbehind or option setting or condition or all the other extended
 5555     parenthesis forms.  */
 5556 
 5557     case CHAR_LEFT_PARENTHESIS:
 5558     newoptions = options;
 5559     skipbytes = 0;
 5560     bravalue = OP_CBRA;
 5561     save_hwm = cd->hwm;
 5562     reset_bracount = FALSE;
 5563 
 5564     /* First deal with various "verbs" that can be introduced by '*'. */
 5565 
 5566     ptr++;
 5567     if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
 5568          || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
 5569       {
 5570       int i, namelen;
 5571       int arglen = 0;
 5572       const char *vn = verbnames;
 5573       const pcre_uchar *name = ptr + 1;
 5574       const pcre_uchar *arg = NULL;
 5575       previous = NULL;
 5576       ptr++;
 5577       while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
 5578       namelen = (int)(ptr - name);
 5579 
 5580       /* It appears that Perl allows any characters whatsoever, other than
 5581       a closing parenthesis, to appear in arguments, so we no longer insist on
 5582       letters, digits, and underscores. */
 5583 
 5584       if (*ptr == CHAR_COLON)
 5585         {
 5586         arg = ++ptr;
 5587         while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
 5588         arglen = (int)(ptr - arg);
 5589         }
 5590 
 5591       if (*ptr != CHAR_RIGHT_PARENTHESIS)
 5592         {
 5593         *errorcodeptr = ERR60;
 5594         goto FAILED;
 5595         }
 5596 
 5597       /* Scan the table of verb names */
 5598 
 5599       for (i = 0; i < verbcount; i++)
 5600         {
 5601         if (namelen == verbs[i].len &&
 5602             STRNCMP_UC_C8(name, vn, namelen) == 0)
 5603           {
 5604           /* Check for open captures before ACCEPT and convert it to
 5605           ASSERT_ACCEPT if in an assertion. */
 5606 
 5607           if (verbs[i].op == OP_ACCEPT)
 5608             {
 5609             open_capitem *oc;
 5610             if (arglen != 0)
 5611               {
 5612               *errorcodeptr = ERR59;
 5613               goto FAILED;
 5614               }
 5615             cd->had_accept = TRUE;
 5616             for (oc = cd->open_caps; oc != NULL; oc = oc->next)
 5617               {
 5618               *code++ = OP_CLOSE;
 5619               PUT2INC(code, 0, oc->number);
 5620               }
 5621             *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
 5622 
 5623             /* Do not set firstchar after *ACCEPT */
 5624             if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
 5625             }
 5626 
 5627           /* Handle other cases with/without an argument */
 5628 
 5629           else if (arglen == 0)
 5630             {
 5631             if (verbs[i].op < 0)   /* Argument is mandatory */
 5632               {
 5633               *errorcodeptr = ERR66;
 5634               goto FAILED;
 5635               }
 5636             *code = verbs[i].op;
 5637             if (*code++ == OP_THEN) cd->external_flags |= PCRE_HASTHEN;
 5638