"Fossies" - the Fresh Open Source Software Archive

Member "tin-2.4.1/pcre/pcre_compile.c" (28 Aug 2013, 172486 Bytes) of package /linux/misc/tin-2.4.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "pcre_compile.c" see the Fossies "Dox" file reference documentation.

    1 /*************************************************
    2 *      Perl-Compatible Regular Expressions       *
    3 *************************************************/
    4 
    5 /* PCRE is a library of functions to support regular expressions whose syntax
    6 and semantics are as close as possible to those of the Perl 5 language.
    7 
    8                        Written by Philip Hazel
    9            Copyright (c) 1997-2006 University of Cambridge
   10 
   11 -----------------------------------------------------------------------------
   12 Redistribution and use in source and binary forms, with or without
   13 modification, are permitted provided that the following conditions are met:
   14 
   15     * Redistributions of source code must retain the above copyright notice,
   16       this list of conditions and the following disclaimer.
   17 
   18     * Redistributions in binary form must reproduce the above copyright
   19       notice, this list of conditions and the following disclaimer in the
   20       documentation and/or other materials provided with the distribution.
   21 
   22     * Neither the name of the University of Cambridge nor the names of its
   23       contributors may be used to endorse or promote products derived from
   24       this software without specific prior written permission.
   25 
   26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
   27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
   30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   36 POSSIBILITY OF SUCH DAMAGE.
   37 -----------------------------------------------------------------------------
   38 */
   39 
   40 
   41 /* This module contains the external function pcre_compile(), along with
   42 supporting internal functions that are not used by other modules. */
   43 
   44 
   45 #define NLBLOCK cd             /* Block containing newline information */
   46 #define PSSTART start_pattern  /* Field containing processed string start */
   47 #define PSEND   end_pattern    /* Field containing processed string end */
   48 
   49 
   50 #include "pcre_internal.h"
   51 
   52 
   53 /* When DEBUG is defined, we need the pcre_printint() function, which is also
   54 used by pcretest. DEBUG is not defined when building a production library. */
   55 
   56 #ifdef DEBUG
   57 #include "pcre_printint.src"
   58 #endif
   59 
   60 
   61 /*************************************************
   62 *      Code parameters and static tables         *
   63 *************************************************/
   64 
   65 /* This value specifies the size of stack workspace that is used during the
   66 first pre-compile phase that determines how much memory is required. The regex
   67 is partly compiled into this space, but the compiled parts are discarded as
   68 soon as they can be, so that hopefully there will never be an overrun. The code
   69 does, however, check for an overrun. The largest amount I've seen used is 218,
   70 so this number is very generous.
   71 
   72 The same workspace is used during the second, actual compile phase for
   73 remembering forward references to groups so that they can be filled in at the
   74 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
   75 is 4 there is plenty of room. */
   76 
   77 #define COMPILE_WORK_SIZE (4096)
   78 
   79 
   80 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
   81 are simple data values; negative values are for special things like \d and so
   82 on. Zero means further processing is needed (for things like \x), or the escape
   83 is invalid. */
   84 
   85 #if !EBCDIC   /* This is the "normal" table for ASCII systems */
   86 static const short int escapes[] = {
   87      0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
   88      0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
   89    '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
   90      0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
   91 -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
   92 -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
   93    '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
   94      0,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
   95 -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */
   96      0,      0, -ESC_z                                            /* x - z */
   97 };
   98 
   99 #else         /* This is the "abnormal" table for EBCDIC systems */
  100 static const short int escapes[] = {
  101 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
  102 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
  103 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
  104 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
  105 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
  106 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
  107 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
  108 /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
  109 /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,
  110 /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
  111 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
  112 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,
  113 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
  114 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
  115 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
  116 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
  117 /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,
  118 /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,
  119 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
  120 /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,
  121 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
  122 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
  123 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
  124 };
  125 #endif
  126 
  127 
  128 /* Tables of names of POSIX character classes and their lengths. The list is
  129 terminated by a zero length entry. The first three must be alpha, lower, upper,
  130 as this is assumed for handling case independence. */
  131 
  132 static const char *const posix_names[] = {
  133   "alpha", "lower", "upper",
  134   "alnum", "ascii", "blank", "cntrl", "digit", "graph",
  135   "print", "punct", "space", "word",  "xdigit" };
  136 
  137 static const uschar posix_name_lengths[] = {
  138   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
  139 
  140 /* Table of class bit maps for each POSIX class. Each class is formed from a
  141 base map, with an optional addition or removal of another map. Then, for some
  142 classes, there is some additional tweaking: for [:blank:] the vertical space
  143 characters are removed, and for [:alpha:] and [:alnum:] the underscore
  144 character is removed. The triples in the table consist of the base map offset,
  145 second map offset or -1 if no second map, and a non-negative value for map
  146 addition or a negative value for map subtraction (if there are two maps). The
  147 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
  148 remove vertical space characters, 2 => remove underscore. */
  149 
  150 static const int posix_class_maps[] = {
  151   cbit_word,  cbit_digit, -2,             /* alpha */
  152   cbit_lower, -1,          0,             /* lower */
  153   cbit_upper, -1,          0,             /* upper */
  154   cbit_word,  -1,          2,             /* alnum - word without underscore */
  155   cbit_print, cbit_cntrl,  0,             /* ascii */
  156   cbit_space, -1,          1,             /* blank - a GNU extension */
  157   cbit_cntrl, -1,          0,             /* cntrl */
  158   cbit_digit, -1,          0,             /* digit */
  159   cbit_graph, -1,          0,             /* graph */
  160   cbit_print, -1,          0,             /* print */
  161   cbit_punct, -1,          0,             /* punct */
  162   cbit_space, -1,          0,             /* space */
  163   cbit_word,  -1,          0,             /* word - a Perl extension */
  164   cbit_xdigit,-1,          0              /* xdigit */
  165 };
  166 
  167 
  168 #define STRING(a)  # a
  169 #define XSTRING(s) STRING(s)
  170 
  171 /* The texts of compile-time error messages. These are "char *" because they
  172 are passed to the outside world. Do not ever re-use any error number, because
  173 they are documented. Always add a new error instead. Messages marked DEAD below
  174 are no longer used. */
  175 
  176 static const char *error_texts[] = {
  177   "no error",
  178   "\\ at end of pattern",
  179   "\\c at end of pattern",
  180   "unrecognized character follows \\",
  181   "numbers out of order in {} quantifier",
  182   /* 5 */
  183   "number too big in {} quantifier",
  184   "missing terminating ] for character class",
  185   "invalid escape sequence in character class",
  186   "range out of order in character class",
  187   "nothing to repeat",
  188   /* 10 */
  189   "operand of unlimited repeat could match the empty string",  /** DEAD **/
  190   "internal error: unexpected repeat",
  191   "unrecognized character after (?",
  192   "POSIX named classes are supported only within a class",
  193   "missing )",
  194   /* 15 */
  195   "reference to non-existent subpattern",
  196   "erroffset passed as NULL",
  197   "unknown option bit(s) set",
  198   "missing ) after comment",
  199   "parentheses nested too deeply",  /** DEAD **/
  200   /* 20 */
  201   "regular expression too large",
  202   "failed to get memory",
  203   "unmatched parentheses",
  204   "internal error: code overflow",
  205   "unrecognized character after (?<",
  206   /* 25 */
  207   "lookbehind assertion is not fixed length",
  208   "malformed number or name after (?(",
  209   "conditional group contains more than two branches",
  210   "assertion expected after (?(",
  211   "(?R or (?digits must be followed by )",
  212   /* 30 */
  213   "unknown POSIX class name",
  214   "POSIX collating elements are not supported",
  215   "this version of PCRE is not compiled with PCRE_UTF8 support",
  216   "spare error",  /** DEAD **/
  217   "character value in \\x{...} sequence is too large",
  218   /* 35 */
  219   "invalid condition (?(0)",
  220   "\\C not allowed in lookbehind assertion",
  221   "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
  222   "number after (?C is > 255",
  223   "closing ) for (?C expected",
  224   /* 40 */
  225   "recursive call could loop indefinitely",
  226   "unrecognized character after (?P",
  227   "syntax error in subpattern name (missing terminator)",
  228   "two named subpatterns have the same name",
  229   "invalid UTF-8 string",
  230   /* 45 */
  231   "support for \\P, \\p, and \\X has not been compiled",
  232   "malformed \\P or \\p sequence",
  233   "unknown property name after \\P or \\p",
  234   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
  235   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
  236   /* 50 */
  237   "repeated subpattern is too long",
  238   "octal value is greater than \\377 (not in UTF-8 mode)",
  239   "internal error: overran compiling workspace",
  240   "internal error: previously-checked referenced subpattern not found",
  241   "DEFINE group contains more than one branch",
  242   /* 55 */
  243   "repeating a DEFINE group is not allowed",
  244   "inconsistent NEWLINE options",
  245   "\\g is not followed by an (optionally braced) non-zero number"
  246 };
  247 
  248 
  249 /* Table to identify digits and hex digits. This is used when compiling
  250 patterns. Note that the tables in chartables are dependent on the locale, and
  251 may mark arbitrary characters as digits - but the PCRE compiling code expects
  252 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
  253 a private table here. It costs 256 bytes, but it is a lot faster than doing
  254 character value tests (at least in some simple cases I timed), and in some
  255 applications one wants PCRE to compile efficiently as well as match
  256 efficiently.
  257 
  258 For convenience, we use the same bit definitions as in chartables:
  259 
  260   0x04   decimal digit
  261   0x08   hexadecimal digit
  262 
  263 Then we can use ctype_digit and ctype_xdigit in the code. */
  264 
  265 #if !EBCDIC    /* This is the "normal" case, for ASCII systems */
  266 static const unsigned char digitab[] =
  267   {
  268   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
  269   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
  270   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
  271   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
  272   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
  273   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
  274   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
  275   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
  276   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
  277   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
  278   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
  279   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
  280   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
  281   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
  282   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
  283   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
  284   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
  285   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
  286   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
  287   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
  288   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
  289   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
  290   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
  291   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
  292   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
  293   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
  294   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
  295   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
  296   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
  297   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
  298   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
  299   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
  300 
  301 #else          /* This is the "abnormal" case, for EBCDIC systems */
  302 static const unsigned char digitab[] =
  303   {
  304   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
  305   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
  306   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
  307   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
  308   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
  309   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
  310   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
  311   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
  312   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
  313   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
  314   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
  315   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88-      */
  316   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
  317   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
  318   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
  319   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
  320   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
  321   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
  322   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
  323   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
  324   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
  325   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
  326   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
  327   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
  328   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
  329   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
  330   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
  331   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
  332   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
  333   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
  334   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
  335   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
  336 
  337 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
  338   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
  339   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
  340   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
  341   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
  342   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
  343   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
  344   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
  345   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
  346   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
  347   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
  348   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
  349   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88-   */
  350   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
  351   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
  352   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
  353   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
  354   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
  355   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
  356   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
  357   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
  358   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
  359   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
  360   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
  361   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
  362   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
  363   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
  364   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
  365   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
  366   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
  367   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
  368   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
  369   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
  370 #endif
  371 
  372 
  373 /* Definition to allow mutual recursion */
  374 
  375 static BOOL
  376   compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *,
  377     int *, branch_chain *, compile_data *, int *);
  378 
  379 
  380 
  381 /*************************************************
  382 *            Handle escapes                      *
  383 *************************************************/
  384 
  385 /* This function is called when a \ has been encountered. It either returns a
  386 positive value for a simple escape such as \n, or a negative value which
  387 encodes one of the more complicated things such as \d. A backreference to group
  388 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
  389 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
  390 ptr is pointing at the \. On exit, it is on the final character of the escape
  391 sequence.
  392 
  393 Arguments:
  394   ptrptr         points to the pattern position pointer
  395   errorcodeptr   points to the errorcode variable
  396   bracount       number of previous extracting brackets
  397   options        the options bits
  398   isclass        TRUE if inside a character class
  399 
  400 Returns:         zero or positive => a data character
  401                  negative => a special escape sequence
  402                  on error, errorptr is set
  403 */
  404 
  405 static int
  406 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
  407   int options, BOOL isclass)
  408 {
  409 BOOL utf8 = (options & PCRE_UTF8) != 0;
  410 const uschar *ptr = *ptrptr + 1;
  411 int c, i;
  412 
  413 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
  414 ptr--;                            /* Set pointer back to the last byte */
  415 
  416 /* If backslash is at the end of the pattern, it's an error. */
  417 
  418 if (c == 0) *errorcodeptr = ERR1;
  419 
  420 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
  421 a table. A non-zero result is something that can be returned immediately.
  422 Otherwise further processing may be required. */
  423 
  424 #if !EBCDIC    /* ASCII coding */
  425 else if (c < '0' || c > 'z') {}                           /* Not alphameric */
  426 else if ((i = escapes[c - '0']) != 0) c = i;
  427 
  428 #else          /* EBCDIC coding */
  429 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
  430 else if ((i = escapes[c - 0x48]) != 0)  c = i;
  431 #endif
  432 
  433 /* Escapes that need further processing, or are illegal. */
  434 
  435 else
  436   {
  437   const uschar *oldptr;
  438   BOOL braced, negated;
  439 
  440   switch (c)
  441     {
  442     /* A number of Perl escapes are not handled by PCRE. We give an explicit
  443     error. */
  444 
  445     case 'l':
  446     case 'L':
  447     case 'N':
  448     case 'u':
  449     case 'U':
  450     *errorcodeptr = ERR37;
  451     break;
  452 
  453     /* \g must be followed by a number, either plain or braced. If positive, it
  454     is an absolute backreference. If negative, it is a relative backreference.
  455     This is a Perl 5.10 feature. */
  456 
  457     case 'g':
  458     if (ptr[1] == '{')
  459       {
  460       braced = TRUE;
  461       ptr++;
  462       }
  463     else braced = FALSE;
  464 
  465     if (ptr[1] == '-')
  466       {
  467       negated = TRUE;
  468       ptr++;
  469       }
  470     else negated = FALSE;
  471 
  472     c = 0;
  473     while ((digitab[ptr[1]] & ctype_digit) != 0)
  474       c = c * 10 + *(++ptr) - '0';
  475 
  476     if (c == 0 || (braced && *(++ptr) != '}'))
  477       {
  478       *errorcodeptr = ERR57;
  479       return 0;
  480       }
  481 
  482     if (negated)
  483       {
  484       if (c > bracount)
  485         {
  486         *errorcodeptr = ERR15;
  487         return 0;
  488         }
  489       c = bracount - (c - 1);
  490       }
  491 
  492     c = -(ESC_REF + c);
  493     break;
  494 
  495     /* The handling of escape sequences consisting of a string of digits
  496     starting with one that is not zero is not straightforward. By experiment,
  497     the way Perl works seems to be as follows:
  498 
  499     Outside a character class, the digits are read as a decimal number. If the
  500     number is less than 10, or if there are that many previous extracting
  501     left brackets, then it is a back reference. Otherwise, up to three octal
  502     digits are read to form an escaped byte. Thus \123 is likely to be octal
  503     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
  504     value is greater than 377, the least significant 8 bits are taken. Inside a
  505     character class, \ followed by a digit is always an octal number. */
  506 
  507     case '1': case '2': case '3': case '4': case '5':
  508     case '6': case '7': case '8': case '9':
  509 
  510     if (!isclass)
  511       {
  512       oldptr = ptr;
  513       c -= '0';
  514       while ((digitab[ptr[1]] & ctype_digit) != 0)
  515         c = c * 10 + *(++ptr) - '0';
  516       if (c < 10 || c <= bracount)
  517         {
  518         c = -(ESC_REF + c);
  519         break;
  520         }
  521       ptr = oldptr;      /* Put the pointer back and fall through */
  522       }
  523 
  524     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
  525     generates a binary zero byte and treats the digit as a following literal.
  526     Thus we have to pull back the pointer by one. */
  527 
  528     if ((c = *ptr) >= '8')
  529       {
  530       ptr--;
  531       c = 0;
  532       break;
  533       }
  534 
  535     /* \0 always starts an octal number, but we may drop through to here with a
  536     larger first octal digit. The original code used just to take the least
  537     significant 8 bits of octal numbers (I think this is what early Perls used
  538     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
  539     than 3 octal digits. */
  540 
  541     case '0':
  542     c -= '0';
  543     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
  544         c = c * 8 + *(++ptr) - '0';
  545     if (!utf8 && c > 255) *errorcodeptr = ERR51;
  546     break;
  547 
  548     /* \x is complicated. \x{ddd} is a character number which can be greater
  549     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
  550     treated as a data character. */
  551 
  552     case 'x':
  553     if (ptr[1] == '{')
  554       {
  555       const uschar *pt = ptr + 2;
  556       int count = 0;
  557 
  558       c = 0;
  559       while ((digitab[*pt] & ctype_xdigit) != 0)
  560         {
  561         register int cc = *pt++;
  562         if (c == 0 && cc == '0') continue;     /* Leading zeroes */
  563         count++;
  564 
  565 #if !EBCDIC    /* ASCII coding */
  566         if (cc >= 'a') cc -= 32;               /* Convert to upper case */
  567         c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
  568 #else          /* EBCDIC coding */
  569         if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
  570         c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
  571 #endif
  572         }
  573 
  574       if (*pt == '}')
  575         {
  576         if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
  577         ptr = pt;
  578         break;
  579         }
  580 
  581       /* If the sequence of hex digits does not end with '}', then we don't
  582       recognize this construct; fall through to the normal \x handling. */
  583       }
  584 
  585     /* Read just a single-byte hex-defined char */
  586 
  587     c = 0;
  588     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
  589       {
  590       int cc;                               /* Some compilers don't like ++ */
  591       cc = *(++ptr);                        /* in initializers */
  592 #if !EBCDIC    /* ASCII coding */
  593       if (cc >= 'a') cc -= 32;              /* Convert to upper case */
  594       c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
  595 #else          /* EBCDIC coding */
  596       if (cc <= 'z') cc += 64;              /* Convert to upper case */
  597       c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
  598 #endif
  599       }
  600     break;
  601 
  602     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
  603     This coding is ASCII-specific, but then the whole concept of \cx is
  604     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
  605 
  606     case 'c':
  607     c = *(++ptr);
  608     if (c == 0)
  609       {
  610       *errorcodeptr = ERR2;
  611       return 0;
  612       }
  613 
  614 #if !EBCDIC    /* ASCII coding */
  615     if (c >= 'a' && c <= 'z') c -= 32;
  616     c ^= 0x40;
  617 #else          /* EBCDIC coding */
  618     if (c >= 'a' && c <= 'z') c += 64;
  619     c ^= 0xC0;
  620 #endif
  621     break;
  622 
  623     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
  624     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
  625     for Perl compatibility, it is a literal. This code looks a bit odd, but
  626     there used to be some cases other than the default, and there may be again
  627     in future, so I haven't "optimized" it. */
  628 
  629     default:
  630     if ((options & PCRE_EXTRA) != 0) switch(c)
  631       {
  632       default:
  633       *errorcodeptr = ERR3;
  634       break;
  635       }
  636     break;
  637     }
  638   }
  639 
  640 *ptrptr = ptr;
  641 return c;
  642 }
  643 
  644 
  645 
  646 #ifdef SUPPORT_UCP
  647 /*************************************************
  648 *               Handle \P and \p                 *
  649 *************************************************/
  650 
  651 /* This function is called after \P or \p has been encountered, provided that
  652 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
  653 pointing at the P or p. On exit, it is pointing at the final character of the
  654 escape sequence.
  655 
  656 Argument:
  657   ptrptr         points to the pattern position pointer
  658   negptr         points to a boolean that is set TRUE for negation else FALSE
  659   dptr           points to an int that is set to the detailed property value
  660   errorcodeptr   points to the error code variable
  661 
  662 Returns:         type value from ucp_type_table, or -1 for an invalid type
  663 */
  664 
  665 static int
  666 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
  667 {
  668 int c, i, bot, top;
  669 const uschar *ptr = *ptrptr;
  670 char name[32];
  671 
  672 c = *(++ptr);
  673 if (c == 0) goto ERROR_RETURN;
  674 
  675 *negptr = FALSE;
  676 
  677 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
  678 negation. */
  679 
  680 if (c == '{')
  681   {
  682   if (ptr[1] == '^')
  683     {
  684     *negptr = TRUE;
  685     ptr++;
  686     }
  687   for (i = 0; i < sizeof(name) - 1; i++)
  688     {
  689     c = *(++ptr);
  690     if (c == 0) goto ERROR_RETURN;
  691     if (c == '}') break;
  692     name[i] = c;
  693     }
  694   if (c !='}') goto ERROR_RETURN;
  695   name[i] = 0;
  696   }
  697 
  698 /* Otherwise there is just one following character */
  699 
  700 else
  701   {
  702   name[0] = c;
  703   name[1] = 0;
  704   }
  705 
  706 *ptrptr = ptr;
  707 
  708 /* Search for a recognized property name using binary chop */
  709 
  710 bot = 0;
  711 top = _pcre_utt_size;
  712 
  713 while (bot < top)
  714   {
  715   i = (bot + top) >> 1;
  716   c = strcmp(name, _pcre_utt[i].name);
  717   if (c == 0)
  718     {
  719     *dptr = _pcre_utt[i].value;
  720     return _pcre_utt[i].type;
  721     }
  722   if (c > 0) bot = i + 1; else top = i;
  723   }
  724 
  725 *errorcodeptr = ERR47;
  726 *ptrptr = ptr;
  727 return -1;
  728 
  729 ERROR_RETURN:
  730 *errorcodeptr = ERR46;
  731 *ptrptr = ptr;
  732 return -1;
  733 }
  734 #endif
  735 
  736 
  737 
  738 
  739 /*************************************************
  740 *            Check for counted repeat            *
  741 *************************************************/
  742 
  743 /* This function is called when a '{' is encountered in a place where it might
  744 start a quantifier. It looks ahead to see if it really is a quantifier or not.
  745 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
  746 where the ddds are digits.
  747 
  748 Arguments:
  749   p         pointer to the first char after '{'
  750 
  751 Returns:    TRUE or FALSE
  752 */
  753 
  754 static BOOL
  755 is_counted_repeat(const uschar *p)
  756 {
  757 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
  758 while ((digitab[*p] & ctype_digit) != 0) p++;
  759 if (*p == '}') return TRUE;
  760 
  761 if (*p++ != ',') return FALSE;
  762 if (*p == '}') return TRUE;
  763 
  764 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
  765 while ((digitab[*p] & ctype_digit) != 0) p++;
  766 
  767 return (*p == '}');
  768 }
  769 
  770 
  771 
  772 /*************************************************
  773 *         Read repeat counts                     *
  774 *************************************************/
  775 
  776 /* Read an item of the form {n,m} and return the values. This is called only
  777 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
  778 so the syntax is guaranteed to be correct, but we need to check the values.
  779 
  780 Arguments:
  781   p              pointer to first char after '{'
  782   minp           pointer to int for min
  783   maxp           pointer to int for max
  784                  returned as -1 if no max
  785   errorcodeptr   points to error code variable
  786 
  787 Returns:         pointer to '}' on success;
  788                  current ptr on error, with errorcodeptr set non-zero
  789 */
  790 
  791 static const uschar *
  792 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
  793 {
  794 int min = 0;
  795 int max = -1;
  796 
  797 /* Read the minimum value and do a paranoid check: a negative value indicates
  798 an integer overflow. */
  799 
  800 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
  801 if (min < 0 || min > 65535)
  802   {
  803   *errorcodeptr = ERR5;
  804   return p;
  805   }
  806 
  807 /* Read the maximum value if there is one, and again do a paranoid on its size.
  808 Also, max must not be less than min. */
  809 
  810 if (*p == '}') max = min; else
  811   {
  812   if (*(++p) != '}')
  813     {
  814     max = 0;
  815     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
  816     if (max < 0 || max > 65535)
  817       {
  818       *errorcodeptr = ERR5;
  819       return p;
  820       }
  821     if (max < min)
  822       {
  823       *errorcodeptr = ERR4;
  824       return p;
  825       }
  826     }
  827   }
  828 
  829 /* Fill in the required variables, and pass back the pointer to the terminating
  830 '}'. */
  831 
  832 *minp = min;
  833 *maxp = max;
  834 return p;
  835 }
  836 
  837 
  838 
  839 /*************************************************
  840 *       Find forward referenced subpattern       *
  841 *************************************************/
  842 
  843 /* This function scans along a pattern's text looking for capturing
  844 subpatterns, and counting them. If it finds a named pattern that matches the
  845 name it is given, it returns its number. Alternatively, if the name is NULL, it
  846 returns when it reaches a given numbered subpattern. This is used for forward
  847 references to subpatterns. We know that if (?P< is encountered, the name will
  848 be terminated by '>' because that is checked in the first pass.
  849 
  850 Arguments:
  851   ptr          current position in the pattern
  852   count        current count of capturing parens so far encountered
  853   name         name to seek, or NULL if seeking a numbered subpattern
  854   lorn         name length, or subpattern number if name is NULL
  855   xmode        TRUE if we are in /x mode
  856 
  857 Returns:       the number of the named subpattern, or -1 if not found
  858 */
  859 
  860 static int
  861 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
  862   BOOL xmode)
  863 {
  864 const uschar *thisname;
  865 
  866 for (; *ptr != 0; ptr++)
  867   {
  868   int term;
  869 
  870   /* Skip over backslashed characters and also entire \Q...\E */
  871 
  872   if (*ptr == '\\')
  873     {
  874     if (*(++ptr) == 0) return -1;
  875     if (*ptr == 'Q') for (;;)
  876       {
  877       while (*(++ptr) != 0 && *ptr != '\\');
  878       if (*ptr == 0) return -1;
  879       if (*(++ptr) == 'E') break;
  880       }
  881     continue;
  882     }
  883 
  884   /* Skip over character classes */
  885 
  886   if (*ptr == '[')
  887     {
  888     while (*(++ptr) != ']')
  889       {
  890       if (*ptr == '\\')
  891         {
  892         if (*(++ptr) == 0) return -1;
  893         if (*ptr == 'Q') for (;;)
  894           {
  895           while (*(++ptr) != 0 && *ptr != '\\');
  896           if (*ptr == 0) return -1;
  897           if (*(++ptr) == 'E') break;
  898           }
  899         continue;
  900         }
  901       }
  902     continue;
  903     }
  904 
  905   /* Skip comments in /x mode */
  906 
  907   if (xmode && *ptr == '#')
  908     {
  909     while (*(++ptr) != 0 && *ptr != '\n');
  910     if (*ptr == 0) return -1;
  911     continue;
  912     }
  913 
  914   /* An opening parens must now be a real metacharacter */
  915 
  916   if (*ptr != '(') continue;
  917   if (ptr[1] != '?')
  918     {
  919     count++;
  920     if (name == NULL && count == lorn) return count;
  921     continue;
  922     }
  923 
  924   ptr += 2;
  925   if (*ptr == 'P') ptr++;                      /* Allow optional P */
  926 
  927   /* We have to disambiguate (?<! and (?<= from (?<name> */
  928 
  929   if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
  930        *ptr != '\'')
  931     continue;
  932 
  933   count++;
  934 
  935   if (name == NULL && count == lorn) return count;
  936   term = *ptr++;
  937   if (term == '<') term = '>';
  938   thisname = ptr;
  939   while (*ptr != term) ptr++;
  940   if (name != NULL && lorn == ptr - thisname &&
  941       strncmp((const char *)name, (const char *)thisname, lorn) == 0)
  942     return count;
  943   }
  944 
  945 return -1;
  946 }
  947 
  948 
  949 
  950 /*************************************************
  951 *      Find first significant op code            *
  952 *************************************************/
  953 
  954 /* This is called by several functions that scan a compiled expression looking
  955 for a fixed first character, or an anchoring op code etc. It skips over things
  956 that do not influence this. For some calls, a change of option is important.
  957 For some calls, it makes sense to skip negative forward and all backward
  958 assertions, and also the \b assertion; for others it does not.
  959 
  960 Arguments:
  961   code         pointer to the start of the group
  962   options      pointer to external options
  963   optbit       the option bit whose changing is significant, or
  964                  zero if none are
  965   skipassert   TRUE if certain assertions are to be skipped
  966 
  967 Returns:       pointer to the first significant opcode
  968 */
  969 
  970 static const uschar*
  971 first_significant_code(const uschar *code, int *options, int optbit,
  972   BOOL skipassert)
  973 {
  974 for (;;)
  975   {
  976   switch ((int)*code)
  977     {
  978     case OP_OPT:
  979     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
  980       *options = (int)code[1];
  981     code += 2;
  982     break;
  983 
  984     case OP_ASSERT_NOT:
  985     case OP_ASSERTBACK:
  986     case OP_ASSERTBACK_NOT:
  987     if (!skipassert) return code;
  988     do code += GET(code, 1); while (*code == OP_ALT);
  989     code += _pcre_OP_lengths[*code];
  990     break;
  991 
  992     case OP_WORD_BOUNDARY:
  993     case OP_NOT_WORD_BOUNDARY:
  994     if (!skipassert) return code;
  995     /* Fall through */
  996 
  997     case OP_CALLOUT:
  998     case OP_CREF:
  999     case OP_RREF:
 1000     case OP_DEF:
 1001     code += _pcre_OP_lengths[*code];
 1002     break;
 1003 
 1004     default:
 1005     return code;
 1006     }
 1007   }
 1008 /* Control never reaches here */
 1009 }
 1010 
 1011 
 1012 
 1013 
 1014 /*************************************************
 1015 *        Find the fixed length of a pattern      *
 1016 *************************************************/
 1017 
 1018 /* Scan a pattern and compute the fixed length of subject that will match it,
 1019 if the length is fixed. This is needed for dealing with backward assertions.
 1020 In UTF8 mode, the result is in characters rather than bytes.
 1021 
 1022 Arguments:
 1023   code     points to the start of the pattern (the bracket)
 1024   options  the compiling options
 1025 
 1026 Returns:   the fixed length, or -1 if there is no fixed length,
 1027              or -2 if \C was encountered
 1028 */
 1029 
 1030 static int
 1031 find_fixedlength(uschar *code, int options)
 1032 {
 1033 int length = -1;
 1034 
 1035 register int branchlength = 0;
 1036 register uschar *cc = code + 1 + LINK_SIZE;
 1037 
 1038 /* Scan along the opcodes for this branch. If we get to the end of the
 1039 branch, check the length against that of the other branches. */
 1040 
 1041 for (;;)
 1042   {
 1043   int d;
 1044   register int op = *cc;
 1045 
 1046   switch (op)
 1047     {
 1048     case OP_CBRA:
 1049     case OP_BRA:
 1050     case OP_ONCE:
 1051     case OP_COND:
 1052     d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
 1053     if (d < 0) return d;
 1054     branchlength += d;
 1055     do cc += GET(cc, 1); while (*cc == OP_ALT);
 1056     cc += 1 + LINK_SIZE;
 1057     break;
 1058 
 1059     /* Reached end of a branch; if it's a ket it is the end of a nested
 1060     call. If it's ALT it is an alternation in a nested call. If it is
 1061     END it's the end of the outer call. All can be handled by the same code. */
 1062 
 1063     case OP_ALT:
 1064     case OP_KET:
 1065     case OP_KETRMAX:
 1066     case OP_KETRMIN:
 1067     case OP_END:
 1068     if (length < 0) length = branchlength;
 1069       else if (length != branchlength) return -1;
 1070     if (*cc != OP_ALT) return length;
 1071     cc += 1 + LINK_SIZE;
 1072     branchlength = 0;
 1073     break;
 1074 
 1075     /* Skip over assertive subpatterns */
 1076 
 1077     case OP_ASSERT:
 1078     case OP_ASSERT_NOT:
 1079     case OP_ASSERTBACK:
 1080     case OP_ASSERTBACK_NOT:
 1081     do cc += GET(cc, 1); while (*cc == OP_ALT);
 1082     /* Fall through */
 1083 
 1084     /* Skip over things that don't match chars */
 1085 
 1086     case OP_REVERSE:
 1087     case OP_CREF:
 1088     case OP_RREF:
 1089     case OP_DEF:
 1090     case OP_OPT:
 1091     case OP_CALLOUT:
 1092     case OP_SOD:
 1093     case OP_SOM:
 1094     case OP_EOD:
 1095     case OP_EODN:
 1096     case OP_CIRC:
 1097     case OP_DOLL:
 1098     case OP_NOT_WORD_BOUNDARY:
 1099     case OP_WORD_BOUNDARY:
 1100     cc += _pcre_OP_lengths[*cc];
 1101     break;
 1102 
 1103     /* Handle literal characters */
 1104 
 1105     case OP_CHAR:
 1106     case OP_CHARNC:
 1107     case OP_NOT:
 1108     branchlength++;
 1109     cc += 2;
 1110 #ifdef SUPPORT_UTF8
 1111     if ((options & PCRE_UTF8) != 0)
 1112       {
 1113       while ((*cc & 0xc0) == 0x80) cc++;
 1114       }
 1115 #endif
 1116     break;
 1117 
 1118     /* Handle exact repetitions. The count is already in characters, but we
 1119     need to skip over a multibyte character in UTF8 mode.  */
 1120 
 1121     case OP_EXACT:
 1122     branchlength += GET2(cc,1);
 1123     cc += 4;
 1124 #ifdef SUPPORT_UTF8
 1125     if ((options & PCRE_UTF8) != 0)
 1126       {
 1127       while((*cc & 0x80) == 0x80) cc++;
 1128       }
 1129 #endif
 1130     break;
 1131 
 1132     case OP_TYPEEXACT:
 1133     branchlength += GET2(cc,1);
 1134     cc += 4;
 1135     break;
 1136 
 1137     /* Handle single-char matchers */
 1138 
 1139     case OP_PROP:
 1140     case OP_NOTPROP:
 1141     cc += 2;
 1142     /* Fall through */
 1143 
 1144     case OP_NOT_DIGIT:
 1145     case OP_DIGIT:
 1146     case OP_NOT_WHITESPACE:
 1147     case OP_WHITESPACE:
 1148     case OP_NOT_WORDCHAR:
 1149     case OP_WORDCHAR:
 1150     case OP_ANY:
 1151     branchlength++;
 1152     cc++;
 1153     break;
 1154 
 1155     /* The single-byte matcher isn't allowed */
 1156 
 1157     case OP_ANYBYTE:
 1158     return -2;
 1159 
 1160     /* Check a class for variable quantification */
 1161 
 1162 #ifdef SUPPORT_UTF8
 1163     case OP_XCLASS:
 1164     cc += GET(cc, 1) - 33;
 1165     /* Fall through */
 1166 #endif
 1167 
 1168     case OP_CLASS:
 1169     case OP_NCLASS:
 1170     cc += 33;
 1171 
 1172     switch (*cc)
 1173       {
 1174       case OP_CRSTAR:
 1175       case OP_CRMINSTAR:
 1176       case OP_CRQUERY:
 1177       case OP_CRMINQUERY:
 1178       return -1;
 1179 
 1180       case OP_CRRANGE:
 1181       case OP_CRMINRANGE:
 1182       if (GET2(cc,1) != GET2(cc,3)) return -1;
 1183       branchlength += GET2(cc,1);
 1184       cc += 5;
 1185       break;
 1186 
 1187       default:
 1188       branchlength++;
 1189       }
 1190     break;
 1191 
 1192     /* Anything else is variable length */
 1193 
 1194     default:
 1195     return -1;
 1196     }
 1197   }
 1198 /* Control never gets here */
 1199 }
 1200 
 1201 
 1202 
 1203 
 1204 /*************************************************
 1205 *    Scan compiled regex for numbered bracket    *
 1206 *************************************************/
 1207 
 1208 /* This little function scans through a compiled pattern until it finds a
 1209 capturing bracket with the given number.
 1210 
 1211 Arguments:
 1212   code        points to start of expression
 1213   utf8        TRUE in UTF-8 mode
 1214   number      the required bracket number
 1215 
 1216 Returns:      pointer to the opcode for the bracket, or NULL if not found
 1217 */
 1218 
 1219 static const uschar *
 1220 find_bracket(const uschar *code, BOOL utf8, int number)
 1221 {
 1222 for (;;)
 1223   {
 1224   register int c = *code;
 1225   if (c == OP_END) return NULL;
 1226 
 1227   /* XCLASS is used for classes that cannot be represented just by a bit
 1228   map. This includes negated single high-valued characters. The length in
 1229   the table is zero; the actual length is stored in the compiled code. */
 1230 
 1231   if (c == OP_XCLASS) code += GET(code, 1);
 1232 
 1233   /* Handle capturing bracket */
 1234 
 1235   else if (c == OP_CBRA)
 1236     {
 1237     int n = GET2(code, 1+LINK_SIZE);
 1238     if (n == number) return (uschar *)code;
 1239     code += _pcre_OP_lengths[c];
 1240     }
 1241 
 1242   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
 1243   a multi-byte character. The length in the table is a minimum, so we have to
 1244   arrange to skip the extra bytes. */
 1245 
 1246   else
 1247     {
 1248     code += _pcre_OP_lengths[c];
 1249     if (utf8) switch(c)
 1250       {
 1251       case OP_CHAR:
 1252       case OP_CHARNC:
 1253       case OP_EXACT:
 1254       case OP_UPTO:
 1255       case OP_MINUPTO:
 1256       case OP_POSUPTO:
 1257       case OP_STAR:
 1258       case OP_MINSTAR:
 1259       case OP_POSSTAR:
 1260       case OP_PLUS:
 1261       case OP_MINPLUS:
 1262       case OP_POSPLUS:
 1263       case OP_QUERY:
 1264       case OP_MINQUERY:
 1265       case OP_POSQUERY:
 1266       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
 1267       break;
 1268       }
 1269     }
 1270   }
 1271 }
 1272 
 1273 
 1274 
 1275 /*************************************************
 1276 *   Scan compiled regex for recursion reference  *
 1277 *************************************************/
 1278 
 1279 /* This little function scans through a compiled pattern until it finds an
 1280 instance of OP_RECURSE.
 1281 
 1282 Arguments:
 1283   code        points to start of expression
 1284   utf8        TRUE in UTF-8 mode
 1285 
 1286 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
 1287 */
 1288 
 1289 static const uschar *
 1290 find_recurse(const uschar *code, BOOL utf8)
 1291 {
 1292 for (;;)
 1293   {
 1294   register int c = *code;
 1295   if (c == OP_END) return NULL;
 1296   if (c == OP_RECURSE) return code;
 1297 
 1298   /* XCLASS is used for classes that cannot be represented just by a bit
 1299   map. This includes negated single high-valued characters. The length in
 1300   the table is zero; the actual length is stored in the compiled code. */
 1301 
 1302   if (c == OP_XCLASS) code += GET(code, 1);
 1303 
 1304   /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
 1305   that are followed by a character may be followed by a multi-byte character.
 1306   The length in the table is a minimum, so we have to arrange to skip the extra
 1307   bytes. */
 1308 
 1309   else
 1310     {
 1311     code += _pcre_OP_lengths[c];
 1312     if (utf8) switch(c)
 1313       {
 1314       case OP_CHAR:
 1315       case OP_CHARNC:
 1316       case OP_EXACT:
 1317       case OP_UPTO:
 1318       case OP_MINUPTO:
 1319       case OP_POSUPTO:
 1320       case OP_STAR:
 1321       case OP_MINSTAR:
 1322       case OP_POSSTAR:
 1323       case OP_PLUS:
 1324       case OP_MINPLUS:
 1325       case OP_POSPLUS:
 1326       case OP_QUERY:
 1327       case OP_MINQUERY:
 1328       case OP_POSQUERY:
 1329       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
 1330       break;
 1331       }
 1332     }
 1333   }
 1334 }
 1335 
 1336 
 1337 
 1338 /*************************************************
 1339 *    Scan compiled branch for non-emptiness      *
 1340 *************************************************/
 1341 
 1342 /* This function scans through a branch of a compiled pattern to see whether it
 1343 can match the empty string or not. It is called from could_be_empty()
 1344 below and from compile_branch() when checking for an unlimited repeat of a
 1345 group that can match nothing. Note that first_significant_code() skips over
 1346 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
 1347 struck an inner bracket whose current branch will already have been scanned.
 1348 
 1349 Arguments:
 1350   code        points to start of search
 1351   endcode     points to where to stop
 1352   utf8        TRUE if in UTF8 mode
 1353 
 1354 Returns:      TRUE if what is matched could be empty
 1355 */
 1356 
 1357 static BOOL
 1358 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
 1359 {
 1360 register int c;
 1361 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
 1362      code < endcode;
 1363      code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
 1364   {
 1365   const uschar *ccode;
 1366 
 1367   c = *code;
 1368 
 1369   if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
 1370     {
 1371     BOOL empty_branch;
 1372     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
 1373 
 1374     /* Scan a closed bracket */
 1375 
 1376     empty_branch = FALSE;
 1377     do
 1378       {
 1379       if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
 1380         empty_branch = TRUE;
 1381       code += GET(code, 1);
 1382       }
 1383     while (*code == OP_ALT);
 1384     if (!empty_branch) return FALSE;   /* All branches are non-empty */
 1385 
 1386     /* Move past the KET and fudge things so that the increment in the "for"
 1387     above has no effect. */
 1388 
 1389     c = OP_END;
 1390     code += 1 + LINK_SIZE - _pcre_OP_lengths[c];
 1391     continue;
 1392     }
 1393 
 1394   /* Handle the other opcodes */
 1395 
 1396   switch (c)
 1397     {
 1398     /* Check for quantifiers after a class */
 1399 
 1400 #ifdef SUPPORT_UTF8
 1401     case OP_XCLASS:
 1402     ccode = code + GET(code, 1);
 1403     goto CHECK_CLASS_REPEAT;
 1404 #endif
 1405 
 1406     case OP_CLASS:
 1407     case OP_NCLASS:
 1408     ccode = code + 33;
 1409 
 1410 #ifdef SUPPORT_UTF8
 1411     CHECK_CLASS_REPEAT:
 1412 #endif
 1413 
 1414     switch (*ccode)
 1415       {
 1416       case OP_CRSTAR:            /* These could be empty; continue */
 1417       case OP_CRMINSTAR:
 1418       case OP_CRQUERY:
 1419       case OP_CRMINQUERY:
 1420       break;
 1421 
 1422       default:                   /* Non-repeat => class must match */
 1423       case OP_CRPLUS:            /* These repeats aren't empty */
 1424       case OP_CRMINPLUS:
 1425       return FALSE;
 1426 
 1427       case OP_CRRANGE:
 1428       case OP_CRMINRANGE:
 1429       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
 1430       break;
 1431       }
 1432     break;
 1433 
 1434     /* Opcodes that must match a character */
 1435 
 1436     case OP_PROP:
 1437     case OP_NOTPROP:
 1438     case OP_EXTUNI:
 1439     case OP_NOT_DIGIT:
 1440     case OP_DIGIT:
 1441     case OP_NOT_WHITESPACE:
 1442     case OP_WHITESPACE:
 1443     case OP_NOT_WORDCHAR:
 1444     case OP_WORDCHAR:
 1445     case OP_ANY:
 1446     case OP_ANYBYTE:
 1447     case OP_CHAR:
 1448     case OP_CHARNC:
 1449     case OP_NOT:
 1450     case OP_PLUS:
 1451     case OP_MINPLUS:
 1452     case OP_POSPLUS:
 1453     case OP_EXACT:
 1454     case OP_NOTPLUS:
 1455     case OP_NOTMINPLUS:
 1456     case OP_NOTPOSPLUS:
 1457     case OP_NOTEXACT:
 1458     case OP_TYPEPLUS:
 1459     case OP_TYPEMINPLUS:
 1460     case OP_TYPEPOSPLUS:
 1461     case OP_TYPEEXACT:
 1462     return FALSE;
 1463 
 1464     /* End of branch */
 1465 
 1466     case OP_KET:
 1467     case OP_KETRMAX:
 1468     case OP_KETRMIN:
 1469     case OP_ALT:
 1470     return TRUE;
 1471 
 1472     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
 1473     MINUPTO, and POSUPTO may be followed by a multibyte character */
 1474 
 1475 #ifdef SUPPORT_UTF8
 1476     case OP_STAR:
 1477     case OP_MINSTAR:
 1478     case OP_POSSTAR:
 1479     case OP_QUERY:
 1480     case OP_MINQUERY:
 1481     case OP_POSQUERY:
 1482     case OP_UPTO:
 1483     case OP_MINUPTO:
 1484     case OP_POSUPTO:
 1485     if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
 1486     break;
 1487 #endif
 1488     }
 1489   }
 1490 
 1491 return TRUE;
 1492 }
 1493 
 1494 
 1495 
 1496 /*************************************************
 1497 *    Scan compiled regex for non-emptiness       *
 1498 *************************************************/
 1499 
 1500 /* This function is called to check for left recursive calls. We want to check
 1501 the current branch of the current pattern to see if it could match the empty
 1502 string. If it could, we must look outwards for branches at other levels,
 1503 stopping when we pass beyond the bracket which is the subject of the recursion.
 1504 
 1505 Arguments:
 1506   code        points to start of the recursion
 1507   endcode     points to where to stop (current RECURSE item)
 1508   bcptr       points to the chain of current (unclosed) branch starts
 1509   utf8        TRUE if in UTF-8 mode
 1510 
 1511 Returns:      TRUE if what is matched could be empty
 1512 */
 1513 
 1514 static BOOL
 1515 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
 1516   BOOL utf8)
 1517 {
 1518 while (bcptr != NULL && bcptr->current >= code)
 1519   {
 1520   if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
 1521   bcptr = bcptr->outer;
 1522   }
 1523 return TRUE;
 1524 }
 1525 
 1526 
 1527 
 1528 /*************************************************
 1529 *           Check for POSIX class syntax         *
 1530 *************************************************/
 1531 
 1532 /* This function is called when the sequence "[:" or "[." or "[=" is
 1533 encountered in a character class. It checks whether this is followed by an
 1534 optional ^ and then a sequence of letters, terminated by a matching ":]" or
 1535 ".]" or "=]".
 1536 
 1537 Argument:
 1538   ptr      pointer to the initial [
 1539   endptr   where to return the end pointer
 1540   cd       pointer to compile data
 1541 
 1542 Returns:   TRUE or FALSE
 1543 */
 1544 
 1545 static BOOL
 1546 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
 1547 {
 1548 int terminator;          /* Don't combine these lines; the Solaris cc */
 1549 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
 1550 if (*(++ptr) == '^') ptr++;
 1551 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
 1552 if (*ptr == terminator && ptr[1] == ']')
 1553   {
 1554   *endptr = ptr;
 1555   return TRUE;
 1556   }
 1557 return FALSE;
 1558 }
 1559 
 1560 
 1561 
 1562 
 1563 /*************************************************
 1564 *          Check POSIX class name                *
 1565 *************************************************/
 1566 
 1567 /* This function is called to check the name given in a POSIX-style class entry
 1568 such as [:alnum:].
 1569 
 1570 Arguments:
 1571   ptr        points to the first letter
 1572   len        the length of the name
 1573 
 1574 Returns:     a value representing the name, or -1 if unknown
 1575 */
 1576 
 1577 static int
 1578 check_posix_name(const uschar *ptr, int len)
 1579 {
 1580 register int yield = 0;
 1581 while (posix_name_lengths[yield] != 0)
 1582   {
 1583   if (len == posix_name_lengths[yield] &&
 1584     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
 1585   yield++;
 1586   }
 1587 return -1;
 1588 }
 1589 
 1590 
 1591 /*************************************************
 1592 *    Adjust OP_RECURSE items in repeated group   *
 1593 *************************************************/
 1594 
 1595 /* OP_RECURSE items contain an offset from the start of the regex to the group
 1596 that is referenced. This means that groups can be replicated for fixed
 1597 repetition simply by copying (because the recursion is allowed to refer to
 1598 earlier groups that are outside the current group). However, when a group is
 1599 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
 1600 it, after it has been compiled. This means that any OP_RECURSE items within it
 1601 that refer to the group itself or any contained groups have to have their
 1602 offsets adjusted. That one of the jobs of this function. Before it is called,
 1603 the partially compiled regex must be temporarily terminated with OP_END.
 1604 
 1605 This function has been extended with the possibility of forward references for
 1606 recursions and subroutine calls. It must also check the list of such references
 1607 for the group we are dealing with. If it finds that one of the recursions in
 1608 the current group is on this list, it adjusts the offset in the list, not the
 1609 value in the reference (which is a group number).
 1610 
 1611 Arguments:
 1612   group      points to the start of the group
 1613   adjust     the amount by which the group is to be moved
 1614   utf8       TRUE in UTF-8 mode
 1615   cd         contains pointers to tables etc.
 1616   save_hwm   the hwm forward reference pointer at the start of the group
 1617 
 1618 Returns:     nothing
 1619 */
 1620 
 1621 static void
 1622 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
 1623   uschar *save_hwm)
 1624 {
 1625 uschar *ptr = group;
 1626 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
 1627   {
 1628   int offset;
 1629   uschar *hc;
 1630 
 1631   /* See if this recursion is on the forward reference list. If so, adjust the
 1632   reference. */
 1633 
 1634   for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
 1635     {
 1636     offset = GET(hc, 0);
 1637     if (cd->start_code + offset == ptr + 1)
 1638       {
 1639       PUT(hc, 0, offset + adjust);
 1640       break;
 1641       }
 1642     }
 1643 
 1644   /* Otherwise, adjust the recursion offset if it's after the start of this
 1645   group. */
 1646 
 1647   if (hc >= cd->hwm)
 1648     {
 1649     offset = GET(ptr, 1);
 1650     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
 1651     }
 1652 
 1653   ptr += 1 + LINK_SIZE;
 1654   }
 1655 }
 1656 
 1657 
 1658 
 1659 /*************************************************
 1660 *        Insert an automatic callout point       *
 1661 *************************************************/
 1662 
 1663 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
 1664 callout points before each pattern item.
 1665 
 1666 Arguments:
 1667   code           current code pointer
 1668   ptr            current pattern pointer
 1669   cd             pointers to tables etc
 1670 
 1671 Returns:         new code pointer
 1672 */
 1673 
 1674 static uschar *
 1675 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
 1676 {
 1677 *code++ = OP_CALLOUT;
 1678 *code++ = 255;
 1679 PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */
 1680 PUT(code, LINK_SIZE, 0);                /* Default length */
 1681 return code + 2*LINK_SIZE;
 1682 }
 1683 
 1684 
 1685 
 1686 /*************************************************
 1687 *         Complete a callout item                *
 1688 *************************************************/
 1689 
 1690 /* A callout item contains the length of the next item in the pattern, which
 1691 we can't fill in till after we have reached the relevant point. This is used
 1692 for both automatic and manual callouts.
 1693 
 1694 Arguments:
 1695   previous_callout   points to previous callout item
 1696   ptr                current pattern pointer
 1697   cd                 pointers to tables etc
 1698 
 1699 Returns:             nothing
 1700 */
 1701 
 1702 static void
 1703 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
 1704 {
 1705 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
 1706 PUT(previous_callout, 2 + LINK_SIZE, length);
 1707 }
 1708 
 1709 
 1710 
 1711 #ifdef SUPPORT_UCP
 1712 /*************************************************
 1713 *           Get othercase range                  *
 1714 *************************************************/
 1715 
 1716 /* This function is passed the start and end of a class range, in UTF-8 mode
 1717 with UCP support. It searches up the characters, looking for internal ranges of
 1718 characters in the "other" case. Each call returns the next one, updating the
 1719 start address.
 1720 
 1721 Arguments:
 1722   cptr        points to starting character value; updated
 1723   d           end value
 1724   ocptr       where to put start of othercase range
 1725   odptr       where to put end of othercase range
 1726 
 1727 Yield:        TRUE when range returned; FALSE when no more
 1728 */
 1729 
 1730 static BOOL
 1731 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
 1732   unsigned int *odptr)
 1733 {
 1734 unsigned int c, othercase, next;
 1735 
 1736 for (c = *cptr; c <= d; c++)
 1737   { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
 1738 
 1739 if (c > d) return FALSE;
 1740 
 1741 *ocptr = othercase;
 1742 next = othercase + 1;
 1743 
 1744 for (++c; c <= d; c++)
 1745   {
 1746   if (_pcre_ucp_othercase(c) != next) break;
 1747   next++;
 1748   }
 1749 
 1750 *odptr = next - 1;
 1751 *cptr = c;
 1752 
 1753 return TRUE;
 1754 }
 1755 #endif  /* SUPPORT_UCP */
 1756 
 1757 
 1758 
 1759 /*************************************************
 1760 *     Check if auto-possessifying is possible    *
 1761 *************************************************/
 1762 
 1763 /* This function is called for unlimited repeats of certain items, to see
 1764 whether the next thing could possibly match the repeated item. If not, it makes
 1765 sense to automatically possessify the repeated item.
 1766 
 1767 Arguments:
 1768   op_code       the repeated op code
 1769   this          data for this item, depends on the opcode
 1770   utf8          TRUE in UTF-8 mode
 1771   utf8_char     used for utf8 character bytes, NULL if not relevant
 1772   ptr           next character in pattern
 1773   options       options bits
 1774   cd            contains pointers to tables etc.
 1775 
 1776 Returns:        TRUE if possessifying is wanted
 1777 */
 1778 
 1779 static BOOL
 1780 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
 1781   const uschar *ptr, int options, compile_data *cd)
 1782 {
 1783 int next;
 1784 
 1785 /* Skip whitespace and comments in extended mode */
 1786 
 1787 if ((options & PCRE_EXTENDED) != 0)
 1788   {
 1789   for (;;)
 1790     {
 1791     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
 1792     if (*ptr == '#')
 1793       {
 1794       while (*(++ptr) != 0)
 1795         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
 1796       }
 1797     else break;
 1798     }
 1799   }
 1800 
 1801 /* If the next item is one that we can handle, get its value. A non-negative
 1802 value is a character, a negative value is an escape value. */
 1803 
 1804 if (*ptr == '\\')
 1805   {
 1806   int temperrorcode = 0;
 1807   next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
 1808   if (temperrorcode != 0) return FALSE;
 1809   ptr++;    /* Point after the escape sequence */
 1810   }
 1811 
 1812 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
 1813   {
 1814 #ifdef SUPPORT_UTF8
 1815   if (utf8) { GETCHARINC(next, ptr); } else
 1816 #endif
 1817   next = *ptr++;
 1818   }
 1819 
 1820 else return FALSE;
 1821 
 1822 /* Skip whitespace and comments in extended mode */
 1823 
 1824 if ((options & PCRE_EXTENDED) != 0)
 1825   {
 1826   for (;;)
 1827     {
 1828     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
 1829     if (*ptr == '#')
 1830       {
 1831       while (*(++ptr) != 0)
 1832         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
 1833       }
 1834     else break;
 1835     }
 1836   }
 1837 
 1838 /* If the next thing is itself optional, we have to give up. */
 1839 
 1840 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
 1841   return FALSE;
 1842 
 1843 /* Now compare the next item with the previous opcode. If the previous is a
 1844 positive single character match, "item" either contains the character or, if
 1845 "item" is greater than 127 in utf8 mode, the character's bytes are in
 1846 utf8_char. */
 1847 
 1848 
 1849 /* Handle cases when the next item is a character. */
 1850 
 1851 if (next >= 0) switch(op_code)
 1852   {
 1853   case OP_CHAR:
 1854 #ifdef SUPPORT_UTF8
 1855   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
 1856 #endif
 1857   return item != next;
 1858 
 1859   /* For CHARNC (caseless character) we must check the other case. If we have
 1860   Unicode property support, we can use it to test the other case of
 1861   high-valued characters. */
 1862 
 1863   case OP_CHARNC:
 1864 #ifdef SUPPORT_UTF8
 1865   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
 1866 #endif
 1867   if (item == next) return FALSE;
 1868 #ifdef SUPPORT_UTF8
 1869   if (utf8)
 1870     {
 1871     unsigned int othercase;
 1872     if (next < 128) othercase = cd->fcc[next]; else
 1873 #ifdef SUPPORT_UCP
 1874     othercase = _pcre_ucp_othercase((unsigned int)next);
 1875 #else
 1876     othercase = NOTACHAR;
 1877 #endif
 1878     return (unsigned int)item != othercase;
 1879     }
 1880   else
 1881 #endif  /* SUPPORT_UTF8 */
 1882   return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
 1883 
 1884   /* For OP_NOT, "item" must be a single-byte character. */
 1885 
 1886   case OP_NOT:
 1887   if (next < 0) return FALSE;  /* Not a character */
 1888   if (item == next) return TRUE;
 1889   if ((options & PCRE_CASELESS) == 0) return FALSE;
 1890 #ifdef SUPPORT_UTF8
 1891   if (utf8)
 1892     {
 1893     unsigned int othercase;
 1894     if (next < 128) othercase = cd->fcc[next]; else
 1895 #ifdef SUPPORT_UCP
 1896     othercase = _pcre_ucp_othercase(next);
 1897 #else
 1898     othercase = NOTACHAR;
 1899 #endif
 1900     return (unsigned int)item == othercase;
 1901     }
 1902   else
 1903 #endif  /* SUPPORT_UTF8 */
 1904   return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
 1905 
 1906   case OP_DIGIT:
 1907   return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
 1908 
 1909   case OP_NOT_DIGIT:
 1910   return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
 1911 
 1912   case OP_WHITESPACE:
 1913   return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
 1914 
 1915   case OP_NOT_WHITESPACE:
 1916   return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
 1917 
 1918   case OP_WORDCHAR:
 1919   return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
 1920 
 1921   case OP_NOT_WORDCHAR:
 1922   return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
 1923 
 1924   default:
 1925   return FALSE;
 1926   }
 1927 
 1928 
 1929 /* Handle the case when the next item is \d, \s, etc. */
 1930 
 1931 switch(op_code)
 1932   {
 1933   case OP_CHAR:
 1934   case OP_CHARNC:
 1935 #ifdef SUPPORT_UTF8
 1936   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
 1937 #endif
 1938   switch(-next)
 1939     {
 1940     case ESC_d:
 1941     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
 1942 
 1943     case ESC_D:
 1944     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
 1945 
 1946     case ESC_s:
 1947     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
 1948 
 1949     case ESC_S:
 1950     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
 1951 
 1952     case ESC_w:
 1953     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
 1954 
 1955     case ESC_W:
 1956     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
 1957 
 1958     default:
 1959     return FALSE;
 1960     }
 1961 
 1962   case OP_DIGIT:
 1963   return next == -ESC_D || next == -ESC_s || next == -ESC_W;
 1964 
 1965   case OP_NOT_DIGIT:
 1966   return next == -ESC_d;
 1967 
 1968   case OP_WHITESPACE:
 1969   return next == -ESC_S || next == -ESC_d || next == -ESC_w;
 1970 
 1971   case OP_NOT_WHITESPACE:
 1972   return next == -ESC_s;
 1973 
 1974   case OP_WORDCHAR:
 1975   return next == -ESC_W || next == -ESC_s;
 1976 
 1977   case OP_NOT_WORDCHAR:
 1978   return next == -ESC_w || next == -ESC_d;
 1979 
 1980   default:
 1981   return FALSE;
 1982   }
 1983 
 1984 /* Control does not reach here */
 1985 }
 1986 
 1987 
 1988 
 1989 /*************************************************
 1990 *           Compile one branch                   *
 1991 *************************************************/
 1992 
 1993 /* Scan the pattern, compiling it into the a vector. If the options are
 1994 changed during the branch, the pointer is used to change the external options
 1995 bits. This function is used during the pre-compile phase when we are trying
 1996 to find out the amount of memory needed, as well as during the real compile
 1997 phase. The value of lengthptr distinguishes the two phases.
 1998 
 1999 Arguments:
 2000   optionsptr     pointer to the option bits
 2001   codeptr        points to the pointer to the current code point
 2002   ptrptr         points to the current pattern pointer
 2003   errorcodeptr   points to error code variable
 2004   firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
 2005   reqbyteptr     set to the last literal character required, else < 0
 2006   bcptr          points to current branch chain
 2007   cd             contains pointers to tables etc.
 2008   lengthptr      NULL during the real compile phase
 2009                  points to length accumulator during pre-compile phase
 2010 
 2011 Returns:         TRUE on success
 2012                  FALSE, with *errorcodeptr set non-zero on error
 2013 */
 2014 
 2015 static BOOL
 2016 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
 2017   int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
 2018   compile_data *cd, int *lengthptr)
 2019 {
 2020 int repeat_type, op_type;
 2021 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
 2022 int bravalue = 0;
 2023 int greedy_default, greedy_non_default;
 2024 int firstbyte, reqbyte;
 2025 int zeroreqbyte, zerofirstbyte;
 2026 int req_caseopt, reqvary, tempreqvary;
 2027 int options = *optionsptr;
 2028 int after_manual_callout = 0;
 2029 int length_prevgroup = 0;
 2030 register int c;
 2031 register uschar *code = *codeptr;
 2032 uschar *last_code = code;
 2033 uschar *orig_code = code;
 2034 uschar *tempcode;
 2035 BOOL inescq = FALSE;
 2036 BOOL groupsetfirstbyte = FALSE;
 2037 const uschar *ptr = *ptrptr;
 2038 const uschar *tempptr;
 2039 uschar *previous = NULL;
 2040 uschar *previous_callout = NULL;
 2041 uschar *save_hwm = NULL;
 2042 uschar classbits[32];
 2043 
 2044 #ifdef SUPPORT_UTF8
 2045 BOOL class_utf8;
 2046 BOOL utf8 = (options & PCRE_UTF8) != 0;
 2047 uschar *class_utf8data;
 2048 uschar utf8_char[6];
 2049 #else
 2050 BOOL utf8 = FALSE;
 2051 uschar *utf8_char = NULL;
 2052 #endif
 2053 
 2054 #ifdef DEBUG
 2055 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
 2056 #endif
 2057 
 2058 /* Set up the default and non-default settings for greediness */
 2059 
 2060 greedy_default = ((options & PCRE_UNGREEDY) != 0);
 2061 greedy_non_default = greedy_default ^ 1;
 2062 
 2063 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
 2064 matching encountered yet". It gets changed to REQ_NONE if we hit something that
 2065 matches a non-fixed char first char; reqbyte just remains unset if we never
 2066 find one.
 2067 
 2068 When we hit a repeat whose minimum is zero, we may have to adjust these values
 2069 to take the zero repeat into account. This is implemented by setting them to
 2070 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
 2071 item types that can be repeated set these backoff variables appropriately. */
 2072 
 2073 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
 2074 
 2075 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
 2076 according to the current setting of the caseless flag. REQ_CASELESS is a bit
 2077 value > 255. It is added into the firstbyte or reqbyte variables to record the
 2078 case status of the value. This is used only for ASCII characters. */
 2079 
 2080 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
 2081 
 2082 /* Switch on next character until the end of the branch */
 2083 
 2084 for (;; ptr++)
 2085   {
 2086   BOOL negate_class;
 2087   BOOL possessive_quantifier;
 2088   BOOL is_quantifier;
 2089   BOOL is_recurse;
 2090   int class_charcount;
 2091   int class_lastchar;
 2092   int newoptions;
 2093   int recno;
 2094   int skipbytes;
 2095   int subreqbyte;
 2096   int subfirstbyte;
 2097   int terminator;
 2098   int mclength;
 2099   uschar mcbuffer[8];
 2100 
 2101   /* Get next byte in the pattern */
 2102 
 2103   c = *ptr;
 2104 
 2105   /* If we are in the pre-compile phase, accumulate the length used for the
 2106   previous cycle of this loop. */
 2107 
 2108   if (lengthptr != NULL)
 2109     {
 2110 #ifdef DEBUG
 2111     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
 2112 #endif
 2113     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
 2114       {
 2115       *errorcodeptr = ERR52;
 2116       goto FAILED;
 2117       }
 2118 
 2119     /* There is at least one situation where code goes backwards: this is the
 2120     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
 2121     the class is simply eliminated. However, it is created first, so we have to
 2122     allow memory for it. Therefore, don't ever reduce the length at this point.
 2123     */
 2124 
 2125     if (code < last_code) code = last_code;
 2126     *lengthptr += code - last_code;
 2127     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
 2128 
 2129     /* If "previous" is set and it is not at the start of the work space, move
 2130     it back to there, in order to avoid filling up the work space. Otherwise,
 2131     if "previous" is NULL, reset the current code pointer to the start. */
 2132 
 2133     if (previous != NULL)
 2134       {
 2135       if (previous > orig_code)
 2136         {
 2137         memmove(orig_code, previous, code - previous);
 2138         code -= previous - orig_code;
 2139         previous = orig_code;
 2140         }
 2141       }
 2142     else code = orig_code;
 2143 
 2144     /* Remember where this code item starts so we can pick up the length
 2145     next time round. */
 2146 
 2147     last_code = code;
 2148     }
 2149 
 2150   /* In the real compile phase, just check the workspace used by the forward
 2151   reference list. */
 2152 
 2153   else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
 2154     {
 2155     *errorcodeptr = ERR52;
 2156     goto FAILED;
 2157     }
 2158 
 2159   /* If in \Q...\E, check for the end; if not, we have a literal */
 2160 
 2161   if (inescq && c != 0)
 2162     {
 2163     if (c == '\\' && ptr[1] == 'E')
 2164       {
 2165       inescq = FALSE;
 2166       ptr++;
 2167       continue;
 2168       }
 2169     else
 2170       {
 2171       if (previous_callout != NULL)
 2172         {
 2173         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
 2174           complete_callout(previous_callout, ptr, cd);
 2175         previous_callout = NULL;
 2176         }
 2177       if ((options & PCRE_AUTO_CALLOUT) != 0)
 2178         {
 2179         previous_callout = code;
 2180         code = auto_callout(code, ptr, cd);
 2181         }
 2182       goto NORMAL_CHAR;
 2183       }
 2184     }
 2185 
 2186   /* Fill in length of a previous callout, except when the next thing is
 2187   a quantifier. */
 2188 
 2189   is_quantifier = c == '*' || c == '+' || c == '?' ||
 2190     (c == '{' && is_counted_repeat(ptr+1));
 2191 
 2192   if (!is_quantifier && previous_callout != NULL &&
 2193        after_manual_callout-- <= 0)
 2194     {
 2195     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
 2196       complete_callout(previous_callout, ptr, cd);
 2197     previous_callout = NULL;
 2198     }
 2199 
 2200   /* In extended mode, skip white space and comments */
 2201 
 2202   if ((options & PCRE_EXTENDED) != 0)
 2203     {
 2204     if ((cd->ctypes[c] & ctype_space) != 0) continue;
 2205     if (c == '#')
 2206       {
 2207       while (*(++ptr) != 0)
 2208         {
 2209         if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
 2210         }
 2211       if (*ptr != 0) continue;
 2212 
 2213       /* Else fall through to handle end of string */
 2214       c = 0;
 2215       }
 2216     }
 2217 
 2218   /* No auto callout for quantifiers. */
 2219 
 2220   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
 2221     {
 2222     previous_callout = code;
 2223     code = auto_callout(code, ptr, cd);
 2224     }
 2225 
 2226   switch(c)
 2227     {
 2228     /* ===================================================================*/
 2229     case 0:                        /* The branch terminates at string end */
 2230     case '|':                      /* or | or ) */
 2231     case ')':
 2232     *firstbyteptr = firstbyte;
 2233     *reqbyteptr = reqbyte;
 2234     *codeptr = code;
 2235     *ptrptr = ptr;
 2236     if (lengthptr != NULL)
 2237       {
 2238       *lengthptr += code - last_code;   /* To include callout length */
 2239       DPRINTF((">> end branch\n"));
 2240       }
 2241     return TRUE;
 2242 
 2243 
 2244     /* ===================================================================*/
 2245     /* Handle single-character metacharacters. In multiline mode, ^ disables
 2246     the setting of any following char as a first character. */
 2247 
 2248     case '^':
 2249     if ((options & PCRE_MULTILINE) != 0)
 2250       {
 2251       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
 2252       }
 2253     previous = NULL;
 2254     *code++ = OP_CIRC;
 2255     break;
 2256 
 2257     case '$':
 2258     previous = NULL;
 2259     *code++ = OP_DOLL;
 2260     break;
 2261 
 2262     /* There can never be a first char if '.' is first, whatever happens about
 2263     repeats. The value of reqbyte doesn't change either. */
 2264 
 2265     case '.':
 2266     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
 2267     zerofirstbyte = firstbyte;
 2268     zeroreqbyte = reqbyte;
 2269     previous = code;
 2270     *code++ = OP_ANY;
 2271     break;
 2272 
 2273 
 2274     /* ===================================================================*/
 2275     /* Character classes. If the included characters are all < 256, we build a
 2276     32-byte bitmap of the permitted characters, except in the special case
 2277     where there is only one such character. For negated classes, we build the
 2278     map as usual, then invert it at the end. However, we use a different opcode
 2279     so that data characters > 255 can be handled correctly.
 2280 
 2281     If the class contains characters outside the 0-255 range, a different
 2282     opcode is compiled. It may optionally have a bit map for characters < 256,
 2283     but those above are are explicitly listed afterwards. A flag byte tells
 2284     whether the bitmap is present, and whether this is a negated class or not.
 2285     */
 2286 
 2287     case '[':
 2288     previous = code;
 2289 
 2290     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
 2291     they are encountered at the top level, so we'll do that too. */
 2292 
 2293     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
 2294         check_posix_syntax(ptr, &tempptr, cd))
 2295       {
 2296       *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
 2297       goto FAILED;
 2298       }
 2299 
 2300     /* If the first character is '^', set the negation flag and skip it. */
 2301 
 2302     if ((c = *(++ptr)) == '^')
 2303       {
 2304       negate_class = TRUE;
 2305       c = *(++ptr);
 2306       }
 2307     else
 2308       {
 2309       negate_class = FALSE;
 2310       }
 2311 
 2312     /* Keep a count of chars with values < 256 so that we can optimize the case
 2313     of just a single character (as long as it's < 256). However, For higher
 2314     valued UTF-8 characters, we don't yet do any optimization. */
 2315 
 2316     class_charcount = 0;
 2317     class_lastchar = -1;
 2318 
 2319     /* Initialize the 32-char bit map to all zeros. We build the map in a
 2320     temporary bit of memory, in case the class contains only 1 character (less
 2321     than 256), because in that case the compiled code doesn't use the bit map.
 2322     */
 2323 
 2324     memset(classbits, 0, 32 * sizeof(uschar));
 2325 
 2326 #ifdef SUPPORT_UTF8
 2327     class_utf8 = FALSE;                       /* No chars >= 256 */
 2328     class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
 2329 #endif
 2330 
 2331     /* Process characters until ] is reached. By writing this as a "do" it
 2332     means that an initial ] is taken as a data character. At the start of the
 2333     loop, c contains the first byte of the character. */
 2334 
 2335     if (c != 0) do
 2336       {
 2337       const uschar *oldptr;
 2338 
 2339 #ifdef SUPPORT_UTF8
 2340       if (utf8 && c > 127)
 2341         {                           /* Braces are required because the */
 2342         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
 2343         }
 2344 #endif
 2345 
 2346       /* Inside \Q...\E everything is literal except \E */
 2347 
 2348       if (inescq)
 2349         {
 2350         if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
 2351           {
 2352           inescq = FALSE;                   /* Reset literal state */
 2353           ptr++;                            /* Skip the 'E' */
 2354           continue;                         /* Carry on with next */
 2355           }
 2356         goto CHECK_RANGE;                   /* Could be range if \E follows */
 2357         }
 2358 
 2359       /* Handle POSIX class names. Perl allows a negation extension of the
 2360       form [:^name:]. A square bracket that doesn't match the syntax is
 2361       treated as a literal. We also recognize the POSIX constructions
 2362       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
 2363       5.6 and 5.8 do. */
 2364 
 2365       if (c == '[' &&
 2366           (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
 2367           check_posix_syntax(ptr, &tempptr, cd))
 2368         {
 2369         BOOL local_negate = FALSE;
 2370         int posix_class, taboffset, tabopt;
 2371         register const uschar *cbits = cd->cbits;
 2372         uschar pbits[32];
 2373 
 2374         if (ptr[1] != ':')
 2375           {
 2376           *errorcodeptr = ERR31;
 2377           goto FAILED;
 2378           }
 2379 
 2380         ptr += 2;
 2381         if (*ptr == '^')
 2382           {
 2383           local_negate = TRUE;
 2384           ptr++;
 2385           }
 2386 
 2387         posix_class = check_posix_name(ptr, tempptr - ptr);
 2388         if (posix_class < 0)
 2389           {
 2390           *errorcodeptr = ERR30;
 2391           goto FAILED;
 2392           }
 2393 
 2394         /* If matching is caseless, upper and lower are converted to
 2395         alpha. This relies on the fact that the class table starts with
 2396         alpha, lower, upper as the first 3 entries. */
 2397 
 2398         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
 2399           posix_class = 0;
 2400 
 2401         /* We build the bit map for the POSIX class in a chunk of local store
 2402         because we may be adding and subtracting from it, and we don't want to
 2403         subtract bits that may be in the main map already. At the end we or the
 2404         result into the bit map that is being built. */
 2405 
 2406         posix_class *= 3;
 2407 
 2408         /* Copy in the first table (always present) */
 2409 
 2410         memcpy(pbits, cbits + posix_class_maps[posix_class],
 2411           32 * sizeof(uschar));
 2412 
 2413         /* If there is a second table, add or remove it as required. */
 2414 
 2415         taboffset = posix_class_maps[posix_class + 1];
 2416         tabopt = posix_class_maps[posix_class + 2];
 2417 
 2418         if (taboffset >= 0)
 2419           {
 2420           if (tabopt >= 0)
 2421             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
 2422           else
 2423             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
 2424           }
 2425 
 2426         /* Not see if we need to remove any special characters. An option
 2427         value of 1 removes vertical space and 2 removes underscore. */
 2428 
 2429         if (tabopt < 0) tabopt = -tabopt;
 2430         if (tabopt == 1) pbits[1] &= ~0x3c;
 2431           else if (tabopt == 2) pbits[11] &= 0x7f;
 2432 
 2433         /* Add the POSIX table or its complement into the main table that is
 2434         being built and we are done. */
 2435 
 2436         if (local_negate)
 2437           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
 2438         else
 2439           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
 2440 
 2441         ptr = tempptr + 1;
 2442         class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
 2443         continue;    /* End of POSIX syntax handling */
 2444         }
 2445 
 2446       /* Backslash may introduce a single character, or it may introduce one
 2447       of the specials, which just set a flag. The sequence \b is a special
 2448       case. Inside a class (and only there) it is treated as backspace.
 2449       Elsewhere it marks a word boundary. Other escapes have preset maps ready
 2450       to or into the one we are building. We assume they have more than one
 2451       character in them, so set class_charcount bigger than one. */
 2452 
 2453       if (c == '\\')
 2454         {
 2455         c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
 2456         if (*errorcodeptr != 0) goto FAILED;
 2457 
 2458         if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
 2459         else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
 2460         else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
 2461         else if (-c == ESC_Q)            /* Handle start of quoted string */
 2462           {
 2463           if (ptr[1] == '\\' && ptr[2] == 'E')
 2464             {
 2465             ptr += 2; /* avoid empty string */
 2466             }
 2467           else inescq = TRUE;
 2468           continue;
 2469           }
 2470 
 2471         if (c < 0)
 2472           {
 2473           register const uschar *cbits = cd->cbits;
 2474           class_charcount += 2;     /* Greater than 1 is what matters */
 2475 
 2476           /* Save time by not doing this in the pre-compile phase. */
 2477 
 2478           if (lengthptr == NULL) switch (-c)
 2479             {
 2480             case ESC_d:
 2481             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
 2482             continue;
 2483 
 2484             case ESC_D:
 2485             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
 2486             continue;
 2487 
 2488             case ESC_w:
 2489             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
 2490             continue;
 2491 
 2492             case ESC_W:
 2493             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
 2494             continue;
 2495 
 2496             case ESC_s:
 2497             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
 2498             classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
 2499             continue;
 2500 
 2501             case ESC_S:
 2502             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
 2503             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
 2504             continue;
 2505 
 2506             case ESC_E: /* Perl ignores an orphan \E */
 2507             continue;
 2508 
 2509             default:    /* Not recognized; fall through */
 2510             break;      /* Need "default" setting to stop compiler warning. */
 2511             }
 2512 
 2513           /* In the pre-compile phase, just do the recognition. */
 2514 
 2515           else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
 2516                    c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
 2517 
 2518           /* We need to deal with \P and \p in both phases. */
 2519 
 2520 #ifdef SUPPORT_UCP
 2521           if (-c == ESC_p || -c == ESC_P)
 2522             {
 2523             BOOL negated;
 2524             int pdata;
 2525             int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
 2526             if (ptype < 0) goto FAILED;
 2527             class_utf8 = TRUE;
 2528             *class_utf8data++ = ((-c == ESC_p) != negated)?
 2529               XCL_PROP : XCL_NOTPROP;
 2530             *class_utf8data++ = ptype;
 2531             *class_utf8data++ = pdata;
 2532             class_charcount -= 2;   /* Not a < 256 character */
 2533             continue;
 2534             }
 2535 #endif
 2536           /* Unrecognized escapes are faulted if PCRE is running in its
 2537           strict mode. By default, for compatibility with Perl, they are
 2538           treated as literals. */
 2539 
 2540           if ((options & PCRE_EXTRA) != 0)
 2541             {
 2542             *errorcodeptr = ERR7;
 2543             goto FAILED;
 2544             }
 2545 
 2546           class_charcount -= 2;  /* Undo the default count from above */
 2547           c = *ptr;              /* Get the final character and fall through */
 2548           }
 2549 
 2550         /* Fall through if we have a single character (c >= 0). This may be
 2551         greater than 256 in UTF-8 mode. */
 2552 
 2553         }   /* End of backslash handling */
 2554 
 2555       /* A single character may be followed by '-' to form a range. However,
 2556       Perl does not permit ']' to be the end of the range. A '-' character
 2557       at the end is treated as a literal. Perl ignores orphaned \E sequences
 2558       entirely. The code for handling \Q and \E is messy. */
 2559 
 2560       CHECK_RANGE:
 2561       while (ptr[1] == '\\' && ptr[2] == 'E')
 2562         {
 2563         inescq = FALSE;
 2564         ptr += 2;
 2565         }
 2566 
 2567       oldptr = ptr;
 2568 
 2569       if (!inescq && ptr[1] == '-')
 2570         {
 2571         int d;
 2572         ptr += 2;
 2573         while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
 2574 
 2575         /* If we hit \Q (not followed by \E) at this point, go into escaped
 2576         mode. */
 2577 
 2578         while (*ptr == '\\' && ptr[1] == 'Q')
 2579           {
 2580           ptr += 2;
 2581           if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
 2582           inescq = TRUE;
 2583           break;
 2584           }
 2585 
 2586         if (*ptr == 0 || (!inescq && *ptr == ']'))
 2587           {
 2588           ptr = oldptr;
 2589           goto LONE_SINGLE_CHARACTER;
 2590           }
 2591 
 2592 #ifdef SUPPORT_UTF8
 2593         if (utf8)
 2594           {                           /* Braces are required because the */
 2595           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
 2596           }
 2597         else
 2598 #endif
 2599         d = *ptr;  /* Not UTF-8 mode */
 2600 
 2601         /* The second part of a range can be a single-character escape, but
 2602         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
 2603         in such circumstances. */
 2604 
 2605         if (!inescq && d == '\\')
 2606           {
 2607           d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
 2608           if (*errorcodeptr != 0) goto FAILED;
 2609 
 2610           /* \b is backslash; \X is literal X; \R is literal R; any other
 2611           special means the '-' was literal */
 2612 
 2613           if (d < 0)
 2614             {
 2615             if (d == -ESC_b) d = '\b';
 2616             else if (d == -ESC_X) d = 'X';
 2617             else if (d == -ESC_R) d = 'R'; else
 2618               {
 2619               ptr = oldptr;
 2620               goto LONE_SINGLE_CHARACTER;  /* A few lines below */
 2621               }
 2622             }
 2623           }
 2624 
 2625         /* Check that the two values are in the correct order. Optimize
 2626         one-character ranges */
 2627 
 2628         if (d < c)
 2629           {
 2630           *errorcodeptr = ERR8;
 2631           goto FAILED;
 2632           }
 2633 
 2634         if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
 2635 
 2636         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
 2637         matching, we have to use an XCLASS with extra data items. Caseless
 2638         matching for characters > 127 is available only if UCP support is
 2639         available. */
 2640 
 2641 #ifdef SUPPORT_UTF8
 2642         if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
 2643           {
 2644           class_utf8 = TRUE;
 2645 
 2646           /* With UCP support, we can find the other case equivalents of
 2647           the relevant characters. There may be several ranges. Optimize how
 2648           they fit with the basic range. */
 2649 
 2650 #ifdef SUPPORT_UCP
 2651           if ((options & PCRE_CASELESS) != 0)
 2652             {
 2653             unsigned int occ, ocd;
 2654             unsigned int cc = c;
 2655             unsigned int origd = d;
 2656             while (get_othercase_range(&cc, origd, &occ, &ocd))
 2657               {
 2658               if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */
 2659 
 2660               if (occ < c  && ocd >= c - 1)        /* Extend the basic range */
 2661                 {                                  /* if there is overlap,   */
 2662                 c = occ;                           /* noting that if occ < c */
 2663                 continue;                          /* we can't have ocd > d  */
 2664                 }                                  /* because a subrange is  */
 2665               if (ocd > d && occ <= d + 1)         /* always shorter than    */
 2666                 {                                  /* the basic range.       */
 2667                 d = ocd;
 2668                 continue;
 2669                 }
 2670 
 2671               if (occ == ocd)
 2672                 {
 2673                 *class_utf8data++ = XCL_SINGLE;
 2674                 }
 2675               else
 2676                 {
 2677                 *class_utf8data++ = XCL_RANGE;
 2678                 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
 2679                 }
 2680               class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
 2681               }
 2682             }
 2683 #endif  /* SUPPORT_UCP */
 2684 
 2685           /* Now record the original range, possibly modified for UCP caseless
 2686           overlapping ranges. */
 2687 
 2688           *class_utf8data++ = XCL_RANGE;
 2689           class_utf8data += _pcre_ord2utf8(c, class_utf8data);
 2690           class_utf8data += _pcre_ord2utf8(d, class_utf8data);
 2691 
 2692           /* With UCP support, we are done. Without UCP support, there is no
 2693           caseless matching for UTF-8 characters > 127; we can use the bit map
 2694           for the smaller ones. */
 2695 
 2696 #ifdef SUPPORT_UCP
 2697           continue;    /* With next character in the class */
 2698 #else
 2699           if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
 2700 
 2701           /* Adjust upper limit and fall through to set up the map */
 2702 
 2703           d = 127;
 2704 
 2705 #endif  /* SUPPORT_UCP */
 2706           }
 2707 #endif  /* SUPPORT_UTF8 */
 2708 
 2709         /* We use the bit map for all cases when not in UTF-8 mode; else
 2710         ranges that lie entirely within 0-127 when there is UCP support; else
 2711         for partial ranges without UCP support. */
 2712 
 2713         class_charcount += d - c + 1;
 2714         class_lastchar = d;
 2715 
 2716         /* We can save a bit of time by skipping this in the pre-compile. */
 2717 
 2718         if (lengthptr == NULL) for (; c <= d; c++)
 2719           {
 2720           classbits[c/8] |= (1 << (c&7));
 2721           if ((options & PCRE_CASELESS) != 0)
 2722             {
 2723             int uc = cd->fcc[c];           /* flip case */
 2724             classbits[uc/8] |= (1 << (uc&7));
 2725             }
 2726           }
 2727 
 2728         continue;   /* Go get the next char in the class */
 2729         }
 2730 
 2731       /* Handle a lone single character - we can get here for a normal
 2732       non-escape char, or after \ that introduces a single character or for an
 2733       apparent range that isn't. */
 2734 
 2735       LONE_SINGLE_CHARACTER:
 2736 
 2737       /* Handle a character that cannot go in the bit map */
 2738 
 2739 #ifdef SUPPORT_UTF8
 2740       if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
 2741         {
 2742         class_utf8 = TRUE;
 2743         *class_utf8data++ = XCL_SINGLE;
 2744         class_utf8data += _pcre_ord2utf8(c, class_utf8data);
 2745 
 2746 #ifdef SUPPORT_UCP
 2747         if ((options & PCRE_CASELESS) != 0)
 2748           {
 2749           unsigned int othercase;
 2750           if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
 2751             {
 2752             *class_utf8data++ = XCL_SINGLE;
 2753             class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
 2754             }
 2755           }
 2756 #endif  /* SUPPORT_UCP */
 2757 
 2758         }
 2759       else
 2760 #endif  /* SUPPORT_UTF8 */
 2761 
 2762       /* Handle a single-byte character */
 2763         {
 2764         classbits[c/8] |= (1 << (c&7));
 2765         if ((options & PCRE_CASELESS) != 0)
 2766           {
 2767           c = cd->fcc[c];   /* flip case */
 2768           classbits[c/8] |= (1 << (c&7));
 2769           }
 2770         class_charcount++;
 2771         class_lastchar = c;
 2772         }
 2773       }
 2774 
 2775     /* Loop until ']' reached. This "while" is the end of the "do" above. */
 2776 
 2777     while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
 2778 
 2779     if (c == 0)                          /* Missing terminating ']' */
 2780       {
 2781       *errorcodeptr = ERR6;
 2782       goto FAILED;
 2783       }
 2784 
 2785     /* If class_charcount is 1, we saw precisely one character whose value is
 2786     less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
 2787     can optimize the negative case only if there were no characters >= 128
 2788     because OP_NOT and the related opcodes like OP_NOTSTAR operate on
 2789     single-bytes only. This is an historical hangover. Maybe one day we can
 2790     tidy these opcodes to handle multi-byte characters.
 2791 
 2792     The optimization throws away the bit map. We turn the item into a
 2793     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
 2794     that OP_NOT does not support multibyte characters. In the positive case, it
 2795     can cause firstbyte to be set. Otherwise, there can be no first char if
 2796     this item is first, whatever repeat count may follow. In the case of
 2797     reqbyte, save the previous value for reinstating. */
 2798 
 2799 #ifdef SUPPORT_UTF8
 2800     if (class_charcount == 1 &&
 2801           (!utf8 ||
 2802           (!class_utf8 && (!negate_class || class_lastchar < 128))))
 2803 
 2804 #else
 2805     if (class_charcount == 1)
 2806 #endif
 2807       {
 2808       zeroreqbyte = reqbyte;
 2809 
 2810       /* The OP_NOT opcode works on one-byte characters only. */
 2811 
 2812       if (negate_class)
 2813         {
 2814         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
 2815         zerofirstbyte = firstbyte;
 2816         *code++ = OP_NOT;
 2817         *code++ = class_lastchar;
 2818         break;
 2819         }
 2820 
 2821       /* For a single, positive character, get the value into mcbuffer, and
 2822       then we can handle this with the normal one-character code. */
 2823 
 2824 #ifdef SUPPORT_UTF8
 2825       if (utf8 && class_lastchar > 127)
 2826         mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
 2827       else
 2828 #endif
 2829         {
 2830         mcbuffer[0] = class_lastchar;
 2831         mclength = 1;
 2832         }
 2833       goto ONE_CHAR;
 2834       }       /* End of 1-char optimization */
 2835 
 2836     /* The general case - not the one-char optimization. If this is the first
 2837     thing in the branch, there can be no first char setting, whatever the
 2838     repeat count. Any reqbyte setting must remain unchanged after any kind of
 2839     repeat. */
 2840 
 2841     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
 2842     zerofirstbyte = firstbyte;
 2843     zeroreqbyte = reqbyte;
 2844 
 2845     /* If there are characters with values > 255, we have to compile an
 2846     extended class, with its own opcode. If there are no characters < 256,
 2847     we can omit the bitmap in the actual compiled code. */
 2848 
 2849 #ifdef SUPPORT_UTF8
 2850     if (class_utf8)
 2851       {
 2852       *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
 2853       *code++ = OP_XCLASS;
 2854       code += LINK_SIZE;
 2855       *code = negate_class? XCL_NOT : 0;
 2856 
 2857       /* If the map is required, move up the extra data to make room for it;
 2858       otherwise just move the code pointer to the end of the extra data. */
 2859 
 2860       if (class_charcount > 0)
 2861         {
 2862         *code++ |= XCL_MAP;
 2863         memmove(code + 32, code, class_utf8data - code);
 2864         memcpy(code, classbits, 32);
 2865         code = class_utf8data + 32;
 2866         }
 2867       else code = class_utf8data;
 2868 
 2869       /* Now fill in the complete length of the item */
 2870 
 2871       PUT(previous, 1, code - previous);
 2872       break;   /* End of class handling */
 2873       }
 2874 #endif
 2875 
 2876     /* If there are no characters > 255, negate the 32-byte map if necessary,
 2877     and copy it into the code vector. If this is the first thing in the branch,
 2878     there can be no first char setting, whatever the repeat count. Any reqbyte
 2879     setting must remain unchanged after any kind of repeat. */
 2880 
 2881     if (negate_class)
 2882       {
 2883       *code++ = OP_NCLASS;
 2884       if (lengthptr == NULL)    /* Save time in the pre-compile phase */
 2885         for (c = 0; c < 32; c++) code[c] = ~classbits[c];
 2886       }
 2887     else
 2888       {
 2889       *code++ = OP_CLASS;
 2890       memcpy(code, classbits, 32);
 2891       }
 2892     code += 32;
 2893     break;
 2894 
 2895 
 2896     /* ===================================================================*/
 2897     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
 2898     has been tested above. */
 2899 
 2900     case '{':
 2901     if (!is_quantifier) goto NORMAL_CHAR;
 2902     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
 2903     if (*errorcodeptr != 0) goto FAILED;
 2904     goto REPEAT;
 2905 
 2906     case '*':
 2907     repeat_min = 0;
 2908     repeat_max = -1;
 2909     goto REPEAT;
 2910 
 2911     case '+':
 2912     repeat_min = 1;
 2913     repeat_max = -1;
 2914     goto REPEAT;
 2915 
 2916     case '?':
 2917     repeat_min = 0;
 2918     repeat_max = 1;
 2919 
 2920     REPEAT:
 2921     if (previous == NULL)
 2922       {
 2923       *errorcodeptr = ERR9;
 2924       goto FAILED;
 2925       }
 2926 
 2927     if (repeat_min == 0)
 2928       {
 2929       firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
 2930       reqbyte = zeroreqbyte;        /* Ditto */
 2931       }
 2932 
 2933     /* Remember whether this is a variable length repeat */
 2934 
 2935     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
 2936 
 2937     op_type = 0;                    /* Default single-char op codes */
 2938     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
 2939 
 2940     /* Save start of previous item, in case we have to move it up to make space
 2941     for an inserted OP_ONCE for the additional '+' extension. */
 2942 
 2943     tempcode = previous;
 2944 
 2945     /* If the next character is '+', we have a possessive quantifier. This
 2946     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
 2947     If the next character is '?' this is a minimizing repeat, by default,
 2948     but if PCRE_UNGREEDY is set, it works the other way round. We change the
 2949     repeat type to the non-default. */
 2950 
 2951     if (ptr[1] == '+')
 2952       {
 2953       repeat_type = 0;                  /* Force greedy */
 2954       possessive_quantifier = TRUE;
 2955       ptr++;
 2956       }
 2957     else if (ptr[1] == '?')
 2958       {
 2959       repeat_type = greedy_non_default;
 2960       ptr++;
 2961       }
 2962     else repeat_type = greedy_default;
 2963 
 2964     /* If previous was a character match, abolish the item and generate a
 2965     repeat item instead. If a char item has a minumum of more than one, ensure
 2966     that it is set in reqbyte - it might not be if a sequence such as x{3} is
 2967     the first thing in a branch because the x will have gone into firstbyte
 2968     instead.  */
 2969 
 2970     if (*previous == OP_CHAR || *previous == OP_CHARNC)
 2971       {
 2972       /* Deal with UTF-8 characters that take up more than one byte. It's
 2973       easier to write this out separately than try to macrify it. Use c to
 2974       hold the length of the character in bytes, plus 0x80 to flag that it's a
 2975       length rather than a small character. */
 2976 
 2977 #ifdef SUPPORT_UTF8
 2978       if (utf8 && (code[-1] & 0x80) != 0)
 2979         {
 2980         uschar *lastchar = code - 1;
 2981         while((*lastchar & 0xc0) == 0x80) lastchar--;
 2982         c = code - lastchar;            /* Length of UTF-8 character */
 2983         memcpy(utf8_char, lastchar, c); /* Save the char */
 2984         c |= 0x80;                      /* Flag c as a length */
 2985         }
 2986       else
 2987 #endif
 2988 
 2989       /* Handle the case of a single byte - either with no UTF8 support, or
 2990       with UTF-8 disabled, or for a UTF-8 character < 128. */
 2991 
 2992         {
 2993         c = code[-1];
 2994         if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
 2995         }
 2996 
 2997       /* If the repetition is unlimited, it pays to see if the next thing on
 2998       the line is something that cannot possibly match this character. If so,
 2999       automatically possessifying this item gains some performance in the case
 3000       where the match fails. */
 3001 
 3002       if (!possessive_quantifier &&
 3003           repeat_max < 0 &&
 3004           check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
 3005             options, cd))
 3006         {
 3007         repeat_type = 0;    /* Force greedy */
 3008         possessive_quantifier = TRUE;
 3009         }
 3010 
 3011       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
 3012       }
 3013 
 3014     /* If previous was a single negated character ([^a] or similar), we use
 3015     one of the special opcodes, replacing it. The code is shared with single-
 3016     character repeats by setting opt_type to add a suitable offset into
 3017     repeat_type. We can also test for auto-possessification. OP_NOT is
 3018     currently used only for single-byte chars. */
 3019 
 3020     else if (*previous == OP_NOT)
 3021       {
 3022       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
 3023       c = previous[1];
 3024       if (!possessive_quantifier &&
 3025           repeat_max < 0 &&
 3026           check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
 3027         {
 3028         repeat_type = 0;    /* Force greedy */
 3029         possessive_quantifier = TRUE;
 3030         }
 3031       goto OUTPUT_SINGLE_REPEAT;
 3032       }
 3033 
 3034     /* If previous was a character type match (\d or similar), abolish it and
 3035     create a suitable repeat item. The code is shared with single-character
 3036     repeats by setting op_type to add a suitable offset into repeat_type. Note
 3037     the the Unicode property types will be present only when SUPPORT_UCP is
 3038     defined, but we don't wrap the little bits of code here because it just
 3039     makes it horribly messy. */
 3040 
 3041     else if (*previous < OP_EODN)
 3042       {
 3043       uschar *oldcode;
 3044       int prop_type, prop_value;
 3045       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
 3046       c = *previous;
 3047 
 3048       if (!possessive_quantifier &&
 3049           repeat_max < 0 &&
 3050           check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
 3051         {
 3052         repeat_type = 0;    /* Force greedy */
 3053         possessive_quantifier = TRUE;
 3054         }
 3055 
 3056       OUTPUT_SINGLE_REPEAT:
 3057       if (*previous == OP_PROP || *previous == OP_NOTPROP)
 3058         {
 3059         prop_type = previous[1];
 3060         prop_value = previous[2];
 3061         }
 3062       else prop_type = prop_value = -1;
 3063 
 3064       oldcode = code;
 3065       code = previous;                  /* Usually overwrite previous item */
 3066 
 3067       /* If the maximum is zero then the minimum must also be zero; Perl allows
 3068       this case, so we do too - by simply omitting the item altogether. */
 3069 
 3070       if (repeat_max == 0) goto END_REPEAT;
 3071 
 3072       /* All real repeats make it impossible to handle partial matching (maybe
 3073       one day we will be able to remove this restriction). */
 3074 
 3075       if (repeat_max != 1) cd->nopartial = TRUE;
 3076 
 3077       /* Combine the op_type with the repeat_type */
 3078 
 3079       repeat_type += op_type;
 3080 
 3081       /* A minimum of zero is handled either as the special case * or ?, or as
 3082       an UPTO, with the maximum given. */
 3083 
 3084       if (repeat_min == 0)
 3085         {
 3086         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
 3087           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
 3088         else
 3089           {
 3090           *code++ = OP_UPTO + repeat_type;
 3091           PUT2INC(code, 0, repeat_max);
 3092           }
 3093         }
 3094 
 3095       /* A repeat minimum of 1 is optimized into some special cases. If the
 3096       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
 3097       left in place and, if the maximum is greater than 1, we use OP_UPTO with
 3098       one less than the maximum. */
 3099 
 3100       else if (repeat_min == 1)
 3101         {
 3102         if (repeat_max == -1)
 3103           *code++ = OP_PLUS + repeat_type;
 3104         else
 3105           {
 3106           code = oldcode;                 /* leave previous item in place */
 3107           if (repeat_max == 1) goto END_REPEAT;
 3108           *code++ = OP_UPTO + repeat_type;
 3109           PUT2INC(code, 0, repeat_max - 1);
 3110           }
 3111         }
 3112 
 3113       /* The case {n,n} is just an EXACT, while the general case {n,m} is
 3114       handled as an EXACT followed by an UPTO. */
 3115 
 3116       else
 3117         {
 3118         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
 3119         PUT2INC(code, 0, repeat_min);
 3120 
 3121         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
 3122         we have to insert the character for the previous code. For a repeated
 3123         Unicode property match, there are two extra bytes that define the
 3124         required property. In UTF-8 mode, long characters have their length in
 3125         c, with the 0x80 bit as a flag. */
 3126 
 3127         if (repeat_max < 0)
 3128           {
 3129 #ifdef SUPPORT_UTF8
 3130           if (utf8 && c >= 128)
 3131             {
 3132             memcpy(code, utf8_char, c & 7);
 3133             code += c & 7;
 3134             }
 3135           else
 3136 #endif
 3137             {
 3138             *code++ = c;
 3139             if (prop_type >= 0)
 3140               {
 3141               *code++ = prop_type;
 3142               *code++ = prop_value;
 3143               }
 3144             }
 3145           *code++ = OP_STAR + repeat_type;
 3146           }
 3147 
 3148         /* Else insert an UPTO if the max is greater than the min, again
 3149         preceded by the character, for the previously inserted code. If the
 3150         UPTO is just for 1 instance, we can use QUERY instead. */
 3151 
 3152         else if (repeat_max != repeat_min)
 3153           {
 3154 #ifdef SUPPORT_UTF8
 3155           if (utf8 && c >= 128)
 3156             {
 3157             memcpy(code, utf8_char, c & 7);
 3158             code += c & 7;
 3159             }
 3160           else
 3161 #endif
 3162           *code++ = c;
 3163           if (prop_type >= 0)
 3164             {
 3165             *code++ = prop_type;
 3166             *code++ = prop_value;
 3167             }
 3168           repeat_max -= repeat_min;
 3169 
 3170           if (repeat_max == 1)
 3171             {
 3172             *code++ = OP_QUERY + repeat_type;
 3173             }
 3174           else
 3175             {
 3176             *code++ = OP_UPTO + repeat_type;
 3177             PUT2INC(code, 0, repeat_max);
 3178             }
 3179           }
 3180         }
 3181 
 3182       /* The character or character type itself comes last in all cases. */
 3183 
 3184 #ifdef SUPPORT_UTF8
 3185       if (utf8 && c >= 128)
 3186         {
 3187         memcpy(code, utf8_char, c & 7);
 3188         code += c & 7;
 3189         }
 3190       else
 3191 #endif
 3192       *code++ = c;
 3193 
 3194       /* For a repeated Unicode property match, there are two extra bytes that
 3195       define the required property. */
 3196 
 3197 #ifdef SUPPORT_UCP
 3198       if (prop_type >= 0)
 3199         {
 3200         *code++ = prop_type;
 3201         *code++ = prop_value;
 3202         }
 3203 #endif
 3204       }
 3205 
 3206     /* If previous was a character class or a back reference, we put the repeat
 3207     stuff after it, but just skip the item if the repeat was {0,0}. */
 3208 
 3209     else if (*previous == OP_CLASS ||
 3210              *previous == OP_NCLASS ||
 3211 #ifdef SUPPORT_UTF8
 3212              *previous == OP_XCLASS ||
 3213 #endif
 3214              *previous == OP_REF)
 3215       {
 3216       if (repeat_max == 0)
 3217         {
 3218         code = previous;
 3219         goto END_REPEAT;
 3220         }
 3221 
 3222       /* All real repeats make it impossible to handle partial matching (maybe
 3223       one day we will be able to remove this restriction). */
 3224 
 3225       if (repeat_max != 1) cd->nopartial = TRUE;
 3226 
 3227       if (repeat_min == 0 && repeat_max == -1)
 3228         *code++ = OP_CRSTAR + repeat_type;
 3229       else if (repeat_min == 1 && repeat_max == -1)
 3230         *code++ = OP_CRPLUS + repeat_type;
 3231       else if (repeat_min == 0 && repeat_max == 1)
 3232         *code++ = OP_CRQUERY + repeat_type;
 3233       else
 3234         {
 3235         *code++ = OP_CRRANGE + repeat_type;
 3236         PUT2INC(code, 0, repeat_min);
 3237         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
 3238         PUT2INC(code, 0, repeat_max);
 3239         }
 3240       }
 3241 
 3242     /* If previous was a bracket group, we may have to replicate it in certain
 3243     cases. */
 3244 
 3245     else if (*previous == OP_BRA  || *previous == OP_CBRA ||
 3246              *previous == OP_ONCE || *previous == OP_COND)
 3247       {
 3248       register int i;
 3249       int ketoffset = 0;
 3250       int len = code - previous;
 3251       uschar *bralink = NULL;
 3252 
 3253       /* Repeating a DEFINE group is pointless */
 3254 
 3255       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
 3256         {
 3257         *errorcodeptr = ERR55;
 3258         goto FAILED;
 3259         }
 3260 
 3261       /* This is a paranoid check to stop integer overflow later on */
 3262 
 3263       if (len > MAX_DUPLENGTH)
 3264         {
 3265         *errorcodeptr = ERR50;
 3266         goto FAILED;
 3267         }
 3268 
 3269       /* If the maximum repeat count is unlimited, find the end of the bracket
 3270       by scanning through from the start, and compute the offset back to it
 3271       from the current code pointer. There may be an OP_OPT setting following
 3272       the final KET, so we can't find the end just by going back from the code
 3273       pointer. */
 3274 
 3275       if (repeat_max == -1)
 3276         {
 3277         register uschar *ket = previous;
 3278         do ket += GET(ket, 1); while (*ket != OP_KET);
 3279         ketoffset = code - ket;
 3280         }
 3281 
 3282       /* The case of a zero minimum is special because of the need to stick
 3283       OP_BRAZERO in front of it, and because the group appears once in the
 3284       data, whereas in other cases it appears the minimum number of times. For
 3285       this reason, it is simplest to treat this case separately, as otherwise
 3286       the code gets far too messy. There are several special subcases when the
 3287       minimum is zero. */
 3288 
 3289       if (repeat_min == 0)
 3290         {
 3291         /* If the maximum is also zero, we just omit the group from the output
 3292         altogether. */
 3293 
 3294         if (repeat_max == 0)
 3295           {
 3296           code = previous;
 3297           goto END_REPEAT;
 3298           }
 3299 
 3300         /* If the maximum is 1 or unlimited, we just have to stick in the
 3301         BRAZERO and do no more at this point. However, we do need to adjust
 3302         any OP_RECURSE calls inside the group that refer to the group itself or
 3303         any internal or forward referenced group, because the offset is from
 3304         the start of the whole regex. Temporarily terminate the pattern while
 3305         doing this. */
 3306 
 3307         if (repeat_max <= 1)
 3308           {
 3309           *code = OP_END;
 3310           adjust_recurse(previous, 1, utf8, cd, save_hwm);
 3311           memmove(previous+1, previous, len);
 3312           code++;
 3313           *previous++ = OP_BRAZERO + repeat_type;
 3314           }
 3315 
 3316         /* If the maximum is greater than 1 and limited, we have to replicate
 3317         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
 3318         The first one has to be handled carefully because it's the original
 3319         copy, which has to be moved up. The remainder can be handled by code
 3320         that is common with the non-zero minimum case below. We have to
 3321         adjust the value or repeat_max, since one less copy is required. Once
 3322         again, we may have to adjust any OP_RECURSE calls inside the group. */
 3323 
 3324         else
 3325           {
 3326           int offset;
 3327           *code = OP_END;
 3328           adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
 3329           memmove(previous + 2 + LINK_SIZE, previous, len);
 3330           code += 2 + LINK_SIZE;
 3331           *previous++ = OP_BRAZERO + repeat_type;
 3332           *previous++ = OP_BRA;
 3333 
 3334           /* We chain together the bracket offset fields that have to be
 3335           filled in later when the ends of the brackets are reached. */
 3336 
 3337           offset = (bralink == NULL)? 0 : previous - bralink;
 3338           bralink = previous;
 3339           PUTINC(previous, 0, offset);
 3340           }
 3341 
 3342         repeat_max--;
 3343         }
 3344 
 3345       /* If the minimum is greater than zero, replicate the group as many
 3346       times as necessary, and adjust the maximum to the number of subsequent
 3347       copies that we need. If we set a first char from the group, and didn't
 3348       set a required char, copy the latter from the former. If there are any
 3349       forward reference subroutine calls in the group, there will be entries on
 3350       the workspace list; replicate these with an appropriate increment. */
 3351 
 3352       else
 3353         {
 3354         if (repeat_min > 1)
 3355           {
 3356           /* In the pre-compile phase, we don't actually do the replication. We
 3357           just adjust the length as if we had. */
 3358 
 3359           if (lengthptr != NULL)
 3360             *lengthptr += (repeat_min - 1)*length_prevgroup;
 3361 
 3362           /* This is compiling for real */
 3363 
 3364           else
 3365             {
 3366             if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
 3367             for (i = 1; i < repeat_min; i++)
 3368               {
 3369               uschar *hc;
 3370               uschar *this_hwm = cd->hwm;
 3371               memcpy(code, previous, len);
 3372               for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
 3373                 {
 3374                 PUT(cd->hwm, 0, GET(hc, 0) + len);
 3375                 cd->hwm += LINK_SIZE;
 3376                 }
 3377               save_hwm = this_hwm;
 3378               code += len;
 3379               }
 3380             }
 3381           }
 3382 
 3383         if (repeat_max > 0) repeat_max -= repeat_min;
 3384         }
 3385 
 3386       /* This code is common to both the zero and non-zero minimum cases. If
 3387       the maximum is limited, it replicates the group in a nested fashion,
 3388       remembering the bracket starts on a stack. In the case of a zero minimum,
 3389       the first one was set up above. In all cases the repeat_max now specifies
 3390       the number of additional copies needed. Again, we must remember to
 3391       replicate entries on the forward reference list. */
 3392 
 3393       if (repeat_max >= 0)
 3394         {
 3395         /* In the pre-compile phase, we don't actually do the replication. We
 3396         just adjust the length as if we had. For each repetition we must add 1
 3397         to the length for BRAZERO and for all but the last repetition we must
 3398         add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
 3399 
 3400         if (lengthptr != NULL && repeat_max > 0)
 3401           *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
 3402             2 - 2*LINK_SIZE;  /* Last one doesn't nest */
 3403 
 3404         /* This is compiling for real */
 3405 
 3406         else for (i = repeat_max - 1; i >= 0; i--)
 3407           {
 3408           uschar *hc;
 3409           uschar *this_hwm = cd->hwm;
 3410 
 3411           *code++ = OP_BRAZERO + repeat_type;
 3412 
 3413           /* All but the final copy start a new nesting, maintaining the
 3414           chain of brackets outstanding. */
 3415 
 3416           if (i != 0)
 3417             {
 3418             int offset;
 3419             *code++ = OP_BRA;
 3420             offset = (bralink == NULL)? 0 : code - bralink;
 3421             bralink = code;
 3422             PUTINC(code, 0, offset);
 3423             }
 3424 
 3425           memcpy(code, previous, len);
 3426           for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
 3427             {
 3428             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
 3429             cd->hwm += LINK_SIZE;
 3430             }
 3431           save_hwm = this_hwm;
 3432           code += len;
 3433           }
 3434 
 3435         /* Now chain through the pending brackets, and fill in their length
 3436         fields (which are holding the chain links pro tem). */
 3437 
 3438         while (bralink != NULL)
 3439           {
 3440           int oldlinkoffset;
 3441           int offset = code - bralink + 1;
 3442           uschar *bra = code - offset;
 3443           oldlinkoffset = GET(bra, 1);
 3444           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
 3445           *code++ = OP_KET;
 3446           PUTINC(code, 0, offset);
 3447           PUT(bra, 1, offset);
 3448           }
 3449         }
 3450 
 3451       /* If the maximum is unlimited, set a repeater in the final copy. We
 3452       can't just offset backwards from the current code point, because we
 3453       don't know if there's been an options resetting after the ket. The
 3454       correct offset was computed above.
 3455 
 3456       Then, when we are doing the actual compile phase, check to see whether
 3457       this group is a non-atomic one that could match an empty string. If so,
 3458       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
 3459       that runtime checking can be done. [This check is also applied to
 3460       atomic groups at runtime, but in a different way.] */
 3461 
 3462       else
 3463         {
 3464         uschar *ketcode = code - ketoffset;
 3465         uschar *bracode = ketcode - GET(ketcode, 1);
 3466         *ketcode = OP_KETRMAX + repeat_type;
 3467         if (lengthptr == NULL && *bracode != OP_ONCE)
 3468           {
 3469           uschar *scode = bracode;
 3470           do
 3471             {
 3472             if (could_be_empty_branch(scode, ketcode, utf8))
 3473               {
 3474               *bracode += OP_SBRA - OP_BRA;
 3475               break;
 3476               }
 3477             scode += GET(scode, 1);
 3478             }
 3479           while (*scode == OP_ALT);
 3480           }
 3481         }
 3482       }
 3483 
 3484     /* Else there's some kind of shambles */
 3485 
 3486     else
 3487       {
 3488       *errorcodeptr = ERR11;
 3489       goto FAILED;
 3490       }
 3491 
 3492     /* If the character following a repeat is '+', or if certain optimization
 3493     tests above succeeded, possessive_quantifier is TRUE. For some of the
 3494     simpler opcodes, there is an special alternative opcode for this. For
 3495     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
 3496     The '+' notation is just syntactic sugar, taken from Sun's Java package,
 3497     but the special opcodes can optimize it a bit. The repeated item starts at
 3498     tempcode, not at previous, which might be the first part of a string whose
 3499     (former) last char we repeated.
 3500 
 3501     Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
 3502     an 'upto' may follow. We skip over an 'exact' item, and then test the
 3503     length of what remains before proceeding. */
 3504 
 3505     if (possessive_quantifier)
 3506       {
 3507       int len;
 3508       if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
 3509           *tempcode == OP_NOTEXACT)
 3510         tempcode += _pcre_OP_lengths[*tempcode];
 3511       len = code - tempcode;
 3512       if (len > 0) switch (*tempcode)
 3513         {
 3514         case OP_STAR:  *tempcode = OP_POSSTAR; break;
 3515         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
 3516         case OP_QUERY: *tempcode = OP_POSQUERY; break;
 3517         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
 3518 
 3519         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
 3520         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
 3521         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
 3522         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
 3523 
 3524         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
 3525         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
 3526         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
 3527         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
 3528 
 3529         default:
 3530         memmove(tempcode + 1+LINK_SIZE, tempcode, len);
 3531         code += 1 + LINK_SIZE;
 3532         len += 1 + LINK_SIZE;
 3533         tempcode[0] = OP_ONCE;
 3534         *code++ = OP_KET;
 3535         PUTINC(code, 0, len);
 3536         PUT(tempcode, 1, len);
 3537         break;
 3538         }
 3539       }
 3540 
 3541     /* In all case we no longer have a previous item. We also set the
 3542     "follows varying string" flag for subsequently encountered reqbytes if
 3543     it isn't already set and we have just passed a varying length item. */
 3544 
 3545     END_REPEAT:
 3546     previous = NULL;
 3547     cd->req_varyopt |= reqvary;
 3548     break;
 3549 
 3550 
 3551     /* ===================================================================*/
 3552     /* Start of nested parenthesized sub-expression, or comment or lookahead or
 3553     lookbehind or option setting or condition or all the other extended
 3554     parenthesis forms. First deal with the specials; all are introduced by ?,
 3555     and the appearance of any of them means that this is not a capturing
 3556     group. */
 3557 
 3558     case '(':
 3559     newoptions = options;
 3560     skipbytes = 0;
 3561     bravalue = OP_CBRA;
 3562     save_hwm = cd->hwm;
 3563 
 3564     if (*(++ptr) == '?')
 3565       {
 3566       int i, set, unset, namelen;
 3567       int *optset;
 3568       const uschar *name;
 3569       uschar *slot;
 3570 
 3571       switch (*(++ptr))
 3572         {
 3573         case '#':                 /* Comment; skip to ket */
 3574         ptr++;
 3575         while (*ptr != 0 && *ptr != ')') ptr++;
 3576         if (*ptr == 0)
 3577           {
 3578           *errorcodeptr = ERR18;
 3579           goto FAILED;
 3580           }
 3581         continue;
 3582 
 3583 
 3584         /* ------------------------------------------------------------ */
 3585         case ':':                 /* Non-capturing bracket */
 3586         bravalue = OP_BRA;
 3587         ptr++;
 3588         break;
 3589 
 3590 
 3591         /* ------------------------------------------------------------ */
 3592         case '(':
 3593         bravalue = OP_COND;       /* Conditional group */
 3594 
 3595         /* A condition can be an assertion, a number (referring to a numbered
 3596         group), a name (referring to a named group), or 'R', referring to
 3597         recursion. R<digits> and R&name are also permitted for recursion tests.
 3598 
 3599         There are several syntaxes for testing a named group: (?(name)) is used
 3600         by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
 3601 
 3602         There are two unfortunate ambiguities, caused by history. (a) 'R' can
 3603         be the recursive thing or the name 'R' (and similarly for 'R' followed
 3604         by digits), and (b) a number could be a name that consists of digits.
 3605         In both cases, we look for a name first; if not found, we try the other
 3606         cases. */
 3607 
 3608         /* For conditions that are assertions, check the syntax, and then exit
 3609         the switch. This will take control down to where bracketed groups,
 3610         including assertions, are processed. */
 3611 
 3612         if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
 3613           break;
 3614 
 3615         /* Most other conditions use OP_CREF (a couple change to OP_RREF
 3616         below), and all need to skip 3 bytes at the start of the group. */
 3617 
 3618         code[1+LINK_SIZE] = OP_CREF;
 3619         skipbytes = 3;
 3620 
 3621         /* Check for a test for recursion in a named group. */
 3622 
 3623         if (ptr[1] == 'R' && ptr[2] == '&')
 3624           {
 3625           terminator = -1;
 3626           ptr += 2;
 3627           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
 3628           }
 3629 
 3630         /* Check for a test for a named group's having been set, using the Perl
 3631         syntax (?(<name>) or (?('name') */
 3632 
 3633         else if (ptr[1] == '<')
 3634           {
 3635           terminator = '>';
 3636           ptr++;
 3637           }
 3638         else if (ptr[1] == '\'')
 3639           {
 3640           terminator = '\'';
 3641           ptr++;
 3642           }
 3643         else terminator = 0;
 3644 
 3645         /* We now expect to read a name; any thing else is an error */
 3646 
 3647         if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
 3648           {
 3649           ptr += 1;  /* To get the right offset */
 3650           *errorcodeptr = ERR28;
 3651           goto FAILED;
 3652           }
 3653 
 3654         /* Read the name, but also get it as a number if it's all digits */
 3655 
 3656         recno = 0;
 3657         name = ++ptr;
 3658         while ((cd->ctypes[*ptr] & ctype_word) != 0)
 3659           {
 3660           if (recno >= 0)
 3661             recno = ((digitab[*ptr] & ctype_digit) != 0)?
 3662               recno * 10 + *ptr - '0' : -1;
 3663           ptr++;
 3664           }
 3665         namelen = ptr - name;
 3666 
 3667         if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
 3668           {
 3669           ptr--;      /* Error offset */
 3670           *errorcodeptr = ERR26;
 3671           goto FAILED;
 3672           }
 3673 
 3674         /* Do no further checking in the pre-compile phase. */
 3675 
 3676         if (lengthptr != NULL) break;
 3677 
 3678         /* In the real compile we do the work of looking for the actual
 3679         reference. */
 3680 
 3681         slot = cd->name_table;
 3682         for (i = 0; i < cd->names_found; i++)
 3683           {
 3684           if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
 3685           slot += cd->name_entry_size;
 3686           }
 3687 
 3688         /* Found a previous named subpattern */
 3689 
 3690         if (i < cd->names_found)
 3691           {
 3692           recno = GET2(slot, 0);
 3693           PUT2(code, 2+LINK_SIZE, recno);
 3694           }
 3695 
 3696         /* Search the pattern for a forward reference */
 3697 
 3698         else if ((i = find_parens(ptr, cd->bracount, name, namelen,
 3699                         (options & PCRE_EXTENDED) != 0)) > 0)
 3700           {
 3701           PUT2(code, 2+LINK_SIZE, i);
 3702           }
 3703 
 3704         /* If terminator == 0 it means that the name followed directly after
 3705         the opening parenthesis [e.g. (?(abc)...] and in this case there are
 3706         some further alternatives to try. For the cases where terminator != 0
 3707         [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
 3708         now checked all the possibilities, so give an error. */
 3709 
 3710         else if (terminator != 0)
 3711           {
 3712           *errorcodeptr = ERR15;
 3713           goto FAILED;
 3714           }
 3715 
 3716         /* Check for (?(R) for recursion. Allow digits after R to specify a
 3717         specific group number. */
 3718 
 3719         else if (*name == 'R')
 3720           {
 3721           recno = 0;
 3722           for (i = 1; i < namelen; i++)
 3723             {
 3724             if ((digitab[name[i]] & ctype_digit) == 0)
 3725               {
 3726               *errorcodeptr = ERR15;
 3727               goto FAILED;
 3728               }
 3729             recno = recno * 10 + name[i] - '0';
 3730             }
 3731           if (recno == 0) recno = RREF_ANY;
 3732           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
 3733           PUT2(code, 2+LINK_SIZE, recno);
 3734           }
 3735 
 3736         /* Similarly, check for the (?(DEFINE) "condition", which is always
 3737         false. */
 3738 
 3739         else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
 3740           {
 3741           code[1+LINK_SIZE] = OP_DEF;
 3742           skipbytes = 1;
 3743           }
 3744 
 3745         /* Check for the "name" actually being a subpattern number. */
 3746 
 3747         else if (recno > 0)
 3748           {
 3749           PUT2(code, 2+LINK_SIZE, recno);
 3750           }
 3751 
 3752         /* Either an unidentified subpattern, or a reference to (?(0) */
 3753 
 3754         else
 3755           {
 3756           *errorcodeptr = (recno == 0)? ERR35: ERR15;
 3757           goto FAILED;
 3758           }
 3759         break;
 3760 
 3761 
 3762         /* ------------------------------------------------------------ */
 3763         case '=':                 /* Positive lookahead */
 3764         bravalue = OP_ASSERT;
 3765         ptr++;
 3766         break;
 3767 
 3768 
 3769         /* ------------------------------------------------------------ */
 3770         case '!':                 /* Negative lookahead */
 3771         bravalue = OP_ASSERT_NOT;
 3772         ptr++;
 3773         break;
 3774 
 3775 
 3776         /* ------------------------------------------------------------ */
 3777         case '<':                 /* Lookbehind or named define */
 3778         switch (ptr[1])
 3779           {
 3780           case '=':               /* Positive lookbehind */
 3781           bravalue = OP_ASSERTBACK;
 3782           ptr += 2;
 3783           break;
 3784 
 3785           case '!':               /* Negative lookbehind */
 3786           bravalue = OP_ASSERTBACK_NOT;
 3787           ptr += 2;
 3788           break;
 3789 
 3790           default:                /* Could be name define, else bad */
 3791           if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
 3792           ptr++;                  /* Correct offset for error */
 3793           *errorcodeptr = ERR24;
 3794           goto FAILED;
 3795           }
 3796         break;
 3797 
 3798 
 3799         /* ------------------------------------------------------------ */
 3800         case '>':                 /* One-time brackets */
 3801         bravalue = OP_ONCE;
 3802         ptr++;
 3803         break;
 3804 
 3805 
 3806         /* ------------------------------------------------------------ */
 3807         case 'C':                 /* Callout - may be followed by digits; */
 3808         previous_callout = code;  /* Save for later completion */
 3809         after_manual_callout = 1; /* Skip one item before completing */
 3810         *code++ = OP_CALLOUT;
 3811           {
 3812           int n = 0;
 3813           while ((digitab[*(++ptr)] & ctype_digit) != 0)
 3814             n = n * 10 + *ptr - '0';
 3815           if (*ptr != ')')
 3816             {
 3817             *errorcodeptr = ERR39;
 3818             goto FAILED;
 3819             }
 3820           if (n > 255)
 3821             {
 3822             *errorcodeptr = ERR38;
 3823             goto FAILED;
 3824             }
 3825           *code++ = n;
 3826           PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
 3827           PUT(code, LINK_SIZE, 0);                    /* Default length */
 3828           code += 2 * LINK_SIZE;
 3829           }
 3830         previous = NULL;
 3831         continue;
 3832 
 3833 
 3834         /* ------------------------------------------------------------ */
 3835         case 'P':                 /* Python-style named subpattern handling */
 3836         if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
 3837           {
 3838           is_recurse = *ptr == '>';
 3839           terminator = ')';
 3840           goto NAMED_REF_OR_RECURSE;
 3841           }
 3842         else if (*ptr != '<')    /* Test for Python-style definition */
 3843           {
 3844           *errorcodeptr = ERR41;
 3845           goto FAILED;
 3846           }
 3847         /* Fall through to handle (?P< as (?< is handled */
 3848 
 3849 
 3850         /* ------------------------------------------------------------ */
 3851         DEFINE_NAME:    /* Come here from (?< handling */
 3852         case '\'':
 3853           {
 3854           terminator = (*ptr == '<')? '>' : '\'';
 3855           name = ++ptr;
 3856 
 3857           while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
 3858           namelen = ptr - name;
 3859 
 3860           /* In the pre-compile phase, just do a syntax check. */
 3861 
 3862           if (lengthptr != NULL)
 3863             {
 3864             if (*ptr != terminator)
 3865               {
 3866               *errorcodeptr = ERR42;
 3867               goto FAILED;
 3868               }
 3869             if (cd->names_found >= MAX_NAME_COUNT)
 3870               {
 3871               *errorcodeptr = ERR49;
 3872               goto FAILED;
 3873               }
 3874             if (namelen + 3 > cd->name_entry_size)
 3875               {
 3876               cd->name_entry_size = namelen + 3;
 3877               if (namelen > MAX_NAME_SIZE)
 3878                 {
 3879                 *errorcodeptr = ERR48;
 3880                 goto FAILED;
 3881                 }
 3882               }
 3883             }
 3884 
 3885           /* In the real compile, create the entry in the table */
 3886 
 3887           else
 3888             {
 3889             slot = cd->name_table;
 3890             for (i = 0; i < cd->names_found; i++)
 3891               {
 3892               int crc = memcmp(name, slot+2, namelen);
 3893               if (crc == 0)
 3894                 {
 3895                 if (slot[2+namelen] == 0)
 3896                   {
 3897                   if ((options & PCRE_DUPNAMES) == 0)
 3898                     {
 3899                     *errorcodeptr = ERR43;
 3900                     goto FAILED;
 3901                     }
 3902                   }
 3903                 else crc = -1;      /* Current name is substring */
 3904                 }
 3905               if (crc < 0)
 3906                 {
 3907                 memmove(slot + cd->name_entry_size, slot,
 3908                   (cd->names_found - i) * cd->name_entry_size);
 3909                 break;
 3910                 }
 3911               slot += cd->name_entry_size;
 3912               }
 3913 
 3914             PUT2(slot, 0, cd->bracount + 1);
 3915             memcpy(slot + 2, name, namelen);
 3916             slot[2+namelen] = 0;
 3917             }
 3918           }
 3919 
 3920         /* In both cases, count the number of names we've encountered. */
 3921 
 3922         ptr++;                    /* Move past > or ' */
 3923         cd->names_found++;
 3924         goto NUMBERED_GROUP;
 3925 
 3926 
 3927         /* ------------------------------------------------------------ */
 3928         case '&':                 /* Perl recursion/subroutine syntax */
 3929         terminator = ')';
 3930         is_recurse = TRUE;
 3931         /* Fall through */
 3932 
 3933         /* We come here from the Python syntax above that handles both
 3934         references (?P=name) and recursion (?P>name), as well as falling
 3935         through from the Perl recursion syntax (?&name). */
 3936 
 3937         NAMED_REF_OR_RECURSE:
 3938         name = ++ptr;
 3939         while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
 3940         namelen = ptr - name;
 3941 
 3942         /* In the pre-compile phase, do a syntax check and set a dummy
 3943         reference number. */
 3944 
 3945         if (lengthptr != NULL)
 3946           {
 3947           if (*ptr != terminator)
 3948             {
 3949             *errorcodeptr = ERR42;
 3950             goto FAILED;
 3951             }
 3952           if (namelen > MAX_NAME_SIZE)
 3953             {
 3954             *errorcodeptr = ERR48;
 3955             goto FAILED;
 3956             }
 3957           recno = 0;
 3958           }
 3959 
 3960         /* In the real compile, seek the name in the table */
 3961 
 3962         else
 3963           {
 3964           slot = cd->name_table;
 3965           for (i = 0; i < cd->names_found; i++)
 3966             {
 3967             if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
 3968             slot += cd->name_entry_size;
 3969             }
 3970 
 3971           if (i < cd->names_found)         /* Back reference */
 3972             {
 3973             recno = GET2(slot, 0);
 3974             }
 3975           else if ((recno =                /* Forward back reference */
 3976                     find_parens(ptr, cd->bracount, name, namelen,
 3977                       (options & PCRE_EXTENDED) != 0)) <= 0)
 3978             {
 3979             *errorcodeptr = ERR15;
 3980             goto FAILED;
 3981             }
 3982           }
 3983 
 3984         /* In both phases, we can now go to the code than handles numerical
 3985         recursion or backreferences. */
 3986 
 3987         if (is_recurse) goto HANDLE_RECURSION;
 3988           else goto HANDLE_REFERENCE;
 3989 
 3990 
 3991         /* ------------------------------------------------------------ */
 3992         case 'R':                 /* Recursion */
 3993         ptr++;                    /* Same as (?0)      */
 3994         /* Fall through */
 3995 
 3996 
 3997         /* ------------------------------------------------------------ */
 3998         case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
 3999         case '5': case '6': case '7': case '8': case '9':   /* subroutine */
 4000           {
 4001           const uschar *called;
 4002           recno = 0;
 4003           while((digitab[*ptr] & ctype_digit) != 0)
 4004             recno = recno * 10 + *ptr++ - '0';
 4005           if (*ptr != ')')
 4006             {
 4007             *errorcodeptr = ERR29;
 4008             goto FAILED;
 4009             }
 4010 
 4011           /* Come here from code above that handles a named recursion */
 4012 
 4013           HANDLE_RECURSION:
 4014 
 4015           previous = code;
 4016           called = cd->start_code;
 4017 
 4018           /* When we are actually compiling, find the bracket that is being
 4019           referenced. Temporarily end the regex in case it doesn't exist before
 4020           this point. If we end up with a forward reference, first check that
 4021           the bracket does occur later so we can give the error (and position)
 4022           now. Then remember this forward reference in the workspace so it can
 4023           be filled in at the end. */
 4024 
 4025           if (lengthptr == NULL)
 4026             {
 4027             *code = OP_END;
 4028             if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
 4029 
 4030             /* Forward reference */
 4031 
 4032             if (called == NULL)
 4033               {
 4034               if (find_parens(ptr, cd->bracount, NULL, recno,
 4035                    (options & PCRE_EXTENDED) != 0) < 0)
 4036                 {
 4037                 *errorcodeptr = ERR15;
 4038                 goto FAILED;
 4039                 }
 4040               called = cd->start_code + recno;
 4041               PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
 4042               }
 4043 
 4044             /* If not a forward reference, and the subpattern is still open,
 4045             this is a recursive call. We check to see if this is a left
 4046             recursion that could loop for ever, and diagnose that case. */
 4047 
 4048             else if (GET(called, 1) == 0 &&
 4049                      could_be_empty(called, code, bcptr, utf8))
 4050               {
 4051               *errorcodeptr = ERR40;
 4052               goto FAILED;
 4053               }
 4054             }
 4055 
 4056           /* Insert the recursion/subroutine item, automatically wrapped inside
 4057           "once" brackets. Set up a "previous group" length so that a
 4058           subsequent quantifier will work. */
 4059 
 4060           *code = OP_ONCE;
 4061           PUT(code, 1, 2 + 2*LINK_SIZE);
 4062           code += 1 + LINK_SIZE;
 4063 
 4064           *code = OP_RECURSE;
 4065           PUT(code, 1, called - cd->start_code);
 4066           code += 1 + LINK_SIZE;
 4067 
 4068           *code = OP_KET;
 4069           PUT(code, 1, 2 + 2*LINK_SIZE);
 4070           code += 1 + LINK_SIZE;
 4071 
 4072           length_prevgroup = 3 + 3*LINK_SIZE;
 4073           }
 4074 
 4075         /* Can't determine a first byte now */
 4076 
 4077         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
 4078         continue;
 4079 
 4080 
 4081         /* ------------------------------------------------------------ */
 4082         default:              /* Other characters: check option setting */
 4083         set = unset = 0;
 4084         optset = &set;
 4085 
 4086         while (*ptr != ')' && *ptr != ':')
 4087           {
 4088           switch (*ptr++)
 4089             {
 4090             case '-': optset = &unset; break;
 4091 
 4092             case 'J':    /* Record that it changed in the external options */
 4093             *optset |= PCRE_DUPNAMES;
 4094             cd->external_options |= PCRE_JCHANGED;
 4095             break;
 4096 
 4097             case 'i': *optset |= PCRE_CASELESS; break;
 4098             case 'm': *optset |= PCRE_MULTILINE; break;
 4099             case 's': *optset |= PCRE_DOTALL; break;
 4100             case 'x': *optset |= PCRE_EXTENDED; break;
 4101             case 'U': *optset |= PCRE_UNGREEDY; break;
 4102             case 'X': *optset |= PCRE_EXTRA; break;
 4103 
 4104             default:  *errorcodeptr = ERR12;
 4105                       ptr--;    /* Correct the offset */
 4106                       goto FAILED;
 4107             }
 4108           }
 4109 
 4110         /* Set up the changed option bits, but don't change anything yet. */
 4111 
 4112         newoptions = (options | set) & (~unset);
 4113 
 4114         /* If the options ended with ')' this is not the start of a nested
 4115         group with option changes, so the options change at this level. If this
 4116         item is right at the start of the pattern, the options can be
 4117         abstracted and made external in the pre-compile phase, and ignored in
 4118         the compile phase. This can be helpful when matching -- for instance in
 4119         caseless checking of required bytes.
 4120 
 4121         If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
 4122         definitely *not* at the start of the pattern because something has been
 4123         compiled. In the pre-compile phase, however, the code pointer can have
 4124         that value after the start, because it gets reset as code is discarded
 4125         during the pre-compile. However, this can happen only at top level - if
 4126         we are within parentheses, the starting BRA will still be present. At
 4127         any parenthesis level, the length value can be used to test if anything
 4128         has been compiled at that level. Thus, a test for both these conditions
 4129         is necessary to ensure we correctly detect the start of the pattern in
 4130         both phases.
 4131 
 4132         If we are not at the pattern start, compile code to change the ims
 4133         options if this setting actually changes any of them. We also pass the
 4134         new setting back so that it can be put at the start of any following
 4135         branches, and when this group ends (if we are in a group), a resetting
 4136         item can be compiled. */
 4137 
 4138         if (*ptr == ')')
 4139           {
 4140           if (code == cd->start_code + 1 + LINK_SIZE &&
 4141                (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
 4142             {
 4143             cd->external_options = newoptions;
 4144             options = newoptions;
 4145             }
 4146          else
 4147             {
 4148             if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
 4149               {
 4150               *code++ = OP_OPT;
 4151               *code++ = newoptions & PCRE_IMS;
 4152               }
 4153 
 4154             /* Change options at this level, and pass them back for use
 4155             in subsequent branches. Reset the greedy defaults and the case
 4156             value for firstbyte and reqbyte. */
 4157 
 4158             *optionsptr = options = newoptions;
 4159             greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
 4160             greedy_non_default = greedy_default ^ 1;
 4161             req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
 4162             }
 4163 
 4164           previous = NULL;       /* This item can't be repeated */
 4165           continue;              /* It is complete */
 4166           }
 4167 
 4168         /* If the options ended with ':' we are heading into a nested group
 4169         with possible change of options. Such groups are non-capturing and are
 4170         not assertions of any kind. All we need to do is skip over the ':';
 4171         the newoptions value is handled below. */
 4172 
 4173         bravalue = OP_BRA;
 4174         ptr++;
 4175         }     /* End of switch for character following (? */
 4176       }       /* End of (? handling */
 4177 
 4178     /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
 4179     all unadorned brackets become non-capturing and behave like (?:...)
 4180     brackets. */
 4181 
 4182     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
 4183       {
 4184       bravalue = OP_BRA;
 4185       }
 4186 
 4187     /* Else we have a capturing group. */
 4188 
 4189     else
 4190       {
 4191       NUMBERED_GROUP:
 4192       cd->bracount += 1;
 4193       PUT2(code, 1+LINK_SIZE, cd->bracount);
 4194       skipbytes = 2;
 4195       }
 4196 
 4197     /* Process nested bracketed regex. Assertions may not be repeated, but
 4198     other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
 4199     non-register variable in order to be able to pass its address because some
 4200     compilers complain otherwise. Pass in a new setting for the ims options if
 4201     they have changed. */
 4202 
 4203     previous = (bravalue >= OP_ONCE)? code : NULL;
 4204     *code = bravalue;
 4205     tempcode = code;
 4206     tempreqvary = cd->req_varyopt;     /* Save value before bracket */
 4207     length_prevgroup = 0;              /* Initialize for pre-compile phase */
 4208 
 4209     if (!compile_regex(
 4210          newoptions,                   /* The complete new option state */
 4211          options & PCRE_IMS,           /* The previous ims option state */
 4212          &tempcode,                    /* Where to put code (updated) */
 4213          &ptr,                         /* Input pointer (updated) */
 4214          errorcodeptr,                 /* Where to put an error message */
 4215          (bravalue == OP_ASSERTBACK ||
 4216           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
 4217          skipbytes,                    /* Skip over bracket number */
 4218          &subfirstbyte,                /* For possible first char */
 4219          &subreqbyte,                  /* For possible last char */
 4220          bcptr,                        /* Current branch chain */
 4221          cd,                           /* Tables block */
 4222          (lengthptr == NULL)? NULL :   /* Actual compile phase */
 4223            &length_prevgroup           /* Pre-compile phase */
 4224          ))
 4225       goto FAILED;
 4226 
 4227     /* At the end of compiling, code is still pointing to the start of the
 4228     group, while tempcode has been updated to point past the end of the group
 4229     and any option resetting that may follow it. The pattern pointer (ptr)
 4230     is on the bracket. */
 4231 
 4232     /* If this is a conditional bracket, check that there are no more than
 4233     two branches in the group, or just one if it's a DEFINE group. */
 4234 
 4235     if (bravalue == OP_COND)
 4236       {
 4237       uschar *tc = code;
 4238       int condcount = 0;
 4239 
 4240       do {
 4241          condcount++;
 4242          tc += GET(tc,1);
 4243          }
 4244       while (*tc != OP_KET);
 4245 
 4246       /* A DEFINE group is never obeyed inline (the "condition" is always
 4247       false). It must have only one branch. */
 4248 
 4249       if (code[LINK_SIZE+1] == OP_DEF)
 4250         {
 4251         if (condcount > 1)
 4252           {
 4253           *errorcodeptr = ERR54;
 4254           goto FAILED;
 4255           }
 4256         bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
 4257         }
 4258 
 4259       /* A "normal" conditional group. If there is just one branch, we must not
 4260       make use of its firstbyte or reqbyte, because this is equivalent to an
 4261       empty second branch. */
 4262 
 4263       else
 4264         {
 4265         if (condcount > 2)
 4266           {
 4267           *errorcodeptr = ERR27;
 4268           goto FAILED;
 4269           }
 4270         if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
 4271         }
 4272       }
 4273 
 4274     /* Error if hit end of pattern */
 4275 
 4276     if (*ptr != ')')
 4277       {
 4278       *errorcodeptr = ERR14;
 4279       goto FAILED;
 4280       }
 4281 
 4282     /* In the pre-compile phase, update the length by the length of the nested
 4283     group, less the brackets at either end. Then reduce the compiled code to
 4284     just the brackets so that it doesn't use much memory if it is duplicated by
 4285     a quantifier. */
 4286 
 4287     if (lengthptr != NULL)
 4288       {
 4289       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
 4290       code++;
 4291       PUTINC(code, 0, 1 + LINK_SIZE);
 4292       *code++ = OP_KET;
 4293       PUTINC(code, 0, 1 + LINK_SIZE);
 4294       }
 4295 
 4296     /* Otherwise update the main code pointer to the end of the group. */
 4297 
 4298     else code = tempcode;
 4299 
 4300     /* For a DEFINE group, required and first character settings are not
 4301     relevant. */
 4302 
 4303     if (bravalue == OP_DEF) break;
 4304 
 4305     /* Handle updating of the required and first characters for other types of
 4306     group. Update for normal brackets of all kinds, and conditions with two
 4307     branches (see code above). If the bracket is followed by a quantifier with
 4308     zero repeat, we have to back off. Hence the definition of zeroreqbyte and
 4309     zerofirstbyte outside the main loop so that they can be accessed for the
 4310     back off. */
 4311 
 4312     zeroreqbyte = reqbyte;
 4313     zerofirstbyte = firstbyte;
 4314     groupsetfirstbyte = FALSE;
 4315 
 4316     if (bravalue >= OP_ONCE)
 4317       {
 4318       /* If we have not yet set a firstbyte in this branch, take it from the
 4319       subpattern, remembering that it was set here so that a repeat of more
 4320       than one can replicate it as reqbyte if necessary. If the subpattern has
 4321       no firstbyte, set "none" for the whole branch. In both cases, a zero
 4322       repeat forces firstbyte to "none". */
 4323 
 4324       if (firstbyte == REQ_UNSET)
 4325         {
 4326         if (subfirstbyte >= 0)
 4327           {
 4328           firstbyte = subfirstbyte;
 4329           groupsetfirstbyte = TRUE;
 4330           }
 4331         else firstbyte = REQ_NONE;
 4332         zerofirstbyte = REQ_NONE;
 4333         }
 4334 
 4335       /* If firstbyte was previously set, convert the subpattern's firstbyte
 4336       into reqbyte if there wasn't one, using the vary flag that was in
 4337       existence beforehand. */
 4338 
 4339       else if (subfirstbyte >= 0 && subreqbyte < 0)
 4340         subreqbyte = subfirstbyte | tempreqvary;
 4341 
 4342       /* If the subpattern set a required byte (or set a first byte that isn't
 4343       really the first byte - see above), set it. */
 4344 
 4345       if (subreqbyte >= 0) reqbyte = subreqbyte;
 4346       }
 4347 
 4348     /* For a forward assertion, we take the reqbyte, if set. This can be
 4349     helpful if the pattern that follows the assertion doesn't set a different
 4350     char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
 4351     for an assertion, however because it leads to incorrect effect for patterns
 4352     such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
 4353     of a firstbyte. This is overcome by a scan at the end if there's no
 4354     firstbyte, looking for an asserted first char. */
 4355 
 4356     else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
 4357     break;     /* End of processing '(' */
 4358 
 4359 
 4360     /* ===================================================================*/
 4361     /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
 4362     are arranged to be the negation of the corresponding OP_values. For the
 4363     back references, the values are ESC_REF plus the reference number. Only
 4364     back references and those types that consume a character may be repeated.
 4365     We can test for values between ESC_b and ESC_Z for the latter; this may
 4366     have to change if any new ones are ever created. */
 4367 
 4368     case '\\':
 4369     tempptr = ptr;
 4370     c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
 4371     if (*errorcodeptr != 0) goto FAILED;
 4372 
 4373     if (c < 0)
 4374       {
 4375       if (-c == ESC_Q)            /* Handle start of quoted string */
 4376         {
 4377         if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
 4378           else inescq = TRUE;
 4379         continue;
 4380         }
 4381 
 4382       if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
 4383 
 4384       /* For metasequences that actually match a character, we disable the
 4385       setting of a first character if it hasn't already been set. */
 4386 
 4387       if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
 4388         firstbyte = REQ_NONE;
 4389 
 4390       /* Set values to reset to if this is followed by a zero repeat. */
 4391 
 4392       zerofirstbyte = firstbyte;
 4393       zeroreqbyte = reqbyte;
 4394 
 4395       /* \k<name> or \k'name' is a back reference by name (Perl syntax) */
 4396 
 4397       if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\''))
 4398         {
 4399         is_recurse = FALSE;
 4400         terminator = (*(++ptr) == '<')? '>' : '\'';
 4401         goto NAMED_REF_OR_RECURSE;
 4402         }
 4403 
 4404       /* Back references are handled specially; must disable firstbyte if
 4405       not set to cope with cases like (?=(\w+))\1: which would otherwise set
 4406       ':' later. */
 4407 
 4408       if (-c >= ESC_REF)
 4409         {
 4410         recno = -c - ESC_REF;
 4411 
 4412         HANDLE_REFERENCE:    /* Come here from named backref handling */
 4413         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
 4414         previous = code;
 4415         *code++ = OP_REF;
 4416         PUT2INC(code, 0, recno);
 4417         cd->backref_map |= (recno < 32)? (1 << recno) : 1;
 4418         if (recno > cd->top_backref) cd->top_backref = recno;
 4419         }
 4420 
 4421       /* So are Unicode property matches, if supported. */
 4422 
 4423 #ifdef SUPPORT_UCP
 4424       else if (-c == ESC_P || -c == ESC_p)
 4425         {
 4426         BOOL negated;
 4427         int pdata;
 4428         int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
 4429         if (ptype < 0) goto FAILED;
 4430         previous = code;
 4431         *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
 4432         *code++ = ptype;
 4433         *code++ = pdata;
 4434         }
 4435 #else
 4436 
 4437       /* If Unicode properties are not supported, \X, \P, and \p are not
 4438       allowed. */
 4439 
 4440       else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
 4441         {
 4442         *errorcodeptr = ERR45;
 4443         goto FAILED;
 4444         }
 4445 #endif
 4446 
 4447       /* For the rest (including \X when Unicode properties are supported), we
 4448       can obtain the OP value by negating the escape value. */
 4449 
 4450       else
 4451         {
 4452         previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
 4453         *code++ = -c;
 4454         }
 4455       continue;
 4456       }
 4457 
 4458     /* We have a data character whose value is in c. In UTF-8 mode it may have
 4459     a value > 127. We set its representation in the length/buffer, and then
 4460     handle it as a data character. */
 4461 
 4462 #ifdef SUPPORT_UTF8
 4463     if (utf8 && c > 127)
 4464       mclength = _pcre_ord2utf8(c, mcbuffer);
 4465     else
 4466 #endif
 4467 
 4468      {
 4469      mcbuffer[0] = c;
 4470      mclength = 1;
 4471      }
 4472     goto ONE_CHAR;
 4473 
 4474 
 4475     /* ===================================================================*/
 4476     /* Handle a literal character. It is guaranteed not to be whitespace or #
 4477     when the extended flag is set. If we are in UTF-8 mode, it may be a
 4478     multi-byte literal character. */
 4479 
 4480     default:
 4481     NORMAL_CHAR:
 4482     mclength = 1;
 4483     mcbuffer[0] = c;
 4484 
 4485 #ifdef SUPPORT_UTF8
 4486     if (utf8 && c >= 0xc0)
 4487       {
 4488       while ((ptr[1] & 0xc0) == 0x80)
 4489         mcbuffer[mclength++] = *(++ptr);
 4490       }
 4491 #endif
 4492 
 4493     /* At this point we have the character's bytes in mcbuffer, and the length
 4494     in mclength. When not in UTF-8 mode, the length is always 1. */
 4495 
 4496     ONE_CHAR:
 4497     previous = code;
 4498     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
 4499     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
 4500 
 4501     /* Set the first and required bytes appropriately. If no previous first
 4502     byte, set it from this character, but revert to none on a zero repeat.
 4503     Otherwise, leave the firstbyte value alone, and don't change it on a zero
 4504     repeat. */
 4505 
 4506     if (firstbyte == REQ_UNSET)
 4507       {
 4508       zerofirstbyte = REQ_NONE;
 4509       zeroreqbyte = reqbyte;
 4510 
 4511       /* If the character is more than one byte long, we can set firstbyte
 4512       only if it is not to be matched caselessly. */
 4513 
 4514       if (mclength == 1 || req_caseopt == 0)
 4515         {
 4516         firstbyte = mcbuffer[0] | req_caseopt;
 4517         if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
 4518         }
 4519       else firstbyte = reqbyte = REQ_NONE;
 4520       }
 4521 
 4522     /* firstbyte was previously set; we can set reqbyte only the length is
 4523     1 or the matching is caseful. */
 4524 
 4525     else
 4526       {
 4527       zerofirstbyte = firstbyte;
 4528       zeroreqbyte = reqbyte;
 4529       if (mclength == 1 || req_caseopt == 0)
 4530         reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
 4531       }
 4532 
 4533     break;            /* End of literal character handling */
 4534     }
 4535   }                   /* end of big loop */
 4536 
 4537 
 4538 /* Control never reaches here by falling through, only by a goto for all the
 4539 error states. Pass back the position in the pattern so that it can be displayed
 4540 to the user for diagnosing the error. */
 4541 
 4542 FAILED:
 4543 *ptrptr = ptr;
 4544 return FALSE;
 4545 }
 4546 
 4547 
 4548 
 4549 
 4550 /*************************************************
 4551 *     Compile sequence of alternatives           *
 4552 *************************************************/
 4553 
 4554 /* On entry, ptr is pointing past the bracket character, but on return it
 4555 points to the closing bracket, or vertical bar, or end of string. The code
 4556 variable is pointing at the byte into which the BRA operator has been stored.
 4557 If the ims options are changed at the start (for a (?ims: group) or during any
 4558 branch, we need to insert an OP_OPT item at the start of every following branch
 4559 to ensure they get set correctly at run time, and also pass the new options
 4560 into every subsequent branch compile.
 4561 
 4562 This function is used during the pre-compile phase when we are trying to find
 4563 out the amount of memory needed, as well as during the real compile phase. The
 4564 value of lengthptr distinguishes the two phases.
 4565 
 4566 Argument:
 4567   options        option bits, including any changes for this subpattern
 4568   oldims         previous settings of ims option bits
 4569   codeptr        -> the address of the current code pointer
 4570   ptrptr         -> the address of the current pattern pointer
 4571   errorcodeptr   -> pointer to error code variable
 4572   lookbehind     TRUE if this is a lookbehind assertion
 4573   skipbytes      skip this many bytes at start (for brackets and OP_COND)
 4574   firstbyteptr   place to put the first required character, or a negative number
 4575   reqbyteptr     place to put the last required character, or a negative number
 4576   bcptr          pointer to the chain of currently open branches
 4577   cd             points to the data block with tables pointers etc.
 4578   lengthptr      NULL during the real compile phase
 4579                  points to length accumulator during pre-compile phase
 4580 
 4581 Returns:         TRUE on success
 4582 */
 4583 
 4584 static BOOL
 4585 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
 4586   int *errorcodeptr, BOOL lookbehind, int skipbytes, int *firstbyteptr,
 4587   int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr)
 4588 {
 4589 const uschar *ptr = *ptrptr;
 4590 uschar *code = *codeptr;
 4591 uschar *last_branch = code;
 4592 uschar *start_bracket = code;
 4593 uschar *reverse_count = NULL;
 4594 int firstbyte, reqbyte;
 4595 int branchfirstbyte, branchreqbyte;
 4596 int length;
 4597 branch_chain bc;
 4598 
 4599 bc.outer = bcptr;
 4600 bc.current = code;
 4601 
 4602 firstbyte = reqbyte = REQ_UNSET;
 4603 
 4604 /* Accumulate the length for use in the pre-compile phase. Start with the
 4605 length of the BRA and KET and any extra bytes that are required at the
 4606 beginning. We accumulate in a local variable to save frequent testing of
 4607 lenthptr for NULL. We cannot do this by looking at the value of code at the
 4608 start and end of each alternative, because compiled items are discarded during
 4609 the pre-compile phase so that the work space is not exceeded. */
 4610 
 4611 length = 2 + 2*LINK_SIZE + skipbytes;
 4612 
 4613 /* WARNING: If the above line is changed for any reason, you must also change
 4614 the code that abstracts option settings at the start of the pattern and makes
 4615 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
 4616 pre-compile phase to find out whether anything has yet been compiled or not. */
 4617 
 4618 /* Offset is set zero to mark that this bracket is still open */
 4619 
 4620 PUT(code, 1, 0);
 4621 code += 1 + LINK_SIZE + skipbytes;
 4622 
 4623 /* Loop for each alternative branch */
 4624 
 4625 for (;;)
 4626   {
 4627   /* Handle a change of ims options at the start of the branch */
 4628 
 4629   if ((options & PCRE_IMS) != oldims)
 4630     {
 4631     *code++ = OP_OPT;
 4632     *code++ = options & PCRE_IMS;
 4633     length += 2;
 4634     }
 4635 
 4636   /* Set up dummy OP_REVERSE if lookbehind assertion */
 4637 
 4638   if (lookbehind)
 4639     {
 4640     *code++ = OP_REVERSE;
 4641     reverse_count = code;
 4642     PUTINC(code, 0, 0);
 4643     length += 1 + LINK_SIZE;
 4644     }
 4645 
 4646   /* Now compile the branch; in the pre-compile phase its length gets added
 4647   into the length. */
 4648 
 4649   if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
 4650         &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
 4651     {
 4652     *ptrptr = ptr;
 4653     return FALSE;
 4654     }
 4655 
 4656   /* In the real compile phase, there is some post-processing to be done. */
 4657 
 4658   if (lengthptr == NULL)
 4659     {
 4660     /* If this is the first branch, the firstbyte and reqbyte values for the
 4661     branch become the values for the regex. */
 4662 
 4663     if (*last_branch != OP_ALT)
 4664       {
 4665       firstbyte = branchfirstbyte;
 4666       reqbyte = branchreqbyte;
 4667       }
 4668 
 4669     /* If this is not the first branch, the first char and reqbyte have to
 4670     match the values from all the previous branches, except that if the
 4671     previous value for reqbyte didn't have REQ_VARY set, it can still match,
 4672     and we set REQ_VARY for the regex. */
 4673 
 4674     else
 4675       {
 4676       /* If we previously had a firstbyte, but it doesn't match the new branch,
 4677       we have to abandon the firstbyte for the regex, but if there was
 4678       previously no reqbyte, it takes on the value of the old firstbyte. */
 4679 
 4680       if (firstbyte >= 0 && firstbyte != branchfirstbyte)
 4681         {
 4682         if (reqbyte < 0) reqbyte = firstbyte;
 4683         firstbyte = REQ_NONE;
 4684         }
 4685 
 4686       /* If we (now or from before) have no firstbyte, a firstbyte from the
 4687       branch becomes a reqbyte if there isn't a branch reqbyte. */
 4688 
 4689       if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
 4690           branchreqbyte = branchfirstbyte;
 4691 
 4692       /* Now ensure that the reqbytes match */
 4693 
 4694       if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
 4695         reqbyte = REQ_NONE;
 4696       else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
 4697       }
 4698 
 4699     /* If lookbehind, check that this branch matches a fixed-length string, and
 4700     put the length into the OP_REVERSE item. Temporarily mark the end of the
 4701     branch with OP_END. */
 4702 
 4703     if (lookbehind)
 4704       {
 4705       int fixed_length;
 4706       *code = OP_END;
 4707       fixed_length = find_fixedlength(last_branch, options);
 4708       DPRINTF(("fixed length = %d\n", fixed_length));
 4709       if (fixed_length < 0)
 4710         {
 4711         *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
 4712         *ptrptr = ptr;
 4713         return FALSE;
 4714         }
 4715       PUT(reverse_count, 0, fixed_length);
 4716       }
 4717     }
 4718 
 4719   /* Reached end of expression, either ')' or end of pattern. Go back through
 4720   the alternative branches and reverse the chain of offsets, with the field in
 4721   the BRA item now becoming an offset to the first alternative. If there are
 4722   no alternatives, it points to the end of the group. The length in the
 4723   terminating ket is always the length of the whole bracketed item. If any of
 4724   the ims options were changed inside the group, compile a resetting op-code
 4725   following, except at the very end of the pattern. Return leaving the pointer
 4726   at the terminating char. */
 4727 
 4728   if (*ptr != '|')
 4729     {
 4730     int branch_length = code - last_branch;
 4731     do
 4732       {
 4733       int prev_length = GET(last_branch, 1);
 4734       PUT(last_branch, 1, branch_length);
 4735       branch_length = prev_length;
 4736       last_branch -= branch_length;
 4737       }
 4738     while (branch_length > 0);
 4739 
 4740     /* Fill in the ket */
 4741 
 4742     *code = OP_KET;
 4743     PUT(code, 1, code - start_bracket);
 4744     code += 1 + LINK_SIZE;
 4745 
 4746     /* Resetting option if needed */
 4747 
 4748     if ((options & PCRE_IMS) != oldims && *ptr == ')')
 4749       {
 4750       *code++ = OP_OPT;
 4751       *code++ = oldims;
 4752       length += 2;
 4753       }
 4754 
 4755     /* Set values to pass back */
 4756 
 4757     *codeptr = code;
 4758     *ptrptr = ptr;
 4759     *firstbyteptr = firstbyte;
 4760     *reqbyteptr = reqbyte;
 4761     if (lengthptr != NULL) *lengthptr += length;
 4762     return TRUE;
 4763     }
 4764 
 4765   /* Another branch follows; insert an "or" node. Its length field points back
 4766   to the previous branch while the bracket remains open. At the end the chain
 4767   is reversed. It's done like this so that the start of the bracket has a
 4768   zero offset until it is closed, making it possible to detect recursion. */
 4769 
 4770   *code = OP_ALT;
 4771   PUT(code, 1, code - last_branch);
 4772   bc.current = last_branch = code;
 4773   code += 1 + LINK_SIZE;
 4774   ptr++;
 4775   length += 1 + LINK_SIZE;
 4776   }
 4777 /* Control never reaches here */
 4778 }
 4779 
 4780 
 4781 
 4782 
 4783 /*************************************************
 4784 *          Check for anchored expression         *
 4785 *************************************************/
 4786 
 4787 /* Try to find out if this is an anchored regular expression. Consider each
 4788 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
 4789 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
 4790 it's anchored. However, if this is a multiline pattern, then only OP_SOD
 4791 counts, since OP_CIRC can match in the middle.
 4792 
 4793 We can also consider a regex to be anchored if OP_SOM starts all its branches.
 4794 This is the code for \G, which means "match at start of match position, taking
 4795 into account the match offset".
 4796 
 4797 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
 4798 because that will try the rest of the pattern at all possible matching points,
 4799 so there is no point trying again.... er ....
 4800 
 4801 .... except when the .* appears inside capturing parentheses, and there is a
 4802 subsequent back reference to those parentheses. We haven't enough information
 4803 to catch that case precisely.
 4804 
 4805 At first, the best we could do was to detect when .* was in capturing brackets
 4806 and the highest back reference was greater than or equal to that level.
 4807 However, by keeping a bitmap of the first 31 back references, we can catch some
 4808 of the more common cases more precisely.
 4809 
 4810 Arguments:
 4811   code           points to start of expression (the bracket)
 4812   options        points to the options setting
 4813   bracket_map    a bitmap of which brackets we are inside while testing; this
 4814                   handles up to substring 31; after that we just have to take
 4815                   the less precise approach
 4816   backref_map    the back reference bitmap
 4817 
 4818 Returns:     TRUE or FALSE
 4819 */
 4820 
 4821 static BOOL
 4822 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
 4823   unsigned int backref_map)
 4824 {
 4825 do {
 4826    const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
 4827      options, PCRE_MULTILINE, FALSE);
 4828    register int op = *scode;
 4829 
 4830    /* Non-capturing brackets */
 4831 
 4832    if (op == OP_BRA)
 4833      {
 4834      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
 4835      }
 4836 
 4837    /* Capturing brackets */
 4838 
 4839    else if (op == OP_CBRA)
 4840      {
 4841      int n = GET2(scode, 1+LINK_SIZE);
 4842      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
 4843      if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
 4844      }
 4845 
 4846    /* Other brackets */
 4847 
 4848    else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
 4849      {
 4850      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
 4851      }
 4852 
 4853    /* .* is not anchored unless DOTALL is set and it isn't in brackets that
 4854    are or may be referenced. */
 4855 
 4856    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
 4857              op == OP_TYPEPOSSTAR) &&
 4858             (*options & PCRE_DOTALL) != 0)
 4859      {
 4860      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
 4861      }
 4862 
 4863    /* Check for explicit anchoring */
 4864 
 4865    else if (op != OP_SOD && op != OP_SOM &&
 4866            ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
 4867      return FALSE;
 4868    code += GET(code, 1);
 4869    }
 4870 while (*code == OP_ALT);   /* Loop for each alternative */
 4871 return TRUE;
 4872 }
 4873 
 4874 
 4875 
 4876 /*************************************************
 4877 *         Check for starting with ^ or .*        *
 4878 *************************************************/
 4879 
 4880 /* This is called to find out if every branch starts with ^ or .* so that
 4881 "first char" processing can be done to speed things up in multiline
 4882 matching and for non-DOTALL patterns that start with .* (which must start at
 4883 the beginning or after \n). As in the case of is_anchored() (see above), we
 4884 have to take account of back references to capturing brackets that contain .*
 4885 because in that case we can't make the assumption.
 4886 
 4887 Arguments:
 4888   code           points to start of expression (the bracket)
 4889   bracket_map    a bitmap of which brackets we are inside while testing; this
 4890                   handles up to substring 31; after that we just have to take
 4891                   the less precise approach
 4892   backref_map    the back reference bitmap
 4893 
 4894 Returns:         TRUE or FALSE
 4895 */
 4896 
 4897 static BOOL
 4898 is_startline(const uschar *code, unsigned int bracket_map,
 4899   unsigned int backref_map)
 4900 {
 4901 do {
 4902    const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
 4903      NULL, 0, FALSE);
 4904    register int op = *scode;
 4905 
 4906    /* Non-capturing brackets */
 4907 
 4908    if (op == OP_BRA)
 4909      {
 4910      if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
 4911      }
 4912 
 4913    /* Capturing brackets */
 4914 
 4915    else if (op == OP_CBRA)
 4916      {
 4917      int n = GET2(scode, 1+LINK_SIZE);
 4918      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
 4919      if (!is_startline(scode, new_map, backref_map)) return FALSE;
 4920      }
 4921 
 4922    /* Other brackets */
 4923 
 4924    else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
 4925      { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
 4926 
 4927    /* .* means "start at start or after \n" if it isn't in brackets that
 4928    may be referenced. */
 4929 
 4930    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
 4931      {
 4932      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
 4933      }
 4934 
 4935    /* Check for explicit circumflex */
 4936 
 4937    else if (op != OP_CIRC) return FALSE;
 4938 
 4939    /* Move on to the next alternative */
 4940 
 4941    code += GET(code, 1);
 4942    }
 4943 while (*code == OP_ALT);  /* Loop for each alternative */
 4944 return TRUE;
 4945 }
 4946 
 4947 
 4948 
 4949 /*************************************************
 4950 *       Check for asserted fixed first char      *
 4951 *************************************************/
 4952 
 4953 /* During compilation, the "first char" settings from forward assertions are
 4954 discarded, because they can cause conflicts with actual literals that follow.
 4955 However, if we end up without a first char setting for an unanchored pattern,
 4956 it is worth scanning the regex to see if there is an initial asserted first
 4957 char. If all branches start with the same asserted char, or with a bracket all
 4958 of whose alternatives start with the same asserted char (recurse ad lib), then
 4959 we return that char, otherwise -1.
 4960 
 4961 Arguments:
 4962   code       points to start of expression (the bracket)
 4963   options    pointer to the options (used to check casing changes)
 4964   inassert   TRUE if in an assertion
 4965 
 4966 Returns:     -1 or the fixed first char
 4967 */
 4968 
 4969 static int
 4970 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
 4971 {
 4972 register int c = -1;
 4973 do {
 4974    int d;
 4975    const uschar *scode =
 4976      first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
 4977    register int op = *scode;
 4978 
 4979    switch(op)
 4980      {
 4981      default:
 4982      return -1;
 4983 
 4984      case OP_BRA:
 4985      case OP_CBRA:
 4986      case OP_ASSERT:
 4987      case OP_ONCE:
 4988      case OP_COND:
 4989      if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
 4990        return -1;
 4991      if (c < 0) c = d; else if (c != d) return -1;
 4992      break;
 4993 
 4994      case OP_EXACT:       /* Fall through */
 4995      scode += 2;
 4996 
 4997      case OP_CHAR:
 4998      case OP_CHARNC:
 4999      case OP_PLUS:
 5000      case OP_MINPLUS:
 5001      case OP_POSPLUS:
 5002      if (!inassert) return -1;
 5003      if (c < 0)
 5004        {
 5005        c = scode[1];
 5006        if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
 5007        }
 5008      else if (c != scode[1]) return -1;
 5009      break;
 5010      }
 5011 
 5012    code += GET(code, 1);
 5013    }
 5014 while (*code == OP_ALT);
 5015 return c;
 5016 }
 5017 
 5018 
 5019 
 5020 /*************************************************
 5021 *        Compile a Regular Expression            *
 5022 *************************************************/
 5023 
 5024 /* This function takes a string and returns a pointer to a block of store
 5025 holding a compiled version of the expression. The original API for this
 5026 function had no error code return variable; it is retained for backwards
 5027 compatibility. The new function is given a new name.
 5028 
 5029 Arguments:
 5030   pattern       the regular expression
 5031   options       various option bits
 5032   errorcodeptr  pointer to error code variable (pcre_compile2() only)
 5033                   can be NULL if you don't want a code value
 5034   errorptr      pointer to pointer to error text
 5035   erroroffset   ptr offset in pattern where error was detected
 5036   tables        pointer to character tables or NULL
 5037 
 5038 Returns:        pointer to compiled data block, or NULL on error,
 5039                 with errorptr and erroroffset set
 5040 */
 5041 
 5042 PCRE_DATA_SCOPE pcre *
 5043 pcre_compile(const char *pattern, int options, const char **errorptr,
 5044   int *erroroffset, const unsigned char *tables)
 5045 {
 5046 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
 5047 }
 5048 
 5049 
 5050 PCRE_DATA_SCOPE pcre *
 5051 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
 5052   const char **errorptr, int *erroroffset, const unsigned char *tables)
 5053 {
 5054 real_pcre *re;
 5055 int length = 1;  /* For final END opcode */
 5056 int firstbyte, reqbyte, newline;
 5057 int errorcode = 0;
 5058 #ifdef SUPPORT_UTF8
 5059 BOOL utf8;
 5060 #endif
 5061 size_t size;
 5062 uschar *code;
 5063 const uschar *codestart;
 5064 const uschar *ptr;
 5065 compile_data compile_block;
 5066 compile_data *cd = &compile_block;
 5067 
 5068 /* This space is used for "compiling" into during the first phase, when we are
 5069 computing the amount of memory that is needed. Compiled items are thrown away
 5070 as soon as possible, so that a fairly large buffer should be sufficient for
 5071 this purpose. The same space is used in the second phase for remembering where
 5072 to fill in forward references to subpatterns. */
 5073 
 5074 uschar cworkspace[COMPILE_WORK_SIZE];
 5075 
 5076 
 5077 /* Set this early so that early errors get offset 0. */
 5078 
 5079 ptr = (const uschar *)pattern;
 5080 
 5081 /* We can't pass back an error message if errorptr is NULL; I guess the best we
 5082 can do is just return NULL, but we can set a code value if there is a code
 5083 pointer. */
 5084 
 5085 if (errorptr == NULL)
 5086   {
 5087   if (errorcodeptr != NULL) *errorcodeptr = 99;
 5088   return NULL;
 5089   }
 5090 
 5091 *errorptr = NULL;
 5092 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
 5093 
 5094 /* However, we can give a message for this error */
 5095 
 5096 if (erroroffset == NULL)
 5097   {
 5098   errorcode = ERR16;
 5099   goto PCRE_EARLY_ERROR_RETURN;
 5100   }
 5101 
 5102 *erroroffset = 0;
 5103 
 5104 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
 5105 
 5106 #ifdef SUPPORT_UTF8
 5107 utf8 = (options & PCRE_UTF8) != 0;
 5108 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
 5109      (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
 5110   {
 5111   errorcode = ERR44;
 5112   goto PCRE_UTF8_ERROR_RETURN;
 5113   }
 5114 #else
 5115 if ((options & PCRE_UTF8) != 0)
 5116   {
 5117   errorcode = ERR32;
 5118   goto PCRE_EARLY_ERROR_RETURN;
 5119   }
 5120 #endif
 5121 
 5122 if ((options & ~PUBLIC_OPTIONS) != 0)
 5123   {
 5124   errorcode = ERR17;
 5125   goto PCRE_EARLY_ERROR_RETURN;
 5126   }
 5127 
 5128 /* Set up pointers to the individual character tables */
 5129 
 5130 if (tables == NULL) tables = _pcre_default_tables;
 5131 cd->lcc = tables + lcc_offset;
 5132 cd->fcc = tables + fcc_offset;
 5133 cd->cbits = tables + cbits_offset;
 5134 cd->ctypes = tables + ctypes_offset;
 5135 
 5136 /* Handle different types of newline. The three bits give seven cases. The
 5137 current code allows for fixed one- or two-byte sequences, plus "any". */
 5138 
 5139 switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
 5140   {
 5141   case 0: newline = NEWLINE; break;   /* Compile-time default */
 5142   case PCRE_NEWLINE_CR: newline = '\r'; break;
 5143   case PCRE_NEWLINE_LF: newline = '\n'; break;
 5144   case PCRE_NEWLINE_CR+
 5145        PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
 5146   case PCRE_NEWLINE_ANY: newline = -1; break;
 5147   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
 5148   }
 5149 
 5150 if (newline < 0)
 5151   {
 5152   cd->nltype = NLTYPE_ANY;
 5153   }
 5154 else
 5155   {
 5156   cd->nltype = NLTYPE_FIXED;
 5157   if (newline > 255)
 5158     {
 5159     cd->nllen = 2;
 5160     cd->nl[0] = (newline >> 8) & 255;
 5161     cd->nl[1] = newline & 255;
 5162     }
 5163   else
 5164     {
 5165     cd->nllen = 1;
 5166     cd->nl[0] = newline;
 5167     }
 5168   }
 5169 
 5170 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
 5171 references to help in deciding whether (.*) can be treated as anchored or not.
 5172 */
 5173 
 5174 cd->top_backref = 0;
 5175 cd->backref_map = 0;
 5176 
 5177 /* Reflect pattern for debugging output */
 5178 
 5179 DPRINTF(("------------------------------------------------------------------\n"));
 5180 DPRINTF(("%s\n", pattern));
 5181 
 5182 /* Pretend to compile the pattern while actually just accumulating the length
 5183 of memory required. This behaviour is triggered by passing a non-NULL final
 5184 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
 5185 to compile parts of the pattern into; the compiled code is discarded when it is
 5186 no longer needed, so hopefully this workspace will never overflow, though there
 5187 is a test for its doing so. */
 5188 
 5189 cd->bracount = 0;
 5190 cd->names_found = 0;
 5191 cd->name_entry_size = 0;
 5192 cd->name_table = NULL;
 5193 cd->start_workspace = cworkspace;
 5194 cd->start_code = cworkspace;
 5195 cd->hwm = cworkspace;
 5196 cd->start_pattern = (const uschar *)pattern;
 5197 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
 5198 cd->req_varyopt = 0;
 5199 cd->nopartial = FALSE;
 5200 cd->external_options = options;
 5201 
 5202 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
 5203 don't need to look at the result of the function here. The initial options have
 5204 been put into the cd block so that they can be changed if an option setting is
 5205 found within the regex right at the beginning. Bringing initial option settings
 5206 outside can help speed up starting point checks. */
 5207 
 5208 code = cworkspace;
 5209 *code = OP_BRA;
 5210 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
 5211   &code, &ptr, &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, &length);
 5212 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
 5213 
 5214 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
 5215   cd->hwm - cworkspace));
 5216 
 5217 if (length > MAX_PATTERN_SIZE)
 5218   {
 5219   errorcode = ERR20;
 5220   goto PCRE_EARLY_ERROR_RETURN;
 5221   }
 5222 
 5223 /* Compute the size of data block needed and get it, either from malloc or
 5224 externally provided function. Integer overflow should no longer be possible
 5225 because nowadays we limit the maximum value of cd->names_found and
 5226 cd->name_entry_size. */
 5227 
 5228 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
 5229 re = (real_pcre *)(pcre_malloc)(size);
 5230 
 5231 if (re == NULL)
 5232   {
 5233   errorcode = ERR21;
 5234   goto PCRE_EARLY_ERROR_RETURN;
 5235   }
 5236 
 5237 /* Put in the magic number, and save the sizes, initial options, and character
 5238 table pointer. NULL is used for the default character tables. The nullpad field
 5239 is at the end; it's there to help in the case when a regex compiled on a system
 5240 with 4-byte pointers is run on another with 8-byte pointers. */
 5241 
 5242 re->magic_number = MAGIC_NUMBER;
 5243 re->size = size;
 5244 re->options = cd->external_options;
 5245 re->dummy1 = 0;
 5246 re->first_byte = 0;
 5247 re->req_byte = 0;
 5248 re->name_table_offset = sizeof(real_pcre);
 5249 re->name_entry_size = cd->name_entry_size;
 5250 re->name_count = cd->names_found;
 5251 re->ref_count = 0;
 5252 re->tables = (tables == _pcre_default_tables)? NULL : tables;
 5253 re->nullpad = NULL;
 5254 
 5255 /* The starting points of the name/number translation table and of the code are
 5256 passed around in the compile data block. The start/end pattern and initial
 5257 options are already set from the pre-compile phase, as is the name_entry_size
 5258 field. Reset the bracket count and the names_found field. Also reset the hwm
 5259 field; this time it's used for remembering forward references to subpatterns.
 5260 */
 5261 
 5262 cd->bracount = 0;
 5263 cd->names_found = 0;
 5264 cd->name_table = (uschar *)re + re->name_table_offset;
 5265 codestart = cd->name_table + re->name_entry_size * re->name_count;
 5266 cd->start_code = codestart;
 5267 cd->hwm = cworkspace;
 5268 cd->req_varyopt = 0;
 5269 cd->nopartial = FALSE;
 5270 
 5271 /* Set up a starting, non-extracting bracket, then compile the expression. On
 5272 error, errorcode will be set non-zero, so we don't need to look at the result
 5273 of the function here. */
 5274 
 5275 ptr = (const uschar *)pattern;
 5276 code = (uschar *)codestart;
 5277 *code = OP_BRA;
 5278 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
 5279   &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
 5280 re->top_bracket = cd->bracount;
 5281 re->top_backref = cd->top_backref;
 5282 
 5283 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
 5284 
 5285 /* If not reached end of pattern on success, there's an excess bracket. */
 5286 
 5287 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
 5288 
 5289 /* Fill in the terminating state and check for disastrous overflow, but
 5290 if debugging, leave the test till after things are printed out. */
 5291 
 5292 *code++ = OP_END;
 5293 
 5294 #ifndef DEBUG
 5295 if (code - codestart > length) errorcode = ERR23;
 5296 #endif
 5297 
 5298 /* Fill in any forward references that are required. */
 5299 
 5300 while (errorcode == 0 && cd->hwm > cworkspace)
 5301   {
 5302   int offset, recno;
 5303   const uschar *groupptr;
 5304   cd->hwm -= LINK_SIZE;
 5305   offset = GET(cd->hwm, 0);
 5306   recno = GET(codestart, offset);
 5307   groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
 5308   if (groupptr == NULL) errorcode = ERR53;
 5309     else PUT(((uschar *)codestart), offset, groupptr - codestart);
 5310   }
 5311 
 5312 /* Give an error if there's back reference to a non-existent capturing
 5313 subpattern. */
 5314 
 5315 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
 5316 
 5317 /* Failed to compile, or error while post-processing */
 5318 
 5319 if (errorcode != 0)
 5320   {
 5321   (pcre_free)(re);
 5322   PCRE_EARLY_ERROR_RETURN:
 5323   *erroroffset = ptr - (const uschar *)pattern;
 5324 #ifdef SUPPORT_UTF8
 5325   PCRE_UTF8_ERROR_RETURN:
 5326 #endif
 5327   *errorptr = error_texts[errorcode];
 5328   if (errorcodeptr != NULL) *errorcodeptr = errorcode;
 5329   return NULL;
 5330   }
 5331 
 5332 /* If the anchored option was not passed, set the flag if we can determine that
 5333 the pattern is anchored by virtue of ^ characters or \A or anything else (such
 5334 as starting with .* when DOTALL is set).
 5335 
 5336 Otherwise, if we know what the first byte has to be, save it, because that
 5337 speeds up unanchored matches no end. If not, see if we can set the
 5338 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
 5339 start with ^. and also when all branches start with .* for non-DOTALL matches.
 5340 */
 5341 
 5342 if ((re->options & PCRE_ANCHORED) == 0)
 5343   {
 5344   int temp_options = re->options;   /* May get changed during these scans */
 5345   if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
 5346     re->options |= PCRE_ANCHORED;
 5347   else
 5348     {
 5349     if (firstbyte < 0)
 5350       firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
 5351     if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
 5352       {
 5353       int ch = firstbyte & 255;
 5354       re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
 5355          cd->fcc[ch] == ch)? ch : firstbyte;
 5356       re->options |= PCRE_FIRSTSET;
 5357       }
 5358     else if (is_startline(codestart, 0, cd->backref_map))
 5359       re->options |= PCRE_STARTLINE;
 5360     }
 5361   }
 5362 
 5363 /* For an anchored pattern, we use the "required byte" only if it follows a
 5364 variable length item in the regex. Remove the caseless flag for non-caseable
 5365 bytes. */
 5366 
 5367 if (reqbyte >= 0 &&
 5368      ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
 5369   {
 5370   int ch = reqbyte & 255;
 5371   re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
 5372     cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
 5373   re->options |= PCRE_REQCHSET;
 5374   }
 5375 
 5376 /* Print out the compiled data if debugging is enabled. This is never the
 5377 case when building a production library. */
 5378 
 5379 #ifdef DEBUG
 5380 
 5381 printf("Length = %d top_bracket = %d top_backref = %d\n",
 5382   length, re->top_bracket, re->top_backref);
 5383 
 5384 if (re->options != 0)
 5385   {
 5386   printf("%s%s%s%s%s%s%s%s%s\n",
 5387     ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
 5388     ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
 5389     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
 5390     ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
 5391     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
 5392     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
 5393     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
 5394     ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
 5395     ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
 5396   }
 5397 
 5398 if ((re->options & PCRE_FIRSTSET) != 0)
 5399   {
 5400   int ch = re->first_byte & 255;
 5401   const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
 5402     "" : " (caseless)";
 5403   if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
 5404     else printf("First char = \\x%02x%s\n", ch, caseless);
 5405   }
 5406 
 5407 if ((re->options & PCRE_REQCHSET) != 0)
 5408   {
 5409   int ch = re->req_byte & 255;
 5410   const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
 5411     "" : " (caseless)";
 5412   if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
 5413     else printf("Req char = \\x%02x%s\n", ch, caseless);
 5414   }
 5415 
 5416 pcre_printint(re, stdout);
 5417 
 5418 /* This check is done here in the debugging case so that the code that
 5419 was compiled can be seen. */
 5420 
 5421 if (code - codestart > length)
 5422   {
 5423   (pcre_free)(re);
 5424   *errorptr = error_texts[ERR23];
 5425   *erroroffset = ptr - (uschar *)pattern;
 5426   if (errorcodeptr != NULL) *errorcodeptr = ERR23;
 5427   return NULL;
 5428   }
 5429 #endif   /* DEBUG */
 5430 
 5431 return (pcre *)re;
 5432 }
 5433 
 5434 /* End of pcre_compile.c */