tin  2.4.5
About: TIN is a threaded NNTP and spool based UseNet newsreader.
  Fossies Dox: tin-2.4.5.tar.xz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

pcre_compile.c
Go to the documentation of this file.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8  Written by Philip Hazel
9  Copyright (c) 1997-2006 University of Cambridge
10 
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14 
15  * Redistributions of source code must retain the above copyright notice,
16  this list of conditions and the following disclaimer.
17 
18  * Redistributions in binary form must reproduce the above copyright
19  notice, this list of conditions and the following disclaimer in the
20  documentation and/or other materials provided with the distribution.
21 
22  * Neither the name of the University of Cambridge nor the names of its
23  contributors may be used to endorse or promote products derived from
24  this software without specific prior written permission.
25 
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39 
40 
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43 
44 
45 #define NLBLOCK cd /* Block containing newline information */
46 #define PSSTART start_pattern /* Field containing processed string start */
47 #define PSEND end_pattern /* Field containing processed string end */
48 
49 
50 #include "pcre_internal.h"
51 
52 
53 /* When DEBUG is defined, we need the pcre_printint() function, which is also
54 used by pcretest. DEBUG is not defined when building a production library. */
55 
56 #ifdef DEBUG
57 #include "pcre_printint.src"
58 #endif
59 
60 
61 /*************************************************
62 * Code parameters and static tables *
63 *************************************************/
64 
65 /* This value specifies the size of stack workspace that is used during the
66 first pre-compile phase that determines how much memory is required. The regex
67 is partly compiled into this space, but the compiled parts are discarded as
68 soon as they can be, so that hopefully there will never be an overrun. The code
69 does, however, check for an overrun. The largest amount I've seen used is 218,
70 so this number is very generous.
71 
72 The same workspace is used during the second, actual compile phase for
73 remembering forward references to groups so that they can be filled in at the
74 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
75 is 4 there is plenty of room. */
76 
77 #define COMPILE_WORK_SIZE (4096)
78 
79 
80 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
81 are simple data values; negative values are for special things like \d and so
82 on. Zero means further processing is needed (for things like \x), or the escape
83 is invalid. */
84 
85 #if !EBCDIC /* This is the "normal" table for ASCII systems */
86 static const short int escapes[] = {
87  0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
88  0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
89  '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
90  0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
91 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
92 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
93  '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
94  0, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
95 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
96  0, 0, -ESC_z /* x - z */
97 };
98 
99 #else /* This is the "abnormal" table for EBCDIC systems */
100 static const short int escapes[] = {
101 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
102 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
103 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
104 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
105 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
106 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
107 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
108 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
109 /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
110 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
111 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
112 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
113 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
114 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
115 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
116 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
117 /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
118 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
119 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
120 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
121 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
122 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
123 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
124 };
125 #endif
126 
127 
128 /* Tables of names of POSIX character classes and their lengths. The list is
129 terminated by a zero length entry. The first three must be alpha, lower, upper,
130 as this is assumed for handling case independence. */
131 
132 static const char *const posix_names[] = {
133  "alpha", "lower", "upper",
134  "alnum", "ascii", "blank", "cntrl", "digit", "graph",
135  "print", "punct", "space", "word", "xdigit" };
136 
137 static const uschar posix_name_lengths[] = {
138  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
139 
140 /* Table of class bit maps for each POSIX class. Each class is formed from a
141 base map, with an optional addition or removal of another map. Then, for some
142 classes, there is some additional tweaking: for [:blank:] the vertical space
143 characters are removed, and for [:alpha:] and [:alnum:] the underscore
144 character is removed. The triples in the table consist of the base map offset,
145 second map offset or -1 if no second map, and a non-negative value for map
146 addition or a negative value for map subtraction (if there are two maps). The
147 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
148 remove vertical space characters, 2 => remove underscore. */
149 
150 static const int posix_class_maps[] = {
151  cbit_word, cbit_digit, -2, /* alpha */
152  cbit_lower, -1, 0, /* lower */
153  cbit_upper, -1, 0, /* upper */
154  cbit_word, -1, 2, /* alnum - word without underscore */
155  cbit_print, cbit_cntrl, 0, /* ascii */
156  cbit_space, -1, 1, /* blank - a GNU extension */
157  cbit_cntrl, -1, 0, /* cntrl */
158  cbit_digit, -1, 0, /* digit */
159  cbit_graph, -1, 0, /* graph */
160  cbit_print, -1, 0, /* print */
161  cbit_punct, -1, 0, /* punct */
162  cbit_space, -1, 0, /* space */
163  cbit_word, -1, 0, /* word - a Perl extension */
164  cbit_xdigit,-1, 0 /* xdigit */
165 };
166 
167 
168 #define STRING(a) # a
169 #define XSTRING(s) STRING(s)
170 
171 /* The texts of compile-time error messages. These are "char *" because they
172 are passed to the outside world. Do not ever re-use any error number, because
173 they are documented. Always add a new error instead. Messages marked DEAD below
174 are no longer used. */
175 
176 static const char *error_texts[] = {
177  "no error",
178  "\\ at end of pattern",
179  "\\c at end of pattern",
180  "unrecognized character follows \\",
181  "numbers out of order in {} quantifier",
182  /* 5 */
183  "number too big in {} quantifier",
184  "missing terminating ] for character class",
185  "invalid escape sequence in character class",
186  "range out of order in character class",
187  "nothing to repeat",
188  /* 10 */
189  "operand of unlimited repeat could match the empty string", /** DEAD **/
190  "internal error: unexpected repeat",
191  "unrecognized character after (?",
192  "POSIX named classes are supported only within a class",
193  "missing )",
194  /* 15 */
195  "reference to non-existent subpattern",
196  "erroffset passed as NULL",
197  "unknown option bit(s) set",
198  "missing ) after comment",
199  "parentheses nested too deeply", /** DEAD **/
200  /* 20 */
201  "regular expression too large",
202  "failed to get memory",
203  "unmatched parentheses",
204  "internal error: code overflow",
205  "unrecognized character after (?<",
206  /* 25 */
207  "lookbehind assertion is not fixed length",
208  "malformed number or name after (?(",
209  "conditional group contains more than two branches",
210  "assertion expected after (?(",
211  "(?R or (?digits must be followed by )",
212  /* 30 */
213  "unknown POSIX class name",
214  "POSIX collating elements are not supported",
215  "this version of PCRE is not compiled with PCRE_UTF8 support",
216  "spare error", /** DEAD **/
217  "character value in \\x{...} sequence is too large",
218  /* 35 */
219  "invalid condition (?(0)",
220  "\\C not allowed in lookbehind assertion",
221  "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
222  "number after (?C is > 255",
223  "closing ) for (?C expected",
224  /* 40 */
225  "recursive call could loop indefinitely",
226  "unrecognized character after (?P",
227  "syntax error in subpattern name (missing terminator)",
228  "two named subpatterns have the same name",
229  "invalid UTF-8 string",
230  /* 45 */
231  "support for \\P, \\p, and \\X has not been compiled",
232  "malformed \\P or \\p sequence",
233  "unknown property name after \\P or \\p",
234  "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
235  "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
236  /* 50 */
237  "repeated subpattern is too long",
238  "octal value is greater than \\377 (not in UTF-8 mode)",
239  "internal error: overran compiling workspace",
240  "internal error: previously-checked referenced subpattern not found",
241  "DEFINE group contains more than one branch",
242  /* 55 */
243  "repeating a DEFINE group is not allowed",
244  "inconsistent NEWLINE options",
245  "\\g is not followed by an (optionally braced) non-zero number"
246 };
247 
248 
249 /* Table to identify digits and hex digits. This is used when compiling
250 patterns. Note that the tables in chartables are dependent on the locale, and
251 may mark arbitrary characters as digits - but the PCRE compiling code expects
252 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
253 a private table here. It costs 256 bytes, but it is a lot faster than doing
254 character value tests (at least in some simple cases I timed), and in some
255 applications one wants PCRE to compile efficiently as well as match
256 efficiently.
257 
258 For convenience, we use the same bit definitions as in chartables:
259 
260  0x04 decimal digit
261  0x08 hexadecimal digit
262 
263 Then we can use ctype_digit and ctype_xdigit in the code. */
264 
265 #if !EBCDIC /* This is the "normal" case, for ASCII systems */
266 static const unsigned char digitab[] =
267  {
268  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
269  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
270  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
271  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
272  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
273  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
274  0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
275  0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
276  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
277  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
278  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
279  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
280  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
281  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
282  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
283  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
284  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
285  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
286  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
287  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
288  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
289  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
290  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
291  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
292  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
293  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
294  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
295  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
296  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
297  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
298  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
299  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
300 
301 #else /* This is the "abnormal" case, for EBCDIC systems */
302 static const unsigned char digitab[] =
303  {
304  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
305  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
306  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
307  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
308  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
309  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
310  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
311  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
312  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
313  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
314  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
315  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- ¬ */
316  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
317  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
318  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
319  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
320  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
321  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
322  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
323  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
324  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
325  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
326  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
327  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
328  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
329  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
330  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
331  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
332  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
333  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
334  0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
335  0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
336 
337 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
338  0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
339  0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
340  0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
341  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
342  0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
343  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
344  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
345  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
346  0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
347  0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
348  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
349  0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- ¬ */
350  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
351  0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
352  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
353  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
354  0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
355  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
356  0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
357  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
358  0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
359  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
360  0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
361  0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
362  0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
363  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
364  0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
365  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
366  0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
367  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
368  0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
369  0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
370 #endif
371 
372 
373 /* Definition to allow mutual recursion */
374 
375 static BOOL
376  compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *,
377  int *, branch_chain *, compile_data *, int *);
378 
379 
380 
381 /*************************************************
382 * Handle escapes *
383 *************************************************/
384 
385 /* This function is called when a \ has been encountered. It either returns a
386 positive value for a simple escape such as \n, or a negative value which
387 encodes one of the more complicated things such as \d. A backreference to group
388 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
389 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
390 ptr is pointing at the \. On exit, it is on the final character of the escape
391 sequence.
392 
393 Arguments:
394  ptrptr points to the pattern position pointer
395  errorcodeptr points to the errorcode variable
396  bracount number of previous extracting brackets
397  options the options bits
398  isclass TRUE if inside a character class
399 
400 Returns: zero or positive => a data character
401  negative => a special escape sequence
402  on error, errorptr is set
403 */
404 
405 static int
406 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
407  int options, BOOL isclass)
408 {
409 BOOL utf8 = (options & PCRE_UTF8) != 0;
410 const uschar *ptr = *ptrptr + 1;
411 int c, i;
412 
413 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
414 ptr--; /* Set pointer back to the last byte */
415 
416 /* If backslash is at the end of the pattern, it's an error. */
417 
418 if (c == 0) *errorcodeptr = ERR1;
419 
420 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
421 a table. A non-zero result is something that can be returned immediately.
422 Otherwise further processing may be required. */
423 
424 #if !EBCDIC /* ASCII coding */
425 else if (c < '0' || c > 'z') {} /* Not alphameric */
426 else if ((i = escapes[c - '0']) != 0) c = i;
427 
428 #else /* EBCDIC coding */
429 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
430 else if ((i = escapes[c - 0x48]) != 0) c = i;
431 #endif
432 
433 /* Escapes that need further processing, or are illegal. */
434 
435 else
436  {
437  const uschar *oldptr;
438  BOOL braced, negated;
439 
440  switch (c)
441  {
442  /* A number of Perl escapes are not handled by PCRE. We give an explicit
443  error. */
444 
445  case 'l':
446  case 'L':
447  case 'N':
448  case 'u':
449  case 'U':
450  *errorcodeptr = ERR37;
451  break;
452 
453  /* \g must be followed by a number, either plain or braced. If positive, it
454  is an absolute backreference. If negative, it is a relative backreference.
455  This is a Perl 5.10 feature. */
456 
457  case 'g':
458  if (ptr[1] == '{')
459  {
460  braced = TRUE;
461  ptr++;
462  }
463  else braced = FALSE;
464 
465  if (ptr[1] == '-')
466  {
467  negated = TRUE;
468  ptr++;
469  }
470  else negated = FALSE;
471 
472  c = 0;
473  while ((digitab[ptr[1]] & ctype_digit) != 0)
474  c = c * 10 + *(++ptr) - '0';
475 
476  if (c == 0 || (braced && *(++ptr) != '}'))
477  {
478  *errorcodeptr = ERR57;
479  return 0;
480  }
481 
482  if (negated)
483  {
484  if (c > bracount)
485  {
486  *errorcodeptr = ERR15;
487  return 0;
488  }
489  c = bracount - (c - 1);
490  }
491 
492  c = -(ESC_REF + c);
493  break;
494 
495  /* The handling of escape sequences consisting of a string of digits
496  starting with one that is not zero is not straightforward. By experiment,
497  the way Perl works seems to be as follows:
498 
499  Outside a character class, the digits are read as a decimal number. If the
500  number is less than 10, or if there are that many previous extracting
501  left brackets, then it is a back reference. Otherwise, up to three octal
502  digits are read to form an escaped byte. Thus \123 is likely to be octal
503  123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
504  value is greater than 377, the least significant 8 bits are taken. Inside a
505  character class, \ followed by a digit is always an octal number. */
506 
507  case '1': case '2': case '3': case '4': case '5':
508  case '6': case '7': case '8': case '9':
509 
510  if (!isclass)
511  {
512  oldptr = ptr;
513  c -= '0';
514  while ((digitab[ptr[1]] & ctype_digit) != 0)
515  c = c * 10 + *(++ptr) - '0';
516  if (c < 10 || c <= bracount)
517  {
518  c = -(ESC_REF + c);
519  break;
520  }
521  ptr = oldptr; /* Put the pointer back and fall through */
522  }
523 
524  /* Handle an octal number following \. If the first digit is 8 or 9, Perl
525  generates a binary zero byte and treats the digit as a following literal.
526  Thus we have to pull back the pointer by one. */
527 
528  if ((c = *ptr) >= '8')
529  {
530  ptr--;
531  c = 0;
532  break;
533  }
534 
535  /* \0 always starts an octal number, but we may drop through to here with a
536  larger first octal digit. The original code used just to take the least
537  significant 8 bits of octal numbers (I think this is what early Perls used
538  to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
539  than 3 octal digits. */
540 
541  case '0':
542  c -= '0';
543  while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
544  c = c * 8 + *(++ptr) - '0';
545  if (!utf8 && c > 255) *errorcodeptr = ERR51;
546  break;
547 
548  /* \x is complicated. \x{ddd} is a character number which can be greater
549  than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
550  treated as a data character. */
551 
552  case 'x':
553  if (ptr[1] == '{')
554  {
555  const uschar *pt = ptr + 2;
556  int count = 0;
557 
558  c = 0;
559  while ((digitab[*pt] & ctype_xdigit) != 0)
560  {
561  register int cc = *pt++;
562  if (c == 0 && cc == '0') continue; /* Leading zeroes */
563  count++;
564 
565 #if !EBCDIC /* ASCII coding */
566  if (cc >= 'a') cc -= 32; /* Convert to upper case */
567  c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
568 #else /* EBCDIC coding */
569  if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
570  c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
571 #endif
572  }
573 
574  if (*pt == '}')
575  {
576  if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
577  ptr = pt;
578  break;
579  }
580 
581  /* If the sequence of hex digits does not end with '}', then we don't
582  recognize this construct; fall through to the normal \x handling. */
583  }
584 
585  /* Read just a single-byte hex-defined char */
586 
587  c = 0;
588  while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
589  {
590  int cc; /* Some compilers don't like ++ */
591  cc = *(++ptr); /* in initializers */
592 #if !EBCDIC /* ASCII coding */
593  if (cc >= 'a') cc -= 32; /* Convert to upper case */
594  c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
595 #else /* EBCDIC coding */
596  if (cc <= 'z') cc += 64; /* Convert to upper case */
597  c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
598 #endif
599  }
600  break;
601 
602  /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
603  This coding is ASCII-specific, but then the whole concept of \cx is
604  ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
605 
606  case 'c':
607  c = *(++ptr);
608  if (c == 0)
609  {
610  *errorcodeptr = ERR2;
611  return 0;
612  }
613 
614 #if !EBCDIC /* ASCII coding */
615  if (c >= 'a' && c <= 'z') c -= 32;
616  c ^= 0x40;
617 #else /* EBCDIC coding */
618  if (c >= 'a' && c <= 'z') c += 64;
619  c ^= 0xC0;
620 #endif
621  break;
622 
623  /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
624  other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
625  for Perl compatibility, it is a literal. This code looks a bit odd, but
626  there used to be some cases other than the default, and there may be again
627  in future, so I haven't "optimized" it. */
628 
629  default:
630  if ((options & PCRE_EXTRA) != 0) switch(c)
631  {
632  default:
633  *errorcodeptr = ERR3;
634  break;
635  }
636  break;
637  }
638  }
639 
640 *ptrptr = ptr;
641 return c;
642 }
643 
644 
645 
646 #ifdef SUPPORT_UCP
647 /*************************************************
648 * Handle \P and \p *
649 *************************************************/
650 
651 /* This function is called after \P or \p has been encountered, provided that
652 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
653 pointing at the P or p. On exit, it is pointing at the final character of the
654 escape sequence.
655 
656 Argument:
657  ptrptr points to the pattern position pointer
658  negptr points to a boolean that is set TRUE for negation else FALSE
659  dptr points to an int that is set to the detailed property value
660  errorcodeptr points to the error code variable
661 
662 Returns: type value from ucp_type_table, or -1 for an invalid type
663 */
664 
665 static int
666 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
667 {
668 int c, i, bot, top;
669 const uschar *ptr = *ptrptr;
670 char name[32];
671 
672 c = *(++ptr);
673 if (c == 0) goto ERROR_RETURN;
674 
675 *negptr = FALSE;
676 
677 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
678 negation. */
679 
680 if (c == '{')
681  {
682  if (ptr[1] == '^')
683  {
684  *negptr = TRUE;
685  ptr++;
686  }
687  for (i = 0; i < sizeof(name) - 1; i++)
688  {
689  c = *(++ptr);
690  if (c == 0) goto ERROR_RETURN;
691  if (c == '}') break;
692  name[i] = c;
693  }
694  if (c !='}') goto ERROR_RETURN;
695  name[i] = 0;
696  }
697 
698 /* Otherwise there is just one following character */
699 
700 else
701  {
702  name[0] = c;
703  name[1] = 0;
704  }
705 
706 *ptrptr = ptr;
707 
708 /* Search for a recognized property name using binary chop */
709 
710 bot = 0;
711 top = _pcre_utt_size;
712 
713 while (bot < top)
714  {
715  i = (bot + top) >> 1;
716  c = strcmp(name, _pcre_utt[i].name);
717  if (c == 0)
718  {
719  *dptr = _pcre_utt[i].value;
720  return _pcre_utt[i].type;
721  }
722  if (c > 0) bot = i + 1; else top = i;
723  }
724 
725 *errorcodeptr = ERR47;
726 *ptrptr = ptr;
727 return -1;
728 
729 ERROR_RETURN:
730 *errorcodeptr = ERR46;
731 *ptrptr = ptr;
732 return -1;
733 }
734 #endif
735 
736 
737 
738 
739 /*************************************************
740 * Check for counted repeat *
741 *************************************************/
742 
743 /* This function is called when a '{' is encountered in a place where it might
744 start a quantifier. It looks ahead to see if it really is a quantifier or not.
745 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
746 where the ddds are digits.
747 
748 Arguments:
749  p pointer to the first char after '{'
750 
751 Returns: TRUE or FALSE
752 */
753 
754 static BOOL
756 {
757 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
758 while ((digitab[*p] & ctype_digit) != 0) p++;
759 if (*p == '}') return TRUE;
760 
761 if (*p++ != ',') return FALSE;
762 if (*p == '}') return TRUE;
763 
764 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
765 while ((digitab[*p] & ctype_digit) != 0) p++;
766 
767 return (*p == '}');
768 }
769 
770 
771 
772 /*************************************************
773 * Read repeat counts *
774 *************************************************/
775 
776 /* Read an item of the form {n,m} and return the values. This is called only
777 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
778 so the syntax is guaranteed to be correct, but we need to check the values.
779 
780 Arguments:
781  p pointer to first char after '{'
782  minp pointer to int for min
783  maxp pointer to int for max
784  returned as -1 if no max
785  errorcodeptr points to error code variable
786 
787 Returns: pointer to '}' on success;
788  current ptr on error, with errorcodeptr set non-zero
789 */
790 
791 static const uschar *
792 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
793 {
794 int min = 0;
795 int max = -1;
796 
797 /* Read the minimum value and do a paranoid check: a negative value indicates
798 an integer overflow. */
799 
800 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
801 if (min < 0 || min > 65535)
802  {
803  *errorcodeptr = ERR5;
804  return p;
805  }
806 
807 /* Read the maximum value if there is one, and again do a paranoid on its size.
808 Also, max must not be less than min. */
809 
810 if (*p == '}') max = min; else
811  {
812  if (*(++p) != '}')
813  {
814  max = 0;
815  while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
816  if (max < 0 || max > 65535)
817  {
818  *errorcodeptr = ERR5;
819  return p;
820  }
821  if (max < min)
822  {
823  *errorcodeptr = ERR4;
824  return p;
825  }
826  }
827  }
828 
829 /* Fill in the required variables, and pass back the pointer to the terminating
830 '}'. */
831 
832 *minp = min;
833 *maxp = max;
834 return p;
835 }
836 
837 
838 
839 /*************************************************
840 * Find forward referenced subpattern *
841 *************************************************/
842 
843 /* This function scans along a pattern's text looking for capturing
844 subpatterns, and counting them. If it finds a named pattern that matches the
845 name it is given, it returns its number. Alternatively, if the name is NULL, it
846 returns when it reaches a given numbered subpattern. This is used for forward
847 references to subpatterns. We know that if (?P< is encountered, the name will
848 be terminated by '>' because that is checked in the first pass.
849 
850 Arguments:
851  ptr current position in the pattern
852  count current count of capturing parens so far encountered
853  name name to seek, or NULL if seeking a numbered subpattern
854  lorn name length, or subpattern number if name is NULL
855  xmode TRUE if we are in /x mode
856 
857 Returns: the number of the named subpattern, or -1 if not found
858 */
859 
860 static int
861 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
862  BOOL xmode)
863 {
864 const uschar *thisname;
865 
866 for (; *ptr != 0; ptr++)
867  {
868  int term;
869 
870  /* Skip over backslashed characters and also entire \Q...\E */
871 
872  if (*ptr == '\\')
873  {
874  if (*(++ptr) == 0) return -1;
875  if (*ptr == 'Q') for (;;)
876  {
877  while (*(++ptr) != 0 && *ptr != '\\');
878  if (*ptr == 0) return -1;
879  if (*(++ptr) == 'E') break;
880  }
881  continue;
882  }
883 
884  /* Skip over character classes */
885 
886  if (*ptr == '[')
887  {
888  while (*(++ptr) != ']')
889  {
890  if (*ptr == '\\')
891  {
892  if (*(++ptr) == 0) return -1;
893  if (*ptr == 'Q') for (;;)
894  {
895  while (*(++ptr) != 0 && *ptr != '\\');
896  if (*ptr == 0) return -1;
897  if (*(++ptr) == 'E') break;
898  }
899  continue;
900  }
901  }
902  continue;
903  }
904 
905  /* Skip comments in /x mode */
906 
907  if (xmode && *ptr == '#')
908  {
909  while (*(++ptr) != 0 && *ptr != '\n');
910  if (*ptr == 0) return -1;
911  continue;
912  }
913 
914  /* An opening parens must now be a real metacharacter */
915 
916  if (*ptr != '(') continue;
917  if (ptr[1] != '?')
918  {
919  count++;
920  if (name == NULL && count == lorn) return count;
921  continue;
922  }
923 
924  ptr += 2;
925  if (*ptr == 'P') ptr++; /* Allow optional P */
926 
927  /* We have to disambiguate (?<! and (?<= from (?<name> */
928 
929  if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
930  *ptr != '\'')
931  continue;
932 
933  count++;
934 
935  if (name == NULL && count == lorn) return count;
936  term = *ptr++;
937  if (term == '<') term = '>';
938  thisname = ptr;
939  while (*ptr != term) ptr++;
940  if (name != NULL && lorn == ptr - thisname &&
941  strncmp((const char *)name, (const char *)thisname, lorn) == 0)
942  return count;
943  }
944 
945 return -1;
946 }
947 
948 
949 
950 /*************************************************
951 * Find first significant op code *
952 *************************************************/
953 
954 /* This is called by several functions that scan a compiled expression looking
955 for a fixed first character, or an anchoring op code etc. It skips over things
956 that do not influence this. For some calls, a change of option is important.
957 For some calls, it makes sense to skip negative forward and all backward
958 assertions, and also the \b assertion; for others it does not.
959 
960 Arguments:
961  code pointer to the start of the group
962  options pointer to external options
963  optbit the option bit whose changing is significant, or
964  zero if none are
965  skipassert TRUE if certain assertions are to be skipped
966 
967 Returns: pointer to the first significant opcode
968 */
969 
970 static const uschar*
971 first_significant_code(const uschar *code, int *options, int optbit,
972  BOOL skipassert)
973 {
974 for (;;)
975  {
976  switch ((int)*code)
977  {
978  case OP_OPT:
979  if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
980  *options = (int)code[1];
981  code += 2;
982  break;
983 
984  case OP_ASSERT_NOT:
985  case OP_ASSERTBACK:
986  case OP_ASSERTBACK_NOT:
987  if (!skipassert) return code;
988  do code += GET(code, 1); while (*code == OP_ALT);
990  break;
991 
992  case OP_WORD_BOUNDARY:
994  if (!skipassert) return code;
995  /* Fall through */
996 
997  case OP_CALLOUT:
998  case OP_CREF:
999  case OP_RREF:
1000  case OP_DEF:
1001  code += _pcre_OP_lengths[*code];
1002  break;
1003 
1004  default:
1005  return code;
1006  }
1007  }
1008 /* Control never reaches here */
1009 }
1010 
1011 
1012 
1013 
1014 /*************************************************
1015 * Find the fixed length of a pattern *
1016 *************************************************/
1017 
1018 /* Scan a pattern and compute the fixed length of subject that will match it,
1019 if the length is fixed. This is needed for dealing with backward assertions.
1020 In UTF8 mode, the result is in characters rather than bytes.
1021 
1022 Arguments:
1023  code points to the start of the pattern (the bracket)
1024  options the compiling options
1025 
1026 Returns: the fixed length, or -1 if there is no fixed length,
1027  or -2 if \C was encountered
1028 */
1029 
1030 static int
1032 {
1033 int length = -1;
1034 
1035 register int branchlength = 0;
1036 register uschar *cc = code + 1 + LINK_SIZE;
1037 
1038 /* Scan along the opcodes for this branch. If we get to the end of the
1039 branch, check the length against that of the other branches. */
1040 
1041 for (;;)
1042  {
1043  int d;
1044  register int op = *cc;
1045 
1046  switch (op)
1047  {
1048  case OP_CBRA:
1049  case OP_BRA:
1050  case OP_ONCE:
1051  case OP_COND:
1052  d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1053  if (d < 0) return d;
1054  branchlength += d;
1055  do cc += GET(cc, 1); while (*cc == OP_ALT);
1056  cc += 1 + LINK_SIZE;
1057  break;
1058 
1059  /* Reached end of a branch; if it's a ket it is the end of a nested
1060  call. If it's ALT it is an alternation in a nested call. If it is
1061  END it's the end of the outer call. All can be handled by the same code. */
1062 
1063  case OP_ALT:
1064  case OP_KET:
1065  case OP_KETRMAX:
1066  case OP_KETRMIN:
1067  case OP_END:
1068  if (length < 0) length = branchlength;
1069  else if (length != branchlength) return -1;
1070  if (*cc != OP_ALT) return length;
1071  cc += 1 + LINK_SIZE;
1072  branchlength = 0;
1073  break;
1074 
1075  /* Skip over assertive subpatterns */
1076 
1077  case OP_ASSERT:
1078  case OP_ASSERT_NOT:
1079  case OP_ASSERTBACK:
1080  case OP_ASSERTBACK_NOT:
1081  do cc += GET(cc, 1); while (*cc == OP_ALT);
1082  /* Fall through */
1083 
1084  /* Skip over things that don't match chars */
1085 
1086  case OP_REVERSE:
1087  case OP_CREF:
1088  case OP_RREF:
1089  case OP_DEF:
1090  case OP_OPT:
1091  case OP_CALLOUT:
1092  case OP_SOD:
1093  case OP_SOM:
1094  case OP_EOD:
1095  case OP_EODN:
1096  case OP_CIRC:
1097  case OP_DOLL:
1098  case OP_NOT_WORD_BOUNDARY:
1099  case OP_WORD_BOUNDARY:
1100  cc += _pcre_OP_lengths[*cc];
1101  break;
1102 
1103  /* Handle literal characters */
1104 
1105  case OP_CHAR:
1106  case OP_CHARNC:
1107  case OP_NOT:
1108  branchlength++;
1109  cc += 2;
1110 #ifdef SUPPORT_UTF8
1111  if ((options & PCRE_UTF8) != 0)
1112  {
1113  while ((*cc & 0xc0) == 0x80) cc++;
1114  }
1115 #endif
1116  break;
1117 
1118  /* Handle exact repetitions. The count is already in characters, but we
1119  need to skip over a multibyte character in UTF8 mode. */
1120 
1121  case OP_EXACT:
1122  branchlength += GET2(cc,1);
1123  cc += 4;
1124 #ifdef SUPPORT_UTF8
1125  if ((options & PCRE_UTF8) != 0)
1126  {
1127  while((*cc & 0x80) == 0x80) cc++;
1128  }
1129 #endif
1130  break;
1131 
1132  case OP_TYPEEXACT:
1133  branchlength += GET2(cc,1);
1134  cc += 4;
1135  break;
1136 
1137  /* Handle single-char matchers */
1138 
1139  case OP_PROP:
1140  case OP_NOTPROP:
1141  cc += 2;
1142  /* Fall through */
1143 
1144  case OP_NOT_DIGIT:
1145  case OP_DIGIT:
1146  case OP_NOT_WHITESPACE:
1147  case OP_WHITESPACE:
1148  case OP_NOT_WORDCHAR:
1149  case OP_WORDCHAR:
1150  case OP_ANY:
1151  branchlength++;
1152  cc++;
1153  break;
1154 
1155  /* The single-byte matcher isn't allowed */
1156 
1157  case OP_ANYBYTE:
1158  return -2;
1159 
1160  /* Check a class for variable quantification */
1161 
1162 #ifdef SUPPORT_UTF8
1163  case OP_XCLASS:
1164  cc += GET(cc, 1) - 33;
1165  /* Fall through */
1166 #endif
1167 
1168  case OP_CLASS:
1169  case OP_NCLASS:
1170  cc += 33;
1171 
1172  switch (*cc)
1173  {
1174  case OP_CRSTAR:
1175  case OP_CRMINSTAR:
1176  case OP_CRQUERY:
1177  case OP_CRMINQUERY:
1178  return -1;
1179 
1180  case OP_CRRANGE:
1181  case OP_CRMINRANGE:
1182  if (GET2(cc,1) != GET2(cc,3)) return -1;
1183  branchlength += GET2(cc,1);
1184  cc += 5;
1185  break;
1186 
1187  default:
1188  branchlength++;
1189  }
1190  break;
1191 
1192  /* Anything else is variable length */
1193 
1194  default:
1195  return -1;
1196  }
1197  }
1198 /* Control never gets here */
1199 }
1200 
1201 
1202 
1203 
1204 /*************************************************
1205 * Scan compiled regex for numbered bracket *
1206 *************************************************/
1207 
1208 /* This little function scans through a compiled pattern until it finds a
1209 capturing bracket with the given number.
1210 
1211 Arguments:
1212  code points to start of expression
1213  utf8 TRUE in UTF-8 mode
1214  number the required bracket number
1215 
1216 Returns: pointer to the opcode for the bracket, or NULL if not found
1217 */
1218 
1219 static const uschar *
1221 {
1222 for (;;)
1223  {
1224  register int c = *code;
1225  if (c == OP_END) return NULL;
1226 
1227  /* XCLASS is used for classes that cannot be represented just by a bit
1228  map. This includes negated single high-valued characters. The length in
1229  the table is zero; the actual length is stored in the compiled code. */
1230 
1231  if (c == OP_XCLASS) code += GET(code, 1);
1232 
1233  /* Handle capturing bracket */
1234 
1235  else if (c == OP_CBRA)
1236  {
1237  int n = GET2(code, 1+LINK_SIZE);
1238  if (n == number) return (uschar *)code;
1239  code += _pcre_OP_lengths[c];
1240  }
1241 
1242  /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1243  a multi-byte character. The length in the table is a minimum, so we have to
1244  arrange to skip the extra bytes. */
1245 
1246  else
1247  {
1248  code += _pcre_OP_lengths[c];
1249  if (utf8) switch(c)
1250  {
1251  case OP_CHAR:
1252  case OP_CHARNC:
1253  case OP_EXACT:
1254  case OP_UPTO:
1255  case OP_MINUPTO:
1256  case OP_POSUPTO:
1257  case OP_STAR:
1258  case OP_MINSTAR:
1259  case OP_POSSTAR:
1260  case OP_PLUS:
1261  case OP_MINPLUS:
1262  case OP_POSPLUS:
1263  case OP_QUERY:
1264  case OP_MINQUERY:
1265  case OP_POSQUERY:
1266  if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1267  break;
1268  }
1269  }
1270  }
1271 }
1272 
1273 
1274 
1275 /*************************************************
1276 * Scan compiled regex for recursion reference *
1277 *************************************************/
1278 
1279 /* This little function scans through a compiled pattern until it finds an
1280 instance of OP_RECURSE.
1281 
1282 Arguments:
1283  code points to start of expression
1284  utf8 TRUE in UTF-8 mode
1285 
1286 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1287 */
1288 
1289 static const uschar *
1291 {
1292 for (;;)
1293  {
1294  register int c = *code;
1295  if (c == OP_END) return NULL;
1296  if (c == OP_RECURSE) return code;
1297 
1298  /* XCLASS is used for classes that cannot be represented just by a bit
1299  map. This includes negated single high-valued characters. The length in
1300  the table is zero; the actual length is stored in the compiled code. */
1301 
1302  if (c == OP_XCLASS) code += GET(code, 1);
1303 
1304  /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1305  that are followed by a character may be followed by a multi-byte character.
1306  The length in the table is a minimum, so we have to arrange to skip the extra
1307  bytes. */
1308 
1309  else
1310  {
1311  code += _pcre_OP_lengths[c];
1312  if (utf8) switch(c)
1313  {
1314  case OP_CHAR:
1315  case OP_CHARNC:
1316  case OP_EXACT:
1317  case OP_UPTO:
1318  case OP_MINUPTO:
1319  case OP_POSUPTO:
1320  case OP_STAR:
1321  case OP_MINSTAR:
1322  case OP_POSSTAR:
1323  case OP_PLUS:
1324  case OP_MINPLUS:
1325  case OP_POSPLUS:
1326  case OP_QUERY:
1327  case OP_MINQUERY:
1328  case OP_POSQUERY:
1329  if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1330  break;
1331  }
1332  }
1333  }
1334 }
1335 
1336 
1337 
1338 /*************************************************
1339 * Scan compiled branch for non-emptiness *
1340 *************************************************/
1341 
1342 /* This function scans through a branch of a compiled pattern to see whether it
1343 can match the empty string or not. It is called from could_be_empty()
1344 below and from compile_branch() when checking for an unlimited repeat of a
1345 group that can match nothing. Note that first_significant_code() skips over
1346 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1347 struck an inner bracket whose current branch will already have been scanned.
1348 
1349 Arguments:
1350  code points to start of search
1351  endcode points to where to stop
1352  utf8 TRUE if in UTF8 mode
1353 
1354 Returns: TRUE if what is matched could be empty
1355 */
1356 
1357 static BOOL
1359 {
1360 register int c;
1362  code < endcode;
1364  {
1365  const uschar *ccode;
1366 
1367  c = *code;
1368 
1369  if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1370  {
1371  BOOL empty_branch;
1372  if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1373 
1374  /* Scan a closed bracket */
1375 
1376  empty_branch = FALSE;
1377  do
1378  {
1379  if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1380  empty_branch = TRUE;
1381  code += GET(code, 1);
1382  }
1383  while (*code == OP_ALT);
1384  if (!empty_branch) return FALSE; /* All branches are non-empty */
1385 
1386  /* Move past the KET and fudge things so that the increment in the "for"
1387  above has no effect. */
1388 
1389  c = OP_END;
1390  code += 1 + LINK_SIZE - _pcre_OP_lengths[c];
1391  continue;
1392  }
1393 
1394  /* Handle the other opcodes */
1395 
1396  switch (c)
1397  {
1398  /* Check for quantifiers after a class */
1399 
1400 #ifdef SUPPORT_UTF8
1401  case OP_XCLASS:
1402  ccode = code + GET(code, 1);
1403  goto CHECK_CLASS_REPEAT;
1404 #endif
1405 
1406  case OP_CLASS:
1407  case OP_NCLASS:
1408  ccode = code + 33;
1409 
1410 #ifdef SUPPORT_UTF8
1411  CHECK_CLASS_REPEAT:
1412 #endif
1413 
1414  switch (*ccode)
1415  {
1416  case OP_CRSTAR: /* These could be empty; continue */
1417  case OP_CRMINSTAR:
1418  case OP_CRQUERY:
1419  case OP_CRMINQUERY:
1420  break;
1421 
1422  default: /* Non-repeat => class must match */
1423  case OP_CRPLUS: /* These repeats aren't empty */
1424  case OP_CRMINPLUS:
1425  return FALSE;
1426 
1427  case OP_CRRANGE:
1428  case OP_CRMINRANGE:
1429  if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1430  break;
1431  }
1432  break;
1433 
1434  /* Opcodes that must match a character */
1435 
1436  case OP_PROP:
1437  case OP_NOTPROP:
1438  case OP_EXTUNI:
1439  case OP_NOT_DIGIT:
1440  case OP_DIGIT:
1441  case OP_NOT_WHITESPACE:
1442  case OP_WHITESPACE:
1443  case OP_NOT_WORDCHAR:
1444  case OP_WORDCHAR:
1445  case OP_ANY:
1446  case OP_ANYBYTE:
1447  case OP_CHAR:
1448  case OP_CHARNC:
1449  case OP_NOT:
1450  case OP_PLUS:
1451  case OP_MINPLUS:
1452  case OP_POSPLUS:
1453  case OP_EXACT:
1454  case OP_NOTPLUS:
1455  case OP_NOTMINPLUS:
1456  case OP_NOTPOSPLUS:
1457  case OP_NOTEXACT:
1458  case OP_TYPEPLUS:
1459  case OP_TYPEMINPLUS:
1460  case OP_TYPEPOSPLUS:
1461  case OP_TYPEEXACT:
1462  return FALSE;
1463 
1464  /* End of branch */
1465 
1466  case OP_KET:
1467  case OP_KETRMAX:
1468  case OP_KETRMIN:
1469  case OP_ALT:
1470  return TRUE;
1471 
1472  /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1473  MINUPTO, and POSUPTO may be followed by a multibyte character */
1474 
1475 #ifdef SUPPORT_UTF8
1476  case OP_STAR:
1477  case OP_MINSTAR:
1478  case OP_POSSTAR:
1479  case OP_QUERY:
1480  case OP_MINQUERY:
1481  case OP_POSQUERY:
1482  case OP_UPTO:
1483  case OP_MINUPTO:
1484  case OP_POSUPTO:
1485  if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1486  break;
1487 #endif
1488  }
1489  }
1490 
1491 return TRUE;
1492 }
1493 
1494 
1495 
1496 /*************************************************
1497 * Scan compiled regex for non-emptiness *
1498 *************************************************/
1499 
1500 /* This function is called to check for left recursive calls. We want to check
1501 the current branch of the current pattern to see if it could match the empty
1502 string. If it could, we must look outwards for branches at other levels,
1503 stopping when we pass beyond the bracket which is the subject of the recursion.
1504 
1505 Arguments:
1506  code points to start of the recursion
1507  endcode points to where to stop (current RECURSE item)
1508  bcptr points to the chain of current (unclosed) branch starts
1509  utf8 TRUE if in UTF-8 mode
1510 
1511 Returns: TRUE if what is matched could be empty
1512 */
1513 
1514 static BOOL
1515 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1516  BOOL utf8)
1517 {
1518 while (bcptr != NULL && bcptr->current >= code)
1519  {
1520  if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1521  bcptr = bcptr->outer;
1522  }
1523 return TRUE;
1524 }
1525 
1526 
1527 
1528 /*************************************************
1529 * Check for POSIX class syntax *
1530 *************************************************/
1531 
1532 /* This function is called when the sequence "[:" or "[." or "[=" is
1533 encountered in a character class. It checks whether this is followed by an
1534 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1535 ".]" or "=]".
1536 
1537 Argument:
1538  ptr pointer to the initial [
1539  endptr where to return the end pointer
1540  cd pointer to compile data
1541 
1542 Returns: TRUE or FALSE
1543 */
1544 
1545 static BOOL
1546 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1547 {
1548 int terminator; /* Don't combine these lines; the Solaris cc */
1549 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1550 if (*(++ptr) == '^') ptr++;
1551 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1552 if (*ptr == terminator && ptr[1] == ']')
1553  {
1554  *endptr = ptr;
1555  return TRUE;
1556  }
1557 return FALSE;
1558 }
1559 
1560 
1561 
1562 
1563 /*************************************************
1564 * Check POSIX class name *
1565 *************************************************/
1566 
1567 /* This function is called to check the name given in a POSIX-style class entry
1568 such as [:alnum:].
1569 
1570 Arguments:
1571  ptr points to the first letter
1572  len the length of the name
1573 
1574 Returns: a value representing the name, or -1 if unknown
1575 */
1576 
1577 static int
1578 check_posix_name(const uschar *ptr, int len)
1579 {
1580 register int yield = 0;
1581 while (posix_name_lengths[yield] != 0)
1582  {
1583  if (len == posix_name_lengths[yield] &&
1584  strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1585  yield++;
1586  }
1587 return -1;
1588 }
1589 
1590 
1591 /*************************************************
1592 * Adjust OP_RECURSE items in repeated group *
1593 *************************************************/
1594 
1595 /* OP_RECURSE items contain an offset from the start of the regex to the group
1596 that is referenced. This means that groups can be replicated for fixed
1597 repetition simply by copying (because the recursion is allowed to refer to
1598 earlier groups that are outside the current group). However, when a group is
1599 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1600 it, after it has been compiled. This means that any OP_RECURSE items within it
1601 that refer to the group itself or any contained groups have to have their
1602 offsets adjusted. That one of the jobs of this function. Before it is called,
1603 the partially compiled regex must be temporarily terminated with OP_END.
1604 
1605 This function has been extended with the possibility of forward references for
1606 recursions and subroutine calls. It must also check the list of such references
1607 for the group we are dealing with. If it finds that one of the recursions in
1608 the current group is on this list, it adjusts the offset in the list, not the
1609 value in the reference (which is a group number).
1610 
1611 Arguments:
1612  group points to the start of the group
1613  adjust the amount by which the group is to be moved
1614  utf8 TRUE in UTF-8 mode
1615  cd contains pointers to tables etc.
1616  save_hwm the hwm forward reference pointer at the start of the group
1617 
1618 Returns: nothing
1619 */
1620 
1621 static void
1622 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1623  uschar *save_hwm)
1624 {
1625 uschar *ptr = group;
1626 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1627  {
1628  int offset;
1629  uschar *hc;
1630 
1631  /* See if this recursion is on the forward reference list. If so, adjust the
1632  reference. */
1633 
1634  for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1635  {
1636  offset = GET(hc, 0);
1637  if (cd->start_code + offset == ptr + 1)
1638  {
1639  PUT(hc, 0, offset + adjust);
1640  break;
1641  }
1642  }
1643 
1644  /* Otherwise, adjust the recursion offset if it's after the start of this
1645  group. */
1646 
1647  if (hc >= cd->hwm)
1648  {
1649  offset = GET(ptr, 1);
1650  if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1651  }
1652 
1653  ptr += 1 + LINK_SIZE;
1654  }
1655 }
1656 
1657 
1658 
1659 /*************************************************
1660 * Insert an automatic callout point *
1661 *************************************************/
1662 
1663 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1664 callout points before each pattern item.
1665 
1666 Arguments:
1667  code current code pointer
1668  ptr current pattern pointer
1669  cd pointers to tables etc
1670 
1671 Returns: new code pointer
1672 */
1673 
1674 static uschar *
1676 {
1677 *code++ = OP_CALLOUT;
1678 *code++ = 255;
1679 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1680 PUT(code, LINK_SIZE, 0); /* Default length */
1681 return code + 2*LINK_SIZE;
1682 }
1683 
1684 
1685 
1686 /*************************************************
1687 * Complete a callout item *
1688 *************************************************/
1689 
1690 /* A callout item contains the length of the next item in the pattern, which
1691 we can't fill in till after we have reached the relevant point. This is used
1692 for both automatic and manual callouts.
1693 
1694 Arguments:
1695  previous_callout points to previous callout item
1696  ptr current pattern pointer
1697  cd pointers to tables etc
1698 
1699 Returns: nothing
1700 */
1701 
1702 static void
1703 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1704 {
1705 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1706 PUT(previous_callout, 2 + LINK_SIZE, length);
1707 }
1708 
1709 
1710 
1711 #ifdef SUPPORT_UCP
1712 /*************************************************
1713 * Get othercase range *
1714 *************************************************/
1715 
1716 /* This function is passed the start and end of a class range, in UTF-8 mode
1717 with UCP support. It searches up the characters, looking for internal ranges of
1718 characters in the "other" case. Each call returns the next one, updating the
1719 start address.
1720 
1721 Arguments:
1722  cptr points to starting character value; updated
1723  d end value
1724  ocptr where to put start of othercase range
1725  odptr where to put end of othercase range
1726 
1727 Yield: TRUE when range returned; FALSE when no more
1728 */
1729 
1730 static BOOL
1731 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1732  unsigned int *odptr)
1733 {
1734 unsigned int c, othercase, next;
1735 
1736 for (c = *cptr; c <= d; c++)
1737  { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1738 
1739 if (c > d) return FALSE;
1740 
1741 *ocptr = othercase;
1742 next = othercase + 1;
1743 
1744 for (++c; c <= d; c++)
1745  {
1746  if (_pcre_ucp_othercase(c) != next) break;
1747  next++;
1748  }
1749 
1750 *odptr = next - 1;
1751 *cptr = c;
1752 
1753 return TRUE;
1754 }
1755 #endif /* SUPPORT_UCP */
1756 
1757 
1758 
1759 /*************************************************
1760 * Check if auto-possessifying is possible *
1761 *************************************************/
1762 
1763 /* This function is called for unlimited repeats of certain items, to see
1764 whether the next thing could possibly match the repeated item. If not, it makes
1765 sense to automatically possessify the repeated item.
1766 
1767 Arguments:
1768  op_code the repeated op code
1769  this data for this item, depends on the opcode
1770  utf8 TRUE in UTF-8 mode
1771  utf8_char used for utf8 character bytes, NULL if not relevant
1772  ptr next character in pattern
1773  options options bits
1774  cd contains pointers to tables etc.
1775 
1776 Returns: TRUE if possessifying is wanted
1777 */
1778 
1779 static BOOL
1780 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1781  const uschar *ptr, int options, compile_data *cd)
1782 {
1783 int next;
1784 
1785 /* Skip whitespace and comments in extended mode */
1786 
1787 if ((options & PCRE_EXTENDED) != 0)
1788  {
1789  for (;;)
1790  {
1791  while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1792  if (*ptr == '#')
1793  {
1794  while (*(++ptr) != 0)
1795  if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1796  }
1797  else break;
1798  }
1799  }
1800 
1801 /* If the next item is one that we can handle, get its value. A non-negative
1802 value is a character, a negative value is an escape value. */
1803 
1804 if (*ptr == '\\')
1805  {
1806  int temperrorcode = 0;
1807  next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1808  if (temperrorcode != 0) return FALSE;
1809  ptr++; /* Point after the escape sequence */
1810  }
1811 
1812 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1813  {
1814 #ifdef SUPPORT_UTF8
1815  if (utf8) { GETCHARINC(next, ptr); } else
1816 #endif
1817  next = *ptr++;
1818  }
1819 
1820 else return FALSE;
1821 
1822 /* Skip whitespace and comments in extended mode */
1823 
1824 if ((options & PCRE_EXTENDED) != 0)
1825  {
1826  for (;;)
1827  {
1828  while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1829  if (*ptr == '#')
1830  {
1831  while (*(++ptr) != 0)
1832  if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1833  }
1834  else break;
1835  }
1836  }
1837 
1838 /* If the next thing is itself optional, we have to give up. */
1839 
1840 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1841  return FALSE;
1842 
1843 /* Now compare the next item with the previous opcode. If the previous is a
1844 positive single character match, "item" either contains the character or, if
1845 "item" is greater than 127 in utf8 mode, the character's bytes are in
1846 utf8_char. */
1847 
1848 
1849 /* Handle cases when the next item is a character. */
1850 
1851 if (next >= 0) switch(op_code)
1852  {
1853  case OP_CHAR:
1854 #ifdef SUPPORT_UTF8
1855  if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1856 #endif
1857  return item != next;
1858 
1859  /* For CHARNC (caseless character) we must check the other case. If we have
1860  Unicode property support, we can use it to test the other case of
1861  high-valued characters. */
1862 
1863  case OP_CHARNC:
1864 #ifdef SUPPORT_UTF8
1865  if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1866 #endif
1867  if (item == next) return FALSE;
1868 #ifdef SUPPORT_UTF8
1869  if (utf8)
1870  {
1871  unsigned int othercase;
1872  if (next < 128) othercase = cd->fcc[next]; else
1873 #ifdef SUPPORT_UCP
1874  othercase = _pcre_ucp_othercase((unsigned int)next);
1875 #else
1876  othercase = NOTACHAR;
1877 #endif
1878  return (unsigned int)item != othercase;
1879  }
1880  else
1881 #endif /* SUPPORT_UTF8 */
1882  return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1883 
1884  /* For OP_NOT, "item" must be a single-byte character. */
1885 
1886  case OP_NOT:
1887  if (next < 0) return FALSE; /* Not a character */
1888  if (item == next) return TRUE;
1889  if ((options & PCRE_CASELESS) == 0) return FALSE;
1890 #ifdef SUPPORT_UTF8
1891  if (utf8)
1892  {
1893  unsigned int othercase;
1894  if (next < 128) othercase = cd->fcc[next]; else
1895 #ifdef SUPPORT_UCP
1896  othercase = _pcre_ucp_othercase(next);
1897 #else
1898  othercase = NOTACHAR;
1899 #endif
1900  return (unsigned int)item == othercase;
1901  }
1902  else
1903 #endif /* SUPPORT_UTF8 */
1904  return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1905 
1906  case OP_DIGIT:
1907  return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1908 
1909  case OP_NOT_DIGIT:
1910  return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1911 
1912  case OP_WHITESPACE:
1913  return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1914 
1915  case OP_NOT_WHITESPACE:
1916  return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1917 
1918  case OP_WORDCHAR:
1919  return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1920 
1921  case OP_NOT_WORDCHAR:
1922  return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1923 
1924  default:
1925  return FALSE;
1926  }
1927 
1928 
1929 /* Handle the case when the next item is \d, \s, etc. */
1930 
1931 switch(op_code)
1932  {
1933  case OP_CHAR:
1934  case OP_CHARNC:
1935 #ifdef SUPPORT_UTF8
1936  if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1937 #endif
1938  switch(-next)
1939  {
1940  case ESC_d:
1941  return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
1942 
1943  case ESC_D:
1944  return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
1945 
1946  case ESC_s:
1947  return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
1948 
1949  case ESC_S:
1950  return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
1951 
1952  case ESC_w:
1953  return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
1954 
1955  case ESC_W:
1956  return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
1957 
1958  default:
1959  return FALSE;
1960  }
1961 
1962  case OP_DIGIT:
1963  return next == -ESC_D || next == -ESC_s || next == -ESC_W;
1964 
1965  case OP_NOT_DIGIT:
1966  return next == -ESC_d;
1967 
1968  case OP_WHITESPACE:
1969  return next == -ESC_S || next == -ESC_d || next == -ESC_w;
1970 
1971  case OP_NOT_WHITESPACE:
1972  return next == -ESC_s;
1973 
1974  case OP_WORDCHAR:
1975  return next == -ESC_W || next == -ESC_s;
1976 
1977  case OP_NOT_WORDCHAR:
1978  return next == -ESC_w || next == -ESC_d;
1979 
1980  default:
1981  return FALSE;
1982  }
1983 
1984 /* Control does not reach here */
1985 }
1986 
1987 
1988 
1989 /*************************************************
1990 * Compile one branch *
1991 *************************************************/
1992 
1993 /* Scan the pattern, compiling it into the a vector. If the options are
1994 changed during the branch, the pointer is used to change the external options
1995 bits. This function is used during the pre-compile phase when we are trying
1996 to find out the amount of memory needed, as well as during the real compile
1997 phase. The value of lengthptr distinguishes the two phases.
1998 
1999 Arguments:
2000  optionsptr pointer to the option bits
2001  codeptr points to the pointer to the current code point
2002  ptrptr points to the current pattern pointer
2003  errorcodeptr points to error code variable
2004  firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2005  reqbyteptr set to the last literal character required, else < 0
2006  bcptr points to current branch chain
2007  cd contains pointers to tables etc.
2008  lengthptr NULL during the real compile phase
2009  points to length accumulator during pre-compile phase
2010 
2011 Returns: TRUE on success
2012  FALSE, with *errorcodeptr set non-zero on error
2013 */
2014 
2015 static BOOL
2016 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2017  int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2018  compile_data *cd, int *lengthptr)
2019 {
2020 int repeat_type, op_type;
2021 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2022 int bravalue = 0;
2023 int greedy_default, greedy_non_default;
2024 int firstbyte, reqbyte;
2025 int zeroreqbyte, zerofirstbyte;
2026 int req_caseopt, reqvary, tempreqvary;
2027 int options = *optionsptr;
2028 int after_manual_callout = 0;
2029 int length_prevgroup = 0;
2030 register int c;
2031 register uschar *code = *codeptr;
2032 uschar *last_code = code;
2033 uschar *orig_code = code;
2034 uschar *tempcode;
2035 BOOL inescq = FALSE;
2036 BOOL groupsetfirstbyte = FALSE;
2037 const uschar *ptr = *ptrptr;
2038 const uschar *tempptr;
2039 uschar *previous = NULL;
2040 uschar *previous_callout = NULL;
2041 uschar *save_hwm = NULL;
2042 uschar classbits[32];
2043 
2044 #ifdef SUPPORT_UTF8
2045 BOOL class_utf8;
2046 BOOL utf8 = (options & PCRE_UTF8) != 0;
2047 uschar *class_utf8data;
2048 uschar utf8_char[6];
2049 #else
2050 BOOL utf8 = FALSE;
2051 uschar *utf8_char = NULL;
2052 #endif
2053 
2054 #ifdef DEBUG
2055 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2056 #endif
2057 
2058 /* Set up the default and non-default settings for greediness */
2059 
2060 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2061 greedy_non_default = greedy_default ^ 1;
2062 
2063 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2064 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2065 matches a non-fixed char first char; reqbyte just remains unset if we never
2066 find one.
2067 
2068 When we hit a repeat whose minimum is zero, we may have to adjust these values
2069 to take the zero repeat into account. This is implemented by setting them to
2070 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2071 item types that can be repeated set these backoff variables appropriately. */
2072 
2073 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2074 
2075 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2076 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2077 value > 255. It is added into the firstbyte or reqbyte variables to record the
2078 case status of the value. This is used only for ASCII characters. */
2079 
2080 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2081 
2082 /* Switch on next character until the end of the branch */
2083 
2084 for (;; ptr++)
2085  {
2086  BOOL negate_class;
2087  BOOL possessive_quantifier;
2088  BOOL is_quantifier;
2089  BOOL is_recurse;
2090  int class_charcount;
2091  int class_lastchar;
2092  int newoptions;
2093  int recno;
2094  int skipbytes;
2095  int subreqbyte;
2096  int subfirstbyte;
2097  int terminator;
2098  int mclength;
2099  uschar mcbuffer[8];
2100 
2101  /* Get next byte in the pattern */
2102 
2103  c = *ptr;
2104 
2105  /* If we are in the pre-compile phase, accumulate the length used for the
2106  previous cycle of this loop. */
2107 
2108  if (lengthptr != NULL)
2109  {
2110 #ifdef DEBUG
2111  if (code > cd->hwm) cd->hwm = code; /* High water info */
2112 #endif
2113  if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2114  {
2115  *errorcodeptr = ERR52;
2116  goto FAILED;
2117  }
2118 
2119  /* There is at least one situation where code goes backwards: this is the
2120  case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2121  the class is simply eliminated. However, it is created first, so we have to
2122  allow memory for it. Therefore, don't ever reduce the length at this point.
2123  */
2124 
2125  if (code < last_code) code = last_code;
2126  *lengthptr += code - last_code;
2127  DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2128 
2129  /* If "previous" is set and it is not at the start of the work space, move
2130  it back to there, in order to avoid filling up the work space. Otherwise,
2131  if "previous" is NULL, reset the current code pointer to the start. */
2132 
2133  if (previous != NULL)
2134  {
2135  if (previous > orig_code)
2136  {
2137  memmove(orig_code, previous, code - previous);
2138  code -= previous - orig_code;
2139  previous = orig_code;
2140  }
2141  }
2142  else code = orig_code;
2143 
2144  /* Remember where this code item starts so we can pick up the length
2145  next time round. */
2146 
2147  last_code = code;
2148  }
2149 
2150  /* In the real compile phase, just check the workspace used by the forward
2151  reference list. */
2152 
2153  else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2154  {
2155  *errorcodeptr = ERR52;
2156  goto FAILED;
2157  }
2158 
2159  /* If in \Q...\E, check for the end; if not, we have a literal */
2160 
2161  if (inescq && c != 0)
2162  {
2163  if (c == '\\' && ptr[1] == 'E')
2164  {
2165  inescq = FALSE;
2166  ptr++;
2167  continue;
2168  }
2169  else
2170  {
2171  if (previous_callout != NULL)
2172  {
2173  if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2174  complete_callout(previous_callout, ptr, cd);
2175  previous_callout = NULL;
2176  }
2177  if ((options & PCRE_AUTO_CALLOUT) != 0)
2178  {
2179  previous_callout = code;
2180  code = auto_callout(code, ptr, cd);
2181  }
2182  goto NORMAL_CHAR;
2183  }
2184  }
2185 
2186  /* Fill in length of a previous callout, except when the next thing is
2187  a quantifier. */
2188 
2189  is_quantifier = c == '*' || c == '+' || c == '?' ||
2190  (c == '{' && is_counted_repeat(ptr+1));
2191 
2192  if (!is_quantifier && previous_callout != NULL &&
2193  after_manual_callout-- <= 0)
2194  {
2195  if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2196  complete_callout(previous_callout, ptr, cd);
2197  previous_callout = NULL;
2198  }
2199 
2200  /* In extended mode, skip white space and comments */
2201 
2202  if ((options & PCRE_EXTENDED) != 0)
2203  {
2204  if ((cd->ctypes[c] & ctype_space) != 0) continue;
2205  if (c == '#')
2206  {
2207  while (*(++ptr) != 0)
2208  {
2209  if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2210  }
2211  if (*ptr != 0) continue;
2212 
2213  /* Else fall through to handle end of string */
2214  c = 0;
2215  }
2216  }
2217 
2218  /* No auto callout for quantifiers. */
2219 
2220  if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2221  {
2222  previous_callout = code;
2223  code = auto_callout(code, ptr, cd);
2224  }
2225 
2226  switch(c)
2227  {
2228  /* ===================================================================*/
2229  case 0: /* The branch terminates at string end */
2230  case '|': /* or | or ) */
2231  case ')':
2232  *firstbyteptr = firstbyte;
2233  *reqbyteptr = reqbyte;
2234  *codeptr = code;
2235  *ptrptr = ptr;
2236  if (lengthptr != NULL)
2237  {
2238  *lengthptr += code - last_code; /* To include callout length */
2239  DPRINTF((">> end branch\n"));
2240  }
2241  return TRUE;
2242 
2243 
2244  /* ===================================================================*/
2245  /* Handle single-character metacharacters. In multiline mode, ^ disables
2246  the setting of any following char as a first character. */
2247 
2248  case '^':
2249  if ((options & PCRE_MULTILINE) != 0)
2250  {
2251  if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2252  }
2253  previous = NULL;
2254  *code++ = OP_CIRC;
2255  break;
2256 
2257  case '$':
2258  previous = NULL;
2259  *code++ = OP_DOLL;
2260  break;
2261 
2262  /* There can never be a first char if '.' is first, whatever happens about
2263  repeats. The value of reqbyte doesn't change either. */
2264 
2265  case '.':
2266  if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2267  zerofirstbyte = firstbyte;
2268  zeroreqbyte = reqbyte;
2269  previous = code;
2270  *code++ = OP_ANY;
2271  break;
2272 
2273 
2274  /* ===================================================================*/
2275  /* Character classes. If the included characters are all < 256, we build a
2276  32-byte bitmap of the permitted characters, except in the special case
2277  where there is only one such character. For negated classes, we build the
2278  map as usual, then invert it at the end. However, we use a different opcode
2279  so that data characters > 255 can be handled correctly.
2280 
2281  If the class contains characters outside the 0-255 range, a different
2282  opcode is compiled. It may optionally have a bit map for characters < 256,
2283  but those above are are explicitly listed afterwards. A flag byte tells
2284  whether the bitmap is present, and whether this is a negated class or not.
2285  */
2286 
2287  case '[':
2288  previous = code;
2289 
2290  /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2291  they are encountered at the top level, so we'll do that too. */
2292 
2293  if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2294  check_posix_syntax(ptr, &tempptr, cd))
2295  {
2296  *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2297  goto FAILED;
2298  }
2299 
2300  /* If the first character is '^', set the negation flag and skip it. */
2301 
2302  if ((c = *(++ptr)) == '^')
2303  {
2304  negate_class = TRUE;
2305  c = *(++ptr);
2306  }
2307  else
2308  {
2309  negate_class = FALSE;
2310  }
2311 
2312  /* Keep a count of chars with values < 256 so that we can optimize the case
2313  of just a single character (as long as it's < 256). However, For higher
2314  valued UTF-8 characters, we don't yet do any optimization. */
2315 
2316  class_charcount = 0;
2317  class_lastchar = -1;
2318 
2319  /* Initialize the 32-char bit map to all zeros. We build the map in a
2320  temporary bit of memory, in case the class contains only 1 character (less
2321  than 256), because in that case the compiled code doesn't use the bit map.
2322  */
2323 
2324  memset(classbits, 0, 32 * sizeof(uschar));
2325 
2326 #ifdef SUPPORT_UTF8
2327  class_utf8 = FALSE; /* No chars >= 256 */
2328  class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2329 #endif
2330 
2331  /* Process characters until ] is reached. By writing this as a "do" it
2332  means that an initial ] is taken as a data character. At the start of the
2333  loop, c contains the first byte of the character. */
2334 
2335  if (c != 0) do
2336  {
2337  const uschar *oldptr;
2338 
2339 #ifdef SUPPORT_UTF8
2340  if (utf8 && c > 127)
2341  { /* Braces are required because the */
2342  GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2343  }
2344 #endif
2345 
2346  /* Inside \Q...\E everything is literal except \E */
2347 
2348  if (inescq)
2349  {
2350  if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2351  {
2352  inescq = FALSE; /* Reset literal state */
2353  ptr++; /* Skip the 'E' */
2354  continue; /* Carry on with next */
2355  }
2356  goto CHECK_RANGE; /* Could be range if \E follows */
2357  }
2358 
2359  /* Handle POSIX class names. Perl allows a negation extension of the
2360  form [:^name:]. A square bracket that doesn't match the syntax is
2361  treated as a literal. We also recognize the POSIX constructions
2362  [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2363  5.6 and 5.8 do. */
2364 
2365  if (c == '[' &&
2366  (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2367  check_posix_syntax(ptr, &tempptr, cd))
2368  {
2369  BOOL local_negate = FALSE;
2370  int posix_class, taboffset, tabopt;
2371  register const uschar *cbits = cd->cbits;
2372  uschar pbits[32];
2373 
2374  if (ptr[1] != ':')
2375  {
2376  *errorcodeptr = ERR31;
2377  goto FAILED;
2378  }
2379 
2380  ptr += 2;
2381  if (*ptr == '^')
2382  {
2383  local_negate = TRUE;
2384  ptr++;
2385  }
2386 
2387  posix_class = check_posix_name(ptr, tempptr - ptr);
2388  if (posix_class < 0)
2389  {
2390  *errorcodeptr = ERR30;
2391  goto FAILED;
2392  }
2393 
2394  /* If matching is caseless, upper and lower are converted to
2395  alpha. This relies on the fact that the class table starts with
2396  alpha, lower, upper as the first 3 entries. */
2397 
2398  if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2399  posix_class = 0;
2400 
2401  /* We build the bit map for the POSIX class in a chunk of local store
2402  because we may be adding and subtracting from it, and we don't want to
2403  subtract bits that may be in the main map already. At the end we or the
2404  result into the bit map that is being built. */
2405 
2406  posix_class *= 3;
2407 
2408  /* Copy in the first table (always present) */
2409 
2410  memcpy(pbits, cbits + posix_class_maps[posix_class],
2411  32 * sizeof(uschar));
2412 
2413  /* If there is a second table, add or remove it as required. */
2414 
2415  taboffset = posix_class_maps[posix_class + 1];
2416  tabopt = posix_class_maps[posix_class + 2];
2417 
2418  if (taboffset >= 0)
2419  {
2420  if (tabopt >= 0)
2421  for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2422  else
2423  for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2424  }
2425 
2426  /* Not see if we need to remove any special characters. An option
2427  value of 1 removes vertical space and 2 removes underscore. */
2428 
2429  if (tabopt < 0) tabopt = -tabopt;
2430  if (tabopt == 1) pbits[1] &= ~0x3c;
2431  else if (tabopt == 2) pbits[11] &= 0x7f;
2432 
2433  /* Add the POSIX table or its complement into the main table that is
2434  being built and we are done. */
2435 
2436  if (local_negate)
2437  for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2438  else
2439  for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2440 
2441  ptr = tempptr + 1;
2442  class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2443  continue; /* End of POSIX syntax handling */
2444  }
2445 
2446  /* Backslash may introduce a single character, or it may introduce one
2447  of the specials, which just set a flag. The sequence \b is a special
2448  case. Inside a class (and only there) it is treated as backspace.
2449  Elsewhere it marks a word boundary. Other escapes have preset maps ready
2450  to or into the one we are building. We assume they have more than one
2451  character in them, so set class_charcount bigger than one. */
2452 
2453  if (c == '\\')
2454  {
2455  c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2456  if (*errorcodeptr != 0) goto FAILED;
2457 
2458  if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2459  else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2460  else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2461  else if (-c == ESC_Q) /* Handle start of quoted string */
2462  {
2463  if (ptr[1] == '\\' && ptr[2] == 'E')
2464  {
2465  ptr += 2; /* avoid empty string */
2466  }
2467  else inescq = TRUE;
2468  continue;
2469  }
2470 
2471  if (c < 0)
2472  {
2473  register const uschar *cbits = cd->cbits;
2474  class_charcount += 2; /* Greater than 1 is what matters */
2475 
2476  /* Save time by not doing this in the pre-compile phase. */
2477 
2478  if (lengthptr == NULL) switch (-c)
2479  {
2480  case ESC_d:
2481  for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2482  continue;
2483 
2484  case ESC_D:
2485  for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2486  continue;
2487 
2488  case ESC_w:
2489  for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2490  continue;
2491 
2492  case ESC_W:
2493  for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2494  continue;
2495 
2496  case ESC_s:
2497  for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2498  classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2499  continue;
2500 
2501  case ESC_S:
2502  for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2503  classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2504  continue;
2505 
2506  case ESC_E: /* Perl ignores an orphan \E */
2507  continue;
2508 
2509  default: /* Not recognized; fall through */
2510  break; /* Need "default" setting to stop compiler warning. */
2511  }
2512 
2513  /* In the pre-compile phase, just do the recognition. */
2514 
2515  else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2516  c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2517 
2518  /* We need to deal with \P and \p in both phases. */
2519 
2520 #ifdef SUPPORT_UCP
2521  if (-c == ESC_p || -c == ESC_P)
2522  {
2523  BOOL negated;
2524  int pdata;
2525  int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2526  if (ptype < 0) goto FAILED;
2527  class_utf8 = TRUE;
2528  *class_utf8data++ = ((-c == ESC_p) != negated)?
2530  *class_utf8data++ = ptype;
2531  *class_utf8data++ = pdata;
2532  class_charcount -= 2; /* Not a < 256 character */
2533  continue;
2534  }
2535 #endif
2536  /* Unrecognized escapes are faulted if PCRE is running in its
2537  strict mode. By default, for compatibility with Perl, they are
2538  treated as literals. */
2539 
2540  if ((options & PCRE_EXTRA) != 0)
2541  {
2542  *errorcodeptr = ERR7;
2543  goto FAILED;
2544  }
2545 
2546  class_charcount -= 2; /* Undo the default count from above */
2547  c = *ptr; /* Get the final character and fall through */
2548  }
2549 
2550  /* Fall through if we have a single character (c >= 0). This may be
2551  greater than 256 in UTF-8 mode. */
2552 
2553  } /* End of backslash handling */
2554 
2555  /* A single character may be followed by '-' to form a range. However,
2556  Perl does not permit ']' to be the end of the range. A '-' character
2557  at the end is treated as a literal. Perl ignores orphaned \E sequences
2558  entirely. The code for handling \Q and \E is messy. */
2559 
2560  CHECK_RANGE:
2561  while (ptr[1] == '\\' && ptr[2] == 'E')
2562  {
2563  inescq = FALSE;
2564  ptr += 2;
2565  }
2566 
2567  oldptr = ptr;
2568 
2569  if (!inescq && ptr[1] == '-')
2570  {
2571  int d;
2572  ptr += 2;
2573  while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2574 
2575  /* If we hit \Q (not followed by \E) at this point, go into escaped
2576  mode. */
2577 
2578  while (*ptr == '\\' && ptr[1] == 'Q')
2579  {
2580  ptr += 2;
2581  if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2582  inescq = TRUE;
2583  break;
2584  }
2585 
2586  if (*ptr == 0 || (!inescq && *ptr == ']'))
2587  {
2588  ptr = oldptr;
2589  goto LONE_SINGLE_CHARACTER;
2590  }
2591 
2592 #ifdef SUPPORT_UTF8
2593  if (utf8)
2594  { /* Braces are required because the */
2595  GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2596  }
2597  else
2598 #endif
2599  d = *ptr; /* Not UTF-8 mode */
2600 
2601  /* The second part of a range can be a single-character escape, but
2602  not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2603  in such circumstances. */
2604 
2605  if (!inescq && d == '\\')
2606  {
2607  d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2608  if (*errorcodeptr != 0) goto FAILED;
2609 
2610  /* \b is backslash; \X is literal X; \R is literal R; any other
2611  special means the '-' was literal */
2612 
2613  if (d < 0)
2614  {
2615  if (d == -ESC_b) d = '\b';
2616  else if (d == -ESC_X) d = 'X';
2617  else if (d == -ESC_R) d = 'R'; else
2618  {
2619  ptr = oldptr;
2620  goto LONE_SINGLE_CHARACTER; /* A few lines below */
2621  }
2622  }
2623  }
2624 
2625  /* Check that the two values are in the correct order. Optimize
2626  one-character ranges */
2627 
2628  if (d < c)
2629  {
2630  *errorcodeptr = ERR8;
2631  goto FAILED;
2632  }
2633 
2634  if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2635 
2636  /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2637  matching, we have to use an XCLASS with extra data items. Caseless
2638  matching for characters > 127 is available only if UCP support is
2639  available. */
2640 
2641 #ifdef SUPPORT_UTF8
2642  if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2643  {
2644  class_utf8 = TRUE;
2645 
2646  /* With UCP support, we can find the other case equivalents of
2647  the relevant characters. There may be several ranges. Optimize how
2648  they fit with the basic range. */
2649 
2650 #ifdef SUPPORT_UCP
2651  if ((options & PCRE_CASELESS) != 0)
2652  {
2653  unsigned int occ, ocd;
2654  unsigned int cc = c;
2655  unsigned int origd = d;
2656  while (get_othercase_range(&cc, origd, &occ, &ocd))
2657  {
2658  if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2659 
2660  if (occ < c && ocd >= c - 1) /* Extend the basic range */
2661  { /* if there is overlap, */
2662  c = occ; /* noting that if occ < c */
2663  continue; /* we can't have ocd > d */
2664  } /* because a subrange is */
2665  if (ocd > d && occ <= d + 1) /* always shorter than */
2666  { /* the basic range. */
2667  d = ocd;
2668  continue;
2669  }
2670 
2671  if (occ == ocd)
2672  {
2673  *class_utf8data++ = XCL_SINGLE;
2674  }
2675  else
2676  {
2677  *class_utf8data++ = XCL_RANGE;
2678  class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2679  }
2680  class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2681  }
2682  }
2683 #endif /* SUPPORT_UCP */
2684 
2685  /* Now record the original range, possibly modified for UCP caseless
2686  overlapping ranges. */
2687 
2688  *class_utf8data++ = XCL_RANGE;
2689  class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2690  class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2691 
2692  /* With UCP support, we are done. Without UCP support, there is no
2693  caseless matching for UTF-8 characters > 127; we can use the bit map
2694  for the smaller ones. */
2695 
2696 #ifdef SUPPORT_UCP
2697  continue; /* With next character in the class */
2698 #else
2699  if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2700 
2701  /* Adjust upper limit and fall through to set up the map */
2702 
2703  d = 127;
2704 
2705 #endif /* SUPPORT_UCP */
2706  }
2707 #endif /* SUPPORT_UTF8 */
2708 
2709  /* We use the bit map for all cases when not in UTF-8 mode; else
2710  ranges that lie entirely within 0-127 when there is UCP support; else
2711  for partial ranges without UCP support. */
2712 
2713  class_charcount += d - c + 1;
2714  class_lastchar = d;
2715 
2716  /* We can save a bit of time by skipping this in the pre-compile. */
2717 
2718  if (lengthptr == NULL) for (; c <= d; c++)
2719  {
2720  classbits[c/8] |= (1 << (c&7));
2721  if ((options & PCRE_CASELESS) != 0)
2722  {
2723  int uc = cd->fcc[c]; /* flip case */
2724  classbits[uc/8] |= (1 << (uc&7));
2725  }
2726  }
2727 
2728  continue; /* Go get the next char in the class */
2729  }
2730 
2731  /* Handle a lone single character - we can get here for a normal
2732  non-escape char, or after \ that introduces a single character or for an
2733  apparent range that isn't. */
2734 
2735  LONE_SINGLE_CHARACTER:
2736 
2737  /* Handle a character that cannot go in the bit map */
2738 
2739 #ifdef SUPPORT_UTF8
2740  if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2741  {
2742  class_utf8 = TRUE;
2743  *class_utf8data++ = XCL_SINGLE;
2744  class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2745 
2746 #ifdef SUPPORT_UCP
2747  if ((options & PCRE_CASELESS) != 0)
2748  {
2749  unsigned int othercase;
2750  if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
2751  {
2752  *class_utf8data++ = XCL_SINGLE;
2753  class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2754  }
2755  }
2756 #endif /* SUPPORT_UCP */
2757 
2758  }
2759  else
2760 #endif /* SUPPORT_UTF8 */
2761 
2762  /* Handle a single-byte character */
2763  {
2764  classbits[c/8] |= (1 << (c&7));
2765  if ((options & PCRE_CASELESS) != 0)
2766  {
2767  c = cd->fcc[c]; /* flip case */
2768  classbits[c/8] |= (1 << (c&7));
2769  }
2770  class_charcount++;
2771  class_lastchar = c;
2772  }
2773  }
2774 
2775  /* Loop until ']' reached. This "while" is the end of the "do" above. */
2776 
2777  while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
2778 
2779  if (c == 0) /* Missing terminating ']' */
2780  {
2781  *errorcodeptr = ERR6;
2782  goto FAILED;
2783  }
2784 
2785  /* If class_charcount is 1, we saw precisely one character whose value is
2786  less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2787  can optimize the negative case only if there were no characters >= 128
2788  because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2789  single-bytes only. This is an historical hangover. Maybe one day we can
2790  tidy these opcodes to handle multi-byte characters.
2791 
2792  The optimization throws away the bit map. We turn the item into a
2793  1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2794  that OP_NOT does not support multibyte characters. In the positive case, it
2795  can cause firstbyte to be set. Otherwise, there can be no first char if
2796  this item is first, whatever repeat count may follow. In the case of
2797  reqbyte, save the previous value for reinstating. */
2798 
2799 #ifdef SUPPORT_UTF8
2800  if (class_charcount == 1 &&
2801  (!utf8 ||
2802  (!class_utf8 && (!negate_class || class_lastchar < 128))))
2803 
2804 #else
2805  if (class_charcount == 1)
2806 #endif
2807  {
2808  zeroreqbyte = reqbyte;
2809 
2810  /* The OP_NOT opcode works on one-byte characters only. */
2811 
2812  if (negate_class)
2813  {
2814  if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2815  zerofirstbyte = firstbyte;
2816  *code++ = OP_NOT;
2817  *code++ = class_lastchar;
2818  break;
2819  }
2820 
2821  /* For a single, positive character, get the value into mcbuffer, and
2822  then we can handle this with the normal one-character code. */
2823 
2824 #ifdef SUPPORT_UTF8
2825  if (utf8 && class_lastchar > 127)
2826  mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2827  else
2828 #endif
2829  {
2830  mcbuffer[0] = class_lastchar;
2831  mclength = 1;
2832  }
2833  goto ONE_CHAR;
2834  } /* End of 1-char optimization */
2835 
2836  /* The general case - not the one-char optimization. If this is the first
2837  thing in the branch, there can be no first char setting, whatever the
2838  repeat count. Any reqbyte setting must remain unchanged after any kind of
2839  repeat. */
2840 
2841  if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2842  zerofirstbyte = firstbyte;
2843  zeroreqbyte = reqbyte;
2844 
2845  /* If there are characters with values > 255, we have to compile an
2846  extended class, with its own opcode. If there are no characters < 256,
2847  we can omit the bitmap in the actual compiled code. */
2848 
2849 #ifdef SUPPORT_UTF8
2850  if (class_utf8)
2851  {
2852  *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2853  *code++ = OP_XCLASS;
2854  code += LINK_SIZE;
2855  *code = negate_class? XCL_NOT : 0;
2856 
2857  /* If the map is required, move up the extra data to make room for it;
2858  otherwise just move the code pointer to the end of the extra data. */
2859 
2860  if (class_charcount > 0)
2861  {
2862  *code++ |= XCL_MAP;
2863  memmove(code + 32, code, class_utf8data - code);
2864  memcpy(code, classbits, 32);
2865  code = class_utf8data + 32;
2866  }
2867  else code = class_utf8data;
2868 
2869  /* Now fill in the complete length of the item */
2870 
2871  PUT(previous, 1, code - previous);
2872  break; /* End of class handling */
2873  }
2874 #endif
2875 
2876  /* If there are no characters > 255, negate the 32-byte map if necessary,
2877  and copy it into the code vector. If this is the first thing in the branch,
2878  there can be no first char setting, whatever the repeat count. Any reqbyte
2879  setting must remain unchanged after any kind of repeat. */
2880 
2881  if (negate_class)
2882  {
2883  *code++ = OP_NCLASS;
2884  if (lengthptr == NULL) /* Save time in the pre-compile phase */
2885  for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2886  }
2887  else
2888  {
2889  *code++ = OP_CLASS;
2890  memcpy(code, classbits, 32);
2891  }
2892  code += 32;
2893  break;
2894 
2895 
2896  /* ===================================================================*/
2897  /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2898  has been tested above. */
2899 
2900  case '{':
2901  if (!is_quantifier) goto NORMAL_CHAR;
2902  ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2903  if (*errorcodeptr != 0) goto FAILED;
2904  goto REPEAT;
2905 
2906  case '*':
2907  repeat_min = 0;
2908  repeat_max = -1;
2909  goto REPEAT;
2910 
2911  case '+':
2912  repeat_min = 1;
2913  repeat_max = -1;
2914  goto REPEAT;
2915 
2916  case '?':
2917  repeat_min = 0;
2918  repeat_max = 1;
2919 
2920  REPEAT:
2921  if (previous == NULL)
2922  {
2923  *errorcodeptr = ERR9;
2924  goto FAILED;
2925  }
2926 
2927  if (repeat_min == 0)
2928  {
2929  firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2930  reqbyte = zeroreqbyte; /* Ditto */
2931  }
2932 
2933  /* Remember whether this is a variable length repeat */
2934 
2935  reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2936 
2937  op_type = 0; /* Default single-char op codes */
2938  possessive_quantifier = FALSE; /* Default not possessive quantifier */
2939 
2940  /* Save start of previous item, in case we have to move it up to make space
2941  for an inserted OP_ONCE for the additional '+' extension. */
2942 
2943  tempcode = previous;
2944 
2945  /* If the next character is '+', we have a possessive quantifier. This
2946  implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2947  If the next character is '?' this is a minimizing repeat, by default,
2948  but if PCRE_UNGREEDY is set, it works the other way round. We change the
2949  repeat type to the non-default. */
2950 
2951  if (ptr[1] == '+')
2952  {
2953  repeat_type = 0; /* Force greedy */
2954  possessive_quantifier = TRUE;
2955  ptr++;
2956  }
2957  else if (ptr[1] == '?')
2958  {
2959  repeat_type = greedy_non_default;
2960  ptr++;
2961  }
2962  else repeat_type = greedy_default;
2963 
2964  /* If previous was a character match, abolish the item and generate a
2965  repeat item instead. If a char item has a minumum of more than one, ensure
2966  that it is set in reqbyte - it might not be if a sequence such as x{3} is
2967  the first thing in a branch because the x will have gone into firstbyte
2968  instead. */
2969 
2970  if (*previous == OP_CHAR || *previous == OP_CHARNC)
2971  {
2972  /* Deal with UTF-8 characters that take up more than one byte. It's
2973  easier to write this out separately than try to macrify it. Use c to
2974  hold the length of the character in bytes, plus 0x80 to flag that it's a
2975  length rather than a small character. */
2976 
2977 #ifdef SUPPORT_UTF8
2978  if (utf8 && (code[-1] & 0x80) != 0)
2979  {
2980  uschar *lastchar = code - 1;
2981  while((*lastchar & 0xc0) == 0x80) lastchar--;
2982  c = code - lastchar; /* Length of UTF-8 character */
2983  memcpy(utf8_char, lastchar, c); /* Save the char */
2984  c |= 0x80; /* Flag c as a length */
2985  }
2986  else
2987 #endif
2988 
2989  /* Handle the case of a single byte - either with no UTF8 support, or
2990  with UTF-8 disabled, or for a UTF-8 character < 128. */
2991 
2992  {
2993  c = code[-1];
2994  if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2995  }
2996 
2997  /* If the repetition is unlimited, it pays to see if the next thing on
2998  the line is something that cannot possibly match this character. If so,
2999  automatically possessifying this item gains some performance in the case
3000  where the match fails. */
3001 
3002  if (!possessive_quantifier &&
3003  repeat_max < 0 &&
3004  check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3005  options, cd))
3006  {
3007  repeat_type = 0; /* Force greedy */
3008  possessive_quantifier = TRUE;
3009  }
3010 
3011  goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3012  }
3013 
3014  /* If previous was a single negated character ([^a] or similar), we use
3015  one of the special opcodes, replacing it. The code is shared with single-
3016  character repeats by setting opt_type to add a suitable offset into
3017  repeat_type. We can also test for auto-possessification. OP_NOT is
3018  currently used only for single-byte chars. */
3019 
3020  else if (*previous == OP_NOT)
3021  {
3022  op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3023  c = previous[1];
3024  if (!possessive_quantifier &&
3025  repeat_max < 0 &&
3026  check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3027  {
3028  repeat_type = 0; /* Force greedy */
3029  possessive_quantifier = TRUE;
3030  }
3031  goto OUTPUT_SINGLE_REPEAT;
3032  }
3033 
3034  /* If previous was a character type match (\d or similar), abolish it and
3035  create a suitable repeat item. The code is shared with single-character
3036  repeats by setting op_type to add a suitable offset into repeat_type. Note
3037  the the Unicode property types will be present only when SUPPORT_UCP is
3038  defined, but we don't wrap the little bits of code here because it just
3039  makes it horribly messy. */
3040 
3041  else if (*previous < OP_EODN)
3042  {
3043  uschar *oldcode;
3044  int prop_type, prop_value;
3045  op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3046  c = *previous;
3047 
3048  if (!possessive_quantifier &&
3049  repeat_max < 0 &&
3050  check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3051  {
3052  repeat_type = 0; /* Force greedy */
3053  possessive_quantifier = TRUE;
3054  }
3055 
3056  OUTPUT_SINGLE_REPEAT:
3057  if (*previous == OP_PROP || *previous == OP_NOTPROP)
3058  {
3059  prop_type = previous[1];
3060  prop_value = previous[2];
3061  }
3062  else prop_type = prop_value = -1;
3063 
3064  oldcode = code;
3065  code = previous; /* Usually overwrite previous item */
3066 
3067  /* If the maximum is zero then the minimum must also be zero; Perl allows
3068  this case, so we do too - by simply omitting the item altogether. */
3069 
3070  if (repeat_max == 0) goto END_REPEAT;
3071 
3072  /* All real repeats make it impossible to handle partial matching (maybe
3073  one day we will be able to remove this restriction). */
3074 
3075  if (repeat_max != 1) cd->nopartial = TRUE;
3076 
3077  /* Combine the op_type with the repeat_type */
3078 
3079  repeat_type += op_type;
3080 
3081  /* A minimum of zero is handled either as the special case * or ?, or as
3082  an UPTO, with the maximum given. */
3083 
3084  if (repeat_min == 0)
3085  {
3086  if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3087  else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3088  else
3089  {
3090  *code++ = OP_UPTO + repeat_type;
3091  PUT2INC(code, 0, repeat_max);
3092  }
3093  }
3094 
3095  /* A repeat minimum of 1 is optimized into some special cases. If the
3096  maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3097  left in place and, if the maximum is greater than 1, we use OP_UPTO with
3098  one less than the maximum. */
3099 
3100  else if (repeat_min == 1)
3101  {
3102  if (repeat_max == -1)
3103  *code++ = OP_PLUS + repeat_type;
3104  else
3105  {
3106  code = oldcode; /* leave previous item in place */
3107  if (repeat_max == 1) goto END_REPEAT;
3108  *code++ = OP_UPTO + repeat_type;
3109  PUT2INC(code, 0, repeat_max - 1);
3110  }
3111  }
3112 
3113  /* The case {n,n} is just an EXACT, while the general case {n,m} is
3114  handled as an EXACT followed by an UPTO. */
3115 
3116  else
3117  {
3118  *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3119  PUT2INC(code, 0, repeat_min);
3120 
3121  /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3122  we have to insert the character for the previous code. For a repeated
3123  Unicode property match, there are two extra bytes that define the
3124  required property. In UTF-8 mode, long characters have their length in
3125  c, with the 0x80 bit as a flag. */
3126 
3127  if (repeat_max < 0)
3128  {
3129 #ifdef SUPPORT_UTF8
3130  if (utf8 && c >= 128)
3131  {
3132  memcpy(code, utf8_char, c & 7);
3133  code += c & 7;
3134  }
3135  else
3136 #endif
3137  {
3138  *code++ = c;
3139  if (prop_type >= 0)
3140  {
3141  *code++ = prop_type;
3142  *code++ = prop_value;
3143  }
3144  }
3145  *code++ = OP_STAR + repeat_type;
3146  }
3147 
3148  /* Else insert an UPTO if the max is greater than the min, again
3149  preceded by the character, for the previously inserted code. If the
3150  UPTO is just for 1 instance, we can use QUERY instead. */
3151 
3152  else if (repeat_max != repeat_min)
3153  {
3154 #ifdef SUPPORT_UTF8
3155  if (utf8 && c >= 128)
3156  {
3157  memcpy(code, utf8_char, c & 7);
3158  code += c & 7;
3159  }
3160  else
3161 #endif
3162  *code++ = c;
3163  if (prop_type >= 0)
3164  {
3165  *code++ = prop_type;
3166  *code++ = prop_value;
3167  }
3168  repeat_max -= repeat_min;
3169 
3170  if (repeat_max == 1)
3171  {
3172  *code++ = OP_QUERY + repeat_type;
3173  }
3174  else
3175  {
3176  *code++ = OP_UPTO + repeat_type;
3177  PUT2INC(code, 0, repeat_max);
3178  }
3179  }
3180  }
3181 
3182  /* The character or character type itself comes last in all cases. */
3183 
3184 #ifdef SUPPORT_UTF8
3185  if (utf8 && c >= 128)
3186  {
3187  memcpy(code, utf8_char, c & 7);
3188  code += c & 7;
3189  }
3190  else
3191 #endif
3192  *code++ = c;
3193 
3194  /* For a repeated Unicode property match, there are two extra bytes that
3195  define the required property. */
3196 
3197 #ifdef SUPPORT_UCP
3198  if (prop_type >= 0)
3199  {
3200  *code++ = prop_type;
3201  *code++ = prop_value;
3202  }
3203 #endif
3204  }
3205 
3206  /* If previous was a character class or a back reference, we put the repeat
3207  stuff after it, but just skip the item if the repeat was {0,0}. */
3208 
3209  else if (*previous == OP_CLASS ||
3210  *previous == OP_NCLASS ||
3211 #ifdef SUPPORT_UTF8
3212  *previous == OP_XCLASS ||
3213 #endif
3214  *previous == OP_REF)
3215  {
3216  if (repeat_max == 0)
3217  {
3218  code = previous;
3219  goto END_REPEAT;
3220  }
3221 
3222  /* All real repeats make it impossible to handle partial matching (maybe
3223  one day we will be able to remove this restriction). */
3224 
3225  if (repeat_max != 1) cd->nopartial = TRUE;
3226 
3227  if (repeat_min == 0 && repeat_max == -1)
3228  *code++ = OP_CRSTAR + repeat_type;
3229  else if (repeat_min == 1 && repeat_max == -1)
3230  *code++ = OP_CRPLUS + repeat_type;
3231  else if (repeat_min == 0 && repeat_max == 1)
3232  *code++ = OP_CRQUERY + repeat_type;
3233  else
3234  {
3235  *code++ = OP_CRRANGE + repeat_type;
3236  PUT2INC(code, 0, repeat_min);
3237  if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3238  PUT2INC(code, 0, repeat_max);
3239  }
3240  }
3241 
3242  /* If previous was a bracket group, we may have to replicate it in certain
3243  cases. */
3244 
3245  else if (*previous == OP_BRA || *previous == OP_CBRA ||
3246  *previous == OP_ONCE || *previous == OP_COND)
3247  {
3248  register int i;
3249  int ketoffset = 0;
3250  int len = code - previous;
3251  uschar *bralink = NULL;
3252 
3253  /* Repeating a DEFINE group is pointless */
3254 
3255  if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3256  {
3257  *errorcodeptr = ERR55;
3258  goto FAILED;
3259  }
3260 
3261  /* This is a paranoid check to stop integer overflow later on */
3262 
3263  if (len > MAX_DUPLENGTH)
3264  {
3265  *errorcodeptr = ERR50;
3266  goto FAILED;
3267  }
3268 
3269  /* If the maximum repeat count is unlimited, find the end of the bracket
3270  by scanning through from the start, and compute the offset back to it
3271  from the current code pointer. There may be an OP_OPT setting following
3272  the final KET, so we can't find the end just by going back from the code
3273  pointer. */
3274 
3275  if (repeat_max == -1)
3276  {
3277  register uschar *ket = previous;
3278  do ket += GET(ket, 1); while (*ket != OP_KET);
3279  ketoffset = code - ket;
3280  }
3281 
3282  /* The case of a zero minimum is special because of the need to stick
3283  OP_BRAZERO in front of it, and because the group appears once in the
3284  data, whereas in other cases it appears the minimum number of times. For
3285  this reason, it is simplest to treat this case separately, as otherwise
3286  the code gets far too messy. There are several special subcases when the
3287  minimum is zero. */
3288 
3289  if (repeat_min == 0)
3290  {
3291  /* If the maximum is also zero, we just omit the group from the output
3292  altogether. */
3293 
3294  if (repeat_max == 0)
3295  {
3296  code = previous;
3297  goto END_REPEAT;
3298  }
3299 
3300  /* If the maximum is 1 or unlimited, we just have to stick in the
3301  BRAZERO and do no more at this point. However, we do need to adjust
3302  any OP_RECURSE calls inside the group that refer to the group itself or
3303  any internal or forward referenced group, because the offset is from
3304  the start of the whole regex. Temporarily terminate the pattern while
3305  doing this. */
3306 
3307  if (repeat_max <= 1)
3308  {
3309  *code = OP_END;
3310  adjust_recurse(previous, 1, utf8, cd, save_hwm);
3311  memmove(previous+1, previous, len);
3312  code++;
3313  *previous++ = OP_BRAZERO + repeat_type;
3314  }
3315 
3316  /* If the maximum is greater than 1 and limited, we have to replicate
3317  in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3318  The first one has to be handled carefully because it's the original
3319  copy, which has to be moved up. The remainder can be handled by code
3320  that is common with the non-zero minimum case below. We have to
3321  adjust the value or repeat_max, since one less copy is required. Once
3322  again, we may have to adjust any OP_RECURSE calls inside the group. */
3323 
3324  else
3325  {
3326  int offset;
3327  *code = OP_END;
3328  adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3329  memmove(previous + 2 + LINK_SIZE, previous, len);
3330  code += 2 + LINK_SIZE;
3331  *previous++ = OP_BRAZERO + repeat_type;
3332  *previous++ = OP_BRA;
3333 
3334  /* We chain together the bracket offset fields that have to be
3335  filled in later when the ends of the brackets are reached. */
3336 
3337  offset = (bralink == NULL)? 0 : previous - bralink;
3338  bralink = previous;
3339  PUTINC(previous, 0, offset);
3340  }
3341 
3342  repeat_max--;
3343  }
3344 
3345  /* If the minimum is greater than zero, replicate the group as many
3346  times as necessary, and adjust the maximum to the number of subsequent
3347  copies that we need. If we set a first char from the group, and didn't
3348  set a required char, copy the latter from the former. If there are any
3349  forward reference subroutine calls in the group, there will be entries on
3350  the workspace list; replicate these with an appropriate increment. */
3351 
3352  else
3353  {
3354  if (repeat_min > 1)
3355  {
3356  /* In the pre-compile phase, we don't actually do the replication. We
3357  just adjust the length as if we had. */
3358 
3359  if (lengthptr != NULL)
3360  *lengthptr += (repeat_min - 1)*length_prevgroup;
3361 
3362  /* This is compiling for real */
3363 
3364  else
3365  {
3366  if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3367  for (i = 1; i < repeat_min; i++)
3368  {
3369  uschar *hc;
3370  uschar *this_hwm = cd->hwm;
3371  memcpy(code, previous, len);
3372  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3373  {
3374  PUT(cd->hwm, 0, GET(hc, 0) + len);
3375  cd->hwm += LINK_SIZE;
3376  }
3377  save_hwm = this_hwm;
3378  code += len;
3379  }
3380  }
3381  }
3382 
3383  if (repeat_max > 0) repeat_max -= repeat_min;
3384  }
3385 
3386  /* This code is common to both the zero and non-zero minimum cases. If
3387  the maximum is limited, it replicates the group in a nested fashion,
3388  remembering the bracket starts on a stack. In the case of a zero minimum,
3389  the first one was set up above. In all cases the repeat_max now specifies
3390  the number of additional copies needed. Again, we must remember to
3391  replicate entries on the forward reference list. */
3392 
3393  if (repeat_max >= 0)
3394  {
3395  /* In the pre-compile phase, we don't actually do the replication. We
3396  just adjust the length as if we had. For each repetition we must add 1
3397  to the length for BRAZERO and for all but the last repetition we must
3398  add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3399 
3400  if (lengthptr != NULL && repeat_max > 0)
3401  *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3402  2 - 2*LINK_SIZE; /* Last one doesn't nest */
3403 
3404  /* This is compiling for real */
3405 
3406  else for (i = repeat_max - 1; i >= 0; i--)
3407  {
3408  uschar *hc;
3409  uschar *this_hwm = cd->hwm;
3410 
3411  *code++ = OP_BRAZERO + repeat_type;
3412 
3413  /* All but the final copy start a new nesting, maintaining the
3414  chain of brackets outstanding. */
3415 
3416  if (i != 0)
3417  {
3418  int offset;
3419  *code++ = OP_BRA;
3420  offset = (bralink == NULL)? 0 : code - bralink;
3421  bralink = code;
3422  PUTINC(code, 0, offset);
3423  }
3424 
3425  memcpy(code, previous, len);
3426  for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3427  {
3428  PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3429  cd->hwm += LINK_SIZE;
3430  }
3431  save_hwm = this_hwm;
3432  code += len;
3433  }
3434 
3435  /* Now chain through the pending brackets, and fill in their length
3436  fields (which are holding the chain links pro tem). */
3437 
3438  while (bralink != NULL)
3439  {
3440  int oldlinkoffset;
3441  int offset = code - bralink + 1;
3442  uschar *bra = code - offset;
3443  oldlinkoffset = GET(bra, 1);
3444  bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3445  *code++ = OP_KET;
3446  PUTINC(code, 0, offset);
3447  PUT(bra, 1, offset);
3448  }
3449  }
3450 
3451  /* If the maximum is unlimited, set a repeater in the final copy. We
3452  can't just offset backwards from the current code point, because we
3453  don't know if there's been an options resetting after the ket. The
3454  correct offset was computed above.
3455 
3456  Then, when we are doing the actual compile phase, check to see whether
3457  this group is a non-atomic one that could match an empty string. If so,
3458  convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3459  that runtime checking can be done. [This check is also applied to
3460  atomic groups at runtime, but in a different way.] */
3461 
3462  else
3463  {
3464  uschar *ketcode = code - ketoffset;
3465  uschar *bracode = ketcode - GET(ketcode, 1);
3466  *ketcode = OP_KETRMAX + repeat_type;
3467  if (lengthptr == NULL && *bracode != OP_ONCE)
3468  {
3469  uschar *scode = bracode;
3470  do
3471  {
3472  if (could_be_empty_branch(scode, ketcode, utf8))
3473  {
3474  *bracode += OP_SBRA - OP_BRA;
3475  break;
3476  }
3477  scode += GET(scode, 1);
3478  }
3479  while (*scode == OP_ALT);
3480  }
3481  }
3482  }
3483 
3484  /* Else there's some kind of shambles */
3485 
3486  else
3487  {
3488  *errorcodeptr = ERR11;
3489  goto FAILED;
3490  }
3491 
3492  /* If the character following a repeat is '+', or if certain optimization
3493  tests above succeeded, possessive_quantifier is TRUE. For some of the
3494  simpler opcodes, there is an special alternative opcode for this. For
3495  anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3496  The '+' notation is just syntactic sugar, taken from Sun's Java package,
3497  but the special opcodes can optimize it a bit. The repeated item starts at
3498  tempcode, not at previous, which might be the first part of a string whose
3499  (former) last char we repeated.
3500 
3501  Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3502  an 'upto' may follow. We skip over an 'exact' item, and then test the
3503  length of what remains before proceeding. */
3504 
3505  if (possessive_quantifier)
3506  {
3507  int len;
3508  if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3509  *tempcode == OP_NOTEXACT)
3510  tempcode += _pcre_OP_lengths[*tempcode];
3511  len = code - tempcode;
3512  if (len > 0) switch (*tempcode)
3513  {
3514  case OP_STAR: *tempcode = OP_POSSTAR; break;
3515  case OP_PLUS: *tempcode = OP_POSPLUS; break;
3516  case OP_QUERY: *tempcode = OP_POSQUERY; break;
3517  case OP_UPTO: *tempcode = OP_POSUPTO; break;
3518 
3519  case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3520  case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3521  case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3522  case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3523 
3524  case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3525  case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3526  case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3527  case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3528 
3529  default:
3530  memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3531  code += 1 + LINK_SIZE;
3532  len += 1 + LINK_SIZE;
3533  tempcode[0] = OP_ONCE;
3534  *code++ = OP_KET;
3535  PUTINC(code, 0, len);
3536  PUT(tempcode, 1, len);
3537  break;
3538  }
3539  }
3540 
3541  /* In all case we no longer have a previous item. We also set the
3542  "follows varying string" flag for subsequently encountered reqbytes if
3543  it isn't already set and we have just passed a varying length item. */
3544 
3545  END_REPEAT:
3546  previous = NULL;
3547  cd->req_varyopt |= reqvary;
3548  break;
3549 
3550 
3551  /* ===================================================================*/
3552  /* Start of nested parenthesized sub-expression, or comment or lookahead or
3553  lookbehind or option setting or condition or all the other extended
3554  parenthesis forms. First deal with the specials; all are introduced by ?,
3555  and the appearance of any of them means that this is not a capturing
3556  group. */
3557 
3558  case '(':
3559  newoptions = options;
3560  skipbytes = 0;
3561  bravalue = OP_CBRA;
3562  save_hwm = cd->hwm;
3563 
3564  if (*(++ptr) == '?')
3565  {
3566  int i, set, unset, namelen;
3567  int *optset;
3568  const uschar *name;
3569  uschar *slot;
3570 
3571  switch (*(++ptr))
3572  {
3573  case '#': /* Comment; skip to ket */
3574  ptr++;
3575  while (*ptr != 0 && *ptr != ')') ptr++;
3576  if (*ptr == 0)
3577  {
3578  *errorcodeptr = ERR18;
3579  goto FAILED;
3580  }
3581  continue;
3582 
3583 
3584  /* ------------------------------------------------------------ */
3585  case ':': /* Non-capturing bracket */
3586  bravalue = OP_BRA;
3587  ptr++;
3588  break;
3589 
3590 
3591  /* ------------------------------------------------------------ */
3592  case '(':
3593  bravalue = OP_COND; /* Conditional group */
3594 
3595  /* A condition can be an assertion, a number (referring to a numbered
3596  group), a name (referring to a named group), or 'R', referring to
3597  recursion. R<digits> and R&name are also permitted for recursion tests.
3598 
3599  There are several syntaxes for testing a named group: (?(name)) is used
3600  by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3601 
3602  There are two unfortunate ambiguities, caused by history. (a) 'R' can
3603  be the recursive thing or the name 'R' (and similarly for 'R' followed
3604  by digits), and (b) a number could be a name that consists of digits.
3605  In both cases, we look for a name first; if not found, we try the other
3606  cases. */
3607 
3608  /* For conditions that are assertions, check the syntax, and then exit
3609  the switch. This will take control down to where bracketed groups,
3610  including assertions, are processed. */
3611 
3612  if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3613  break;
3614 
3615  /* Most other conditions use OP_CREF (a couple change to OP_RREF
3616  below), and all need to skip 3 bytes at the start of the group. */
3617 
3618  code[1+LINK_SIZE] = OP_CREF;
3619  skipbytes = 3;
3620 
3621  /* Check for a test for recursion in a named group. */
3622 
3623  if (ptr[1] == 'R' && ptr[2] == '&')
3624  {
3625  terminator = -1;
3626  ptr += 2;
3627  code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
3628  }
3629 
3630  /* Check for a test for a named group's having been set, using the Perl
3631  syntax (?(<name>) or (?('name') */
3632 
3633  else if (ptr[1] == '<')
3634  {
3635  terminator = '>';
3636  ptr++;
3637  }
3638  else if (ptr[1] == '\'')
3639  {
3640  terminator = '\'';
3641  ptr++;
3642  }
3643  else terminator = 0;
3644 
3645  /* We now expect to read a name; any thing else is an error */
3646 
3647  if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3648  {
3649  ptr += 1; /* To get the right offset */
3650  *errorcodeptr = ERR28;
3651  goto FAILED;
3652  }
3653 
3654  /* Read the name, but also get it as a number if it's all digits */
3655 
3656  recno = 0;
3657  name = ++ptr;
3658  while ((cd->ctypes[*ptr] & ctype_word) != 0)
3659  {
3660  if (recno >= 0)
3661  recno = ((digitab[*ptr] & ctype_digit) != 0)?
3662  recno * 10 + *ptr - '0' : -1;
3663  ptr++;
3664  }
3665  namelen = ptr - name;
3666 
3667  if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3668  {
3669  ptr--; /* Error offset */
3670  *errorcodeptr = ERR26;
3671  goto FAILED;
3672  }
3673 
3674  /* Do no further checking in the pre-compile phase. */
3675 
3676  if (lengthptr != NULL) break;
3677 
3678  /* In the real compile we do the work of looking for the actual
3679  reference. */
3680 
3681  slot = cd->name_table;
3682  for (i = 0; i < cd->names_found; i++)
3683  {
3684  if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3685  slot += cd->name_entry_size;
3686  }
3687 
3688  /* Found a previous named subpattern */
3689 
3690  if (i < cd->names_found)
3691  {
3692  recno = GET2(slot, 0);
3693  PUT2(code, 2+LINK_SIZE, recno);
3694  }
3695 
3696  /* Search the pattern for a forward reference */
3697 
3698  else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3699  (options & PCRE_EXTENDED) != 0)) > 0)
3700  {
3701  PUT2(code, 2+LINK_SIZE, i);
3702  }
3703 
3704  /* If terminator == 0 it means that the name followed directly after
3705  the opening parenthesis [e.g. (?(abc)...] and in this case there are
3706  some further alternatives to try. For the cases where terminator != 0
3707  [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
3708  now checked all the possibilities, so give an error. */
3709 
3710  else if (terminator != 0)
3711  {
3712  *errorcodeptr = ERR15;
3713  goto FAILED;
3714  }
3715 
3716  /* Check for (?(R) for recursion. Allow digits after R to specify a
3717  specific group number. */
3718 
3719  else if (*name == 'R')
3720  {
3721  recno = 0;
3722  for (i = 1; i < namelen; i++)
3723  {
3724  if ((digitab[name[i]] & ctype_digit) == 0)
3725  {
3726  *errorcodeptr = ERR15;
3727  goto FAILED;
3728  }
3729  recno = recno * 10 + name[i] - '0';
3730  }
3731  if (recno == 0) recno = RREF_ANY;
3732  code[1+LINK_SIZE] = OP_RREF; /* Change test type */
3733  PUT2(code, 2+LINK_SIZE, recno);
3734  }
3735 
3736  /* Similarly, check for the (?(DEFINE) "condition", which is always
3737  false. */
3738 
3739  else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
3740  {
3741  code[1+LINK_SIZE] = OP_DEF;
3742  skipbytes = 1;
3743  }
3744 
3745  /* Check for the "name" actually being a subpattern number. */
3746 
3747  else if (recno > 0)
3748  {
3749  PUT2(code, 2+LINK_SIZE, recno);
3750  }
3751 
3752  /* Either an unidentified subpattern, or a reference to (?(0) */
3753 
3754  else
3755  {
3756  *errorcodeptr = (recno == 0)? ERR35: ERR15;
3757  goto FAILED;
3758  }
3759  break;
3760 
3761 
3762  /* ------------------------------------------------------------ */
3763  case '=': /* Positive lookahead */
3764  bravalue = OP_ASSERT;
3765  ptr++;
3766  break;
3767 
3768 
3769  /* ------------------------------------------------------------ */
3770  case '!': /* Negative lookahead */
3771  bravalue = OP_ASSERT_NOT;
3772  ptr++;
3773  break;
3774 
3775 
3776  /* ------------------------------------------------------------ */
3777  case '<': /* Lookbehind or named define */
3778  switch (ptr[1])
3779  {
3780  case '=': /* Positive lookbehind */
3781  bravalue = OP_ASSERTBACK;
3782  ptr += 2;
3783  break;
3784 
3785  case '!': /* Negative lookbehind */
3786  bravalue = OP_ASSERTBACK_NOT;
3787  ptr += 2;
3788  break;
3789 
3790  default: /* Could be name define, else bad */
3791  if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
3792  ptr++; /* Correct offset for error */
3793  *errorcodeptr = ERR24;
3794  goto FAILED;
3795  }
3796  break;
3797 
3798 
3799  /* ------------------------------------------------------------ */
3800  case '>': /* One-time brackets */
3801  bravalue = OP_ONCE;
3802  ptr++;
3803  break;
3804 
3805 
3806  /* ------------------------------------------------------------ */
3807  case 'C': /* Callout - may be followed by digits; */
3808  previous_callout = code; /* Save for later completion */
3809  after_manual_callout = 1; /* Skip one item before completing */
3810  *code++ = OP_CALLOUT;
3811  {
3812  int n = 0;
3813  while ((digitab[*(++ptr)] & ctype_digit) != 0)
3814  n = n * 10 + *ptr - '0';
3815  if (*ptr != ')')
3816  {
3817  *errorcodeptr = ERR39;
3818  goto FAILED;
3819  }
3820  if (n > 255)
3821  {
3822  *errorcodeptr = ERR38;
3823  goto FAILED;
3824  }
3825  *code++ = n;
3826  PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
3827  PUT(code, LINK_SIZE, 0); /* Default length */
3828  code += 2 * LINK_SIZE;
3829  }
3830  previous = NULL;
3831  continue;
3832 
3833 
3834  /* ------------------------------------------------------------ */
3835  case 'P': /* Python-style named subpattern handling */
3836  if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
3837  {
3838  is_recurse = *ptr == '>';
3839  terminator = ')';
3840  goto NAMED_REF_OR_RECURSE;
3841  }
3842  else if (*ptr != '<') /* Test for Python-style definition */
3843  {
3844  *errorcodeptr = ERR41;
3845  goto FAILED;
3846  }
3847  /* Fall through to handle (?P< as (?< is handled */
3848 
3849 
3850  /* ------------------------------------------------------------ */
3851  DEFINE_NAME: /* Come here from (?< handling */
3852  case '\'':
3853  {
3854  terminator = (*ptr == '<')? '>' : '\'';
3855  name = ++ptr;
3856 
3857  while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3858  namelen = ptr - name;
3859 
3860  /* In the pre-compile phase, just do a syntax check. */
3861 
3862  if (lengthptr != NULL)
3863  {
3864  if (*ptr != terminator)
3865  {
3866  *errorcodeptr = ERR42;
3867  goto FAILED;
3868  }
3869  if (cd->names_found >= MAX_NAME_COUNT)
3870  {
3871  *errorcodeptr = ERR49;
3872  goto FAILED;
3873  }
3874  if (namelen + 3 > cd->name_entry_size)
3875  {
3876  cd->name_entry_size = namelen + 3;
3877  if (namelen > MAX_NAME_SIZE)
3878  {
3879  *errorcodeptr = ERR48;
3880  goto FAILED;
3881  }
3882  }
3883  }
3884 
3885  /* In the real compile, create the entry in the table */
3886 
3887  else
3888  {
3889  slot = cd->name_table;
3890  for (i = 0; i < cd->names_found; i++)
3891  {
3892  int crc = memcmp(name, slot+2, namelen);
3893  if (crc == 0)
3894  {
3895  if (slot[2+namelen] == 0)
3896  {
3897  if ((options & PCRE_DUPNAMES) == 0)
3898  {
3899  *errorcodeptr = ERR43;
3900  goto FAILED;
3901  }
3902  }
3903  else crc = -1; /* Current name is substring */
3904  }
3905  if (crc < 0)
3906  {
3907  memmove(slot + cd->name_entry_size, slot,
3908  (cd->names_found - i) * cd->name_entry_size);
3909  break;
3910  }
3911  slot += cd->name_entry_size;
3912  }
3913 
3914  PUT2(slot, 0, cd->bracount + 1);
3915  memcpy(slot + 2, name, namelen);
3916  slot[2+namelen] = 0;
3917  }
3918  }
3919 
3920  /* In both cases, count the number of names we've encountered. */
3921 
3922  ptr++; /* Move past > or ' */
3923  cd->names_found++;
3924  goto NUMBERED_GROUP;
3925 
3926 
3927  /* ------------------------------------------------------------ */
3928  case '&': /* Perl recursion/subroutine syntax */
3929  terminator = ')';
3930  is_recurse = TRUE;
3931  /* Fall through */
3932 
3933  /* We come here from the Python syntax above that handles both
3934  references (?P=name) and recursion (?P>name), as well as falling
3935  through from the Perl recursion syntax (?&name). */
3936 
3937  NAMED_REF_OR_RECURSE:
3938  name = ++ptr;
3939  while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3940  namelen = ptr - name;
3941 
3942  /* In the pre-compile phase, do a syntax check and set a dummy
3943  reference number. */
3944 
3945  if (lengthptr != NULL)
3946  {
3947  if (*ptr != terminator)
3948  {
3949  *errorcodeptr = ERR42;
3950  goto FAILED;
3951  }
3952  if (namelen > MAX_NAME_SIZE)
3953  {
3954  *errorcodeptr = ERR48;
3955  goto FAILED;
3956  }
3957  recno = 0;
3958  }
3959 
3960  /* In the real compile, seek the name in the table */
3961 
3962  else
3963  {
3964  slot = cd->name_table;
3965  for (i = 0; i < cd->names_found; i++)
3966  {
3967  if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3968  slot += cd->name_entry_size;
3969  }
3970 
3971  if (i < cd->names_found) /* Back reference */
3972  {
3973  recno = GET2(slot, 0);
3974  }
3975  else if ((recno = /* Forward back reference */
3976  find_parens(ptr, cd->bracount, name, namelen,
3977  (options & PCRE_EXTENDED) != 0)) <= 0)
3978  {
3979  *errorcodeptr = ERR15;
3980  goto FAILED;
3981  }
3982  }
3983 
3984  /* In both phases, we can now go to the code than handles numerical
3985  recursion or backreferences. */
3986 
3987  if (is_recurse) goto HANDLE_RECURSION;
3988  else goto HANDLE_REFERENCE;
3989 
3990 
3991  /* ------------------------------------------------------------ */
3992  case 'R': /* Recursion */
3993  ptr++; /* Same as (?0) */
3994  /* Fall through */
3995 
3996 
3997  /* ------------------------------------------------------------ */
3998  case '0': case '1': case '2': case '3': case '4': /* Recursion or */
3999  case '5': case '6': case '7': case '8': case '9': /* subroutine */
4000  {
4001  const uschar *called;
4002  recno = 0;
4003  while((digitab[*ptr] & ctype_digit) != 0)
4004  recno = recno * 10 + *ptr++ - '0';
4005  if (*ptr != ')')
4006  {
4007  *errorcodeptr = ERR29;
4008  goto FAILED;
4009  }
4010 
4011  /* Come here from code above that handles a named recursion */
4012 
4013  HANDLE_RECURSION:
4014 
4015  previous = code;
4016  called = cd->start_code;
4017 
4018  /* When we are actually compiling, find the bracket that is being
4019  referenced. Temporarily end the regex in case it doesn't exist before
4020  this point. If we end up with a forward reference, first check that
4021  the bracket does occur later so we can give the error (and position)
4022  now. Then remember this forward reference in the workspace so it can
4023  be filled in at the end. */
4024 
4025  if (lengthptr == NULL)
4026  {
4027  *code = OP_END;
4028  if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4029 
4030  /* Forward reference */
4031 
4032  if (called == NULL)
4033  {
4034  if (find_parens(ptr, cd->bracount, NULL, recno,
4035  (options & PCRE_EXTENDED) != 0) < 0)
4036  {
4037  *errorcodeptr = ERR15;
4038  goto FAILED;
4039  }
4040  called = cd->start_code + recno;
4041  PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4042  }
4043 
4044  /* If not a forward reference, and the subpattern is still open,
4045  this is a recursive call. We check to see if this is a left
4046  recursion that could loop for ever, and diagnose that case. */
4047 
4048  else if (GET(called, 1) == 0 &&
4049  could_be_empty(called, code, bcptr, utf8))
4050  {
4051  *errorcodeptr = ERR40;
4052  goto FAILED;
4053  }
4054  }
4055 
4056  /* Insert the recursion/subroutine item, automatically wrapped inside
4057  "once" brackets. Set up a "previous group" length so that a
4058  subsequent quantifier will work. */
4059 
4060  *code = OP_ONCE;
4061  PUT(code, 1, 2 + 2*LINK_SIZE);
4062  code += 1 + LINK_SIZE;
4063 
4064  *code = OP_RECURSE;
4065  PUT(code, 1, called - cd->start_code);
4066  code += 1 + LINK_SIZE;
4067 
4068  *code = OP_KET;
4069  PUT(code, 1, 2 + 2*LINK_SIZE);
4070  code += 1 + LINK_SIZE;
4071 
4072  length_prevgroup = 3 + 3*LINK_SIZE;
4073  }
4074 
4075  /* Can't determine a first byte now */
4076 
4077  if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4078  continue;
4079 
4080 
4081  /* ------------------------------------------------------------ */
4082  default: /* Other characters: check option setting */
4083  set = unset = 0;
4084  optset = &set;
4085 
4086  while (*ptr != ')' && *ptr != ':')
4087  {
4088  switch (*ptr++)
4089  {
4090  case '-': optset = &unset; break;
4091 
4092  case 'J': /* Record that it changed in the external options */
4093  *optset |= PCRE_DUPNAMES;
4095  break;
4096 
4097  case 'i': *optset |= PCRE_CASELESS; break;
4098  case 'm': *optset |= PCRE_MULTILINE; break;
4099  case 's': *optset |= PCRE_DOTALL; break;
4100  case 'x': *optset |= PCRE_EXTENDED; break;
4101  case 'U': *optset |= PCRE_UNGREEDY; break;
4102  case 'X': *optset |= PCRE_EXTRA; break;
4103 
4104  default: *errorcodeptr = ERR12;
4105  ptr--; /* Correct the offset */
4106  goto FAILED;
4107  }
4108  }
4109 
4110  /* Set up the changed option bits, but don't change anything yet. */
4111 
4112  newoptions = (options | set) & (~unset);
4113 
4114  /* If the options ended with ')' this is not the start of a nested
4115  group with option changes, so the options change at this level. If this
4116  item is right at the start of the pattern, the options can be
4117  abstracted and made external in the pre-compile phase, and ignored in
4118  the compile phase. This can be helpful when matching -- for instance in
4119  caseless checking of required bytes.
4120 
4121  If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4122  definitely *not* at the start of the pattern because something has been
4123  compiled. In the pre-compile phase, however, the code pointer can have
4124  that value after the start, because it gets reset as code is discarded
4125  during the pre-compile. However, this can happen only at top level - if
4126  we are within parentheses, the starting BRA will still be present. At
4127  any parenthesis level, the length value can be used to test if anything
4128  has been compiled at that level. Thus, a test for both these conditions
4129  is necessary to ensure we correctly detect the start of the pattern in
4130  both phases.
4131 
4132  If we are not at the pattern start, compile code to change the ims
4133  options if this setting actually changes any of them. We also pass the
4134  new setting back so that it can be put at the start of any following
4135  branches, and when this group ends (if we are in a group), a resetting
4136  item can be compiled. */
4137 
4138  if (*ptr == ')')
4139  {
4140  if (code == cd->start_code + 1 + LINK_SIZE &&
4141  (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4142  {
4143  cd->external_options = newoptions;
4144  options = newoptions;
4145  }
4146  else
4147  {
4148  if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4149  {
4150  *code++ = OP_OPT;
4151  *code++ = newoptions & PCRE_IMS;
4152  }
4153 
4154  /* Change options at this level, and pass them back for use
4155  in subsequent branches. Reset the greedy defaults and the case
4156  value for firstbyte and reqbyte. */
4157 
4158  *optionsptr = options = newoptions;
4159  greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4160  greedy_non_default = greedy_default ^ 1;
4161  req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4162  }
4163 
4164  previous = NULL; /* This item can't be repeated */
4165  continue; /* It is complete */
4166  }
4167 
4168  /* If the options ended with ':' we are heading into a nested group
4169  with possible change of options. Such groups are non-capturing and are
4170  not assertions of any kind. All we need to do is skip over the ':';
4171  the newoptions value is handled below. */
4172 
4173  bravalue = OP_BRA;
4174  ptr++;
4175  } /* End of switch for character following (? */
4176  } /* End of (? handling */
4177 
4178  /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4179  all unadorned brackets become non-capturing and behave like (?:...)
4180  brackets. */
4181 
4182  else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4183  {
4184  bravalue = OP_BRA;
4185  }
4186 
4187  /* Else we have a capturing group. */
4188 
4189  else
4190  {
4191  NUMBERED_GROUP:
4192  cd->bracount += 1;
4193  PUT2(code, 1+LINK_SIZE, cd->bracount);
4194  skipbytes = 2;
4195  }
4196 
4197  /* Process nested bracketed regex. Assertions may not be repeated, but
4198  other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4199  non-register variable in order to be able to pass its address because some
4200  compilers complain otherwise. Pass in a new setting for the ims options if
4201  they have changed. */
4202 
4203  previous = (bravalue >= OP_ONCE)? code : NULL;
4204  *code = bravalue;
4205  tempcode = code;
4206  tempreqvary = cd->req_varyopt; /* Save value before bracket */
4207  length_prevgroup = 0; /* Initialize for pre-compile phase */
4208 
4209  if (!compile_regex(
4210  newoptions, /* The complete new option state */
4211  options & PCRE_IMS, /* The previous ims option state */
4212  &tempcode, /* Where to put code (updated) */
4213  &ptr, /* Input pointer (updated) */
4214  errorcodeptr, /* Where to put an error message */
4215  (bravalue == OP_ASSERTBACK ||
4216  bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4217  skipbytes, /* Skip over bracket number */
4218  &subfirstbyte, /* For possible first char */
4219  &subreqbyte, /* For possible last char */
4220  bcptr, /* Current branch chain */
4221  cd, /* Tables block */
4222  (lengthptr == NULL)? NULL : /* Actual compile phase */
4223  &length_prevgroup /* Pre-compile phase */
4224  ))
4225  goto FAILED;
4226 
4227  /* At the end of compiling, code is still pointing to the start of the
4228  group, while tempcode has been updated to point past the end of the group
4229  and any option resetting that may follow it. The pattern pointer (ptr)
4230  is on the bracket. */
4231 
4232  /* If this is a conditional bracket, check that there are no more than
4233  two branches in the group, or just one if it's a DEFINE group. */
4234 
4235  if (bravalue == OP_COND)
4236  {
4237  uschar *tc = code;
4238  int condcount = 0;
4239 
4240  do {
4241  condcount++;
4242  tc += GET(tc,1);
4243  }
4244  while (*tc != OP_KET);
4245 
4246  /* A DEFINE group is never obeyed inline (the "condition" is always
4247  false). It must have only one branch. */
4248 
4249  if (code[LINK_SIZE+1] == OP_DEF)
4250  {
4251  if (condcount > 1)
4252  {
4253  *errorcodeptr = ERR54;
4254  goto FAILED;
4255  }
4256  bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4257  }
4258 
4259  /* A "normal" conditional group. If there is just one branch, we must not
4260  make use of its firstbyte or reqbyte, because this is equivalent to an
4261  empty second branch. */
4262 
4263  else
4264  {
4265  if (condcount > 2)
4266  {
4267  *errorcodeptr = ERR27;
4268  goto FAILED;
4269  }
4270  if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4271  }
4272  }
4273 
4274  /* Error if hit end of pattern */
4275 
4276  if (*ptr != ')')
4277  {
4278  *errorcodeptr = ERR14;
4279  goto FAILED;
4280  }
4281 
4282  /* In the pre-compile phase, update the length by the length of the nested
4283  group, less the brackets at either end. Then reduce the compiled code to
4284  just the brackets so that it doesn't use much memory if it is duplicated by
4285  a quantifier. */
4286 
4287  if (lengthptr != NULL)
4288  {
4289  *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4290  code++;
4291  PUTINC(code, 0, 1 + LINK_SIZE);
4292  *code++ = OP_KET;
4293  PUTINC(code, 0, 1 + LINK_SIZE);
4294  }
4295 
4296  /* Otherwise update the main code pointer to the end of the group. */
4297 
4298  else code = tempcode;
4299 
4300  /* For a DEFINE group, required and first character settings are not
4301  relevant. */
4302 
4303  if (bravalue == OP_DEF) break;
4304 
4305  /* Handle updating of the required and first characters for other types of
4306  group. Update for normal brackets of all kinds, and conditions with two
4307  branches (see code above). If the bracket is followed by a quantifier with
4308  zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4309  zerofirstbyte outside the main loop so that they can be accessed for the
4310  back off. */
4311 
4312  zeroreqbyte = reqbyte;
4313  zerofirstbyte = firstbyte;
4314  groupsetfirstbyte = FALSE;
4315 
4316  if (bravalue >= OP_ONCE)
4317  {
4318  /* If we have not yet set a firstbyte in this branch, take it from the
4319  subpattern, remembering that it was set here so that a repeat of more
4320  than one can replicate it as reqbyte if necessary. If the subpattern has
4321  no firstbyte, set "none" for the whole branch. In both cases, a zero
4322  repeat forces firstbyte to "none". */
4323 
4324  if (firstbyte == REQ_UNSET)
4325  {
4326  if (subfirstbyte >= 0)
4327  {
4328  firstbyte = subfirstbyte;
4329  groupsetfirstbyte = TRUE;
4330  }
4331  else firstbyte = REQ_NONE;
4332  zerofirstbyte = REQ_NONE;
4333  }
4334 
4335  /* If firstbyte was previously set, convert the subpattern's firstbyte
4336  into reqbyte if there wasn't one, using the vary flag that was in
4337  existence beforehand. */
4338 
4339  else if (subfirstbyte >= 0 && subreqbyte < 0)
4340  subreqbyte = subfirstbyte | tempreqvary;
4341 
4342  /* If the subpattern set a required byte (or set a first byte that isn't
4343  really the first byte - see above), set it. */
4344 
4345  if (subreqbyte >= 0) reqbyte = subreqbyte;
4346  }
4347 
4348  /* For a forward assertion, we take the reqbyte, if set. This can be
4349  helpful if the pattern that follows the assertion doesn't set a different
4350  char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4351  for an assertion, however because it leads to incorrect effect for patterns
4352  such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4353  of a firstbyte. This is overcome by a scan at the end if there's no
4354  firstbyte, looking for an asserted first char. */
4355 
4356  else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4357  break; /* End of processing '(' */
4358 
4359 
4360  /* ===================================================================*/
4361  /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4362  are arranged to be the negation of the corresponding OP_values. For the
4363  back references, the values are ESC_REF plus the reference number. Only
4364  back references and those types that consume a character may be repeated.
4365  We can test for values between ESC_b and ESC_Z for the latter; this may
4366  have to change if any new ones are ever created. */
4367 
4368  case '\\':
4369  tempptr = ptr;
4370  c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4371  if (*errorcodeptr != 0) goto FAILED;
4372 
4373  if (c < 0)
4374  {
4375  if (-c == ESC_Q) /* Handle start of quoted string */
4376  {
4377  if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
4378  else inescq = TRUE;
4379  continue;
4380  }
4381 
4382  if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
4383 
4384  /* For metasequences that actually match a character, we disable the
4385  setting of a first character if it hasn't already been set. */
4386 
4387  if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
4388  firstbyte = REQ_NONE;
4389 
4390  /* Set values to reset to if this is followed by a zero repeat. */
4391 
4392  zerofirstbyte = firstbyte;
4393  zeroreqbyte = reqbyte;
4394 
4395  /* \k<name> or \k'name' is a back reference by name (Perl syntax) */
4396 
4397  if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\''))
4398  {
4399  is_recurse = FALSE;
4400  terminator = (*(++ptr) == '<')? '>' : '\'';
4401  goto NAMED_REF_OR_RECURSE;
4402  }
4403 
4404  /* Back references are handled specially; must disable firstbyte if
4405  not set to cope with cases like (?=(\w+))\1: which would otherwise set
4406  ':' later. */
4407 
4408  if (-c >= ESC_REF)
4409  {
4410  recno = -c - ESC_REF;
4411 
4412  HANDLE_REFERENCE: /* Come here from named backref handling */
4413  if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4414  previous = code;
4415  *code++ = OP_REF;
4416  PUT2INC(code, 0, recno);
4417  cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4418  if (recno > cd->top_backref) cd->top_backref = recno;
4419  }
4420 
4421  /* So are Unicode property matches, if supported. */
4422 
4423 #ifdef SUPPORT_UCP
4424  else if (-c == ESC_P || -c == ESC_p)
4425  {
4426  BOOL negated;
4427  int pdata;
4428  int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4429  if (ptype < 0) goto FAILED;
4430  previous = code;
4431  *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4432  *code++ = ptype;
4433  *code++ = pdata;
4434  }
4435 #else
4436 
4437  /* If Unicode properties are not supported, \X, \P, and \p are not
4438  allowed. */
4439 
4440  else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4441  {
4442  *errorcodeptr = ERR45;
4443  goto FAILED;
4444  }
4445 #endif
4446 
4447  /* For the rest (including \X when Unicode properties are supported), we
4448  can obtain the OP value by negating the escape value. */
4449 
4450  else
4451  {
4452  previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
4453  *code++ = -c;
4454  }
4455  continue;
4456  }
4457 
4458  /* We have a data character whose value is in c. In UTF-8 mode it may have
4459  a value > 127. We set its representation in the length/buffer, and then
4460  handle it as a data character. */
4461 
4462 #ifdef SUPPORT_UTF8
4463  if (utf8 && c > 127)
4464  mclength = _pcre_ord2utf8(c, mcbuffer);
4465  else
4466 #endif
4467 
4468  {
4469  mcbuffer[0] = c;
4470  mclength = 1;
4471  }
4472  goto ONE_CHAR;
4473 
4474 
4475  /* ===================================================================*/
4476  /* Handle a literal character. It is guaranteed not to be whitespace or #
4477  when the extended flag is set. If we are in UTF-8 mode, it may be a
4478  multi-byte literal character. */
4479 
4480  default:
4481  NORMAL_CHAR:
4482  mclength = 1;
4483  mcbuffer[0] = c;
4484 
4485 #ifdef SUPPORT_UTF8
4486  if (utf8 && c >= 0xc0)
4487  {
4488  while ((ptr[1] & 0xc0) == 0x80)
4489  mcbuffer[mclength++] = *(++ptr);
4490  }
4491 #endif
4492 
4493  /* At this point we have the character's bytes in mcbuffer, and the length
4494  in mclength. When not in UTF-8 mode, the length is always 1. */
4495 
4496  ONE_CHAR:
4497  previous = code;
4498  *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
4499  for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
4500 
4501  /* Set the first and required bytes appropriately. If no previous first
4502  byte, set it from this character, but revert to none on a zero repeat.
4503  Otherwise, leave the firstbyte value alone, and don't change it on a zero
4504  repeat. */
4505 
4506  if (firstbyte == REQ_UNSET)
4507  {
4508  zerofirstbyte = REQ_NONE;
4509  zeroreqbyte = reqbyte;
4510 
4511  /* If the character is more than one byte long, we can set firstbyte
4512  only if it is not to be matched caselessly. */
4513 
4514  if (mclength == 1 || req_caseopt == 0)
4515  {
4516  firstbyte = mcbuffer[0] | req_caseopt;
4517  if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
4518  }
4519  else firstbyte = reqbyte = REQ_NONE;
4520  }
4521 
4522  /* firstbyte was previously set; we can set reqbyte only the length is
4523  1 or the matching is caseful. */
4524 
4525  else
4526  {
4527  zerofirstbyte = firstbyte;
4528  zeroreqbyte = reqbyte;
4529  if (mclength == 1 || req_caseopt == 0)
4530  reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
4531  }
4532 
4533  break; /* End of literal character handling */
4534  }
4535  } /* end of big loop */
4536 
4537 
4538 /* Control never reaches here by falling through, only by a goto for all the
4539 error states. Pass back the position in the pattern so that it can be displayed
4540 to the user for diagnosing the error. */
4541 
4542 FAILED:
4543 *ptrptr = ptr;
4544 return FALSE;
4545 }
4546 
4547 
4548 
4549 
4550 /*************************************************
4551 * Compile sequence of alternatives *
4552 *************************************************/
4553 
4554 /* On entry, ptr is pointing past the bracket character, but on return it
4555 points to the closing bracket, or vertical bar, or end of string. The code
4556 variable is pointing at the byte into which the BRA operator has been stored.
4557 If the ims options are changed at the start (for a (?ims: group) or during any
4558 branch, we need to insert an OP_OPT item at the start of every following branch
4559 to ensure they get set correctly at run time, and also pass the new options
4560 into every subsequent branch compile.
4561 
4562 This function is used during the pre-compile phase when we are trying to find
4563 out the amount of memory needed, as well as during the real compile phase. The
4564 value of lengthptr distinguishes the two phases.
4565 
4566 Argument:
4567  options option bits, including any changes for this subpattern
4568  oldims previous settings of ims option bits
4569  codeptr -> the address of the current code pointer
4570  ptrptr -> the address of the current pattern pointer
4571  errorcodeptr -> pointer to error code variable
4572  lookbehind TRUE if this is a lookbehind assertion
4573  skipbytes skip this many bytes at start (for brackets and OP_COND)
4574  firstbyteptr place to put the first required character, or a negative number
4575  reqbyteptr place to put the last required character, or a negative number
4576  bcptr pointer to the chain of currently open branches
4577  cd points to the data block with tables pointers etc.
4578  lengthptr NULL during the real compile phase
4579  points to length accumulator during pre-compile phase
4580 
4581 Returns: TRUE on success
4582 */
4583 
4584 static BOOL
4585 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
4586  int *errorcodeptr, BOOL lookbehind, int skipbytes, int *firstbyteptr,
4587  int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr)
4588 {
4589 const uschar *ptr = *ptrptr;
4590 uschar *code = *codeptr;
4591 uschar *last_branch = code;
4592 uschar *start_bracket = code;
4593 uschar *reverse_count = NULL;
4594 int firstbyte, reqbyte;
4595 int branchfirstbyte, branchreqbyte;
4596 int length;
4597 branch_chain bc;
4598 
4599 bc.outer = bcptr;
4600 bc.current = code;
4601 
4602 firstbyte = reqbyte = REQ_UNSET;
4603 
4604 /* Accumulate the length for use in the pre-compile phase. Start with the
4605 length of the BRA and KET and any extra bytes that are required at the
4606 beginning. We accumulate in a local variable to save frequent testing of
4607 lenthptr for NULL. We cannot do this by looking at the value of code at the
4608 start and end of each alternative, because compiled items are discarded during
4609 the pre-compile phase so that the work space is not exceeded. */
4610 
4611 length = 2 + 2*LINK_SIZE + skipbytes;
4612 
4613 /* WARNING: If the above line is changed for any reason, you must also change
4614 the code that abstracts option settings at the start of the pattern and makes
4615 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
4616 pre-compile phase to find out whether anything has yet been compiled or not. */
4617 
4618 /* Offset is set zero to mark that this bracket is still open */
4619 
4620 PUT(code, 1, 0);
4621 code += 1 + LINK_SIZE + skipbytes;
4622 
4623 /* Loop for each alternative branch */
4624 
4625 for (;;)
4626  {
4627  /* Handle a change of ims options at the start of the branch */
4628 
4629  if ((options & PCRE_IMS) != oldims)
4630  {
4631  *code++ = OP_OPT;
4632  *code++ = options & PCRE_IMS;
4633  length += 2;
4634  }
4635 
4636  /* Set up dummy OP_REVERSE if lookbehind assertion */
4637 
4638  if (lookbehind)
4639  {
4640  *code++ = OP_REVERSE;
4641  reverse_count = code;
4642  PUTINC(code, 0, 0);
4643  length += 1 + LINK_SIZE;
4644  }
4645 
4646  /* Now compile the branch; in the pre-compile phase its length gets added
4647  into the length. */
4648 
4649  if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
4650  &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
4651  {
4652  *ptrptr = ptr;
4653  return FALSE;
4654  }
4655 
4656  /* In the real compile phase, there is some post-processing to be done. */
4657 
4658  if (lengthptr == NULL)
4659  {
4660  /* If this is the first branch, the firstbyte and reqbyte values for the
4661  branch become the values for the regex. */
4662 
4663  if (*last_branch != OP_ALT)
4664  {
4665  firstbyte = branchfirstbyte;
4666  reqbyte = branchreqbyte;
4667  }
4668 
4669  /* If this is not the first branch, the first char and reqbyte have to
4670  match the values from all the previous branches, except that if the
4671  previous value for reqbyte didn't have REQ_VARY set, it can still match,
4672  and we set REQ_VARY for the regex. */
4673 
4674  else
4675  {
4676  /* If we previously had a firstbyte, but it doesn't match the new branch,
4677  we have to abandon the firstbyte for the regex, but if there was
4678  previously no reqbyte, it takes on the value of the old firstbyte. */
4679 
4680  if (firstbyte >= 0 && firstbyte != branchfirstbyte)
4681  {
4682  if (reqbyte < 0) reqbyte = firstbyte;
4683  firstbyte = REQ_NONE;
4684  }
4685 
4686  /* If we (now or from before) have no firstbyte, a firstbyte from the
4687  branch becomes a reqbyte if there isn't a branch reqbyte. */
4688 
4689  if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
4690  branchreqbyte = branchfirstbyte;
4691 
4692  /* Now ensure that the reqbytes match */
4693 
4694  if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
4695  reqbyte = REQ_NONE;
4696  else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
4697  }
4698 
4699  /* If lookbehind, check that this branch matches a fixed-length string, and
4700  put the length into the OP_REVERSE item. Temporarily mark the end of the
4701  branch with OP_END. */
4702 
4703  if (lookbehind)
4704  {
4705  int fixed_length;
4706  *code = OP_END;
4707  fixed_length = find_fixedlength(last_branch, options);
4708  DPRINTF(("fixed length = %d\n", fixed_length));
4709  if (fixed_length < 0)
4710  {
4711  *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
4712  *ptrptr = ptr;
4713  return FALSE;
4714  }
4715  PUT(reverse_count, 0, fixed_length);
4716  }
4717  }
4718 
4719  /* Reached end of expression, either ')' or end of pattern. Go back through
4720  the alternative branches and reverse the chain of offsets, with the field in
4721  the BRA item now becoming an offset to the first alternative. If there are
4722  no alternatives, it points to the end of the group. The length in the
4723  terminating ket is always the length of the whole bracketed item. If any of
4724  the ims options were changed inside the group, compile a resetting op-code
4725  following, except at the very end of the pattern. Return leaving the pointer
4726  at the terminating char. */
4727 
4728  if (*ptr != '|')
4729  {
4730  int branch_length = code - last_branch;
4731  do
4732  {
4733  int prev_length = GET(last_branch, 1);
4734  PUT(last_branch, 1, branch_length);
4735  branch_length = prev_length;
4736  last_branch -= branch_length;
4737  }
4738  while (branch_length > 0);
4739 
4740  /* Fill in the ket */
4741 
4742  *code = OP_KET;
4743  PUT(code, 1, code - start_bracket);
4744  code += 1 + LINK_SIZE;
4745 
4746  /* Resetting option if needed */
4747 
4748  if ((options & PCRE_IMS) != oldims && *ptr == ')')
4749  {
4750  *code++ = OP_OPT;
4751  *code++ = oldims;
4752  length += 2;
4753  }
4754 
4755  /* Set values to pass back */
4756 
4757  *codeptr = code;
4758  *ptrptr = ptr;
4759  *firstbyteptr = firstbyte;
4760  *reqbyteptr = reqbyte;
4761  if (lengthptr != NULL) *lengthptr += length;
4762  return TRUE;
4763  }
4764 
4765  /* Another branch follows; insert an "or" node. Its length field points back
4766  to the previous branch while the bracket remains open. At the end the chain
4767  is reversed. It's done like this so that the start of the bracket has a
4768  zero offset until it is closed, making it possible to detect recursion. */
4769 
4770  *code = OP_ALT;
4771  PUT(code, 1, code - last_branch);
4772  bc.current = last_branch = code;
4773  code += 1 + LINK_SIZE;
4774  ptr++;
4775  length += 1 + LINK_SIZE;
4776  }
4777 /* Control never reaches here */
4778 }
4779 
4780 
4781 
4782 
4783 /*************************************************
4784 * Check for anchored expression *
4785 *************************************************/
4786 
4787 /* Try to find out if this is an anchored regular expression. Consider each
4788 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
4789 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
4790 it's anchored. However, if this is a multiline pattern, then only OP_SOD
4791 counts, since OP_CIRC can match in the middle.
4792 
4793 We can also consider a regex to be anchored if OP_SOM starts all its branches.
4794 This is the code for \G, which means "match at start of match position, taking
4795 into account the match offset".
4796 
4797 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
4798 because that will try the rest of the pattern at all possible matching points,
4799 so there is no point trying again.... er ....
4800 
4801 .... except when the .* appears inside capturing parentheses, and there is a
4802 subsequent back reference to those parentheses. We haven't enough information
4803 to catch that case precisely.
4804 
4805 At first, the best we could do was to detect when .* was in capturing brackets
4806 and the highest back reference was greater than or equal to that level.
4807 However, by keeping a bitmap of the first 31 back references, we can catch some
4808 of the more common cases more precisely.
4809 
4810 Arguments:
4811  code points to start of expression (the bracket)
4812  options points to the options setting
4813  bracket_map a bitmap of which brackets we are inside while testing; this
4814  handles up to substring 31; after that we just have to take
4815  the less precise approach
4816  backref_map the back reference bitmap
4817 
4818 Returns: TRUE or FALSE
4819 */
4820 
4821 static BOOL
4822 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
4823  unsigned int backref_map)
4824 {
4825 do {
4827  options, PCRE_MULTILINE, FALSE);
4828  register int op = *scode;
4829 
4830  /* Non-capturing brackets */
4831 
4832  if (op == OP_BRA)
4833  {
4834  if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4835  }
4836 
4837  /* Capturing brackets */
4838 
4839  else if (op == OP_CBRA)
4840  {
4841  int n = GET2(scode, 1+LINK_SIZE);
4842  int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
4843  if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
4844  }
4845 
4846  /* Other brackets */
4847 
4848  else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4849  {
4850  if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4851  }
4852 
4853  /* .* is not anchored unless DOTALL is set and it isn't in brackets that
4854  are or may be referenced. */
4855 
4856  else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
4857  op == OP_TYPEPOSSTAR) &&
4858  (*options & PCRE_DOTALL) != 0)
4859  {
4860  if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4861  }
4862 
4863  /* Check for explicit anchoring */
4864 
4865  else if (op != OP_SOD && op != OP_SOM &&
4866  ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
4867  return FALSE;
4868  code += GET(code, 1);
4869  }
4870 while (*code == OP_ALT); /* Loop for each alternative */
4871 return TRUE;
4872 }
4873 
4874 
4875 
4876 /*************************************************
4877 * Check for starting with ^ or .* *
4878 *************************************************/
4879 
4880 /* This is called to find out if every branch starts with ^ or .* so that
4881 "first char" processing can be done to speed things up in multiline
4882 matching and for non-DOTALL patterns that start with .* (which must start at
4883 the beginning or after \n). As in the case of is_anchored() (see above), we
4884 have to take account of back references to capturing brackets that contain .*
4885 because in that case we can't make the assumption.
4886 
4887 Arguments:
4888  code points to start of expression (the bracket)
4889  bracket_map a bitmap of which brackets we are inside while testing; this
4890  handles up to substring 31; after that we just have to take
4891  the less precise approach
4892  backref_map the back reference bitmap
4893 
4894 Returns: TRUE or FALSE
4895 */
4896 
4897 static BOOL
4898 is_startline(const uschar *code, unsigned int bracket_map,
4899  unsigned int backref_map)
4900 {
4901 do {
4903  NULL, 0, FALSE);
4904  register int op = *scode;
4905 
4906  /* Non-capturing brackets */
4907 
4908  if (op == OP_BRA)
4909  {
4910  if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
4911  }
4912 
4913  /* Capturing brackets */
4914 
4915  else if (op == OP_CBRA)
4916  {
4917  int n = GET2(scode, 1+LINK_SIZE);
4918  int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
4919  if (!is_startline(scode, new_map, backref_map)) return FALSE;
4920  }
4921 
4922  /* Other brackets */
4923 
4924  else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4925  { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
4926 
4927  /* .* means "start at start or after \n" if it isn't in brackets that
4928  may be referenced. */
4929 
4930  else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
4931  {
4932  if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4933  }
4934 
4935  /* Check for explicit circumflex */
4936 
4937  else if (op != OP_CIRC) return FALSE;
4938 
4939  /* Move on to the next alternative */
4940 
4941  code += GET(code, 1);
4942  }
4943 while (*code == OP_ALT); /* Loop for each alternative */
4944 return TRUE;
4945 }
4946 
4947 
4948 
4949 /*************************************************
4950 * Check for asserted fixed first char *
4951 *************************************************/
4952 
4953 /* During compilation, the "first char" settings from forward assertions are
4954 discarded, because they can cause conflicts with actual literals that follow.
4955 However, if we end up without a first char setting for an unanchored pattern,
4956 it is worth scanning the regex to see if there is an initial asserted first
4957 char. If all branches start with the same asserted char, or with a bracket all
4958 of whose alternatives start with the same asserted char (recurse ad lib), then
4959 we return that char, otherwise -1.
4960 
4961 Arguments:
4962  code points to start of expression (the bracket)
4963  options pointer to the options (used to check casing changes)
4964  inassert TRUE if in an assertion
4965 
4966 Returns: -1 or the fixed first char
4967 */
4968 
4969 static int
4970 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
4971 {
4972 register int c = -1;
4973 do {
4974  int d;
4975  const uschar *scode =
4976  first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
4977  register int op = *scode;
4978 
4979  switch(op)
4980  {
4981  default:
4982  return -1;
4983 
4984  case OP_BRA:
4985  case OP_CBRA:
4986  case OP_ASSERT:
4987  case OP_ONCE:
4988  case OP_COND:
4989  if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
4990  return -1;
4991  if (c < 0) c = d; else if (c != d) return -1;
4992  break;
4993 
4994  case OP_EXACT: /* Fall through */
4995  scode += 2;
4996 
4997  case OP_CHAR:
4998  case OP_CHARNC:
4999  case OP_PLUS:
5000  case OP_MINPLUS:
5001  case OP_POSPLUS:
5002  if (!inassert) return -1;
5003  if (c < 0)
5004  {
5005  c = scode[1];
5006  if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5007  }
5008  else if (c != scode[1]) return -1;
5009  break;
5010  }
5011 
5012  code += GET(code, 1);
5013  }
5014 while (*code == OP_ALT);
5015 return c;
5016 }
5017 
5018 
5019 
5020 /*************************************************
5021 * Compile a Regular Expression *
5022 *************************************************/
5023 
5024 /* This function takes a string and returns a pointer to a block of store
5025 holding a compiled version of the expression. The original API for this
5026 function had no error code return variable; it is retained for backwards
5027 compatibility. The new function is given a new name.
5028 
5029 Arguments:
5030  pattern the regular expression
5031  options various option bits
5032  errorcodeptr pointer to error code variable (pcre_compile2() only)
5033  can be NULL if you don't want a code value
5034  errorptr pointer to pointer to error text
5035  erroroffset ptr offset in pattern where error was detected
5036  tables pointer to character tables or NULL
5037 
5038 Returns: pointer to compiled data block, or NULL on error,
5039  with errorptr and erroroffset set
5040 */
5041 
5043 pcre_compile(const char *pattern, int options, const char **errorptr,
5044  int *erroroffset, const unsigned char *tables)
5045 {
5046 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5047 }
5048 
5049 
5051 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5052  const char **errorptr, int *erroroffset, const unsigned char *tables)
5053 {
5054 real_pcre *re;
5055 int length = 1; /* For final END opcode */
5056 int firstbyte, reqbyte, newline;
5057 int errorcode = 0;
5058 #ifdef SUPPORT_UTF8
5059 BOOL utf8;
5060 #endif
5061 size_t size;
5062 uschar *code;
5063 const uschar *codestart;
5064 const uschar *ptr;
5065 compile_data compile_block;
5066 compile_data *cd = &compile_block;
5067 
5068 /* This space is used for "compiling" into during the first phase, when we are
5069 computing the amount of memory that is needed. Compiled items are thrown away
5070 as soon as possible, so that a fairly large buffer should be sufficient for
5071 this purpose. The same space is used in the second phase for remembering where
5072 to fill in forward references to subpatterns. */
5073 
5074 uschar cworkspace[COMPILE_WORK_SIZE];
5075 
5076 
5077 /* Set this early so that early errors get offset 0. */
5078 
5079 ptr = (const uschar *)pattern;
5080 
5081 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5082 can do is just return NULL, but we can set a code value if there is a code
5083 pointer. */
5084 
5085 if (errorptr == NULL)
5086  {
5087  if (errorcodeptr != NULL) *errorcodeptr = 99;
5088  return NULL;
5089  }
5090 
5091 *errorptr = NULL;
5092 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5093 
5094 /* However, we can give a message for this error */
5095 
5096 if (erroroffset == NULL)
5097  {
5098  errorcode = ERR16;
5099  goto PCRE_EARLY_ERROR_RETURN;
5100  }
5101 
5102 *erroroffset = 0;
5103 
5104 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5105 
5106 #ifdef SUPPORT_UTF8
5107 utf8 = (options & PCRE_UTF8) != 0;
5108 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5109  (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5110  {
5111  errorcode = ERR44;
5112  goto PCRE_UTF8_ERROR_RETURN;
5113  }
5114 #else
5115 if ((options & PCRE_UTF8) != 0)
5116  {
5117  errorcode = ERR32;
5118  goto PCRE_EARLY_ERROR_RETURN;
5119  }
5120 #endif
5121 
5122 if ((options & ~PUBLIC_OPTIONS) != 0)
5123  {
5124  errorcode = ERR17;
5125  goto PCRE_EARLY_ERROR_RETURN;
5126  }
5127 
5128 /* Set up pointers to the individual character tables */
5129 
5130 if (tables == NULL) tables = _pcre_default_tables;
5131 cd->lcc = tables + lcc_offset;
5132 cd->fcc = tables + fcc_offset;
5133 cd->cbits = tables + cbits_offset;
5134 cd->ctypes = tables + ctypes_offset;
5135 
5136 /* Handle different types of newline. The three bits give seven cases. The
5137 current code allows for fixed one- or two-byte sequences, plus "any". */
5138 
5139 switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5140  {
5141  case 0: newline = NEWLINE; break; /* Compile-time default */
5142  case PCRE_NEWLINE_CR: newline = '\r'; break;
5143  case PCRE_NEWLINE_LF: newline = '\n'; break;
5144  case PCRE_NEWLINE_CR+
5145  PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5146  case PCRE_NEWLINE_ANY: newline = -1; break;
5147  default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5148  }
5149 
5150 if (newline < 0)
5151  {
5152  cd->nltype = NLTYPE_ANY;
5153  }
5154 else
5155  {
5156  cd->nltype = NLTYPE_FIXED;
5157  if (newline > 255)
5158  {
5159  cd->nllen = 2;
5160  cd->nl[0] = (newline >> 8) & 255;
5161  cd->nl[1] = newline & 255;
5162  }
5163  else
5164  {
5165  cd->nllen = 1;
5166  cd->nl[0] = newline;
5167  }
5168  }
5169 
5170 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5171 references to help in deciding whether (.*) can be treated as anchored or not.
5172 */
5173 
5174 cd->top_backref = 0;
5175 cd->backref_map = 0;
5176 
5177 /* Reflect pattern for debugging output */
5178 
5179 DPRINTF(("------------------------------------------------------------------\n"));
5180 DPRINTF(("%s\n", pattern));
5181 
5182 /* Pretend to compile the pattern while actually just accumulating the length
5183 of memory required. This behaviour is triggered by passing a non-NULL final
5184 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5185 to compile parts of the pattern into; the compiled code is discarded when it is
5186 no longer needed, so hopefully this workspace will never overflow, though there
5187 is a test for its doing so. */
5188 
5189 cd->bracount = 0;
5190 cd->names_found = 0;
5191 cd->name_entry_size = 0;
5192 cd->name_table = NULL;
5193 cd->start_workspace = cworkspace;
5194 cd->start_code = cworkspace;
5195 cd->hwm = cworkspace;
5196 cd->start_pattern = (const uschar *)pattern;
5197 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5198 cd->req_varyopt = 0;
5199 cd->nopartial = FALSE;
5200 cd->external_options = options;
5201 
5202 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5203 don't need to look at the result of the function here. The initial options have
5204 been put into the cd block so that they can be changed if an option setting is
5205 found within the regex right at the beginning. Bringing initial option settings
5206 outside can help speed up starting point checks. */
5207 
5208 code = cworkspace;
5209 *code = OP_BRA;
5211  &code, &ptr, &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, &length);
5212 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5213 
5214 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5215  cd->hwm - cworkspace));
5216 
5217 if (length > MAX_PATTERN_SIZE)
5218  {
5219  errorcode = ERR20;
5220  goto PCRE_EARLY_ERROR_RETURN;
5221  }
5222 
5223 /* Compute the size of data block needed and get it, either from malloc or
5224 externally provided function. Integer overflow should no longer be possible
5225 because nowadays we limit the maximum value of cd->names_found and
5226 cd->name_entry_size. */
5227 
5228 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5229 re = (real_pcre *)(pcre_malloc)(size);
5230 
5231 if (re == NULL)
5232  {
5233  errorcode = ERR21;
5234  goto PCRE_EARLY_ERROR_RETURN;
5235  }
5236 
5237 /* Put in the magic number, and save the sizes, initial options, and character
5238 table pointer. NULL is used for the default character tables. The nullpad field
5239 is at the end; it's there to help in the case when a regex compiled on a system
5240 with 4-byte pointers is run on another with 8-byte pointers. */
5241 
5243 re->size = size;
5244 re->options = cd->external_options;
5245 re->dummy1 = 0;
5246 re->first_byte = 0;
5247 re->req_byte = 0;
5248 re->name_table_offset = sizeof(real_pcre);
5250 re->name_count = cd->names_found;
5251 re->ref_count = 0;
5252 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5253 re->nullpad = NULL;
5254 
5255 /* The starting points of the name/number translation table and of the code are
5256 passed around in the compile data block. The start/end pattern and initial
5257 options are already set from the pre-compile phase, as is the name_entry_size
5258 field. Reset the bracket count and the names_found field. Also reset the hwm
5259 field; this time it's used for remembering forward references to subpatterns.
5260 */
5261 
5262 cd->bracount = 0;
5263 cd->names_found = 0;
5264 cd->name_table = (uschar *)re + re->name_table_offset;
5265 codestart = cd->name_table + re->name_entry_size * re->name_count;
5266 cd->start_code = codestart;
5267 cd->hwm = cworkspace;
5268 cd->req_varyopt = 0;
5269 cd->nopartial = FALSE;
5270 
5271 /* Set up a starting, non-extracting bracket, then compile the expression. On
5272 error, errorcode will be set non-zero, so we don't need to look at the result
5273 of the function here. */
5274 
5275 ptr = (const uschar *)pattern;
5276 code = (uschar *)codestart;
5277 *code = OP_BRA;
5278 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
5279  &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
5280 re->top_bracket = cd->bracount;
5281 re->top_backref = cd->top_backref;
5282 
5283 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
5284 
5285 /* If not reached end of pattern on success, there's an excess bracket. */
5286 
5287 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
5288 
5289 /* Fill in the terminating state and check for disastrous overflow, but
5290 if debugging, leave the test till after things are printed out. */
5291 
5292 *code++ = OP_END;
5293 
5294 #ifndef DEBUG
5295 if (code - codestart > length) errorcode = ERR23;
5296 #endif
5297 
5298 /* Fill in any forward references that are required. */
5299 
5300 while (errorcode == 0 && cd->hwm > cworkspace)
5301  {
5302  int offset, recno;
5303  const uschar *groupptr;
5304  cd->hwm -= LINK_SIZE;
5305  offset = GET(cd->hwm, 0);
5306  recno = GET(codestart, offset);
5307  groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
5308  if (groupptr == NULL) errorcode = ERR53;
5309  else PUT(((uschar *)codestart), offset, groupptr - codestart);
5310  }
5311 
5312 /* Give an error if there's back reference to a non-existent capturing
5313 subpattern. */
5314 
5315 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
5316 
5317 /* Failed to compile, or error while post-processing */
5318 
5319 if (errorcode != 0)
5320  {
5321  (pcre_free)(re);
5322  PCRE_EARLY_ERROR_RETURN:
5323  *erroroffset = ptr - (const uschar *)pattern;
5324 #ifdef SUPPORT_UTF8
5325  PCRE_UTF8_ERROR_RETURN:
5326 #endif
5327  *errorptr = error_texts[errorcode];
5328  if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5329  return NULL;
5330  }
5331 
5332 /* If the anchored option was not passed, set the flag if we can determine that
5333 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5334 as starting with .* when DOTALL is set).
5335 
5336 Otherwise, if we know what the first byte has to be, save it, because that
5337 speeds up unanchored matches no end. If not, see if we can set the
5338 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5339 start with ^. and also when all branches start with .* for non-DOTALL matches.
5340 */
5341 
5342 if ((re->options & PCRE_ANCHORED) == 0)
5343  {
5344  int temp_options = re->options; /* May get changed during these scans */
5345  if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
5346  re->options |= PCRE_ANCHORED;
5347  else
5348  {
5349  if (firstbyte < 0)
5350  firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5351  if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5352  {
5353  int ch = firstbyte & 255;
5354  re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5355  cd->fcc[ch] == ch)? ch : firstbyte;
5356  re->options |= PCRE_FIRSTSET;
5357  }
5358  else if (is_startline(codestart, 0, cd->backref_map))
5359  re->options |= PCRE_STARTLINE;
5360  }
5361  }
5362 
5363 /* For an anchored pattern, we use the "required byte" only if it follows a
5364 variable length item in the regex. Remove the caseless flag for non-caseable
5365 bytes. */
5366 
5367 if (reqbyte >= 0 &&
5368  ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5369  {
5370  int ch = reqbyte & 255;
5371  re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5372  cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5373  re->options |= PCRE_REQCHSET;
5374  }
5375 
5376 /* Print out the compiled data if debugging is enabled. This is never the
5377 case when building a production library. */
5378 
5379 #ifdef DEBUG
5380 
5381 printf("Length = %d top_bracket = %d top_backref = %d\n",
5382  length, re->top_bracket, re->top_backref);
5383 
5384 if (re->options != 0)
5385  {
5386  printf("%s%s%s%s%s%s%s%s%s\n",
5387  ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5388  ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5389  ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5390  ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5391  ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5392  ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5393  ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5394  ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5395  ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5396  }
5397 
5398 if ((re->options & PCRE_FIRSTSET) != 0)
5399  {
5400  int ch = re->first_byte & 255;
5401  const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
5402  "" : " (caseless)";
5403  if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5404  else printf("First char = \\x%02x%s\n", ch, caseless);
5405  }
5406 
5407 if ((re->options & PCRE_REQCHSET) != 0)
5408  {
5409  int ch = re->req_byte & 255;
5410  const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
5411  "" : " (caseless)";
5412  if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5413  else printf("Req char = \\x%02x%s\n", ch, caseless);
5414  }
5415 
5416 pcre_printint(re, stdout);
5417 
5418 /* This check is done here in the debugging case so that the code that
5419 was compiled can be seen. */
5420 
5421 if (code - codestart > length)
5422  {
5423  (pcre_free)(re);
5424  *errorptr = error_texts[ERR23];
5425  *erroroffset = ptr - (uschar *)pattern;
5426  if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5427  return NULL;
5428  }
5429 #endif /* DEBUG */
5430 
5431 return (pcre *)re;
5432 }
5433 
5434 /* End of pcre_compile.c */
#define TRUE
Definition: bool.h:74
#define FALSE
Definition: bool.h:70
#define MAX_NAME_COUNT
Definition: config.h:9
#define MAX_DUPLENGTH
Definition: config.h:10
#define MAX_NAME_SIZE
Definition: config.h:8
#define PCRE_NO_UTF8_CHECK
Definition: pcre.h:111
#define PCRE_UTF8
Definition: pcre.h:109
#define PCRE_NEWLINE_ANY
Definition: pcre.h:121
#define PCRE_EXTENDED
Definition: pcre.h:101
void *(* pcre_malloc)(size_t)
Definition: pcre_globals.c:75
#define PCRE_CASELESS
Definition: pcre.h:98
#define PCRE_AUTO_CALLOUT
Definition: pcre.h:112
#define PCRE_DATA_SCOPE
Definition: pcre.h:81
#define PCRE_MULTILINE
Definition: pcre.h:99
void(* pcre_free)(void *)
Definition: pcre_globals.c:76
#define PCRE_NEWLINE_CRLF
Definition: pcre.h:120
#define PCRE_NO_AUTO_CAPTURE
Definition: pcre.h:110
#define PCRE_NEWLINE_LF
Definition: pcre.h:119
#define PCRE_DOTALL
Definition: pcre.h:100
#define PCRE_ANCHORED
Definition: pcre.h:102
#define PCRE_NEWLINE_CR
Definition: pcre.h:118
#define PCRE_DUPNAMES
Definition: pcre.h:117
#define PCRE_DOLLAR_ENDONLY
Definition: pcre.h:103
#define PCRE_EXTRA
Definition: pcre.h:104
#define PCRE_UNGREEDY
Definition: pcre.h:107
static const short int escapes[]
Definition: pcre_compile.c:86
static const uschar * read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
Definition: pcre_compile.c:792
static void adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd, uschar *save_hwm)
static BOOL could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr, BOOL utf8)
static BOOL check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
static BOOL could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
pcre * pcre_compile2(const char *pattern, int options, int *errorcodeptr, const char **errorptr, int *erroroffset, const unsigned char *tables)
static const char * error_texts[]
Definition: pcre_compile.c:176
pcre * pcre_compile(const char *pattern, int options, const char **errorptr, int *erroroffset, const unsigned char *tables)
static BOOL is_anchored(register const uschar *code, int *options, unsigned int bracket_map, unsigned int backref_map)
static int check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount, int options, BOOL isclass)
Definition: pcre_compile.c:406
static void complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
static int find_fixedlength(uschar *code, int options)
static const unsigned char digitab[]
Definition: pcre_compile.c:266
static const uschar posix_name_lengths[]
Definition: pcre_compile.c:137
static BOOL is_counted_repeat(const uschar *p)
Definition: pcre_compile.c:755
static int find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
static int check_posix_name(const uschar *ptr, int len)
static BOOL compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *, int *, branch_chain *, compile_data *, int *)
static BOOL is_startline(const uschar *code, unsigned int bracket_map, unsigned int backref_map)
static const int posix_class_maps[]
Definition: pcre_compile.c:150
static BOOL check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char, const uschar *ptr, int options, compile_data *cd)
static uschar * auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
#define XSTRING(s)
Definition: pcre_compile.c:169
static const uschar * find_bracket(const uschar *code, BOOL utf8, int number)
static const uschar * first_significant_code(const uschar *code, int *options, int optbit, BOOL skipassert)
Definition: pcre_compile.c:971
static const uschar * find_recurse(const uschar *code, BOOL utf8)
static const char *const posix_names[]
Definition: pcre_compile.c:132
static int find_parens(const uschar *ptr, int count, const uschar *name, int lorn, BOOL xmode)
Definition: pcre_compile.c:861
#define COMPILE_WORK_SIZE
Definition: pcre_compile.c:77
static BOOL compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr)
@ ESC_Z
@ ESC_k
@ ESC_b
@ ESC_A
@ ESC_D
@ ESC_Q
@ ESC_P
@ ESC_R
@ ESC_d
@ ESC_s
@ ESC_S
@ ESC_C
@ ESC_B
@ ESC_REF
@ ESC_W
@ ESC_G
@ ESC_X
@ ESC_w
@ ESC_E
@ ESC_p
@ ESC_z
#define XCL_END
int BOOL
#define ESC_e
#define REQ_NONE
#define REQ_VARY
#define cbits_offset
#define cbit_punct
#define ctype_letter
#define ESC_n
#define ctype_xdigit
#define cbit_space
#define ESC_tee
#define PUT2(a, n, d)
#define ctype_digit
#define PCRE_IMS
#define cbit_digit
#define ctype_word
#define XCL_NOTPROP
#define NLTYPE_FIXED
#define MAGIC_NUMBER
#define cbit_lower
#define PUBLIC_OPTIONS
#define cbit_cntrl
struct real_pcre real_pcre
#define ctype_space
#define XCL_NOT
int _pcre_ord2utf8(int, uschar *)
Definition: pcre_ord2utf8.c:63
#define NOTACHAR
#define PCRE_REQCHSET
#define ESC_f
unsigned int _pcre_ucp_othercase(const unsigned int)
#define PCRE_NOPARTIAL
#define RREF_ANY
#define XCL_RANGE
#define ESC_r
#define DPRINTF(p)
Definition: pcre_internal.h:66
#define REQ_UNSET
#define PUT2INC(a, n, d)
#define IS_NEWLINE(p)
#define PCRE_FIRSTSET
#define XCL_PROP
#define cbit_word
@ ERR23
@ ERR47
@ ERR17
@ ERR3
@ ERR9
@ ERR55
@ ERR4
@ ERR46
@ ERR34
@ ERR51
@ ERR14
@ ERR48
@ ERR49
@ ERR18
@ ERR50
@ ERR52
@ ERR41
@ ERR40
@ ERR24
@ ERR5
@ ERR15
@ ERR36
@ ERR27
@ ERR56
@ ERR1
@ ERR39
@ ERR12
@ ERR37
@ ERR54
@ ERR38
@ ERR6
@ ERR53
@ ERR28
@ ERR7
@ ERR13
@ ERR32
@ ERR43
@ ERR11
@ ERR16
@ ERR44
@ ERR31
@ ERR22
@ ERR2
@ ERR57
@ ERR42
@ ERR29
@ ERR45
@ ERR8
@ ERR20
@ ERR0
@ ERR35
@ ERR25
@ ERR30
@ ERR21
@ ERR26
#define cbit_xdigit
#define PUTINC(a, n, d)
@ OP_END
@ OP_CHAR
@ OP_CRMINQUERY
@ OP_SBRA
@ OP_ONCE
@ OP_NOTPROP
@ OP_NOTPLUS
@ OP_TYPEMINPLUS
@ OP_TYPEQUERY
@ OP_ASSERTBACK
@ OP_CLASS
@ OP_TYPEPLUS
@ OP_NOT_WORDCHAR
@ OP_CRMINPLUS
@ OP_CRRANGE
@ OP_DOLL
@ OP_ASSERT_NOT
@ OP_NOT
@ OP_ASSERT
@ OP_TYPEPOSSTAR
@ OP_TYPEPOSPLUS
@ OP_POSSTAR
@ OP_NOTUPTO
@ OP_TYPESTAR
@ OP_CRQUERY
@ OP_ASSERTBACK_NOT
@ OP_OPT
@ OP_RREF
@ OP_DIGIT
@ OP_EXACT
@ OP_TYPEEXACT
@ OP_PLUS
@ OP_WHITESPACE
@ OP_CRMINSTAR
@ OP_NOT_WORD_BOUNDARY
@ OP_KET
@ OP_NOT_DIGIT
@ OP_CALLOUT
@ OP_CRMINRANGE
@ OP_RECURSE
@ OP_BRA
@ OP_CHARNC
@ OP_CREF
@ OP_POSUPTO
@ OP_NOTPOSUPTO
@ OP_REVERSE
@ OP_NCLASS
@ OP_KETRMIN
@ OP_COND
@ OP_MINPLUS
@ OP_TYPEPOSUPTO
@ OP_WORDCHAR
@ OP_MINQUERY
@ OP_EODN
@ OP_ALT
@ OP_UPTO
@ OP_QUERY
@ OP_PROP
@ OP_NOTPOSSTAR
@ OP_KETRMAX
@ OP_NOTMINPLUS
@ OP_BRAZERO
@ OP_ANYBYTE
@ OP_NOT_WHITESPACE
@ OP_NOTSTAR
@ OP_MINUPTO
@ OP_CRSTAR
@ OP_POSQUERY
@ OP_MINSTAR
@ OP_STAR
@ OP_DEF
@ OP_TYPEMINSTAR
@ OP_CRPLUS
@ OP_TYPEPOSQUERY
@ OP_POSPLUS
@ OP_REF
@ OP_SOD
@ OP_NOTPOSQUERY
@ OP_TYPEUPTO
@ OP_SOM
@ OP_ANY
@ OP_XCLASS
@ OP_NOTQUERY
@ OP_CBRA
@ OP_EXTUNI
@ OP_EOD
@ OP_NOTEXACT
@ OP_WORD_BOUNDARY
@ OP_NOTPOSPLUS
@ OP_CIRC
#define cbit_upper
#define GETCHARINCTEST(c, eptr)
unsigned char uschar
#define fcc_offset
#define PCRE_JCHANGED
#define NLTYPE_ANY
#define memmove(a, b, c)
#define GETCHAR(c, eptr)
#define GETCHARLEN(c, eptr, len)
#define lcc_offset
#define GETCHARINC(c, eptr)
#define XCL_MAP
const uschar _pcre_default_tables[]
#define REQ_CASELESS
#define ctypes_offset
#define cbit_print
int _pcre_valid_utf8(const uschar *, int)
#define GET2(a, n)
#define PCRE_STARTLINE
#define ctype_meta
#define XCL_SINGLE
#define cbit_graph
static BOOL utf8
Definition: pcregrep.c:147
static BOOL number
Definition: pcregrep.c:143
static char * newline
Definition: pcregrep.c:112
#define _pcre_utt
Definition: pcretest.c:88
#define _pcre_utf8_table4
Definition: pcretest.c:87
#define _pcre_OP_lengths
Definition: pcretest.c:90
#define _pcre_utt_size
Definition: pcretest.c:89
static int offset
Definition: read.c:62
int code
Definition: signal.c:116
const char * name
Definition: signal.c:117
struct branch_chain * outer
uschar * current
const uschar * start_workspace
uschar * name_table
const uschar * cbits
const uschar * start_code
const uschar * fcc
const uschar * end_pattern
uschar nl[4]
uschar * hwm
const uschar * ctypes
unsigned int backref_map
const uschar * lcc
const uschar * start_pattern