"Fossies" - the Fresh Open Source Software Archive 
Member "tin-2.6.2/pcre/pcre_compile.c" (23 Aug 2021, 172488 Bytes) of package /linux/misc/tin-2.6.2.tar.xz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2006 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #define NLBLOCK cd /* Block containing newline information */
46 #define PSSTART start_pattern /* Field containing processed string start */
47 #define PSEND end_pattern /* Field containing processed string end */
48
49
50 #include "pcre_internal.h"
51
52
53 /* When DEBUG is defined, we need the pcre_printint() function, which is also
54 used by pcretest. DEBUG is not defined when building a production library. */
55
56 #ifdef DEBUG
57 #include "pcre_printint.src"
58 #endif
59
60
61 /*************************************************
62 * Code parameters and static tables *
63 *************************************************/
64
65 /* This value specifies the size of stack workspace that is used during the
66 first pre-compile phase that determines how much memory is required. The regex
67 is partly compiled into this space, but the compiled parts are discarded as
68 soon as they can be, so that hopefully there will never be an overrun. The code
69 does, however, check for an overrun. The largest amount I've seen used is 218,
70 so this number is very generous.
71
72 The same workspace is used during the second, actual compile phase for
73 remembering forward references to groups so that they can be filled in at the
74 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
75 is 4 there is plenty of room. */
76
77 #define COMPILE_WORK_SIZE (4096)
78
79
80 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
81 are simple data values; negative values are for special things like \d and so
82 on. Zero means further processing is needed (for things like \x), or the escape
83 is invalid. */
84
85 #if !EBCDIC /* This is the "normal" table for ASCII systems */
86 static const short int escapes[] = {
87 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
88 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
89 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
90 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
91 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
92 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
93 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
94 0, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
95 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
96 0, 0, -ESC_z /* x - z */
97 };
98
99 #else /* This is the "abnormal" table for EBCDIC systems */
100 static const short int escapes[] = {
101 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
102 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
103 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
104 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
105 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
106 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
107 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
108 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
109 /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
110 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
111 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
112 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
113 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
114 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
115 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
116 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
117 /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
118 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
119 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
120 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
121 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
122 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
123 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
124 };
125 #endif
126
127
128 /* Tables of names of POSIX character classes and their lengths. The list is
129 terminated by a zero length entry. The first three must be alpha, lower, upper,
130 as this is assumed for handling case independence. */
131
132 static const char *const posix_names[] = {
133 "alpha", "lower", "upper",
134 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
135 "print", "punct", "space", "word", "xdigit" };
136
137 static const uschar posix_name_lengths[] = {
138 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
139
140 /* Table of class bit maps for each POSIX class. Each class is formed from a
141 base map, with an optional addition or removal of another map. Then, for some
142 classes, there is some additional tweaking: for [:blank:] the vertical space
143 characters are removed, and for [:alpha:] and [:alnum:] the underscore
144 character is removed. The triples in the table consist of the base map offset,
145 second map offset or -1 if no second map, and a non-negative value for map
146 addition or a negative value for map subtraction (if there are two maps). The
147 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
148 remove vertical space characters, 2 => remove underscore. */
149
150 static const int posix_class_maps[] = {
151 cbit_word, cbit_digit, -2, /* alpha */
152 cbit_lower, -1, 0, /* lower */
153 cbit_upper, -1, 0, /* upper */
154 cbit_word, -1, 2, /* alnum - word without underscore */
155 cbit_print, cbit_cntrl, 0, /* ascii */
156 cbit_space, -1, 1, /* blank - a GNU extension */
157 cbit_cntrl, -1, 0, /* cntrl */
158 cbit_digit, -1, 0, /* digit */
159 cbit_graph, -1, 0, /* graph */
160 cbit_print, -1, 0, /* print */
161 cbit_punct, -1, 0, /* punct */
162 cbit_space, -1, 0, /* space */
163 cbit_word, -1, 0, /* word - a Perl extension */
164 cbit_xdigit,-1, 0 /* xdigit */
165 };
166
167
168 #define STRING(a) # a
169 #define XSTRING(s) STRING(s)
170
171 /* The texts of compile-time error messages. These are "char *" because they
172 are passed to the outside world. Do not ever re-use any error number, because
173 they are documented. Always add a new error instead. Messages marked DEAD below
174 are no longer used. */
175
176 static const char *error_texts[] = {
177 "no error",
178 "\\ at end of pattern",
179 "\\c at end of pattern",
180 "unrecognized character follows \\",
181 "numbers out of order in {} quantifier",
182 /* 5 */
183 "number too big in {} quantifier",
184 "missing terminating ] for character class",
185 "invalid escape sequence in character class",
186 "range out of order in character class",
187 "nothing to repeat",
188 /* 10 */
189 "operand of unlimited repeat could match the empty string", /** DEAD **/
190 "internal error: unexpected repeat",
191 "unrecognized character after (?",
192 "POSIX named classes are supported only within a class",
193 "missing )",
194 /* 15 */
195 "reference to non-existent subpattern",
196 "erroffset passed as NULL",
197 "unknown option bit(s) set",
198 "missing ) after comment",
199 "parentheses nested too deeply", /** DEAD **/
200 /* 20 */
201 "regular expression too large",
202 "failed to get memory",
203 "unmatched parentheses",
204 "internal error: code overflow",
205 "unrecognized character after (?<",
206 /* 25 */
207 "lookbehind assertion is not fixed length",
208 "malformed number or name after (?(",
209 "conditional group contains more than two branches",
210 "assertion expected after (?(",
211 "(?R or (?digits must be followed by )",
212 /* 30 */
213 "unknown POSIX class name",
214 "POSIX collating elements are not supported",
215 "this version of PCRE is not compiled with PCRE_UTF8 support",
216 "spare error", /** DEAD **/
217 "character value in \\x{...} sequence is too large",
218 /* 35 */
219 "invalid condition (?(0)",
220 "\\C not allowed in lookbehind assertion",
221 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
222 "number after (?C is > 255",
223 "closing ) for (?C expected",
224 /* 40 */
225 "recursive call could loop indefinitely",
226 "unrecognized character after (?P",
227 "syntax error in subpattern name (missing terminator)",
228 "two named subpatterns have the same name",
229 "invalid UTF-8 string",
230 /* 45 */
231 "support for \\P, \\p, and \\X has not been compiled",
232 "malformed \\P or \\p sequence",
233 "unknown property name after \\P or \\p",
234 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
235 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
236 /* 50 */
237 "repeated subpattern is too long",
238 "octal value is greater than \\377 (not in UTF-8 mode)",
239 "internal error: overran compiling workspace",
240 "internal error: previously-checked referenced subpattern not found",
241 "DEFINE group contains more than one branch",
242 /* 55 */
243 "repeating a DEFINE group is not allowed",
244 "inconsistent NEWLINE options",
245 "\\g is not followed by an (optionally braced) non-zero number"
246 };
247
248
249 /* Table to identify digits and hex digits. This is used when compiling
250 patterns. Note that the tables in chartables are dependent on the locale, and
251 may mark arbitrary characters as digits - but the PCRE compiling code expects
252 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
253 a private table here. It costs 256 bytes, but it is a lot faster than doing
254 character value tests (at least in some simple cases I timed), and in some
255 applications one wants PCRE to compile efficiently as well as match
256 efficiently.
257
258 For convenience, we use the same bit definitions as in chartables:
259
260 0x04 decimal digit
261 0x08 hexadecimal digit
262
263 Then we can use ctype_digit and ctype_xdigit in the code. */
264
265 #if !EBCDIC /* This is the "normal" case, for ASCII systems */
266 static const unsigned char digitab[] =
267 {
268 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
269 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
270 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
271 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
272 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
273 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
274 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
275 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
276 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
277 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
278 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
280 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
281 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
282 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
283 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
285 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
288 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
289 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
290 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
291 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
292 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
293 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
294 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
295 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
296 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
297 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
298 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
299 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
300
301 #else /* This is the "abnormal" case, for EBCDIC systems */
302 static const unsigned char digitab[] =
303 {
304 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
305 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
306 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
307 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
308 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
309 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
311 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
314 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
315 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- ¬ */
316 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
317 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
318 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
319 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
320 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
321 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
322 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
323 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
324 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
325 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
328 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
334 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
335 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
336
337 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
338 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
339 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
340 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
341 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
342 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
343 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
344 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
346 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
347 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
349 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- ¬ */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
351 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
354 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
355 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
356 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
357 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
358 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
359 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
360 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
361 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
362 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
363 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
364 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
365 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
366 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
367 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
368 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
369 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
370 #endif
371
372
373 /* Definition to allow mutual recursion */
374
375 static BOOL
376 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *,
377 int *, branch_chain *, compile_data *, int *);
378
379
380
381 /*************************************************
382 * Handle escapes *
383 *************************************************/
384
385 /* This function is called when a \ has been encountered. It either returns a
386 positive value for a simple escape such as \n, or a negative value which
387 encodes one of the more complicated things such as \d. A backreference to group
388 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
389 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
390 ptr is pointing at the \. On exit, it is on the final character of the escape
391 sequence.
392
393 Arguments:
394 ptrptr points to the pattern position pointer
395 errorcodeptr points to the errorcode variable
396 bracount number of previous extracting brackets
397 options the options bits
398 isclass TRUE if inside a character class
399
400 Returns: zero or positive => a data character
401 negative => a special escape sequence
402 on error, errorptr is set
403 */
404
405 static int
406 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
407 int options, BOOL isclass)
408 {
409 BOOL utf8 = (options & PCRE_UTF8) != 0;
410 const uschar *ptr = *ptrptr + 1;
411 int c, i;
412
413 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
414 ptr--; /* Set pointer back to the last byte */
415
416 /* If backslash is at the end of the pattern, it's an error. */
417
418 if (c == 0) *errorcodeptr = ERR1;
419
420 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
421 a table. A non-zero result is something that can be returned immediately.
422 Otherwise further processing may be required. */
423
424 #if !EBCDIC /* ASCII coding */
425 else if (c < '0' || c > 'z') {} /* Not alphameric */
426 else if ((i = escapes[c - '0']) != 0) c = i;
427
428 #else /* EBCDIC coding */
429 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
430 else if ((i = escapes[c - 0x48]) != 0) c = i;
431 #endif
432
433 /* Escapes that need further processing, or are illegal. */
434
435 else
436 {
437 const uschar *oldptr;
438 BOOL braced, negated;
439
440 switch (c)
441 {
442 /* A number of Perl escapes are not handled by PCRE. We give an explicit
443 error. */
444
445 case 'l':
446 case 'L':
447 case 'N':
448 case 'u':
449 case 'U':
450 *errorcodeptr = ERR37;
451 break;
452
453 /* \g must be followed by a number, either plain or braced. If positive, it
454 is an absolute backreference. If negative, it is a relative backreference.
455 This is a Perl 5.10 feature. */
456
457 case 'g':
458 if (ptr[1] == '{')
459 {
460 braced = TRUE;
461 ptr++;
462 }
463 else braced = FALSE;
464
465 if (ptr[1] == '-')
466 {
467 negated = TRUE;
468 ptr++;
469 }
470 else negated = FALSE;
471
472 c = 0;
473 while ((digitab[ptr[1]] & ctype_digit) != 0)
474 c = c * 10 + *(++ptr) - '0';
475
476 if (c == 0 || (braced && *(++ptr) != '}'))
477 {
478 *errorcodeptr = ERR57;
479 return 0;
480 }
481
482 if (negated)
483 {
484 if (c > bracount)
485 {
486 *errorcodeptr = ERR15;
487 return 0;
488 }
489 c = bracount - (c - 1);
490 }
491
492 c = -(ESC_REF + c);
493 break;
494
495 /* The handling of escape sequences consisting of a string of digits
496 starting with one that is not zero is not straightforward. By experiment,
497 the way Perl works seems to be as follows:
498
499 Outside a character class, the digits are read as a decimal number. If the
500 number is less than 10, or if there are that many previous extracting
501 left brackets, then it is a back reference. Otherwise, up to three octal
502 digits are read to form an escaped byte. Thus \123 is likely to be octal
503 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
504 value is greater than 377, the least significant 8 bits are taken. Inside a
505 character class, \ followed by a digit is always an octal number. */
506
507 case '1': case '2': case '3': case '4': case '5':
508 case '6': case '7': case '8': case '9':
509
510 if (!isclass)
511 {
512 oldptr = ptr;
513 c -= '0';
514 while ((digitab[ptr[1]] & ctype_digit) != 0)
515 c = c * 10 + *(++ptr) - '0';
516 if (c < 10 || c <= bracount)
517 {
518 c = -(ESC_REF + c);
519 break;
520 }
521 ptr = oldptr; /* Put the pointer back and fall through */
522 }
523
524 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
525 generates a binary zero byte and treats the digit as a following literal.
526 Thus we have to pull back the pointer by one. */
527
528 if ((c = *ptr) >= '8')
529 {
530 ptr--;
531 c = 0;
532 break;
533 }
534
535 /* \0 always starts an octal number, but we may drop through to here with a
536 larger first octal digit. The original code used just to take the least
537 significant 8 bits of octal numbers (I think this is what early Perls used
538 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
539 than 3 octal digits. */
540
541 case '0':
542 c -= '0';
543 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
544 c = c * 8 + *(++ptr) - '0';
545 if (!utf8 && c > 255) *errorcodeptr = ERR51;
546 break;
547
548 /* \x is complicated. \x{ddd} is a character number which can be greater
549 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
550 treated as a data character. */
551
552 case 'x':
553 if (ptr[1] == '{')
554 {
555 const uschar *pt = ptr + 2;
556 int count = 0;
557
558 c = 0;
559 while ((digitab[*pt] & ctype_xdigit) != 0)
560 {
561 register int cc = *pt++;
562 if (c == 0 && cc == '0') continue; /* Leading zeroes */
563 count++;
564
565 #if !EBCDIC /* ASCII coding */
566 if (cc >= 'a') cc -= 32; /* Convert to upper case */
567 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
568 #else /* EBCDIC coding */
569 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
570 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
571 #endif
572 }
573
574 if (*pt == '}')
575 {
576 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
577 ptr = pt;
578 break;
579 }
580
581 /* If the sequence of hex digits does not end with '}', then we don't
582 recognize this construct; fall through to the normal \x handling. */
583 }
584
585 /* Read just a single-byte hex-defined char */
586
587 c = 0;
588 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
589 {
590 int cc; /* Some compilers don't like ++ */
591 cc = *(++ptr); /* in initializers */
592 #if !EBCDIC /* ASCII coding */
593 if (cc >= 'a') cc -= 32; /* Convert to upper case */
594 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
595 #else /* EBCDIC coding */
596 if (cc <= 'z') cc += 64; /* Convert to upper case */
597 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
598 #endif
599 }
600 break;
601
602 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
603 This coding is ASCII-specific, but then the whole concept of \cx is
604 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
605
606 case 'c':
607 c = *(++ptr);
608 if (c == 0)
609 {
610 *errorcodeptr = ERR2;
611 return 0;
612 }
613
614 #if !EBCDIC /* ASCII coding */
615 if (c >= 'a' && c <= 'z') c -= 32;
616 c ^= 0x40;
617 #else /* EBCDIC coding */
618 if (c >= 'a' && c <= 'z') c += 64;
619 c ^= 0xC0;
620 #endif
621 break;
622
623 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
624 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
625 for Perl compatibility, it is a literal. This code looks a bit odd, but
626 there used to be some cases other than the default, and there may be again
627 in future, so I haven't "optimized" it. */
628
629 default:
630 if ((options & PCRE_EXTRA) != 0) switch(c)
631 {
632 default:
633 *errorcodeptr = ERR3;
634 break;
635 }
636 break;
637 }
638 }
639
640 *ptrptr = ptr;
641 return c;
642 }
643
644
645
646 #ifdef SUPPORT_UCP
647 /*************************************************
648 * Handle \P and \p *
649 *************************************************/
650
651 /* This function is called after \P or \p has been encountered, provided that
652 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
653 pointing at the P or p. On exit, it is pointing at the final character of the
654 escape sequence.
655
656 Argument:
657 ptrptr points to the pattern position pointer
658 negptr points to a boolean that is set TRUE for negation else FALSE
659 dptr points to an int that is set to the detailed property value
660 errorcodeptr points to the error code variable
661
662 Returns: type value from ucp_type_table, or -1 for an invalid type
663 */
664
665 static int
666 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
667 {
668 int c, i, bot, top;
669 const uschar *ptr = *ptrptr;
670 char name[32];
671
672 c = *(++ptr);
673 if (c == 0) goto ERROR_RETURN;
674
675 *negptr = FALSE;
676
677 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
678 negation. */
679
680 if (c == '{')
681 {
682 if (ptr[1] == '^')
683 {
684 *negptr = TRUE;
685 ptr++;
686 }
687 for (i = 0; i < sizeof(name) - 1; i++)
688 {
689 c = *(++ptr);
690 if (c == 0) goto ERROR_RETURN;
691 if (c == '}') break;
692 name[i] = c;
693 }
694 if (c !='}') goto ERROR_RETURN;
695 name[i] = 0;
696 }
697
698 /* Otherwise there is just one following character */
699
700 else
701 {
702 name[0] = c;
703 name[1] = 0;
704 }
705
706 *ptrptr = ptr;
707
708 /* Search for a recognized property name using binary chop */
709
710 bot = 0;
711 top = _pcre_utt_size;
712
713 while (bot < top)
714 {
715 i = (bot + top) >> 1;
716 c = strcmp(name, _pcre_utt[i].name);
717 if (c == 0)
718 {
719 *dptr = _pcre_utt[i].value;
720 return _pcre_utt[i].type;
721 }
722 if (c > 0) bot = i + 1; else top = i;
723 }
724
725 *errorcodeptr = ERR47;
726 *ptrptr = ptr;
727 return -1;
728
729 ERROR_RETURN:
730 *errorcodeptr = ERR46;
731 *ptrptr = ptr;
732 return -1;
733 }
734 #endif
735
736
737
738
739 /*************************************************
740 * Check for counted repeat *
741 *************************************************/
742
743 /* This function is called when a '{' is encountered in a place where it might
744 start a quantifier. It looks ahead to see if it really is a quantifier or not.
745 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
746 where the ddds are digits.
747
748 Arguments:
749 p pointer to the first char after '{'
750
751 Returns: TRUE or FALSE
752 */
753
754 static BOOL
755 is_counted_repeat(const uschar *p)
756 {
757 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
758 while ((digitab[*p] & ctype_digit) != 0) p++;
759 if (*p == '}') return TRUE;
760
761 if (*p++ != ',') return FALSE;
762 if (*p == '}') return TRUE;
763
764 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
765 while ((digitab[*p] & ctype_digit) != 0) p++;
766
767 return (*p == '}');
768 }
769
770
771
772 /*************************************************
773 * Read repeat counts *
774 *************************************************/
775
776 /* Read an item of the form {n,m} and return the values. This is called only
777 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
778 so the syntax is guaranteed to be correct, but we need to check the values.
779
780 Arguments:
781 p pointer to first char after '{'
782 minp pointer to int for min
783 maxp pointer to int for max
784 returned as -1 if no max
785 errorcodeptr points to error code variable
786
787 Returns: pointer to '}' on success;
788 current ptr on error, with errorcodeptr set non-zero
789 */
790
791 static const uschar *
792 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
793 {
794 int min = 0;
795 int max = -1;
796
797 /* Read the minimum value and do a paranoid check: a negative value indicates
798 an integer overflow. */
799
800 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
801 if (min < 0 || min > 65535)
802 {
803 *errorcodeptr = ERR5;
804 return p;
805 }
806
807 /* Read the maximum value if there is one, and again do a paranoid on its size.
808 Also, max must not be less than min. */
809
810 if (*p == '}') max = min; else
811 {
812 if (*(++p) != '}')
813 {
814 max = 0;
815 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
816 if (max < 0 || max > 65535)
817 {
818 *errorcodeptr = ERR5;
819 return p;
820 }
821 if (max < min)
822 {
823 *errorcodeptr = ERR4;
824 return p;
825 }
826 }
827 }
828
829 /* Fill in the required variables, and pass back the pointer to the terminating
830 '}'. */
831
832 *minp = min;
833 *maxp = max;
834 return p;
835 }
836
837
838
839 /*************************************************
840 * Find forward referenced subpattern *
841 *************************************************/
842
843 /* This function scans along a pattern's text looking for capturing
844 subpatterns, and counting them. If it finds a named pattern that matches the
845 name it is given, it returns its number. Alternatively, if the name is NULL, it
846 returns when it reaches a given numbered subpattern. This is used for forward
847 references to subpatterns. We know that if (?P< is encountered, the name will
848 be terminated by '>' because that is checked in the first pass.
849
850 Arguments:
851 ptr current position in the pattern
852 count current count of capturing parens so far encountered
853 name name to seek, or NULL if seeking a numbered subpattern
854 lorn name length, or subpattern number if name is NULL
855 xmode TRUE if we are in /x mode
856
857 Returns: the number of the named subpattern, or -1 if not found
858 */
859
860 static int
861 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
862 BOOL xmode)
863 {
864 const uschar *thisname;
865
866 for (; *ptr != 0; ptr++)
867 {
868 int term;
869
870 /* Skip over backslashed characters and also entire \Q...\E */
871
872 if (*ptr == '\\')
873 {
874 if (*(++ptr) == 0) return -1;
875 if (*ptr == 'Q') for (;;)
876 {
877 while (*(++ptr) != 0 && *ptr != '\\');
878 if (*ptr == 0) return -1;
879 if (*(++ptr) == 'E') break;
880 }
881 continue;
882 }
883
884 /* Skip over character classes */
885
886 if (*ptr == '[')
887 {
888 while (*(++ptr) != ']')
889 {
890 if (*ptr == '\\')
891 {
892 if (*(++ptr) == 0) return -1;
893 if (*ptr == 'Q') for (;;)
894 {
895 while (*(++ptr) != 0 && *ptr != '\\');
896 if (*ptr == 0) return -1;
897 if (*(++ptr) == 'E') break;
898 }
899 continue;
900 }
901 }
902 continue;
903 }
904
905 /* Skip comments in /x mode */
906
907 if (xmode && *ptr == '#')
908 {
909 while (*(++ptr) != 0 && *ptr != '\n');
910 if (*ptr == 0) return -1;
911 continue;
912 }
913
914 /* An opening parens must now be a real metacharacter */
915
916 if (*ptr != '(') continue;
917 if (ptr[1] != '?')
918 {
919 count++;
920 if (name == NULL && count == lorn) return count;
921 continue;
922 }
923
924 ptr += 2;
925 if (*ptr == 'P') ptr++; /* Allow optional P */
926
927 /* We have to disambiguate (?<! and (?<= from (?<name> */
928
929 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
930 *ptr != '\'')
931 continue;
932
933 count++;
934
935 if (name == NULL && count == lorn) return count;
936 term = *ptr++;
937 if (term == '<') term = '>';
938 thisname = ptr;
939 while (*ptr != term) ptr++;
940 if (name != NULL && lorn == ptr - thisname &&
941 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
942 return count;
943 }
944
945 return -1;
946 }
947
948
949
950 /*************************************************
951 * Find first significant op code *
952 *************************************************/
953
954 /* This is called by several functions that scan a compiled expression looking
955 for a fixed first character, or an anchoring op code etc. It skips over things
956 that do not influence this. For some calls, a change of option is important.
957 For some calls, it makes sense to skip negative forward and all backward
958 assertions, and also the \b assertion; for others it does not.
959
960 Arguments:
961 code pointer to the start of the group
962 options pointer to external options
963 optbit the option bit whose changing is significant, or
964 zero if none are
965 skipassert TRUE if certain assertions are to be skipped
966
967 Returns: pointer to the first significant opcode
968 */
969
970 static const uschar*
971 first_significant_code(const uschar *code, int *options, int optbit,
972 BOOL skipassert)
973 {
974 for (;;)
975 {
976 switch ((int)*code)
977 {
978 case OP_OPT:
979 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
980 *options = (int)code[1];
981 code += 2;
982 break;
983
984 case OP_ASSERT_NOT:
985 case OP_ASSERTBACK:
986 case OP_ASSERTBACK_NOT:
987 if (!skipassert) return code;
988 do code += GET(code, 1); while (*code == OP_ALT);
989 code += _pcre_OP_lengths[*code];
990 break;
991
992 case OP_WORD_BOUNDARY:
993 case OP_NOT_WORD_BOUNDARY:
994 if (!skipassert) return code;
995 /* Fall through */
996
997 case OP_CALLOUT:
998 case OP_CREF:
999 case OP_RREF:
1000 case OP_DEF:
1001 code += _pcre_OP_lengths[*code];
1002 break;
1003
1004 default:
1005 return code;
1006 }
1007 }
1008 /* Control never reaches here */
1009 }
1010
1011
1012
1013
1014 /*************************************************
1015 * Find the fixed length of a pattern *
1016 *************************************************/
1017
1018 /* Scan a pattern and compute the fixed length of subject that will match it,
1019 if the length is fixed. This is needed for dealing with backward assertions.
1020 In UTF8 mode, the result is in characters rather than bytes.
1021
1022 Arguments:
1023 code points to the start of the pattern (the bracket)
1024 options the compiling options
1025
1026 Returns: the fixed length, or -1 if there is no fixed length,
1027 or -2 if \C was encountered
1028 */
1029
1030 static int
1031 find_fixedlength(uschar *code, int options)
1032 {
1033 int length = -1;
1034
1035 register int branchlength = 0;
1036 register uschar *cc = code + 1 + LINK_SIZE;
1037
1038 /* Scan along the opcodes for this branch. If we get to the end of the
1039 branch, check the length against that of the other branches. */
1040
1041 for (;;)
1042 {
1043 int d;
1044 register int op = *cc;
1045
1046 switch (op)
1047 {
1048 case OP_CBRA:
1049 case OP_BRA:
1050 case OP_ONCE:
1051 case OP_COND:
1052 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1053 if (d < 0) return d;
1054 branchlength += d;
1055 do cc += GET(cc, 1); while (*cc == OP_ALT);
1056 cc += 1 + LINK_SIZE;
1057 break;
1058
1059 /* Reached end of a branch; if it's a ket it is the end of a nested
1060 call. If it's ALT it is an alternation in a nested call. If it is
1061 END it's the end of the outer call. All can be handled by the same code. */
1062
1063 case OP_ALT:
1064 case OP_KET:
1065 case OP_KETRMAX:
1066 case OP_KETRMIN:
1067 case OP_END:
1068 if (length < 0) length = branchlength;
1069 else if (length != branchlength) return -1;
1070 if (*cc != OP_ALT) return length;
1071 cc += 1 + LINK_SIZE;
1072 branchlength = 0;
1073 break;
1074
1075 /* Skip over assertive subpatterns */
1076
1077 case OP_ASSERT:
1078 case OP_ASSERT_NOT:
1079 case OP_ASSERTBACK:
1080 case OP_ASSERTBACK_NOT:
1081 do cc += GET(cc, 1); while (*cc == OP_ALT);
1082 /* Fall through */
1083
1084 /* Skip over things that don't match chars */
1085
1086 case OP_REVERSE:
1087 case OP_CREF:
1088 case OP_RREF:
1089 case OP_DEF:
1090 case OP_OPT:
1091 case OP_CALLOUT:
1092 case OP_SOD:
1093 case OP_SOM:
1094 case OP_EOD:
1095 case OP_EODN:
1096 case OP_CIRC:
1097 case OP_DOLL:
1098 case OP_NOT_WORD_BOUNDARY:
1099 case OP_WORD_BOUNDARY:
1100 cc += _pcre_OP_lengths[*cc];
1101 break;
1102
1103 /* Handle literal characters */
1104
1105 case OP_CHAR:
1106 case OP_CHARNC:
1107 case OP_NOT:
1108 branchlength++;
1109 cc += 2;
1110 #ifdef SUPPORT_UTF8
1111 if ((options & PCRE_UTF8) != 0)
1112 {
1113 while ((*cc & 0xc0) == 0x80) cc++;
1114 }
1115 #endif
1116 break;
1117
1118 /* Handle exact repetitions. The count is already in characters, but we
1119 need to skip over a multibyte character in UTF8 mode. */
1120
1121 case OP_EXACT:
1122 branchlength += GET2(cc,1);
1123 cc += 4;
1124 #ifdef SUPPORT_UTF8
1125 if ((options & PCRE_UTF8) != 0)
1126 {
1127 while((*cc & 0x80) == 0x80) cc++;
1128 }
1129 #endif
1130 break;
1131
1132 case OP_TYPEEXACT:
1133 branchlength += GET2(cc,1);
1134 cc += 4;
1135 break;
1136
1137 /* Handle single-char matchers */
1138
1139 case OP_PROP:
1140 case OP_NOTPROP:
1141 cc += 2;
1142 /* Fall through */
1143
1144 case OP_NOT_DIGIT:
1145 case OP_DIGIT:
1146 case OP_NOT_WHITESPACE:
1147 case OP_WHITESPACE:
1148 case OP_NOT_WORDCHAR:
1149 case OP_WORDCHAR:
1150 case OP_ANY:
1151 branchlength++;
1152 cc++;
1153 break;
1154
1155 /* The single-byte matcher isn't allowed */
1156
1157 case OP_ANYBYTE:
1158 return -2;
1159
1160 /* Check a class for variable quantification */
1161
1162 #ifdef SUPPORT_UTF8
1163 case OP_XCLASS:
1164 cc += GET(cc, 1) - 33;
1165 /* Fall through */
1166 #endif
1167
1168 case OP_CLASS:
1169 case OP_NCLASS:
1170 cc += 33;
1171
1172 switch (*cc)
1173 {
1174 case OP_CRSTAR:
1175 case OP_CRMINSTAR:
1176 case OP_CRQUERY:
1177 case OP_CRMINQUERY:
1178 return -1;
1179
1180 case OP_CRRANGE:
1181 case OP_CRMINRANGE:
1182 if (GET2(cc,1) != GET2(cc,3)) return -1;
1183 branchlength += GET2(cc,1);
1184 cc += 5;
1185 break;
1186
1187 default:
1188 branchlength++;
1189 }
1190 break;
1191
1192 /* Anything else is variable length */
1193
1194 default:
1195 return -1;
1196 }
1197 }
1198 /* Control never gets here */
1199 }
1200
1201
1202
1203
1204 /*************************************************
1205 * Scan compiled regex for numbered bracket *
1206 *************************************************/
1207
1208 /* This little function scans through a compiled pattern until it finds a
1209 capturing bracket with the given number.
1210
1211 Arguments:
1212 code points to start of expression
1213 utf8 TRUE in UTF-8 mode
1214 number the required bracket number
1215
1216 Returns: pointer to the opcode for the bracket, or NULL if not found
1217 */
1218
1219 static const uschar *
1220 find_bracket(const uschar *code, BOOL utf8, int number)
1221 {
1222 for (;;)
1223 {
1224 register int c = *code;
1225 if (c == OP_END) return NULL;
1226
1227 /* XCLASS is used for classes that cannot be represented just by a bit
1228 map. This includes negated single high-valued characters. The length in
1229 the table is zero; the actual length is stored in the compiled code. */
1230
1231 if (c == OP_XCLASS) code += GET(code, 1);
1232
1233 /* Handle capturing bracket */
1234
1235 else if (c == OP_CBRA)
1236 {
1237 int n = GET2(code, 1+LINK_SIZE);
1238 if (n == number) return (uschar *)code;
1239 code += _pcre_OP_lengths[c];
1240 }
1241
1242 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1243 a multi-byte character. The length in the table is a minimum, so we have to
1244 arrange to skip the extra bytes. */
1245
1246 else
1247 {
1248 code += _pcre_OP_lengths[c];
1249 if (utf8) switch(c)
1250 {
1251 case OP_CHAR:
1252 case OP_CHARNC:
1253 case OP_EXACT:
1254 case OP_UPTO:
1255 case OP_MINUPTO:
1256 case OP_POSUPTO:
1257 case OP_STAR:
1258 case OP_MINSTAR:
1259 case OP_POSSTAR:
1260 case OP_PLUS:
1261 case OP_MINPLUS:
1262 case OP_POSPLUS:
1263 case OP_QUERY:
1264 case OP_MINQUERY:
1265 case OP_POSQUERY:
1266 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1267 break;
1268 }
1269 }
1270 }
1271 }
1272
1273
1274
1275 /*************************************************
1276 * Scan compiled regex for recursion reference *
1277 *************************************************/
1278
1279 /* This little function scans through a compiled pattern until it finds an
1280 instance of OP_RECURSE.
1281
1282 Arguments:
1283 code points to start of expression
1284 utf8 TRUE in UTF-8 mode
1285
1286 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1287 */
1288
1289 static const uschar *
1290 find_recurse(const uschar *code, BOOL utf8)
1291 {
1292 for (;;)
1293 {
1294 register int c = *code;
1295 if (c == OP_END) return NULL;
1296 if (c == OP_RECURSE) return code;
1297
1298 /* XCLASS is used for classes that cannot be represented just by a bit
1299 map. This includes negated single high-valued characters. The length in
1300 the table is zero; the actual length is stored in the compiled code. */
1301
1302 if (c == OP_XCLASS) code += GET(code, 1);
1303
1304 /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1305 that are followed by a character may be followed by a multi-byte character.
1306 The length in the table is a minimum, so we have to arrange to skip the extra
1307 bytes. */
1308
1309 else
1310 {
1311 code += _pcre_OP_lengths[c];
1312 if (utf8) switch(c)
1313 {
1314 case OP_CHAR:
1315 case OP_CHARNC:
1316 case OP_EXACT:
1317 case OP_UPTO:
1318 case OP_MINUPTO:
1319 case OP_POSUPTO:
1320 case OP_STAR:
1321 case OP_MINSTAR:
1322 case OP_POSSTAR:
1323 case OP_PLUS:
1324 case OP_MINPLUS:
1325 case OP_POSPLUS:
1326 case OP_QUERY:
1327 case OP_MINQUERY:
1328 case OP_POSQUERY:
1329 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1330 break;
1331 }
1332 }
1333 }
1334 }
1335
1336
1337
1338 /*************************************************
1339 * Scan compiled branch for non-emptiness *
1340 *************************************************/
1341
1342 /* This function scans through a branch of a compiled pattern to see whether it
1343 can match the empty string or not. It is called from could_be_empty()
1344 below and from compile_branch() when checking for an unlimited repeat of a
1345 group that can match nothing. Note that first_significant_code() skips over
1346 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1347 struck an inner bracket whose current branch will already have been scanned.
1348
1349 Arguments:
1350 code points to start of search
1351 endcode points to where to stop
1352 utf8 TRUE if in UTF8 mode
1353
1354 Returns: TRUE if what is matched could be empty
1355 */
1356
1357 static BOOL
1358 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1359 {
1360 register int c;
1361 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1362 code < endcode;
1363 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1364 {
1365 const uschar *ccode;
1366
1367 c = *code;
1368
1369 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1370 {
1371 BOOL empty_branch;
1372 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1373
1374 /* Scan a closed bracket */
1375
1376 empty_branch = FALSE;
1377 do
1378 {
1379 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1380 empty_branch = TRUE;
1381 code += GET(code, 1);
1382 }
1383 while (*code == OP_ALT);
1384 if (!empty_branch) return FALSE; /* All branches are non-empty */
1385
1386 /* Move past the KET and fudge things so that the increment in the "for"
1387 above has no effect. */
1388
1389 c = OP_END;
1390 code += 1 + LINK_SIZE - _pcre_OP_lengths[c];
1391 continue;
1392 }
1393
1394 /* Handle the other opcodes */
1395
1396 switch (c)
1397 {
1398 /* Check for quantifiers after a class */
1399
1400 #ifdef SUPPORT_UTF8
1401 case OP_XCLASS:
1402 ccode = code + GET(code, 1);
1403 goto CHECK_CLASS_REPEAT;
1404 #endif
1405
1406 case OP_CLASS:
1407 case OP_NCLASS:
1408 ccode = code + 33;
1409
1410 #ifdef SUPPORT_UTF8
1411 CHECK_CLASS_REPEAT:
1412 #endif
1413
1414 switch (*ccode)
1415 {
1416 case OP_CRSTAR: /* These could be empty; continue */
1417 case OP_CRMINSTAR:
1418 case OP_CRQUERY:
1419 case OP_CRMINQUERY:
1420 break;
1421
1422 default: /* Non-repeat => class must match */
1423 case OP_CRPLUS: /* These repeats aren't empty */
1424 case OP_CRMINPLUS:
1425 return FALSE;
1426
1427 case OP_CRRANGE:
1428 case OP_CRMINRANGE:
1429 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1430 break;
1431 }
1432 break;
1433
1434 /* Opcodes that must match a character */
1435
1436 case OP_PROP:
1437 case OP_NOTPROP:
1438 case OP_EXTUNI:
1439 case OP_NOT_DIGIT:
1440 case OP_DIGIT:
1441 case OP_NOT_WHITESPACE:
1442 case OP_WHITESPACE:
1443 case OP_NOT_WORDCHAR:
1444 case OP_WORDCHAR:
1445 case OP_ANY:
1446 case OP_ANYBYTE:
1447 case OP_CHAR:
1448 case OP_CHARNC:
1449 case OP_NOT:
1450 case OP_PLUS:
1451 case OP_MINPLUS:
1452 case OP_POSPLUS:
1453 case OP_EXACT:
1454 case OP_NOTPLUS:
1455 case OP_NOTMINPLUS:
1456 case OP_NOTPOSPLUS:
1457 case OP_NOTEXACT:
1458 case OP_TYPEPLUS:
1459 case OP_TYPEMINPLUS:
1460 case OP_TYPEPOSPLUS:
1461 case OP_TYPEEXACT:
1462 return FALSE;
1463
1464 /* End of branch */
1465
1466 case OP_KET:
1467 case OP_KETRMAX:
1468 case OP_KETRMIN:
1469 case OP_ALT:
1470 return TRUE;
1471
1472 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1473 MINUPTO, and POSUPTO may be followed by a multibyte character */
1474
1475 #ifdef SUPPORT_UTF8
1476 case OP_STAR:
1477 case OP_MINSTAR:
1478 case OP_POSSTAR:
1479 case OP_QUERY:
1480 case OP_MINQUERY:
1481 case OP_POSQUERY:
1482 case OP_UPTO:
1483 case OP_MINUPTO:
1484 case OP_POSUPTO:
1485 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1486 break;
1487 #endif
1488 }
1489 }
1490
1491 return TRUE;
1492 }
1493
1494
1495
1496 /*************************************************
1497 * Scan compiled regex for non-emptiness *
1498 *************************************************/
1499
1500 /* This function is called to check for left recursive calls. We want to check
1501 the current branch of the current pattern to see if it could match the empty
1502 string. If it could, we must look outwards for branches at other levels,
1503 stopping when we pass beyond the bracket which is the subject of the recursion.
1504
1505 Arguments:
1506 code points to start of the recursion
1507 endcode points to where to stop (current RECURSE item)
1508 bcptr points to the chain of current (unclosed) branch starts
1509 utf8 TRUE if in UTF-8 mode
1510
1511 Returns: TRUE if what is matched could be empty
1512 */
1513
1514 static BOOL
1515 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1516 BOOL utf8)
1517 {
1518 while (bcptr != NULL && bcptr->current >= code)
1519 {
1520 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1521 bcptr = bcptr->outer;
1522 }
1523 return TRUE;
1524 }
1525
1526
1527
1528 /*************************************************
1529 * Check for POSIX class syntax *
1530 *************************************************/
1531
1532 /* This function is called when the sequence "[:" or "[." or "[=" is
1533 encountered in a character class. It checks whether this is followed by an
1534 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1535 ".]" or "=]".
1536
1537 Argument:
1538 ptr pointer to the initial [
1539 endptr where to return the end pointer
1540 cd pointer to compile data
1541
1542 Returns: TRUE or FALSE
1543 */
1544
1545 static BOOL
1546 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1547 {
1548 int terminator; /* Don't combine these lines; the Solaris cc */
1549 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1550 if (*(++ptr) == '^') ptr++;
1551 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1552 if (*ptr == terminator && ptr[1] == ']')
1553 {
1554 *endptr = ptr;
1555 return TRUE;
1556 }
1557 return FALSE;
1558 }
1559
1560
1561
1562
1563 /*************************************************
1564 * Check POSIX class name *
1565 *************************************************/
1566
1567 /* This function is called to check the name given in a POSIX-style class entry
1568 such as [:alnum:].
1569
1570 Arguments:
1571 ptr points to the first letter
1572 len the length of the name
1573
1574 Returns: a value representing the name, or -1 if unknown
1575 */
1576
1577 static int
1578 check_posix_name(const uschar *ptr, int len)
1579 {
1580 register int yield = 0;
1581 while (posix_name_lengths[yield] != 0)
1582 {
1583 if (len == posix_name_lengths[yield] &&
1584 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1585 yield++;
1586 }
1587 return -1;
1588 }
1589
1590
1591 /*************************************************
1592 * Adjust OP_RECURSE items in repeated group *
1593 *************************************************/
1594
1595 /* OP_RECURSE items contain an offset from the start of the regex to the group
1596 that is referenced. This means that groups can be replicated for fixed
1597 repetition simply by copying (because the recursion is allowed to refer to
1598 earlier groups that are outside the current group). However, when a group is
1599 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1600 it, after it has been compiled. This means that any OP_RECURSE items within it
1601 that refer to the group itself or any contained groups have to have their
1602 offsets adjusted. That one of the jobs of this function. Before it is called,
1603 the partially compiled regex must be temporarily terminated with OP_END.
1604
1605 This function has been extended with the possibility of forward references for
1606 recursions and subroutine calls. It must also check the list of such references
1607 for the group we are dealing with. If it finds that one of the recursions in
1608 the current group is on this list, it adjusts the offset in the list, not the
1609 value in the reference (which is a group number).
1610
1611 Arguments:
1612 group points to the start of the group
1613 adjust the amount by which the group is to be moved
1614 utf8 TRUE in UTF-8 mode
1615 cd contains pointers to tables etc.
1616 save_hwm the hwm forward reference pointer at the start of the group
1617
1618 Returns: nothing
1619 */
1620
1621 static void
1622 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1623 uschar *save_hwm)
1624 {
1625 uschar *ptr = group;
1626 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1627 {
1628 int offset;
1629 uschar *hc;
1630
1631 /* See if this recursion is on the forward reference list. If so, adjust the
1632 reference. */
1633
1634 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1635 {
1636 offset = GET(hc, 0);
1637 if (cd->start_code + offset == ptr + 1)
1638 {
1639 PUT(hc, 0, offset + adjust);
1640 break;
1641 }
1642 }
1643
1644 /* Otherwise, adjust the recursion offset if it's after the start of this
1645 group. */
1646
1647 if (hc >= cd->hwm)
1648 {
1649 offset = GET(ptr, 1);
1650 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1651 }
1652
1653 ptr += 1 + LINK_SIZE;
1654 }
1655 }
1656
1657
1658
1659 /*************************************************
1660 * Insert an automatic callout point *
1661 *************************************************/
1662
1663 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1664 callout points before each pattern item.
1665
1666 Arguments:
1667 code current code pointer
1668 ptr current pattern pointer
1669 cd pointers to tables etc
1670
1671 Returns: new code pointer
1672 */
1673
1674 static uschar *
1675 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1676 {
1677 *code++ = OP_CALLOUT;
1678 *code++ = 255;
1679 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1680 PUT(code, LINK_SIZE, 0); /* Default length */
1681 return code + 2*LINK_SIZE;
1682 }
1683
1684
1685
1686 /*************************************************
1687 * Complete a callout item *
1688 *************************************************/
1689
1690 /* A callout item contains the length of the next item in the pattern, which
1691 we can't fill in till after we have reached the relevant point. This is used
1692 for both automatic and manual callouts.
1693
1694 Arguments:
1695 previous_callout points to previous callout item
1696 ptr current pattern pointer
1697 cd pointers to tables etc
1698
1699 Returns: nothing
1700 */
1701
1702 static void
1703 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1704 {
1705 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1706 PUT(previous_callout, 2 + LINK_SIZE, length);
1707 }
1708
1709
1710
1711 #ifdef SUPPORT_UCP
1712 /*************************************************
1713 * Get othercase range *
1714 *************************************************/
1715
1716 /* This function is passed the start and end of a class range, in UTF-8 mode
1717 with UCP support. It searches up the characters, looking for internal ranges of
1718 characters in the "other" case. Each call returns the next one, updating the
1719 start address.
1720
1721 Arguments:
1722 cptr points to starting character value; updated
1723 d end value
1724 ocptr where to put start of othercase range
1725 odptr where to put end of othercase range
1726
1727 Yield: TRUE when range returned; FALSE when no more
1728 */
1729
1730 static BOOL
1731 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1732 unsigned int *odptr)
1733 {
1734 unsigned int c, othercase, next;
1735
1736 for (c = *cptr; c <= d; c++)
1737 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1738
1739 if (c > d) return FALSE;
1740
1741 *ocptr = othercase;
1742 next = othercase + 1;
1743
1744 for (++c; c <= d; c++)
1745 {
1746 if (_pcre_ucp_othercase(c) != next) break;
1747 next++;
1748 }
1749
1750 *odptr = next - 1;
1751 *cptr = c;
1752
1753 return TRUE;
1754 }
1755 #endif /* SUPPORT_UCP */
1756
1757
1758
1759 /*************************************************
1760 * Check if auto-possessifying is possible *
1761 *************************************************/
1762
1763 /* This function is called for unlimited repeats of certain items, to see
1764 whether the next thing could possibly match the repeated item. If not, it makes
1765 sense to automatically possessify the repeated item.
1766
1767 Arguments:
1768 op_code the repeated op code
1769 this data for this item, depends on the opcode
1770 utf8 TRUE in UTF-8 mode
1771 utf8_char used for utf8 character bytes, NULL if not relevant
1772 ptr next character in pattern
1773 options options bits
1774 cd contains pointers to tables etc.
1775
1776 Returns: TRUE if possessifying is wanted
1777 */
1778
1779 static BOOL
1780 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1781 const uschar *ptr, int options, compile_data *cd)
1782 {
1783 int next;
1784
1785 /* Skip whitespace and comments in extended mode */
1786
1787 if ((options & PCRE_EXTENDED) != 0)
1788 {
1789 for (;;)
1790 {
1791 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1792 if (*ptr == '#')
1793 {
1794 while (*(++ptr) != 0)
1795 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1796 }
1797 else break;
1798 }
1799 }
1800
1801 /* If the next item is one that we can handle, get its value. A non-negative
1802 value is a character, a negative value is an escape value. */
1803
1804 if (*ptr == '\\')
1805 {
1806 int temperrorcode = 0;
1807 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1808 if (temperrorcode != 0) return FALSE;
1809 ptr++; /* Point after the escape sequence */
1810 }
1811
1812 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1813 {
1814 #ifdef SUPPORT_UTF8
1815 if (utf8) { GETCHARINC(next, ptr); } else
1816 #endif
1817 next = *ptr++;
1818 }
1819
1820 else return FALSE;
1821
1822 /* Skip whitespace and comments in extended mode */
1823
1824 if ((options & PCRE_EXTENDED) != 0)
1825 {
1826 for (;;)
1827 {
1828 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1829 if (*ptr == '#')
1830 {
1831 while (*(++ptr) != 0)
1832 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1833 }
1834 else break;
1835 }
1836 }
1837
1838 /* If the next thing is itself optional, we have to give up. */
1839
1840 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1841 return FALSE;
1842
1843 /* Now compare the next item with the previous opcode. If the previous is a
1844 positive single character match, "item" either contains the character or, if
1845 "item" is greater than 127 in utf8 mode, the character's bytes are in
1846 utf8_char. */
1847
1848
1849 /* Handle cases when the next item is a character. */
1850
1851 if (next >= 0) switch(op_code)
1852 {
1853 case OP_CHAR:
1854 #ifdef SUPPORT_UTF8
1855 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1856 #endif
1857 return item != next;
1858
1859 /* For CHARNC (caseless character) we must check the other case. If we have
1860 Unicode property support, we can use it to test the other case of
1861 high-valued characters. */
1862
1863 case OP_CHARNC:
1864 #ifdef SUPPORT_UTF8
1865 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1866 #endif
1867 if (item == next) return FALSE;
1868 #ifdef SUPPORT_UTF8
1869 if (utf8)
1870 {
1871 unsigned int othercase;
1872 if (next < 128) othercase = cd->fcc[next]; else
1873 #ifdef SUPPORT_UCP
1874 othercase = _pcre_ucp_othercase((unsigned int)next);
1875 #else
1876 othercase = NOTACHAR;
1877 #endif
1878 return (unsigned int)item != othercase;
1879 }
1880 else
1881 #endif /* SUPPORT_UTF8 */
1882 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1883
1884 /* For OP_NOT, "item" must be a single-byte character. */
1885
1886 case OP_NOT:
1887 if (next < 0) return FALSE; /* Not a character */
1888 if (item == next) return TRUE;
1889 if ((options & PCRE_CASELESS) == 0) return FALSE;
1890 #ifdef SUPPORT_UTF8
1891 if (utf8)
1892 {
1893 unsigned int othercase;
1894 if (next < 128) othercase = cd->fcc[next]; else
1895 #ifdef SUPPORT_UCP
1896 othercase = _pcre_ucp_othercase(next);
1897 #else
1898 othercase = NOTACHAR;
1899 #endif
1900 return (unsigned int)item == othercase;
1901 }
1902 else
1903 #endif /* SUPPORT_UTF8 */
1904 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1905
1906 case OP_DIGIT:
1907 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1908
1909 case OP_NOT_DIGIT:
1910 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1911
1912 case OP_WHITESPACE:
1913 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1914
1915 case OP_NOT_WHITESPACE:
1916 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1917
1918 case OP_WORDCHAR:
1919 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1920
1921 case OP_NOT_WORDCHAR:
1922 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1923
1924 default:
1925 return FALSE;
1926 }
1927
1928
1929 /* Handle the case when the next item is \d, \s, etc. */
1930
1931 switch(op_code)
1932 {
1933 case OP_CHAR:
1934 case OP_CHARNC:
1935 #ifdef SUPPORT_UTF8
1936 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1937 #endif
1938 switch(-next)
1939 {
1940 case ESC_d:
1941 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
1942
1943 case ESC_D:
1944 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
1945
1946 case ESC_s:
1947 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
1948
1949 case ESC_S:
1950 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
1951
1952 case ESC_w:
1953 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
1954
1955 case ESC_W:
1956 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
1957
1958 default:
1959 return FALSE;
1960 }
1961
1962 case OP_DIGIT:
1963 return next == -ESC_D || next == -ESC_s || next == -ESC_W;
1964
1965 case OP_NOT_DIGIT:
1966 return next == -ESC_d;
1967
1968 case OP_WHITESPACE:
1969 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
1970
1971 case OP_NOT_WHITESPACE:
1972 return next == -ESC_s;
1973
1974 case OP_WORDCHAR:
1975 return next == -ESC_W || next == -ESC_s;
1976
1977 case OP_NOT_WORDCHAR:
1978 return next == -ESC_w || next == -ESC_d;
1979
1980 default:
1981 return FALSE;
1982 }
1983
1984 /* Control does not reach here */
1985 }
1986
1987
1988
1989 /*************************************************
1990 * Compile one branch *
1991 *************************************************/
1992
1993 /* Scan the pattern, compiling it into the a vector. If the options are
1994 changed during the branch, the pointer is used to change the external options
1995 bits. This function is used during the pre-compile phase when we are trying
1996 to find out the amount of memory needed, as well as during the real compile
1997 phase. The value of lengthptr distinguishes the two phases.
1998
1999 Arguments:
2000 optionsptr pointer to the option bits
2001 codeptr points to the pointer to the current code point
2002 ptrptr points to the current pattern pointer
2003 errorcodeptr points to error code variable
2004 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2005 reqbyteptr set to the last literal character required, else < 0
2006 bcptr points to current branch chain
2007 cd contains pointers to tables etc.
2008 lengthptr NULL during the real compile phase
2009 points to length accumulator during pre-compile phase
2010
2011 Returns: TRUE on success
2012 FALSE, with *errorcodeptr set non-zero on error
2013 */
2014
2015 static BOOL
2016 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2017 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2018 compile_data *cd, int *lengthptr)
2019 {
2020 int repeat_type, op_type;
2021 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2022 int bravalue = 0;
2023 int greedy_default, greedy_non_default;
2024 int firstbyte, reqbyte;
2025 int zeroreqbyte, zerofirstbyte;
2026 int req_caseopt, reqvary, tempreqvary;
2027 int options = *optionsptr;
2028 int after_manual_callout = 0;
2029 int length_prevgroup = 0;
2030 register int c;
2031 register uschar *code = *codeptr;
2032 uschar *last_code = code;
2033 uschar *orig_code = code;
2034 uschar *tempcode;
2035 BOOL inescq = FALSE;
2036 BOOL groupsetfirstbyte = FALSE;
2037 const uschar *ptr = *ptrptr;
2038 const uschar *tempptr;
2039 uschar *previous = NULL;
2040 uschar *previous_callout = NULL;
2041 uschar *save_hwm = NULL;
2042 uschar classbits[32];
2043
2044 #ifdef SUPPORT_UTF8
2045 BOOL class_utf8;
2046 BOOL utf8 = (options & PCRE_UTF8) != 0;
2047 uschar *class_utf8data;
2048 uschar utf8_char[6];
2049 #else
2050 BOOL utf8 = FALSE;
2051 uschar *utf8_char = NULL;
2052 #endif
2053
2054 #ifdef DEBUG
2055 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2056 #endif
2057
2058 /* Set up the default and non-default settings for greediness */
2059
2060 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2061 greedy_non_default = greedy_default ^ 1;
2062
2063 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2064 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2065 matches a non-fixed char first char; reqbyte just remains unset if we never
2066 find one.
2067
2068 When we hit a repeat whose minimum is zero, we may have to adjust these values
2069 to take the zero repeat into account. This is implemented by setting them to
2070 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2071 item types that can be repeated set these backoff variables appropriately. */
2072
2073 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2074
2075 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2076 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2077 value > 255. It is added into the firstbyte or reqbyte variables to record the
2078 case status of the value. This is used only for ASCII characters. */
2079
2080 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2081
2082 /* Switch on next character until the end of the branch */
2083
2084 for (;; ptr++)
2085 {
2086 BOOL negate_class;
2087 BOOL possessive_quantifier;
2088 BOOL is_quantifier;
2089 BOOL is_recurse;
2090 int class_charcount;
2091 int class_lastchar;
2092 int newoptions;
2093 int recno;
2094 int skipbytes;
2095 int subreqbyte;
2096 int subfirstbyte;
2097 int terminator;
2098 int mclength;
2099 uschar mcbuffer[8];
2100
2101 /* Get next byte in the pattern */
2102
2103 c = *ptr;
2104
2105 /* If we are in the pre-compile phase, accumulate the length used for the
2106 previous cycle of this loop. */
2107
2108 if (lengthptr != NULL)
2109 {
2110 #ifdef DEBUG
2111 if (code > cd->hwm) cd->hwm = code; /* High water info */
2112 #endif
2113 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2114 {
2115 *errorcodeptr = ERR52;
2116 goto FAILED;
2117 }
2118
2119 /* There is at least one situation where code goes backwards: this is the
2120 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2121 the class is simply eliminated. However, it is created first, so we have to
2122 allow memory for it. Therefore, don't ever reduce the length at this point.
2123 */
2124
2125 if (code < last_code) code = last_code;
2126 *lengthptr += code - last_code;
2127 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2128
2129 /* If "previous" is set and it is not at the start of the work space, move
2130 it back to there, in order to avoid filling up the work space. Otherwise,
2131 if "previous" is NULL, reset the current code pointer to the start. */
2132
2133 if (previous != NULL)
2134 {
2135 if (previous > orig_code)
2136 {
2137 memmove(orig_code, previous, code - previous);
2138 code -= previous - orig_code;
2139 previous = orig_code;
2140 }
2141 }
2142 else code = orig_code;
2143
2144 /* Remember where this code item starts so we can pick up the length
2145 next time round. */
2146
2147 last_code = code;
2148 }
2149
2150 /* In the real compile phase, just check the workspace used by the forward
2151 reference list. */
2152
2153 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2154 {
2155 *errorcodeptr = ERR52;
2156 goto FAILED;
2157 }
2158
2159 /* If in \Q...\E, check for the end; if not, we have a literal */
2160
2161 if (inescq && c != 0)
2162 {
2163 if (c == '\\' && ptr[1] == 'E')
2164 {
2165 inescq = FALSE;
2166 ptr++;
2167 continue;
2168 }
2169 else
2170 {
2171 if (previous_callout != NULL)
2172 {
2173 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2174 complete_callout(previous_callout, ptr, cd);
2175 previous_callout = NULL;
2176 }
2177 if ((options & PCRE_AUTO_CALLOUT) != 0)
2178 {
2179 previous_callout = code;
2180 code = auto_callout(code, ptr, cd);
2181 }
2182 goto NORMAL_CHAR;
2183 }
2184 }
2185
2186 /* Fill in length of a previous callout, except when the next thing is
2187 a quantifier. */
2188
2189 is_quantifier = c == '*' || c == '+' || c == '?' ||
2190 (c == '{' && is_counted_repeat(ptr+1));
2191
2192 if (!is_quantifier && previous_callout != NULL &&
2193 after_manual_callout-- <= 0)
2194 {
2195 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2196 complete_callout(previous_callout, ptr, cd);
2197 previous_callout = NULL;
2198 }
2199
2200 /* In extended mode, skip white space and comments */
2201
2202 if ((options & PCRE_EXTENDED) != 0)
2203 {
2204 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2205 if (c == '#')
2206 {
2207 while (*(++ptr) != 0)
2208 {
2209 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2210 }
2211 if (*ptr != 0) continue;
2212
2213 /* Else fall through to handle end of string */
2214 c = 0;
2215 }
2216 }
2217
2218 /* No auto callout for quantifiers. */
2219
2220 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2221 {
2222 previous_callout = code;
2223 code = auto_callout(code, ptr, cd);
2224 }
2225
2226 switch(c)
2227 {
2228 /* ===================================================================*/
2229 case 0: /* The branch terminates at string end */
2230 case '|': /* or | or ) */
2231 case ')':
2232 *firstbyteptr = firstbyte;
2233 *reqbyteptr = reqbyte;
2234 *codeptr = code;
2235 *ptrptr = ptr;
2236 if (lengthptr != NULL)
2237 {
2238 *lengthptr += code - last_code; /* To include callout length */
2239 DPRINTF((">> end branch\n"));
2240 }
2241 return TRUE;
2242
2243
2244 /* ===================================================================*/
2245 /* Handle single-character metacharacters. In multiline mode, ^ disables
2246 the setting of any following char as a first character. */
2247
2248 case '^':
2249 if ((options & PCRE_MULTILINE) != 0)
2250 {
2251 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2252 }
2253 previous = NULL;
2254 *code++ = OP_CIRC;
2255 break;
2256
2257 case '$':
2258 previous = NULL;
2259 *code++ = OP_DOLL;
2260 break;
2261
2262 /* There can never be a first char if '.' is first, whatever happens about
2263 repeats. The value of reqbyte doesn't change either. */
2264
2265 case '.':
2266 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2267 zerofirstbyte = firstbyte;
2268 zeroreqbyte = reqbyte;
2269 previous = code;
2270 *code++ = OP_ANY;
2271 break;
2272
2273
2274 /* ===================================================================*/
2275 /* Character classes. If the included characters are all < 256, we build a
2276 32-byte bitmap of the permitted characters, except in the special case
2277 where there is only one such character. For negated classes, we build the
2278 map as usual, then invert it at the end. However, we use a different opcode
2279 so that data characters > 255 can be handled correctly.
2280
2281 If the class contains characters outside the 0-255 range, a different
2282 opcode is compiled. It may optionally have a bit map for characters < 256,
2283 but those above are are explicitly listed afterwards. A flag byte tells
2284 whether the bitmap is present, and whether this is a negated class or not.
2285 */
2286
2287 case '[':
2288 previous = code;
2289
2290 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2291 they are encountered at the top level, so we'll do that too. */
2292
2293 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2294 check_posix_syntax(ptr, &tempptr, cd))
2295 {
2296 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2297 goto FAILED;
2298 }
2299
2300 /* If the first character is '^', set the negation flag and skip it. */
2301
2302 if ((c = *(++ptr)) == '^')
2303 {
2304 negate_class = TRUE;
2305 c = *(++ptr);
2306 }
2307 else
2308 {
2309 negate_class = FALSE;
2310 }
2311
2312 /* Keep a count of chars with values < 256 so that we can optimize the case
2313 of just a single character (as long as it's < 256). However, For higher
2314 valued UTF-8 characters, we don't yet do any optimization. */
2315
2316 class_charcount = 0;
2317 class_lastchar = -1;
2318
2319 /* Initialize the 32-char bit map to all zeros. We build the map in a
2320 temporary bit of memory, in case the class contains only 1 character (less
2321 than 256), because in that case the compiled code doesn't use the bit map.
2322 */
2323
2324 memset(classbits, 0, 32 * sizeof(uschar));
2325
2326 #ifdef SUPPORT_UTF8
2327 class_utf8 = FALSE; /* No chars >= 256 */
2328 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2329 #endif
2330
2331 /* Process characters until ] is reached. By writing this as a "do" it
2332 means that an initial ] is taken as a data character. At the start of the
2333 loop, c contains the first byte of the character. */
2334
2335 if (c != 0) do
2336 {
2337 const uschar *oldptr;
2338
2339 #ifdef SUPPORT_UTF8
2340 if (utf8 && c > 127)
2341 { /* Braces are required because the */
2342 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2343 }
2344 #endif
2345
2346 /* Inside \Q...\E everything is literal except \E */
2347
2348 if (inescq)
2349 {
2350 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2351 {
2352 inescq = FALSE; /* Reset literal state */
2353 ptr++; /* Skip the 'E' */
2354 continue; /* Carry on with next */
2355 }
2356 goto CHECK_RANGE; /* Could be range if \E follows */
2357 }
2358
2359 /* Handle POSIX class names. Perl allows a negation extension of the
2360 form [:^name:]. A square bracket that doesn't match the syntax is
2361 treated as a literal. We also recognize the POSIX constructions
2362 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2363 5.6 and 5.8 do. */
2364
2365 if (c == '[' &&
2366 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2367 check_posix_syntax(ptr, &tempptr, cd))
2368 {
2369 BOOL local_negate = FALSE;
2370 int posix_class, taboffset, tabopt;
2371 register const uschar *cbits = cd->cbits;
2372 uschar pbits[32];
2373
2374 if (ptr[1] != ':')
2375 {
2376 *errorcodeptr = ERR31;
2377 goto FAILED;
2378 }
2379
2380 ptr += 2;
2381 if (*ptr == '^')
2382 {
2383 local_negate = TRUE;
2384 ptr++;
2385 }
2386
2387 posix_class = check_posix_name(ptr, tempptr - ptr);
2388 if (posix_class < 0)
2389 {
2390 *errorcodeptr = ERR30;
2391 goto FAILED;
2392 }
2393
2394 /* If matching is caseless, upper and lower are converted to
2395 alpha. This relies on the fact that the class table starts with
2396 alpha, lower, upper as the first 3 entries. */
2397
2398 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2399 posix_class = 0;
2400
2401 /* We build the bit map for the POSIX class in a chunk of local store
2402 because we may be adding and subtracting from it, and we don't want to
2403 subtract bits that may be in the main map already. At the end we or the
2404 result into the bit map that is being built. */
2405
2406 posix_class *= 3;
2407
2408 /* Copy in the first table (always present) */
2409
2410 memcpy(pbits, cbits + posix_class_maps[posix_class],
2411 32 * sizeof(uschar));
2412
2413 /* If there is a second table, add or remove it as required. */
2414
2415 taboffset = posix_class_maps[posix_class + 1];
2416 tabopt = posix_class_maps[posix_class + 2];
2417
2418 if (taboffset >= 0)
2419 {
2420 if (tabopt >= 0)
2421 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2422 else
2423 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2424 }
2425
2426 /* Not see if we need to remove any special characters. An option
2427 value of 1 removes vertical space and 2 removes underscore. */
2428
2429 if (tabopt < 0) tabopt = -tabopt;
2430 if (tabopt == 1) pbits[1] &= ~0x3c;
2431 else if (tabopt == 2) pbits[11] &= 0x7f;
2432
2433 /* Add the POSIX table or its complement into the main table that is
2434 being built and we are done. */
2435
2436 if (local_negate)
2437 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2438 else
2439 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2440
2441 ptr = tempptr + 1;
2442 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2443 continue; /* End of POSIX syntax handling */
2444 }
2445
2446 /* Backslash may introduce a single character, or it may introduce one
2447 of the specials, which just set a flag. The sequence \b is a special
2448 case. Inside a class (and only there) it is treated as backspace.
2449 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2450 to or into the one we are building. We assume they have more than one
2451 character in them, so set class_charcount bigger than one. */
2452
2453 if (c == '\\')
2454 {
2455 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2456 if (*errorcodeptr != 0) goto FAILED;
2457
2458 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2459 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2460 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2461 else if (-c == ESC_Q) /* Handle start of quoted string */
2462 {
2463 if (ptr[1] == '\\' && ptr[2] == 'E')
2464 {
2465 ptr += 2; /* avoid empty string */
2466 }
2467 else inescq = TRUE;
2468 continue;
2469 }
2470
2471 if (c < 0)
2472 {
2473 register const uschar *cbits = cd->cbits;
2474 class_charcount += 2; /* Greater than 1 is what matters */
2475
2476 /* Save time by not doing this in the pre-compile phase. */
2477
2478 if (lengthptr == NULL) switch (-c)
2479 {
2480 case ESC_d:
2481 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2482 continue;
2483
2484 case ESC_D:
2485 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2486 continue;
2487
2488 case ESC_w:
2489 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2490 continue;
2491
2492 case ESC_W:
2493 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2494 continue;
2495
2496 case ESC_s:
2497 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2498 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2499 continue;
2500
2501 case ESC_S:
2502 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2503 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2504 continue;
2505
2506 case ESC_E: /* Perl ignores an orphan \E */
2507 continue;
2508
2509 default: /* Not recognized; fall through */
2510 break; /* Need "default" setting to stop compiler warning. */
2511 }
2512
2513 /* In the pre-compile phase, just do the recognition. */
2514
2515 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2516 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2517
2518 /* We need to deal with \P and \p in both phases. */
2519
2520 #ifdef SUPPORT_UCP
2521 if (-c == ESC_p || -c == ESC_P)
2522 {
2523 BOOL negated;
2524 int pdata;
2525 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2526 if (ptype < 0) goto FAILED;
2527 class_utf8 = TRUE;
2528 *class_utf8data++ = ((-c == ESC_p) != negated)?
2529 XCL_PROP : XCL_NOTPROP;
2530 *class_utf8data++ = ptype;
2531 *class_utf8data++ = pdata;
2532 class_charcount -= 2; /* Not a < 256 character */
2533 continue;
2534 }
2535 #endif
2536 /* Unrecognized escapes are faulted if PCRE is running in its
2537 strict mode. By default, for compatibility with Perl, they are
2538 treated as literals. */
2539
2540 if ((options & PCRE_EXTRA) != 0)
2541 {
2542 *errorcodeptr = ERR7;
2543 goto FAILED;
2544 }
2545
2546 class_charcount -= 2; /* Undo the default count from above */
2547 c = *ptr; /* Get the final character and fall through */
2548 }
2549
2550 /* Fall through if we have a single character (c >= 0). This may be
2551 greater than 256 in UTF-8 mode. */
2552
2553 } /* End of backslash handling */
2554
2555 /* A single character may be followed by '-' to form a range. However,
2556 Perl does not permit ']' to be the end of the range. A '-' character
2557 at the end is treated as a literal. Perl ignores orphaned \E sequences
2558 entirely. The code for handling \Q and \E is messy. */
2559
2560 CHECK_RANGE:
2561 while (ptr[1] == '\\' && ptr[2] == 'E')
2562 {
2563 inescq = FALSE;
2564 ptr += 2;
2565 }
2566
2567 oldptr = ptr;
2568
2569 if (!inescq && ptr[1] == '-')
2570 {
2571 int d;
2572 ptr += 2;
2573 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2574
2575 /* If we hit \Q (not followed by \E) at this point, go into escaped
2576 mode. */
2577
2578 while (*ptr == '\\' && ptr[1] == 'Q')
2579 {
2580 ptr += 2;
2581 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2582 inescq = TRUE;
2583 break;
2584 }
2585
2586 if (*ptr == 0 || (!inescq && *ptr == ']'))
2587 {
2588 ptr = oldptr;
2589 goto LONE_SINGLE_CHARACTER;
2590 }
2591
2592 #ifdef SUPPORT_UTF8
2593 if (utf8)
2594 { /* Braces are required because the */
2595 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2596 }
2597 else
2598 #endif
2599 d = *ptr; /* Not UTF-8 mode */
2600
2601 /* The second part of a range can be a single-character escape, but
2602 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2603 in such circumstances. */
2604
2605 if (!inescq && d == '\\')
2606 {
2607 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2608 if (*errorcodeptr != 0) goto FAILED;
2609
2610 /* \b is backslash; \X is literal X; \R is literal R; any other
2611 special means the '-' was literal */
2612
2613 if (d < 0)
2614 {
2615 if (d == -ESC_b) d = '\b';
2616 else if (d == -ESC_X) d = 'X';
2617 else if (d == -ESC_R) d = 'R'; else
2618 {
2619 ptr = oldptr;
2620 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2621 }
2622 }
2623 }
2624
2625 /* Check that the two values are in the correct order. Optimize
2626 one-character ranges */
2627
2628 if (d < c)
2629 {
2630 *errorcodeptr = ERR8;
2631 goto FAILED;
2632 }
2633
2634 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2635
2636 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2637 matching, we have to use an XCLASS with extra data items. Caseless
2638 matching for characters > 127 is available only if UCP support is
2639 available. */
2640
2641 #ifdef SUPPORT_UTF8
2642 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2643 {
2644 class_utf8 = TRUE;
2645
2646 /* With UCP support, we can find the other case equivalents of
2647 the relevant characters. There may be several ranges. Optimize how
2648 they fit with the basic range. */
2649
2650 #ifdef SUPPORT_UCP
2651 if ((options & PCRE_CASELESS) != 0)
2652 {
2653 unsigned int occ, ocd;
2654 unsigned int cc = c;
2655 unsigned int origd = d;
2656 while (get_othercase_range(&cc, origd, &occ, &ocd))
2657 {
2658 if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2659
2660 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2661 { /* if there is overlap, */
2662 c = occ; /* noting that if occ < c */
2663 continue; /* we can't have ocd > d */
2664 } /* because a subrange is */
2665 if (ocd > d && occ <= d + 1) /* always shorter than */
2666 { /* the basic range. */
2667 d = ocd;
2668 continue;
2669 }
2670
2671 if (occ == ocd)
2672 {
2673 *class_utf8data++ = XCL_SINGLE;
2674 }
2675 else
2676 {
2677 *class_utf8data++ = XCL_RANGE;
2678 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2679 }
2680 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2681 }
2682 }
2683 #endif /* SUPPORT_UCP */
2684
2685 /* Now record the original range, possibly modified for UCP caseless
2686 overlapping ranges. */
2687
2688 *class_utf8data++ = XCL_RANGE;
2689 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2690 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2691
2692 /* With UCP support, we are done. Without UCP support, there is no
2693 caseless matching for UTF-8 characters > 127; we can use the bit map
2694 for the smaller ones. */
2695
2696 #ifdef SUPPORT_UCP
2697 continue; /* With next character in the class */
2698 #else
2699 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2700
2701 /* Adjust upper limit and fall through to set up the map */
2702
2703 d = 127;
2704
2705 #endif /* SUPPORT_UCP */
2706 }
2707 #endif /* SUPPORT_UTF8 */
2708
2709 /* We use the bit map for all cases when not in UTF-8 mode; else
2710 ranges that lie entirely within 0-127 when there is UCP support; else
2711 for partial ranges without UCP support. */
2712
2713 class_charcount += d - c + 1;
2714 class_lastchar = d;
2715
2716 /* We can save a bit of time by skipping this in the pre-compile. */
2717
2718 if (lengthptr == NULL) for (; c <= d; c++)
2719 {
2720 classbits[c/8] |= (1 << (c&7));
2721 if ((options & PCRE_CASELESS) != 0)
2722 {
2723 int uc = cd->fcc[c]; /* flip case */
2724 classbits[uc/8] |= (1 << (uc&7));
2725 }
2726 }
2727
2728 continue; /* Go get the next char in the class */
2729 }
2730
2731 /* Handle a lone single character - we can get here for a normal
2732 non-escape char, or after \ that introduces a single character or for an
2733 apparent range that isn't. */
2734
2735 LONE_SINGLE_CHARACTER:
2736
2737 /* Handle a character that cannot go in the bit map */
2738
2739 #ifdef SUPPORT_UTF8
2740 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2741 {
2742 class_utf8 = TRUE;
2743 *class_utf8data++ = XCL_SINGLE;
2744 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2745
2746 #ifdef SUPPORT_UCP
2747 if ((options & PCRE_CASELESS) != 0)
2748 {
2749 unsigned int othercase;
2750 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
2751 {
2752 *class_utf8data++ = XCL_SINGLE;
2753 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2754 }
2755 }
2756 #endif /* SUPPORT_UCP */
2757
2758 }
2759 else
2760 #endif /* SUPPORT_UTF8 */
2761
2762 /* Handle a single-byte character */
2763 {
2764 classbits[c/8] |= (1 << (c&7));
2765 if ((options & PCRE_CASELESS) != 0)
2766 {
2767 c = cd->fcc[c]; /* flip case */
2768 classbits[c/8] |= (1 << (c&7));
2769 }
2770 class_charcount++;
2771 class_lastchar = c;
2772 }
2773 }
2774
2775 /* Loop until ']' reached. This "while" is the end of the "do" above. */
2776
2777 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
2778
2779 if (c == 0) /* Missing terminating ']' */
2780 {
2781 *errorcodeptr = ERR6;
2782 goto FAILED;
2783 }
2784
2785 /* If class_charcount is 1, we saw precisely one character whose value is
2786 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2787 can optimize the negative case only if there were no characters >= 128
2788 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2789 single-bytes only. This is an historical hangover. Maybe one day we can
2790 tidy these opcodes to handle multi-byte characters.
2791
2792 The optimization throws away the bit map. We turn the item into a
2793 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2794 that OP_NOT does not support multibyte characters. In the positive case, it
2795 can cause firstbyte to be set. Otherwise, there can be no first char if
2796 this item is first, whatever repeat count may follow. In the case of
2797 reqbyte, save the previous value for reinstating. */
2798
2799 #ifdef SUPPORT_UTF8
2800 if (class_charcount == 1 &&
2801 (!utf8 ||
2802 (!class_utf8 && (!negate_class || class_lastchar < 128))))
2803
2804 #else
2805 if (class_charcount == 1)
2806 #endif
2807 {
2808 zeroreqbyte = reqbyte;
2809
2810 /* The OP_NOT opcode works on one-byte characters only. */
2811
2812 if (negate_class)
2813 {
2814 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2815 zerofirstbyte = firstbyte;
2816 *code++ = OP_NOT;
2817 *code++ = class_lastchar;
2818 break;
2819 }
2820
2821 /* For a single, positive character, get the value into mcbuffer, and
2822 then we can handle this with the normal one-character code. */
2823
2824 #ifdef SUPPORT_UTF8
2825 if (utf8 && class_lastchar > 127)
2826 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2827 else
2828 #endif
2829 {
2830 mcbuffer[0] = class_lastchar;
2831 mclength = 1;
2832 }
2833 goto ONE_CHAR;
2834 } /* End of 1-char optimization */
2835
2836 /* The general case - not the one-char optimization. If this is the first
2837 thing in the branch, there can be no first char setting, whatever the
2838 repeat count. Any reqbyte setting must remain unchanged after any kind of
2839 repeat. */
2840
2841 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2842 zerofirstbyte = firstbyte;
2843 zeroreqbyte = reqbyte;
2844
2845 /* If there are characters with values > 255, we have to compile an
2846 extended class, with its own opcode. If there are no characters < 256,
2847 we can omit the bitmap in the actual compiled code. */
2848
2849 #ifdef SUPPORT_UTF8
2850 if (class_utf8)
2851 {
2852 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2853 *code++ = OP_XCLASS;
2854 code += LINK_SIZE;
2855 *code = negate_class? XCL_NOT : 0;
2856
2857 /* If the map is required, move up the extra data to make room for it;
2858 otherwise just move the code pointer to the end of the extra data. */
2859
2860 if (class_charcount > 0)
2861 {
2862 *code++ |= XCL_MAP;
2863 memmove(code + 32, code, class_utf8data - code);
2864 memcpy(code, classbits, 32);
2865 code = class_utf8data + 32;
2866 }
2867 else code = class_utf8data;
2868
2869 /* Now fill in the complete length of the item */
2870
2871 PUT(previous, 1, code - previous);
2872 break; /* End of class handling */
2873 }
2874 #endif
2875
2876 /* If there are no characters > 255, negate the 32-byte map if necessary,
2877 and copy it into the code vector. If this is the first thing in the branch,
2878 there can be no first char setting, whatever the repeat count. Any reqbyte
2879 setting must remain unchanged after any kind of repeat. */
2880
2881 if (negate_class)
2882 {
2883 *code++ = OP_NCLASS;
2884 if (lengthptr == NULL) /* Save time in the pre-compile phase */
2885 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2886 }
2887 else
2888 {
2889 *code++ = OP_CLASS;
2890 memcpy(code, classbits, 32);
2891 }
2892 code += 32;
2893 break;
2894
2895
2896 /* ===================================================================*/
2897 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2898 has been tested above. */
2899
2900 case '{':
2901 if (!is_quantifier) goto NORMAL_CHAR;
2902 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2903 if (*errorcodeptr != 0) goto FAILED;
2904 goto REPEAT;
2905
2906 case '*':
2907 repeat_min = 0;
2908 repeat_max = -1;
2909 goto REPEAT;
2910
2911 case '+':
2912 repeat_min = 1;
2913 repeat_max = -1;
2914 goto REPEAT;
2915
2916 case '?':
2917 repeat_min = 0;
2918 repeat_max = 1;
2919
2920 REPEAT:
2921 if (previous == NULL)
2922 {
2923 *errorcodeptr = ERR9;
2924 goto FAILED;
2925 }
2926
2927 if (repeat_min == 0)
2928 {
2929 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2930 reqbyte = zeroreqbyte; /* Ditto */
2931 }
2932
2933 /* Remember whether this is a variable length repeat */
2934
2935 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2936
2937 op_type = 0; /* Default single-char op codes */
2938 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2939
2940 /* Save start of previous item, in case we have to move it up to make space
2941 for an inserted OP_ONCE for the additional '+' extension. */
2942
2943 tempcode = previous;
2944
2945 /* If the next character is '+', we have a possessive quantifier. This
2946 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2947 If the next character is '?' this is a minimizing repeat, by default,
2948 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2949 repeat type to the non-default. */
2950
2951 if (ptr[1] == '+')
2952 {
2953 repeat_type = 0; /* Force greedy */
2954 possessive_quantifier = TRUE;
2955 ptr++;
2956 }
2957 else if (ptr[1] == '?')
2958 {
2959 repeat_type = greedy_non_default;
2960 ptr++;
2961 }
2962 else repeat_type = greedy_default;
2963
2964 /* If previous was a character match, abolish the item and generate a
2965 repeat item instead. If a char item has a minumum of more than one, ensure
2966 that it is set in reqbyte - it might not be if a sequence such as x{3} is
2967 the first thing in a branch because the x will have gone into firstbyte
2968 instead. */
2969
2970 if (*previous == OP_CHAR || *previous == OP_CHARNC)
2971 {
2972 /* Deal with UTF-8 characters that take up more than one byte. It's
2973 easier to write this out separately than try to macrify it. Use c to
2974 hold the length of the character in bytes, plus 0x80 to flag that it's a
2975 length rather than a small character. */
2976
2977 #ifdef SUPPORT_UTF8
2978 if (utf8 && (code[-1] & 0x80) != 0)
2979 {
2980 uschar *lastchar = code - 1;
2981 while((*lastchar & 0xc0) == 0x80) lastchar--;
2982 c = code - lastchar; /* Length of UTF-8 character */
2983 memcpy(utf8_char, lastchar, c); /* Save the char */
2984 c |= 0x80; /* Flag c as a length */
2985 }
2986 else
2987 #endif
2988
2989 /* Handle the case of a single byte - either with no UTF8 support, or
2990 with UTF-8 disabled, or for a UTF-8 character < 128. */
2991
2992 {
2993 c = code[-1];
2994 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2995 }
2996
2997 /* If the repetition is unlimited, it pays to see if the next thing on
2998 the line is something that cannot possibly match this character. If so,
2999 automatically possessifying this item gains some performance in the case
3000 where the match fails. */
3001
3002 if (!possessive_quantifier &&
3003 repeat_max < 0 &&
3004 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3005 options, cd))
3006 {
3007 repeat_type = 0; /* Force greedy */
3008 possessive_quantifier = TRUE;
3009 }
3010
3011 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3012 }
3013
3014 /* If previous was a single negated character ([^a] or similar), we use
3015 one of the special opcodes, replacing it. The code is shared with single-
3016 character repeats by setting opt_type to add a suitable offset into
3017 repeat_type. We can also test for auto-possessification. OP_NOT is
3018 currently used only for single-byte chars. */
3019
3020 else if (*previous == OP_NOT)
3021 {
3022 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3023 c = previous[1];
3024 if (!possessive_quantifier &&
3025 repeat_max < 0 &&
3026 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3027 {
3028 repeat_type = 0; /* Force greedy */
3029 possessive_quantifier = TRUE;
3030 }
3031 goto OUTPUT_SINGLE_REPEAT;
3032 }
3033
3034 /* If previous was a character type match (\d or similar), abolish it and
3035 create a suitable repeat item. The code is shared with single-character
3036 repeats by setting op_type to add a suitable offset into repeat_type. Note
3037 the the Unicode property types will be present only when SUPPORT_UCP is
3038 defined, but we don't wrap the little bits of code here because it just
3039 makes it horribly messy. */
3040
3041 else if (*previous < OP_EODN)
3042 {
3043 uschar *oldcode;
3044 int prop_type, prop_value;
3045 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3046 c = *previous;
3047
3048 if (!possessive_quantifier &&
3049 repeat_max < 0 &&
3050 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3051 {
3052 repeat_type = 0; /* Force greedy */
3053 possessive_quantifier = TRUE;
3054 }
3055
3056 OUTPUT_SINGLE_REPEAT:
3057 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3058 {
3059 prop_type = previous[1];
3060 prop_value = previous[2];
3061 }
3062 else prop_type = prop_value = -1;
3063
3064 oldcode = code;
3065 code = previous; /* Usually overwrite previous item */
3066
3067 /* If the maximum is zero then the minimum must also be zero; Perl allows
3068 this case, so we do too - by simply omitting the item altogether. */
3069
3070 if (repeat_max == 0) goto END_REPEAT;
3071
3072 /* All real repeats make it impossible to handle partial matching (maybe
3073 one day we will be able to remove this restriction). */
3074
3075 if (repeat_max != 1) cd->nopartial = TRUE;
3076
3077 /* Combine the op_type with the repeat_type */
3078
3079 repeat_type += op_type;
3080
3081 /* A minimum of zero is handled either as the special case * or ?, or as
3082 an UPTO, with the maximum given. */
3083
3084 if (repeat_min == 0)
3085 {
3086 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3087 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3088 else
3089 {
3090 *code++ = OP_UPTO + repeat_type;
3091 PUT2INC(code, 0, repeat_max);
3092 }
3093 }
3094
3095 /* A repeat minimum of 1 is optimized into some special cases. If the
3096 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3097 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3098 one less than the maximum. */
3099
3100 else if (repeat_min == 1)
3101 {
3102 if (repeat_max == -1)
3103 *code++ = OP_PLUS + repeat_type;
3104 else
3105 {
3106 code = oldcode; /* leave previous item in place */
3107 if (repeat_max == 1) goto END_REPEAT;
3108 *code++ = OP_UPTO + repeat_type;
3109 PUT2INC(code, 0, repeat_max - 1);
3110 }
3111 }
3112
3113 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3114 handled as an EXACT followed by an UPTO. */
3115
3116 else
3117 {
3118 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3119 PUT2INC(code, 0, repeat_min);
3120
3121 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3122 we have to insert the character for the previous code. For a repeated
3123 Unicode property match, there are two extra bytes that define the
3124 required property. In UTF-8 mode, long characters have their length in
3125 c, with the 0x80 bit as a flag. */
3126
3127 if (repeat_max < 0)
3128 {
3129 #ifdef SUPPORT_UTF8
3130 if (utf8 && c >= 128)
3131 {
3132 memcpy(code, utf8_char, c & 7);
3133 code += c & 7;
3134 }
3135 else
3136 #endif
3137 {
3138 *code++ = c;
3139 if (prop_type >= 0)
3140 {
3141 *code++ = prop_type;
3142 *code++ = prop_value;
3143 }
3144 }
3145 *code++ = OP_STAR + repeat_type;
3146 }
3147
3148 /* Else insert an UPTO if the max is greater than the min, again
3149 preceded by the character, for the previously inserted code. If the
3150 UPTO is just for 1 instance, we can use QUERY instead. */
3151
3152 else if (repeat_max != repeat_min)
3153 {
3154 #ifdef SUPPORT_UTF8
3155 if (utf8 && c >= 128)
3156 {
3157 memcpy(code, utf8_char, c & 7);
3158 code += c & 7;
3159 }
3160 else
3161 #endif
3162 *code++ = c;
3163 if (prop_type >= 0)
3164 {
3165 *code++ = prop_type;
3166 *code++ = prop_value;
3167 }
3168 repeat_max -= repeat_min;
3169
3170 if (repeat_max == 1)
3171 {
3172 *code++ = OP_QUERY + repeat_type;
3173 }
3174 else
3175 {
3176 *code++ = OP_UPTO + repeat_type;
3177 PUT2INC(code, 0, repeat_max);
3178 }
3179 }
3180 }
3181
3182 /* The character or character type itself comes last in all cases. */
3183
3184 #ifdef SUPPORT_UTF8
3185 if (utf8 && c >= 128)
3186 {
3187 memcpy(code, utf8_char, c & 7);
3188 code += c & 7;
3189 }
3190 else
3191 #endif
3192 *code++ = c;
3193
3194 /* For a repeated Unicode property match, there are two extra bytes that
3195 define the required property. */
3196
3197 #ifdef SUPPORT_UCP
3198 if (prop_type >= 0)
3199 {
3200 *code++ = prop_type;
3201 *code++ = prop_value;
3202 }
3203 #endif
3204 }
3205
3206 /* If previous was a character class or a back reference, we put the repeat
3207 stuff after it, but just skip the item if the repeat was {0,0}. */
3208
3209 else if (*previous == OP_CLASS ||
3210 *previous == OP_NCLASS ||
3211 #ifdef SUPPORT_UTF8
3212 *previous == OP_XCLASS ||
3213 #endif
3214 *previous == OP_REF)
3215 {
3216 if (repeat_max == 0)
3217 {
3218 code = previous;
3219 goto END_REPEAT;
3220 }
3221
3222 /* All real repeats make it impossible to handle partial matching (maybe
3223 one day we will be able to remove this restriction). */
3224
3225 if (repeat_max != 1) cd->nopartial = TRUE;
3226
3227 if (repeat_min == 0 && repeat_max == -1)
3228 *code++ = OP_CRSTAR + repeat_type;
3229 else if (repeat_min == 1 && repeat_max == -1)
3230 *code++ = OP_CRPLUS + repeat_type;
3231 else if (repeat_min == 0 && repeat_max == 1)
3232 *code++ = OP_CRQUERY + repeat_type;
3233 else
3234 {
3235 *code++ = OP_CRRANGE + repeat_type;
3236 PUT2INC(code, 0, repeat_min);
3237 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3238 PUT2INC(code, 0, repeat_max);
3239 }
3240 }
3241
3242 /* If previous was a bracket group, we may have to replicate it in certain
3243 cases. */
3244
3245 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3246 *previous == OP_ONCE || *previous == OP_COND)
3247 {
3248 register int i;
3249 int ketoffset = 0;
3250 int len = code - previous;
3251 uschar *bralink = NULL;
3252
3253 /* Repeating a DEFINE group is pointless */
3254
3255 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3256 {
3257 *errorcodeptr = ERR55;
3258 goto FAILED;
3259 }
3260
3261 /* This is a paranoid check to stop integer overflow later on */
3262
3263 if (len > MAX_DUPLENGTH)
3264 {
3265 *errorcodeptr = ERR50;
3266 goto FAILED;
3267 }
3268
3269 /* If the maximum repeat count is unlimited, find the end of the bracket
3270 by scanning through from the start, and compute the offset back to it
3271 from the current code pointer. There may be an OP_OPT setting following
3272 the final KET, so we can't find the end just by going back from the code
3273 pointer. */
3274
3275 if (repeat_max == -1)
3276 {
3277 register uschar *ket = previous;
3278 do ket += GET(ket, 1); while (*ket != OP_KET);
3279 ketoffset = code - ket;
3280 }
3281
3282 /* The case of a zero minimum is special because of the need to stick
3283 OP_BRAZERO in front of it, and because the group appears once in the
3284 data, whereas in other cases it appears the minimum number of times. For
3285 this reason, it is simplest to treat this case separately, as otherwise
3286 the code gets far too messy. There are several special subcases when the
3287 minimum is zero. */
3288
3289 if (repeat_min == 0)
3290 {
3291 /* If the maximum is also zero, we just omit the group from the output
3292 altogether. */
3293
3294 if (repeat_max == 0)
3295 {
3296 code = previous;
3297 goto END_REPEAT;
3298 }
3299
3300 /* If the maximum is 1 or unlimited, we just have to stick in the
3301 BRAZERO and do no more at this point. However, we do need to adjust
3302 any OP_RECURSE calls inside the group that refer to the group itself or
3303 any internal or forward referenced group, because the offset is from
3304 the start of the whole regex. Temporarily terminate the pattern while
3305 doing this. */
3306
3307 if (repeat_max <= 1)
3308 {
3309 *code = OP_END;
3310 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3311 memmove(previous+1, previous, len);
3312 code++;
3313 *previous++ = OP_BRAZERO + repeat_type;
3314 }
3315
3316 /* If the maximum is greater than 1 and limited, we have to replicate
3317 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3318 The first one has to be handled carefully because it's the original
3319 copy, which has to be moved up. The remainder can be handled by code
3320 that is common with the non-zero minimum case below. We have to
3321 adjust the value or repeat_max, since one less copy is required. Once
3322 again, we may have to adjust any OP_RECURSE calls inside the group. */
3323
3324 else
3325 {
3326 int offset;
3327 *code = OP_END;
3328 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3329 memmove(previous + 2 + LINK_SIZE, previous, len);
3330 code += 2 + LINK_SIZE;
3331 *previous++ = OP_BRAZERO + repeat_type;
3332 *previous++ = OP_BRA;
3333
3334 /* We chain together the bracket offset fields that have to be
3335 filled in later when the ends of the brackets are reached. */
3336
3337 offset = (bralink == NULL)? 0 : previous - bralink;
3338 bralink = previous;
3339 PUTINC(previous, 0, offset);
3340 }
3341
3342 repeat_max--;
3343 }
3344
3345 /* If the minimum is greater than zero, replicate the group as many
3346 times as necessary, and adjust the maximum to the number of subsequent
3347 copies that we need. If we set a first char from the group, and didn't
3348 set a required char, copy the latter from the former. If there are any
3349 forward reference subroutine calls in the group, there will be entries on
3350 the workspace list; replicate these with an appropriate increment. */
3351
3352 else
3353 {
3354 if (repeat_min > 1)
3355 {
3356 /* In the pre-compile phase, we don't actually do the replication. We
3357 just adjust the length as if we had. */
3358
3359 if (lengthptr != NULL)
3360 *lengthptr += (repeat_min - 1)*length_prevgroup;
3361
3362 /* This is compiling for real */
3363
3364 else
3365 {
3366 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3367 for (i = 1; i < repeat_min; i++)
3368 {
3369 uschar *hc;
3370 uschar *this_hwm = cd->hwm;
3371 memcpy(code, previous, len);
3372 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3373 {
3374 PUT(cd->hwm, 0, GET(hc, 0) + len);
3375 cd->hwm += LINK_SIZE;
3376 }
3377 save_hwm = this_hwm;
3378 code += len;
3379 }
3380 }
3381 }
3382
3383 if (repeat_max > 0) repeat_max -= repeat_min;
3384 }
3385
3386 /* This code is common to both the zero and non-zero minimum cases. If
3387 the maximum is limited, it replicates the group in a nested fashion,
3388 remembering the bracket starts on a stack. In the case of a zero minimum,
3389 the first one was set up above. In all cases the repeat_max now specifies
3390 the number of additional copies needed. Again, we must remember to
3391 replicate entries on the forward reference list. */
3392
3393 if (repeat_max >= 0)
3394 {
3395 /* In the pre-compile phase, we don't actually do the replication. We
3396 just adjust the length as if we had. For each repetition we must add 1
3397 to the length for BRAZERO and for all but the last repetition we must
3398 add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3399
3400 if (lengthptr != NULL && repeat_max > 0)
3401 *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3402 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3403
3404 /* This is compiling for real */
3405
3406 else for (i = repeat_max - 1; i >= 0; i--)
3407 {
3408 uschar *hc;
3409 uschar *this_hwm = cd->hwm;
3410
3411 *code++ = OP_BRAZERO + repeat_type;
3412
3413 /* All but the final copy start a new nesting, maintaining the
3414 chain of brackets outstanding. */
3415
3416 if (i != 0)
3417 {
3418 int offset;
3419 *code++ = OP_BRA;
3420 offset = (bralink == NULL)? 0 : code - bralink;
3421 bralink = code;
3422 PUTINC(code, 0, offset);
3423 }
3424
3425 memcpy(code, previous, len);
3426 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3427 {
3428 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3429 cd->hwm += LINK_SIZE;
3430 }
3431 save_hwm = this_hwm;
3432 code += len;
3433 }
3434
3435 /* Now chain through the pending brackets, and fill in their length
3436 fields (which are holding the chain links pro tem). */
3437
3438 while (bralink != NULL)
3439 {
3440 int oldlinkoffset;
3441 int offset = code - bralink + 1;
3442 uschar *bra = code - offset;
3443 oldlinkoffset = GET(bra, 1);
3444 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3445 *code++ = OP_KET;
3446 PUTINC(code, 0, offset);
3447 PUT(bra, 1, offset);
3448 }
3449 }
3450
3451 /* If the maximum is unlimited, set a repeater in the final copy. We
3452 can't just offset backwards from the current code point, because we
3453 don't know if there's been an options resetting after the ket. The
3454 correct offset was computed above.
3455
3456 Then, when we are doing the actual compile phase, check to see whether
3457 this group is a non-atomic one that could match an empty string. If so,
3458 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3459 that runtime checking can be done. [This check is also applied to
3460 atomic groups at runtime, but in a different way.] */
3461
3462 else
3463 {
3464 uschar *ketcode = code - ketoffset;
3465 uschar *bracode = ketcode - GET(ketcode, 1);
3466 *ketcode = OP_KETRMAX + repeat_type;
3467 if (lengthptr == NULL && *bracode != OP_ONCE)
3468 {
3469 uschar *scode = bracode;
3470 do
3471 {
3472 if (could_be_empty_branch(scode, ketcode, utf8))
3473 {
3474 *bracode += OP_SBRA - OP_BRA;
3475 break;
3476 }
3477 scode += GET(scode, 1);
3478 }
3479 while (*scode == OP_ALT);
3480 }
3481 }
3482 }
3483
3484 /* Else there's some kind of shambles */
3485
3486 else
3487 {
3488 *errorcodeptr = ERR11;
3489 goto FAILED;
3490 }
3491
3492 /* If the character following a repeat is '+', or if certain optimization
3493 tests above succeeded, possessive_quantifier is TRUE. For some of the
3494 simpler opcodes, there is an special alternative opcode for this. For
3495 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3496 The '+' notation is just syntactic sugar, taken from Sun's Java package,
3497 but the special opcodes can optimize it a bit. The repeated item starts at
3498 tempcode, not at previous, which might be the first part of a string whose
3499 (former) last char we repeated.
3500
3501 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3502 an 'upto' may follow. We skip over an 'exact' item, and then test the
3503 length of what remains before proceeding. */
3504
3505 if (possessive_quantifier)
3506 {
3507 int len;
3508 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3509 *tempcode == OP_NOTEXACT)
3510 tempcode += _pcre_OP_lengths[*tempcode];
3511 len = code - tempcode;
3512 if (len > 0) switch (*tempcode)
3513 {
3514 case OP_STAR: *tempcode = OP_POSSTAR; break;
3515 case OP_PLUS: *tempcode = OP_POSPLUS; break;
3516 case OP_QUERY: *tempcode = OP_POSQUERY; break;
3517 case OP_UPTO: *tempcode = OP_POSUPTO; break;
3518
3519 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3520 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3521 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3522 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3523
3524 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3525 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3526 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3527 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3528
3529 default:
3530 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3531 code += 1 + LINK_SIZE;
3532 len += 1 + LINK_SIZE;
3533 tempcode[0] = OP_ONCE;
3534 *code++ = OP_KET;
3535 PUTINC(code, 0, len);
3536 PUT(tempcode, 1, len);
3537 break;
3538 }
3539 }
3540
3541 /* In all case we no longer have a previous item. We also set the
3542 "follows varying string" flag for subsequently encountered reqbytes if
3543 it isn't already set and we have just passed a varying length item. */
3544
3545 END_REPEAT:
3546 previous = NULL;
3547 cd->req_varyopt |= reqvary;
3548 break;
3549
3550
3551 /* ===================================================================*/
3552 /* Start of nested parenthesized sub-expression, or comment or lookahead or
3553 lookbehind or option setting or condition or all the other extended
3554 parenthesis forms. First deal with the specials; all are introduced by ?,
3555 and the appearance of any of them means that this is not a capturing
3556 group. */
3557
3558 case '(':
3559 newoptions = options;
3560 skipbytes = 0;
3561 bravalue = OP_CBRA;
3562 save_hwm = cd->hwm;
3563
3564 if (*(++ptr) == '?')
3565 {
3566 int i, set, unset, namelen;
3567 int *optset;
3568 const uschar *name;
3569 uschar *slot;
3570
3571 switch (*(++ptr))
3572 {
3573 case '#': /* Comment; skip to ket */
3574 ptr++;
3575 while (*ptr != 0 && *ptr != ')') ptr++;
3576 if (*ptr == 0)
3577 {
3578 *errorcodeptr = ERR18;
3579 goto FAILED;
3580 }
3581 continue;
3582
3583
3584 /* ------------------------------------------------------------ */
3585 case ':': /* Non-capturing bracket */
3586 bravalue = OP_BRA;
3587 ptr++;
3588 break;
3589
3590
3591 /* ------------------------------------------------------------ */
3592 case '(':
3593 bravalue = OP_COND; /* Conditional group */
3594
3595 /* A condition can be an assertion, a number (referring to a numbered
3596 group), a name (referring to a named group), or 'R', referring to
3597 recursion. R<digits> and R&name are also permitted for recursion tests.
3598
3599 There are several syntaxes for testing a named group: (?(name)) is used
3600 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3601
3602 There are two unfortunate ambiguities, caused by history. (a) 'R' can
3603 be the recursive thing or the name 'R' (and similarly for 'R' followed
3604 by digits), and (b) a number could be a name that consists of digits.
3605 In both cases, we look for a name first; if not found, we try the other
3606 cases. */
3607
3608 /* For conditions that are assertions, check the syntax, and then exit
3609 the switch. This will take control down to where bracketed groups,
3610 including assertions, are processed. */
3611
3612 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3613 break;
3614
3615 /* Most other conditions use OP_CREF (a couple change to OP_RREF
3616 below), and all need to skip 3 bytes at the start of the group. */
3617
3618 code[1+LINK_SIZE] = OP_CREF;
3619 skipbytes = 3;
3620
3621 /* Check for a test for recursion in a named group. */
3622
3623 if (ptr[1] == 'R' && ptr[2] == '&')
3624 {
3625 terminator = -1;
3626 ptr += 2;
3627 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
3628 }
3629
3630 /* Check for a test for a named group's having been set, using the Perl
3631 syntax (?(<name>) or (?('name') */
3632
3633 else if (ptr[1] == '<')
3634 {
3635 terminator = '>';
3636 ptr++;
3637 }
3638 else if (ptr[1] == '\'')
3639 {
3640 terminator = '\'';
3641 ptr++;
3642 }
3643 else terminator = 0;
3644
3645 /* We now expect to read a name; any thing else is an error */
3646
3647 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3648 {
3649 ptr += 1; /* To get the right offset */
3650 *errorcodeptr = ERR28;
3651 goto FAILED;
3652 }
3653
3654 /* Read the name, but also get it as a number if it's all digits */
3655
3656 recno = 0;
3657 name = ++ptr;
3658 while ((cd->ctypes[*ptr] & ctype_word) != 0)
3659 {
3660 if (recno >= 0)
3661 recno = ((digitab[*ptr] & ctype_digit) != 0)?
3662 recno * 10 + *ptr - '0' : -1;
3663 ptr++;
3664 }
3665 namelen = ptr - name;
3666
3667 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3668 {
3669 ptr--; /* Error offset */
3670 *errorcodeptr = ERR26;
3671 goto FAILED;
3672 }
3673
3674 /* Do no further checking in the pre-compile phase. */
3675
3676 if (lengthptr != NULL) break;
3677
3678 /* In the real compile we do the work of looking for the actual
3679 reference. */
3680
3681 slot = cd->name_table;
3682 for (i = 0; i < cd->names_found; i++)
3683 {
3684 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3685 slot += cd->name_entry_size;
3686 }
3687
3688 /* Found a previous named subpattern */
3689
3690 if (i < cd->names_found)
3691 {
3692 recno = GET2(slot, 0);
3693 PUT2(code, 2+LINK_SIZE, recno);
3694 }
3695
3696 /* Search the pattern for a forward reference */
3697
3698 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3699 (options & PCRE_EXTENDED) != 0)) > 0)
3700 {
3701 PUT2(code, 2+LINK_SIZE, i);
3702 }
3703
3704 /* If terminator == 0 it means that the name followed directly after
3705 the opening parenthesis [e.g. (?(abc)...] and in this case there are
3706 some further alternatives to try. For the cases where terminator != 0
3707 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
3708 now checked all the possibilities, so give an error. */
3709
3710 else if (terminator != 0)
3711 {
3712 *errorcodeptr = ERR15;
3713 goto FAILED;
3714 }
3715
3716 /* Check for (?(R) for recursion. Allow digits after R to specify a
3717 specific group number. */
3718
3719 else if (*name == 'R')
3720 {
3721 recno = 0;
3722 for (i = 1; i < namelen; i++)
3723 {
3724 if ((digitab[name[i]] & ctype_digit) == 0)
3725 {
3726 *errorcodeptr = ERR15;
3727 goto FAILED;
3728 }
3729 recno = recno * 10 + name[i] - '0';
3730 }
3731 if (recno == 0) recno = RREF_ANY;
3732 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
3733 PUT2(code, 2+LINK_SIZE, recno);
3734 }
3735
3736 /* Similarly, check for the (?(DEFINE) "condition", which is always
3737 false. */
3738
3739 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
3740 {
3741 code[1+LINK_SIZE] = OP_DEF;
3742 skipbytes = 1;
3743 }
3744
3745 /* Check for the "name" actually being a subpattern number. */
3746
3747 else if (recno > 0)
3748 {
3749 PUT2(code, 2+LINK_SIZE, recno);
3750 }
3751
3752 /* Either an unidentified subpattern, or a reference to (?(0) */
3753
3754 else
3755 {
3756 *errorcodeptr = (recno == 0)? ERR35: ERR15;
3757 goto FAILED;
3758 }
3759 break;
3760
3761
3762 /* ------------------------------------------------------------ */
3763 case '=': /* Positive lookahead */
3764 bravalue = OP_ASSERT;
3765 ptr++;
3766 break;
3767
3768
3769 /* ------------------------------------------------------------ */
3770 case '!': /* Negative lookahead */
3771 bravalue = OP_ASSERT_NOT;
3772 ptr++;
3773 break;
3774
3775
3776 /* ------------------------------------------------------------ */
3777 case '<': /* Lookbehind or named define */
3778 switch (ptr[1])
3779 {
3780 case '=': /* Positive lookbehind */
3781 bravalue = OP_ASSERTBACK;
3782 ptr += 2;
3783 break;
3784
3785 case '!': /* Negative lookbehind */
3786 bravalue = OP_ASSERTBACK_NOT;
3787 ptr += 2;
3788 break;
3789
3790 default: /* Could be name define, else bad */
3791 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
3792 ptr++; /* Correct offset for error */
3793 *errorcodeptr = ERR24;
3794 goto FAILED;
3795 }
3796 break;
3797
3798
3799 /* ------------------------------------------------------------ */
3800 case '>': /* One-time brackets */
3801 bravalue = OP_ONCE;
3802 ptr++;
3803 break;
3804
3805
3806 /* ------------------------------------------------------------ */
3807 case 'C': /* Callout - may be followed by digits; */
3808 previous_callout = code; /* Save for later completion */
3809 after_manual_callout = 1; /* Skip one item before completing */
3810 *code++ = OP_CALLOUT;
3811 {
3812 int n = 0;
3813 while ((digitab[*(++ptr)] & ctype_digit) != 0)
3814 n = n * 10 + *ptr - '0';
3815 if (*ptr != ')')
3816 {
3817 *errorcodeptr = ERR39;
3818 goto FAILED;
3819 }
3820 if (n > 255)
3821 {
3822 *errorcodeptr = ERR38;
3823 goto FAILED;
3824 }
3825 *code++ = n;
3826 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
3827 PUT(code, LINK_SIZE, 0); /* Default length */
3828 code += 2 * LINK_SIZE;
3829 }
3830 previous = NULL;
3831 continue;
3832
3833
3834 /* ------------------------------------------------------------ */
3835 case 'P': /* Python-style named subpattern handling */
3836 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
3837 {
3838 is_recurse = *ptr == '>';
3839 terminator = ')';
3840 goto NAMED_REF_OR_RECURSE;
3841 }
3842 else if (*ptr != '<') /* Test for Python-style definition */
3843 {
3844 *errorcodeptr = ERR41;
3845 goto FAILED;
3846 }
3847 /* Fall through to handle (?P< as (?< is handled */
3848
3849
3850 /* ------------------------------------------------------------ */
3851 DEFINE_NAME: /* Come here from (?< handling */
3852 case '\'':
3853 {
3854 terminator = (*ptr == '<')? '>' : '\'';
3855 name = ++ptr;
3856
3857 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3858 namelen = ptr - name;
3859
3860 /* In the pre-compile phase, just do a syntax check. */
3861
3862 if (lengthptr != NULL)
3863 {
3864 if (*ptr != terminator)
3865 {
3866 *errorcodeptr = ERR42;
3867 goto FAILED;
3868 }
3869 if (cd->names_found >= MAX_NAME_COUNT)
3870 {
3871 *errorcodeptr = ERR49;
3872 goto FAILED;
3873 }
3874 if (namelen + 3 > cd->name_entry_size)
3875 {
3876 cd->name_entry_size = namelen + 3;
3877 if (namelen > MAX_NAME_SIZE)
3878 {
3879 *errorcodeptr = ERR48;
3880 goto FAILED;
3881 }
3882 }
3883 }
3884
3885 /* In the real compile, create the entry in the table */
3886
3887 else
3888 {
3889 slot = cd->name_table;
3890 for (i = 0; i < cd->names_found; i++)
3891 {
3892 int crc = memcmp(name, slot+2, namelen);
3893 if (crc == 0)
3894 {
3895 if (slot[2+namelen] == 0)
3896 {
3897 if ((options & PCRE_DUPNAMES) == 0)
3898 {
3899 *errorcodeptr = ERR43;
3900 goto FAILED;
3901 }
3902 }
3903 else crc = -1; /* Current name is substring */
3904 }
3905 if (crc < 0)
3906 {
3907 memmove(slot + cd->name_entry_size, slot,
3908 (cd->names_found - i) * cd->name_entry_size);
3909 break;
3910 }
3911 slot += cd->name_entry_size;
3912 }
3913
3914 PUT2(slot, 0, cd->bracount + 1);
3915 memcpy(slot + 2, name, namelen);
3916 slot[2+namelen] = 0;
3917 }
3918 }
3919
3920 /* In both cases, count the number of names we've encountered. */
3921
3922 ptr++; /* Move past > or ' */
3923 cd->names_found++;
3924 goto NUMBERED_GROUP;
3925
3926
3927 /* ------------------------------------------------------------ */
3928 case '&': /* Perl recursion/subroutine syntax */
3929 terminator = ')';
3930 is_recurse = TRUE;
3931 /* Fall through */
3932
3933 /* We come here from the Python syntax above that handles both
3934 references (?P=name) and recursion (?P>name), as well as falling
3935 through from the Perl recursion syntax (?&name). */
3936
3937 NAMED_REF_OR_RECURSE:
3938 name = ++ptr;
3939 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3940 namelen = ptr - name;
3941
3942 /* In the pre-compile phase, do a syntax check and set a dummy
3943 reference number. */
3944
3945 if (lengthptr != NULL)
3946 {
3947 if (*ptr != terminator)
3948 {
3949 *errorcodeptr = ERR42;
3950 goto FAILED;
3951 }
3952 if (namelen > MAX_NAME_SIZE)
3953 {
3954 *errorcodeptr = ERR48;
3955 goto FAILED;
3956 }
3957 recno = 0;
3958 }
3959
3960 /* In the real compile, seek the name in the table */
3961
3962 else
3963 {
3964 slot = cd->name_table;
3965 for (i = 0; i < cd->names_found; i++)
3966 {
3967 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3968 slot += cd->name_entry_size;
3969 }
3970
3971 if (i < cd->names_found) /* Back reference */
3972 {
3973 recno = GET2(slot, 0);
3974 }
3975 else if ((recno = /* Forward back reference */
3976 find_parens(ptr, cd->bracount, name, namelen,
3977 (options & PCRE_EXTENDED) != 0)) <= 0)
3978 {
3979 *errorcodeptr = ERR15;
3980 goto FAILED;
3981 }
3982 }
3983
3984 /* In both phases, we can now go to the code than handles numerical
3985 recursion or backreferences. */
3986
3987 if (is_recurse) goto HANDLE_RECURSION;
3988 else goto HANDLE_REFERENCE;
3989
3990
3991 /* ------------------------------------------------------------ */
3992 case 'R': /* Recursion */
3993 ptr++; /* Same as (?0) */
3994 /* Fall through */
3995
3996
3997 /* ------------------------------------------------------------ */
3998 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
3999 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4000 {
4001 const uschar *called;
4002 recno = 0;
4003 while((digitab[*ptr] & ctype_digit) != 0)
4004 recno = recno * 10 + *ptr++ - '0';
4005 if (*ptr != ')')
4006 {
4007 *errorcodeptr = ERR29;
4008 goto FAILED;
4009 }
4010
4011 /* Come here from code above that handles a named recursion */
4012
4013 HANDLE_RECURSION:
4014
4015 previous = code;
4016 called = cd->start_code;
4017
4018 /* When we are actually compiling, find the bracket that is being
4019 referenced. Temporarily end the regex in case it doesn't exist before
4020 this point. If we end up with a forward reference, first check that
4021 the bracket does occur later so we can give the error (and position)
4022 now. Then remember this forward reference in the workspace so it can
4023 be filled in at the end. */
4024
4025 if (lengthptr == NULL)
4026 {
4027 *code = OP_END;
4028 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4029
4030 /* Forward reference */
4031
4032 if (called == NULL)
4033 {
4034 if (find_parens(ptr, cd->bracount, NULL, recno,
4035 (options & PCRE_EXTENDED) != 0) < 0)
4036 {
4037 *errorcodeptr = ERR15;
4038 goto FAILED;
4039 }
4040 called = cd->start_code + recno;
4041 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4042 }
4043
4044 /* If not a forward reference, and the subpattern is still open,
4045 this is a recursive call. We check to see if this is a left
4046 recursion that could loop for ever, and diagnose that case. */
4047
4048 else if (GET(called, 1) == 0 &&
4049 could_be_empty(called, code, bcptr, utf8))
4050 {
4051 *errorcodeptr = ERR40;
4052 goto FAILED;
4053 }
4054 }
4055
4056 /* Insert the recursion/subroutine item, automatically wrapped inside
4057 "once" brackets. Set up a "previous group" length so that a
4058 subsequent quantifier will work. */
4059
4060 *code = OP_ONCE;
4061 PUT(code, 1, 2 + 2*LINK_SIZE);
4062 code += 1 + LINK_SIZE;
4063
4064 *code = OP_RECURSE;
4065 PUT(code, 1, called - cd->start_code);
4066 code += 1 + LINK_SIZE;
4067
4068 *code = OP_KET;
4069 PUT(code, 1, 2 + 2*LINK_SIZE);
4070 code += 1 + LINK_SIZE;
4071
4072 length_prevgroup = 3 + 3*LINK_SIZE;
4073 }
4074
4075 /* Can't determine a first byte now */
4076
4077 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4078 continue;
4079
4080
4081 /* ------------------------------------------------------------ */
4082 default: /* Other characters: check option setting */
4083 set = unset = 0;
4084 optset = &set;
4085
4086 while (*ptr != ')' && *ptr != ':')
4087 {
4088 switch (*ptr++)
4089 {
4090 case '-': optset = &unset; break;
4091
4092 case 'J': /* Record that it changed in the external options */
4093 *optset |= PCRE_DUPNAMES;
4094 cd->external_options |= PCRE_JCHANGED;
4095 break;
4096
4097 case 'i': *optset |= PCRE_CASELESS; break;
4098 case 'm': *optset |= PCRE_MULTILINE; break;
4099 case 's': *optset |= PCRE_DOTALL; break;
4100 case 'x': *optset |= PCRE_EXTENDED; break;
4101 case 'U': *optset |= PCRE_UNGREEDY; break;
4102 case 'X': *optset |= PCRE_EXTRA; break;
4103
4104 default: *errorcodeptr = ERR12;
4105 ptr--; /* Correct the offset */
4106 goto FAILED;
4107 }
4108 }
4109
4110 /* Set up the changed option bits, but don't change anything yet. */
4111
4112 newoptions = (options | set) & (~unset);
4113
4114 /* If the options ended with ')' this is not the start of a nested
4115 group with option changes, so the options change at this level. If this
4116 item is right at the start of the pattern, the options can be
4117 abstracted and made external in the pre-compile phase, and ignored in
4118 the compile phase. This can be helpful when matching -- for instance in
4119 caseless checking of required bytes.
4120
4121 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4122 definitely *not* at the start of the pattern because something has been
4123 compiled. In the pre-compile phase, however, the code pointer can have
4124 that value after the start, because it gets reset as code is discarded
4125 during the pre-compile. However, this can happen only at top level - if
4126 we are within parentheses, the starting BRA will still be present. At
4127 any parenthesis level, the length value can be used to test if anything
4128 has been compiled at that level. Thus, a test for both these conditions
4129 is necessary to ensure we correctly detect the start of the pattern in
4130 both phases.
4131
4132 If we are not at the pattern start, compile code to change the ims
4133 options if this setting actually changes any of them. We also pass the
4134 new setting back so that it can be put at the start of any following
4135 branches, and when this group ends (if we are in a group), a resetting
4136 item can be compiled. */
4137
4138 if (*ptr == ')')
4139 {
4140 if (code == cd->start_code + 1 + LINK_SIZE &&
4141 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4142 {
4143 cd->external_options = newoptions;
4144 options = newoptions;
4145 }
4146 else
4147 {
4148 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4149 {
4150 *code++ = OP_OPT;
4151 *code++ = newoptions & PCRE_IMS;
4152 }
4153
4154 /* Change options at this level, and pass them back for use
4155 in subsequent branches. Reset the greedy defaults and the case
4156 value for firstbyte and reqbyte. */
4157
4158 *optionsptr = options = newoptions;
4159 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4160 greedy_non_default = greedy_default ^ 1;
4161 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4162 }
4163
4164 previous = NULL; /* This item can't be repeated */
4165 continue; /* It is complete */
4166 }
4167
4168 /* If the options ended with ':' we are heading into a nested group
4169 with possible change of options. Such groups are non-capturing and are
4170 not assertions of any kind. All we need to do is skip over the ':';
4171 the newoptions value is handled below. */
4172
4173 bravalue = OP_BRA;
4174 ptr++;
4175 } /* End of switch for character following (? */
4176 } /* End of (? handling */
4177
4178 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4179 all unadorned brackets become non-capturing and behave like (?:...)
4180 brackets. */
4181
4182 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4183 {
4184 bravalue = OP_BRA;
4185 }
4186
4187 /* Else we have a capturing group. */
4188
4189 else
4190 {
4191 NUMBERED_GROUP:
4192 cd->bracount += 1;
4193 PUT2(code, 1+LINK_SIZE, cd->bracount);
4194 skipbytes = 2;
4195 }
4196
4197 /* Process nested bracketed regex. Assertions may not be repeated, but
4198 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4199 non-register variable in order to be able to pass its address because some
4200 compilers complain otherwise. Pass in a new setting for the ims options if
4201 they have changed. */
4202
4203 previous = (bravalue >= OP_ONCE)? code : NULL;
4204 *code = bravalue;
4205 tempcode = code;
4206 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4207 length_prevgroup = 0; /* Initialize for pre-compile phase */
4208
4209 if (!compile_regex(
4210 newoptions, /* The complete new option state */
4211 options & PCRE_IMS, /* The previous ims option state */
4212 &tempcode, /* Where to put code (updated) */
4213 &ptr, /* Input pointer (updated) */
4214 errorcodeptr, /* Where to put an error message */
4215 (bravalue == OP_ASSERTBACK ||
4216 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4217 skipbytes, /* Skip over bracket number */
4218 &subfirstbyte, /* For possible first char */
4219 &subreqbyte, /* For possible last char */
4220 bcptr, /* Current branch chain */
4221 cd, /* Tables block */
4222 (lengthptr == NULL)? NULL : /* Actual compile phase */
4223 &length_prevgroup /* Pre-compile phase */
4224 ))
4225 goto FAILED;
4226
4227 /* At the end of compiling, code is still pointing to the start of the
4228 group, while tempcode has been updated to point past the end of the group
4229 and any option resetting that may follow it. The pattern pointer (ptr)
4230 is on the bracket. */
4231
4232 /* If this is a conditional bracket, check that there are no more than
4233 two branches in the group, or just one if it's a DEFINE group. */
4234
4235 if (bravalue == OP_COND)
4236 {
4237 uschar *tc = code;
4238 int condcount = 0;
4239
4240 do {
4241 condcount++;
4242 tc += GET(tc,1);
4243 }
4244 while (*tc != OP_KET);
4245
4246 /* A DEFINE group is never obeyed inline (the "condition" is always
4247 false). It must have only one branch. */
4248
4249 if (code[LINK_SIZE+1] == OP_DEF)
4250 {
4251 if (condcount > 1)
4252 {
4253 *errorcodeptr = ERR54;
4254 goto FAILED;
4255 }
4256 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4257 }
4258
4259 /* A "normal" conditional group. If there is just one branch, we must not
4260 make use of its firstbyte or reqbyte, because this is equivalent to an
4261 empty second branch. */
4262
4263 else
4264 {
4265 if (condcount > 2)
4266 {
4267 *errorcodeptr = ERR27;
4268 goto FAILED;
4269 }
4270 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4271 }
4272 }
4273
4274 /* Error if hit end of pattern */
4275
4276 if (*ptr != ')')
4277 {
4278 *errorcodeptr = ERR14;
4279 goto FAILED;
4280 }
4281
4282 /* In the pre-compile phase, update the length by the length of the nested
4283 group, less the brackets at either end. Then reduce the compiled code to
4284 just the brackets so that it doesn't use much memory if it is duplicated by
4285 a quantifier. */
4286
4287 if (lengthptr != NULL)
4288 {
4289 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4290 code++;
4291 PUTINC(code, 0, 1 + LINK_SIZE);
4292 *code++ = OP_KET;
4293 PUTINC(code, 0, 1 + LINK_SIZE);
4294 }
4295
4296 /* Otherwise update the main code pointer to the end of the group. */
4297
4298 else code = tempcode;
4299
4300 /* For a DEFINE group, required and first character settings are not
4301 relevant. */
4302
4303 if (bravalue == OP_DEF) break;
4304
4305 /* Handle updating of the required and first characters for other types of
4306 group. Update for normal brackets of all kinds, and conditions with two
4307 branches (see code above). If the bracket is followed by a quantifier with
4308 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4309 zerofirstbyte outside the main loop so that they can be accessed for the
4310 back off. */
4311
4312 zeroreqbyte = reqbyte;
4313 zerofirstbyte = firstbyte;
4314 groupsetfirstbyte = FALSE;
4315
4316 if (bravalue >= OP_ONCE)
4317 {
4318 /* If we have not yet set a firstbyte in this branch, take it from the
4319 subpattern, remembering that it was set here so that a repeat of more
4320 than one can replicate it as reqbyte if necessary. If the subpattern has
4321 no firstbyte, set "none" for the whole branch. In both cases, a zero
4322 repeat forces firstbyte to "none". */
4323
4324 if (firstbyte == REQ_UNSET)
4325 {
4326 if (subfirstbyte >= 0)
4327 {
4328 firstbyte = subfirstbyte;
4329 groupsetfirstbyte = TRUE;
4330 }
4331 else firstbyte = REQ_NONE;
4332 zerofirstbyte = REQ_NONE;
4333 }
4334
4335 /* If firstbyte was previously set, convert the subpattern's firstbyte
4336 into reqbyte if there wasn't one, using the vary flag that was in
4337 existence beforehand. */
4338
4339 else if (subfirstbyte >= 0 && subreqbyte < 0)
4340 subreqbyte = subfirstbyte | tempreqvary;
4341
4342 /* If the subpattern set a required byte (or set a first byte that isn't
4343 really the first byte - see above), set it. */
4344
4345 if (subreqbyte >= 0) reqbyte = subreqbyte;
4346 }
4347
4348 /* For a forward assertion, we take the reqbyte, if set. This can be
4349 helpful if the pattern that follows the assertion doesn't set a different
4350 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4351 for an assertion, however because it leads to incorrect effect for patterns
4352 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4353 of a firstbyte. This is overcome by a scan at the end if there's no
4354 firstbyte, looking for an asserted first char. */
4355
4356 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4357 break; /* End of processing '(' */
4358
4359
4360 /* ===================================================================*/
4361 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4362 are arranged to be the negation of the corresponding OP_values. For the
4363 back references, the values are ESC_REF plus the reference number. Only
4364 back references and those types that consume a character may be repeated.
4365 We can test for values between ESC_b and ESC_Z for the latter; this may
4366 have to change if any new ones are ever created. */
4367
4368 case '\\':
4369 tempptr = ptr;
4370 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4371 if (*errorcodeptr != 0) goto FAILED;
4372
4373 if (c < 0)
4374 {
4375 if (-c == ESC_Q) /* Handle start of quoted string */
4376 {
4377 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
4378 else inescq = TRUE;
4379 continue;
4380 }
4381
4382 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
4383
4384 /* For metasequences that actually match a character, we disable the
4385 setting of a first character if it hasn't already been set. */
4386
4387 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
4388 firstbyte = REQ_NONE;
4389
4390 /* Set values to reset to if this is followed by a zero repeat. */
4391
4392 zerofirstbyte = firstbyte;
4393 zeroreqbyte = reqbyte;
4394
4395 /* \k<name> or \k'name' is a back reference by name (Perl syntax) */
4396
4397 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\''))
4398 {
4399 is_recurse = FALSE;
4400 terminator = (*(++ptr) == '<')? '>' : '\'';
4401 goto NAMED_REF_OR_RECURSE;
4402 }
4403
4404 /* Back references are handled specially; must disable firstbyte if
4405 not set to cope with cases like (?=(\w+))\1: which would otherwise set
4406 ':' later. */
4407
4408 if (-c >= ESC_REF)
4409 {
4410 recno = -c - ESC_REF;
4411
4412 HANDLE_REFERENCE: /* Come here from named backref handling */
4413 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4414 previous = code;
4415 *code++ = OP_REF;
4416 PUT2INC(code, 0, recno);
4417 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4418 if (recno > cd->top_backref) cd->top_backref = recno;
4419 }
4420
4421 /* So are Unicode property matches, if supported. */
4422
4423 #ifdef SUPPORT_UCP
4424 else if (-c == ESC_P || -c == ESC_p)
4425 {
4426 BOOL negated;
4427 int pdata;
4428 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4429 if (ptype < 0) goto FAILED;
4430 previous = code;
4431 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4432 *code++ = ptype;
4433 *code++ = pdata;
4434 }
4435 #else
4436
4437 /* If Unicode properties are not supported, \X, \P, and \p are not
4438 allowed. */
4439
4440 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4441 {
4442 *errorcodeptr = ERR45;
4443 goto FAILED;
4444 }
4445 #endif
4446
4447 /* For the rest (including \X when Unicode properties are supported), we
4448 can obtain the OP value by negating the escape value. */
4449
4450 else
4451 {
4452 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
4453 *code++ = -c;
4454 }
4455 continue;
4456 }
4457
4458 /* We have a data character whose value is in c. In UTF-8 mode it may have
4459 a value > 127. We set its representation in the length/buffer, and then
4460 handle it as a data character. */
4461
4462 #ifdef SUPPORT_UTF8
4463 if (utf8 && c > 127)
4464 mclength = _pcre_ord2utf8(c, mcbuffer);
4465 else
4466 #endif
4467
4468 {
4469 mcbuffer[0] = c;
4470 mclength = 1;
4471 }
4472 goto ONE_CHAR;
4473
4474
4475 /* ===================================================================*/
4476 /* Handle a literal character. It is guaranteed not to be whitespace or #
4477 when the extended flag is set. If we are in UTF-8 mode, it may be a
4478 multi-byte literal character. */
4479
4480 default:
4481 NORMAL_CHAR:
4482 mclength = 1;
4483 mcbuffer[0] = c;
4484
4485 #ifdef SUPPORT_UTF8
4486 if (utf8 && c >= 0xc0)
4487 {
4488 while ((ptr[1] & 0xc0) == 0x80)
4489 mcbuffer[mclength++] = *(++ptr);
4490 }
4491 #endif
4492
4493 /* At this point we have the character's bytes in mcbuffer, and the length
4494 in mclength. When not in UTF-8 mode, the length is always 1. */
4495
4496 ONE_CHAR:
4497 previous = code;
4498 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
4499 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
4500
4501 /* Set the first and required bytes appropriately. If no previous first
4502 byte, set it from this character, but revert to none on a zero repeat.
4503 Otherwise, leave the firstbyte value alone, and don't change it on a zero
4504 repeat. */
4505
4506 if (firstbyte == REQ_UNSET)
4507 {
4508 zerofirstbyte = REQ_NONE;
4509 zeroreqbyte = reqbyte;
4510
4511 /* If the character is more than one byte long, we can set firstbyte
4512 only if it is not to be matched caselessly. */
4513
4514 if (mclength == 1 || req_caseopt == 0)
4515 {
4516 firstbyte = mcbuffer[0] | req_caseopt;
4517 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
4518 }
4519 else firstbyte = reqbyte = REQ_NONE;
4520 }
4521
4522 /* firstbyte was previously set; we can set reqbyte only the length is
4523 1 or the matching is caseful. */
4524
4525 else
4526 {
4527 zerofirstbyte = firstbyte;
4528 zeroreqbyte = reqbyte;
4529 if (mclength == 1 || req_caseopt == 0)
4530 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
4531 }
4532
4533 break; /* End of literal character handling */
4534 }
4535 } /* end of big loop */
4536
4537
4538 /* Control never reaches here by falling through, only by a goto for all the
4539 error states. Pass back the position in the pattern so that it can be displayed
4540 to the user for diagnosing the error. */
4541
4542 FAILED:
4543 *ptrptr = ptr;
4544 return FALSE;
4545 }
4546
4547
4548
4549
4550 /*************************************************
4551 * Compile sequence of alternatives *
4552 *************************************************/
4553
4554 /* On entry, ptr is pointing past the bracket character, but on return it
4555 points to the closing bracket, or vertical bar, or end of string. The code
4556 variable is pointing at the byte into which the BRA operator has been stored.
4557 If the ims options are changed at the start (for a (?ims: group) or during any
4558 branch, we need to insert an OP_OPT item at the start of every following branch
4559 to ensure they get set correctly at run time, and also pass the new options
4560 into every subsequent branch compile.
4561
4562 This function is used during the pre-compile phase when we are trying to find
4563 out the amount of memory needed, as well as during the real compile phase. The
4564 value of lengthptr distinguishes the two phases.
4565
4566 Argument:
4567 options option bits, including any changes for this subpattern
4568 oldims previous settings of ims option bits
4569 codeptr -> the address of the current code pointer
4570 ptrptr -> the address of the current pattern pointer
4571 errorcodeptr -> pointer to error code variable
4572 lookbehind TRUE if this is a lookbehind assertion
4573 skipbytes skip this many bytes at start (for brackets and OP_COND)
4574 firstbyteptr place to put the first required character, or a negative number
4575 reqbyteptr place to put the last required character, or a negative number
4576 bcptr pointer to the chain of currently open branches
4577 cd points to the data block with tables pointers etc.
4578 lengthptr NULL during the real compile phase
4579 points to length accumulator during pre-compile phase
4580
4581 Returns: TRUE on success
4582 */
4583
4584 static BOOL
4585 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
4586 int *errorcodeptr, BOOL lookbehind, int skipbytes, int *firstbyteptr,
4587 int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr)
4588 {
4589 const uschar *ptr = *ptrptr;
4590 uschar *code = *codeptr;
4591 uschar *last_branch = code;
4592 uschar *start_bracket = code;
4593 uschar *reverse_count = NULL;
4594 int firstbyte, reqbyte;
4595 int branchfirstbyte, branchreqbyte;
4596 int length;
4597 branch_chain bc;
4598
4599 bc.outer = bcptr;
4600 bc.current = code;
4601
4602 firstbyte = reqbyte = REQ_UNSET;
4603
4604 /* Accumulate the length for use in the pre-compile phase. Start with the
4605 length of the BRA and KET and any extra bytes that are required at the
4606 beginning. We accumulate in a local variable to save frequent testing of
4607 lenthptr for NULL. We cannot do this by looking at the value of code at the
4608 start and end of each alternative, because compiled items are discarded during
4609 the pre-compile phase so that the work space is not exceeded. */
4610
4611 length = 2 + 2*LINK_SIZE + skipbytes;
4612
4613 /* WARNING: If the above line is changed for any reason, you must also change
4614 the code that abstracts option settings at the start of the pattern and makes
4615 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
4616 pre-compile phase to find out whether anything has yet been compiled or not. */
4617
4618 /* Offset is set zero to mark that this bracket is still open */
4619
4620 PUT(code, 1, 0);
4621 code += 1 + LINK_SIZE + skipbytes;
4622
4623 /* Loop for each alternative branch */
4624
4625 for (;;)
4626 {
4627 /* Handle a change of ims options at the start of the branch */
4628
4629 if ((options & PCRE_IMS) != oldims)
4630 {
4631 *code++ = OP_OPT;
4632 *code++ = options & PCRE_IMS;
4633 length += 2;
4634 }
4635
4636 /* Set up dummy OP_REVERSE if lookbehind assertion */
4637
4638 if (lookbehind)
4639 {
4640 *code++ = OP_REVERSE;
4641 reverse_count = code;
4642 PUTINC(code, 0, 0);
4643 length += 1 + LINK_SIZE;
4644 }
4645
4646 /* Now compile the branch; in the pre-compile phase its length gets added
4647 into the length. */
4648
4649 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
4650 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
4651 {
4652 *ptrptr = ptr;
4653 return FALSE;
4654 }
4655
4656 /* In the real compile phase, there is some post-processing to be done. */
4657
4658 if (lengthptr == NULL)
4659 {
4660 /* If this is the first branch, the firstbyte and reqbyte values for the
4661 branch become the values for the regex. */
4662
4663 if (*last_branch != OP_ALT)
4664 {
4665 firstbyte = branchfirstbyte;
4666 reqbyte = branchreqbyte;
4667 }
4668
4669 /* If this is not the first branch, the first char and reqbyte have to
4670 match the values from all the previous branches, except that if the
4671 previous value for reqbyte didn't have REQ_VARY set, it can still match,
4672 and we set REQ_VARY for the regex. */
4673
4674 else
4675 {
4676 /* If we previously had a firstbyte, but it doesn't match the new branch,
4677 we have to abandon the firstbyte for the regex, but if there was
4678 previously no reqbyte, it takes on the value of the old firstbyte. */
4679
4680 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
4681 {
4682 if (reqbyte < 0) reqbyte = firstbyte;
4683 firstbyte = REQ_NONE;
4684 }
4685
4686 /* If we (now or from before) have no firstbyte, a firstbyte from the
4687 branch becomes a reqbyte if there isn't a branch reqbyte. */
4688
4689 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
4690 branchreqbyte = branchfirstbyte;
4691
4692 /* Now ensure that the reqbytes match */
4693
4694 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
4695 reqbyte = REQ_NONE;
4696 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
4697 }
4698
4699 /* If lookbehind, check that this branch matches a fixed-length string, and
4700 put the length into the OP_REVERSE item. Temporarily mark the end of the
4701 branch with OP_END. */
4702
4703 if (lookbehind)
4704 {
4705 int fixed_length;
4706 *code = OP_END;
4707 fixed_length = find_fixedlength(last_branch, options);
4708 DPRINTF(("fixed length = %d\n", fixed_length));
4709 if (fixed_length < 0)
4710 {
4711 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
4712 *ptrptr = ptr;
4713 return FALSE;
4714 }
4715 PUT(reverse_count, 0, fixed_length);
4716 }
4717 }
4718
4719 /* Reached end of expression, either ')' or end of pattern. Go back through
4720 the alternative branches and reverse the chain of offsets, with the field in
4721 the BRA item now becoming an offset to the first alternative. If there are
4722 no alternatives, it points to the end of the group. The length in the
4723 terminating ket is always the length of the whole bracketed item. If any of
4724 the ims options were changed inside the group, compile a resetting op-code
4725 following, except at the very end of the pattern. Return leaving the pointer
4726 at the terminating char. */
4727
4728 if (*ptr != '|')
4729 {
4730 int branch_length = code - last_branch;
4731 do
4732 {
4733 int prev_length = GET(last_branch, 1);
4734 PUT(last_branch, 1, branch_length);
4735 branch_length = prev_length;
4736 last_branch -= branch_length;
4737 }
4738 while (branch_length > 0);
4739
4740 /* Fill in the ket */
4741
4742 *code = OP_KET;
4743 PUT(code, 1, code - start_bracket);
4744 code += 1 + LINK_SIZE;
4745
4746 /* Resetting option if needed */
4747
4748 if ((options & PCRE_IMS) != oldims && *ptr == ')')
4749 {
4750 *code++ = OP_OPT;
4751 *code++ = oldims;
4752 length += 2;
4753 }
4754
4755 /* Set values to pass back */
4756
4757 *codeptr = code;
4758 *ptrptr = ptr;
4759 *firstbyteptr = firstbyte;
4760 *reqbyteptr = reqbyte;
4761 if (lengthptr != NULL) *lengthptr += length;
4762 return TRUE;
4763 }
4764
4765 /* Another branch follows; insert an "or" node. Its length field points back
4766 to the previous branch while the bracket remains open. At the end the chain
4767 is reversed. It's done like this so that the start of the bracket has a
4768 zero offset until it is closed, making it possible to detect recursion. */
4769
4770 *code = OP_ALT;
4771 PUT(code, 1, code - last_branch);
4772 bc.current = last_branch = code;
4773 code += 1 + LINK_SIZE;
4774 ptr++;
4775 length += 1 + LINK_SIZE;
4776 }
4777 /* Control never reaches here */
4778 }
4779
4780
4781
4782
4783 /*************************************************
4784 * Check for anchored expression *
4785 *************************************************/
4786
4787 /* Try to find out if this is an anchored regular expression. Consider each
4788 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
4789 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
4790 it's anchored. However, if this is a multiline pattern, then only OP_SOD
4791 counts, since OP_CIRC can match in the middle.
4792
4793 We can also consider a regex to be anchored if OP_SOM starts all its branches.
4794 This is the code for \G, which means "match at start of match position, taking
4795 into account the match offset".
4796
4797 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
4798 because that will try the rest of the pattern at all possible matching points,
4799 so there is no point trying again.... er ....
4800
4801 .... except when the .* appears inside capturing parentheses, and there is a
4802 subsequent back reference to those parentheses. We haven't enough information
4803 to catch that case precisely.
4804
4805 At first, the best we could do was to detect when .* was in capturing brackets
4806 and the highest back reference was greater than or equal to that level.
4807 However, by keeping a bitmap of the first 31 back references, we can catch some
4808 of the more common cases more precisely.
4809
4810 Arguments:
4811 code points to start of expression (the bracket)
4812 options points to the options setting
4813 bracket_map a bitmap of which brackets we are inside while testing; this
4814 handles up to substring 31; after that we just have to take
4815 the less precise approach
4816 backref_map the back reference bitmap
4817
4818 Returns: TRUE or FALSE
4819 */
4820
4821 static BOOL
4822 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
4823 unsigned int backref_map)
4824 {
4825 do {
4826 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
4827 options, PCRE_MULTILINE, FALSE);
4828 register int op = *scode;
4829
4830 /* Non-capturing brackets */
4831
4832 if (op == OP_BRA)
4833 {
4834 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4835 }
4836
4837 /* Capturing brackets */
4838
4839 else if (op == OP_CBRA)
4840 {
4841 int n = GET2(scode, 1+LINK_SIZE);
4842 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
4843 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
4844 }
4845
4846 /* Other brackets */
4847
4848 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4849 {
4850 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4851 }
4852
4853 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
4854 are or may be referenced. */
4855
4856 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
4857 op == OP_TYPEPOSSTAR) &&
4858 (*options & PCRE_DOTALL) != 0)
4859 {
4860 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4861 }
4862
4863 /* Check for explicit anchoring */
4864
4865 else if (op != OP_SOD && op != OP_SOM &&
4866 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
4867 return FALSE;
4868 code += GET(code, 1);
4869 }
4870 while (*code == OP_ALT); /* Loop for each alternative */
4871 return TRUE;
4872 }
4873
4874
4875
4876 /*************************************************
4877 * Check for starting with ^ or .* *
4878 *************************************************/
4879
4880 /* This is called to find out if every branch starts with ^ or .* so that
4881 "first char" processing can be done to speed things up in multiline
4882 matching and for non-DOTALL patterns that start with .* (which must start at
4883 the beginning or after \n). As in the case of is_anchored() (see above), we
4884 have to take account of back references to capturing brackets that contain .*
4885 because in that case we can't make the assumption.
4886
4887 Arguments:
4888 code points to start of expression (the bracket)
4889 bracket_map a bitmap of which brackets we are inside while testing; this
4890 handles up to substring 31; after that we just have to take
4891 the less precise approach
4892 backref_map the back reference bitmap
4893
4894 Returns: TRUE or FALSE
4895 */
4896
4897 static BOOL
4898 is_startline(const uschar *code, unsigned int bracket_map,
4899 unsigned int backref_map)
4900 {
4901 do {
4902 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
4903 NULL, 0, FALSE);
4904 register int op = *scode;
4905
4906 /* Non-capturing brackets */
4907
4908 if (op == OP_BRA)
4909 {
4910 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
4911 }
4912
4913 /* Capturing brackets */
4914
4915 else if (op == OP_CBRA)
4916 {
4917 int n = GET2(scode, 1+LINK_SIZE);
4918 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
4919 if (!is_startline(scode, new_map, backref_map)) return FALSE;
4920 }
4921
4922 /* Other brackets */
4923
4924 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4925 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
4926
4927 /* .* means "start at start or after \n" if it isn't in brackets that
4928 may be referenced. */
4929
4930 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
4931 {
4932 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4933 }
4934
4935 /* Check for explicit circumflex */
4936
4937 else if (op != OP_CIRC) return FALSE;
4938
4939 /* Move on to the next alternative */
4940
4941 code += GET(code, 1);
4942 }
4943 while (*code == OP_ALT); /* Loop for each alternative */
4944 return TRUE;
4945 }
4946
4947
4948
4949 /*************************************************
4950 * Check for asserted fixed first char *
4951 *************************************************/
4952
4953 /* During compilation, the "first char" settings from forward assertions are
4954 discarded, because they can cause conflicts with actual literals that follow.
4955 However, if we end up without a first char setting for an unanchored pattern,
4956 it is worth scanning the regex to see if there is an initial asserted first
4957 char. If all branches start with the same asserted char, or with a bracket all
4958 of whose alternatives start with the same asserted char (recurse ad lib), then
4959 we return that char, otherwise -1.
4960
4961 Arguments:
4962 code points to start of expression (the bracket)
4963 options pointer to the options (used to check casing changes)
4964 inassert TRUE if in an assertion
4965
4966 Returns: -1 or the fixed first char
4967 */
4968
4969 static int
4970 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
4971 {
4972 register int c = -1;
4973 do {
4974 int d;
4975 const uschar *scode =
4976 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
4977 register int op = *scode;
4978
4979 switch(op)
4980 {
4981 default:
4982 return -1;
4983
4984 case OP_BRA:
4985 case OP_CBRA:
4986 case OP_ASSERT:
4987 case OP_ONCE:
4988 case OP_COND:
4989 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
4990 return -1;
4991 if (c < 0) c = d; else if (c != d) return -1;
4992 break;
4993
4994 case OP_EXACT: /* Fall through */
4995 scode += 2;
4996
4997 case OP_CHAR:
4998 case OP_CHARNC:
4999 case OP_PLUS:
5000 case OP_MINPLUS:
5001 case OP_POSPLUS:
5002 if (!inassert) return -1;
5003 if (c < 0)
5004 {
5005 c = scode[1];
5006 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5007 }
5008 else if (c != scode[1]) return -1;
5009 break;
5010 }
5011
5012 code += GET(code, 1);
5013 }
5014 while (*code == OP_ALT);
5015 return c;
5016 }
5017
5018
5019
5020 /*************************************************
5021 * Compile a Regular Expression *
5022 *************************************************/
5023
5024 /* This function takes a string and returns a pointer to a block of store
5025 holding a compiled version of the expression. The original API for this
5026 function had no error code return variable; it is retained for backwards
5027 compatibility. The new function is given a new name.
5028
5029 Arguments:
5030 pattern the regular expression
5031 options various option bits
5032 errorcodeptr pointer to error code variable (pcre_compile2() only)
5033 can be NULL if you don't want a code value
5034 errorptr pointer to pointer to error text
5035 erroroffset ptr offset in pattern where error was detected
5036 tables pointer to character tables or NULL
5037
5038 Returns: pointer to compiled data block, or NULL on error,
5039 with errorptr and erroroffset set
5040 */
5041
5042 PCRE_DATA_SCOPE pcre *
5043 pcre_compile(const char *pattern, int options, const char **errorptr,
5044 int *erroroffset, const unsigned char *tables)
5045 {
5046 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5047 }
5048
5049
5050 PCRE_DATA_SCOPE pcre *
5051 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5052 const char **errorptr, int *erroroffset, const unsigned char *tables)
5053 {
5054 real_pcre *re;
5055 int length = 1; /* For final END opcode */
5056 int firstbyte, reqbyte, newline;
5057 int errorcode = 0;
5058 #ifdef SUPPORT_UTF8
5059 BOOL utf8;
5060 #endif
5061 size_t size;
5062 uschar *code;
5063 const uschar *codestart;
5064 const uschar *ptr;
5065 compile_data compile_block;
5066 compile_data *cd = &compile_block;
5067
5068 /* This space is used for "compiling" into during the first phase, when we are
5069 computing the amount of memory that is needed. Compiled items are thrown away
5070 as soon as possible, so that a fairly large buffer should be sufficient for
5071 this purpose. The same space is used in the second phase for remembering where
5072 to fill in forward references to subpatterns. */
5073
5074 uschar cworkspace[COMPILE_WORK_SIZE];
5075
5076
5077 /* Set this early so that early errors get offset 0. */
5078
5079 ptr = (const uschar *)pattern;
5080
5081 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5082 can do is just return NULL, but we can set a code value if there is a code
5083 pointer. */
5084
5085 if (errorptr == NULL)
5086 {
5087 if (errorcodeptr != NULL) *errorcodeptr = 99;
5088 return NULL;
5089 }
5090
5091 *errorptr = NULL;
5092 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5093
5094 /* However, we can give a message for this error */
5095
5096 if (erroroffset == NULL)
5097 {
5098 errorcode = ERR16;
5099 goto PCRE_EARLY_ERROR_RETURN;
5100 }
5101
5102 *erroroffset = 0;
5103
5104 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5105
5106 #ifdef SUPPORT_UTF8
5107 utf8 = (options & PCRE_UTF8) != 0;
5108 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5109 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5110 {
5111 errorcode = ERR44;
5112 goto PCRE_UTF8_ERROR_RETURN;
5113 }
5114 #else
5115 if ((options & PCRE_UTF8) != 0)
5116 {
5117 errorcode = ERR32;
5118 goto PCRE_EARLY_ERROR_RETURN;
5119 }
5120 #endif
5121
5122 if ((options & ~PUBLIC_OPTIONS) != 0)
5123 {
5124 errorcode = ERR17;
5125 goto PCRE_EARLY_ERROR_RETURN;
5126 }
5127
5128 /* Set up pointers to the individual character tables */
5129
5130 if (tables == NULL) tables = _pcre_default_tables;
5131 cd->lcc = tables + lcc_offset;
5132 cd->fcc = tables + fcc_offset;
5133 cd->cbits = tables + cbits_offset;
5134 cd->ctypes = tables + ctypes_offset;
5135
5136 /* Handle different types of newline. The three bits give seven cases. The
5137 current code allows for fixed one- or two-byte sequences, plus "any". */
5138
5139 switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5140 {
5141 case 0: newline = NEWLINE; break; /* Compile-time default */
5142 case PCRE_NEWLINE_CR: newline = '\r'; break;
5143 case PCRE_NEWLINE_LF: newline = '\n'; break;
5144 case PCRE_NEWLINE_CR+
5145 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5146 case PCRE_NEWLINE_ANY: newline = -1; break;
5147 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5148 }
5149
5150 if (newline < 0)
5151 {
5152 cd->nltype = NLTYPE_ANY;
5153 }
5154 else
5155 {
5156 cd->nltype = NLTYPE_FIXED;
5157 if (newline > 255)
5158 {
5159 cd->nllen = 2;
5160 cd->nl[0] = (newline >> 8) & 255;
5161 cd->nl[1] = newline & 255;
5162 }
5163 else
5164 {
5165 cd->nllen = 1;
5166 cd->nl[0] = newline;
5167 }
5168 }
5169
5170 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5171 references to help in deciding whether (.*) can be treated as anchored or not.
5172 */
5173
5174 cd->top_backref = 0;
5175 cd->backref_map = 0;
5176
5177 /* Reflect pattern for debugging output */
5178
5179 DPRINTF(("------------------------------------------------------------------\n"));
5180 DPRINTF(("%s\n", pattern));
5181
5182 /* Pretend to compile the pattern while actually just accumulating the length
5183 of memory required. This behaviour is triggered by passing a non-NULL final
5184 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5185 to compile parts of the pattern into; the compiled code is discarded when it is
5186 no longer needed, so hopefully this workspace will never overflow, though there
5187 is a test for its doing so. */
5188
5189 cd->bracount = 0;
5190 cd->names_found = 0;
5191 cd->name_entry_size = 0;
5192 cd->name_table = NULL;
5193 cd->start_workspace = cworkspace;
5194 cd->start_code = cworkspace;
5195 cd->hwm = cworkspace;
5196 cd->start_pattern = (const uschar *)pattern;
5197 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5198 cd->req_varyopt = 0;
5199 cd->nopartial = FALSE;
5200 cd->external_options = options;
5201
5202 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5203 don't need to look at the result of the function here. The initial options have
5204 been put into the cd block so that they can be changed if an option setting is
5205 found within the regex right at the beginning. Bringing initial option settings
5206 outside can help speed up starting point checks. */
5207
5208 code = cworkspace;
5209 *code = OP_BRA;
5210 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5211 &code, &ptr, &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, &length);
5212 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5213
5214 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5215 cd->hwm - cworkspace));
5216
5217 if (length > MAX_PATTERN_SIZE)
5218 {
5219 errorcode = ERR20;
5220 goto PCRE_EARLY_ERROR_RETURN;
5221 }
5222
5223 /* Compute the size of data block needed and get it, either from malloc or
5224 externally provided function. Integer overflow should no longer be possible
5225 because nowadays we limit the maximum value of cd->names_found and
5226 cd->name_entry_size. */
5227
5228 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5229 re = (real_pcre *)(pcre_malloc)(size);
5230
5231 if (re == NULL)
5232 {
5233 errorcode = ERR21;
5234 goto PCRE_EARLY_ERROR_RETURN;
5235 }
5236
5237 /* Put in the magic number, and save the sizes, initial options, and character
5238 table pointer. NULL is used for the default character tables. The nullpad field
5239 is at the end; it's there to help in the case when a regex compiled on a system
5240 with 4-byte pointers is run on another with 8-byte pointers. */
5241
5242 re->magic_number = MAGIC_NUMBER;
5243 re->size = size;
5244 re->options = cd->external_options;
5245 re->dummy1 = 0;
5246 re->first_byte = 0;
5247 re->req_byte = 0;
5248 re->name_table_offset = sizeof(real_pcre);
5249 re->name_entry_size = cd->name_entry_size;
5250 re->name_count = cd->names_found;
5251 re->ref_count = 0;
5252 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5253 re->nullpad = NULL;
5254
5255 /* The starting points of the name/number translation table and of the code are
5256 passed around in the compile data block. The start/end pattern and initial
5257 options are already set from the pre-compile phase, as is the name_entry_size
5258 field. Reset the bracket count and the names_found field. Also reset the hwm
5259 field; this time it's used for remembering forward references to subpatterns.
5260 */
5261
5262 cd->bracount = 0;
5263 cd->names_found = 0;
5264 cd->name_table = (uschar *)re + re->name_table_offset;
5265 codestart = cd->name_table + re->name_entry_size * re->name_count;
5266 cd->start_code = codestart;
5267 cd->hwm = cworkspace;
5268 cd->req_varyopt = 0;
5269 cd->nopartial = FALSE;
5270
5271 /* Set up a starting, non-extracting bracket, then compile the expression. On
5272 error, errorcode will be set non-zero, so we don't need to look at the result
5273 of the function here. */
5274
5275 ptr = (const uschar *)pattern;
5276 code = (uschar *)codestart;
5277 *code = OP_BRA;
5278 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
5279 &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
5280 re->top_bracket = cd->bracount;
5281 re->top_backref = cd->top_backref;
5282
5283 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
5284
5285 /* If not reached end of pattern on success, there's an excess bracket. */
5286
5287 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
5288
5289 /* Fill in the terminating state and check for disastrous overflow, but
5290 if debugging, leave the test till after things are printed out. */
5291
5292 *code++ = OP_END;
5293
5294 #ifndef DEBUG
5295 if (code - codestart > length) errorcode = ERR23;
5296 #endif
5297
5298 /* Fill in any forward references that are required. */
5299
5300 while (errorcode == 0 && cd->hwm > cworkspace)
5301 {
5302 int offset, recno;
5303 const uschar *groupptr;
5304 cd->hwm -= LINK_SIZE;
5305 offset = GET(cd->hwm, 0);
5306 recno = GET(codestart, offset);
5307 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
5308 if (groupptr == NULL) errorcode = ERR53;
5309 else PUT(((uschar *)codestart), offset, groupptr - codestart);
5310 }
5311
5312 /* Give an error if there's back reference to a non-existent capturing
5313 subpattern. */
5314
5315 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
5316
5317 /* Failed to compile, or error while post-processing */
5318
5319 if (errorcode != 0)
5320 {
5321 (pcre_free)(re);
5322 PCRE_EARLY_ERROR_RETURN:
5323 *erroroffset = ptr - (const uschar *)pattern;
5324 #ifdef SUPPORT_UTF8
5325 PCRE_UTF8_ERROR_RETURN:
5326 #endif
5327 *errorptr = error_texts[errorcode];
5328 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5329 return NULL;
5330 }
5331
5332 /* If the anchored option was not passed, set the flag if we can determine that
5333 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5334 as starting with .* when DOTALL is set).
5335
5336 Otherwise, if we know what the first byte has to be, save it, because that
5337 speeds up unanchored matches no end. If not, see if we can set the
5338 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5339 start with ^. and also when all branches start with .* for non-DOTALL matches.
5340 */
5341
5342 if ((re->options & PCRE_ANCHORED) == 0)
5343 {
5344 int temp_options = re->options; /* May get changed during these scans */
5345 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
5346 re->options |= PCRE_ANCHORED;
5347 else
5348 {
5349 if (firstbyte < 0)
5350 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5351 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5352 {
5353 int ch = firstbyte & 255;
5354 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5355 cd->fcc[ch] == ch)? ch : firstbyte;
5356 re->options |= PCRE_FIRSTSET;
5357 }
5358 else if (is_startline(codestart, 0, cd->backref_map))
5359 re->options |= PCRE_STARTLINE;
5360 }
5361 }
5362
5363 /* For an anchored pattern, we use the "required byte" only if it follows a
5364 variable length item in the regex. Remove the caseless flag for non-caseable
5365 bytes. */
5366
5367 if (reqbyte >= 0 &&
5368 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5369 {
5370 int ch = reqbyte & 255;
5371 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5372 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5373 re->options |= PCRE_REQCHSET;
5374 }
5375
5376 /* Print out the compiled data if debugging is enabled. This is never the
5377 case when building a production library. */
5378
5379 #ifdef DEBUG
5380
5381 printf("Length = %d top_bracket = %d top_backref = %d\n",
5382 length, re->top_bracket, re->top_backref);
5383
5384 if (re->options != 0)
5385 {
5386 printf("%s%s%s%s%s%s%s%s%s\n",
5387 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5388 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5389 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5390 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5391 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5392 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5393 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5394 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5395 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5396 }
5397
5398 if ((re->options & PCRE_FIRSTSET) != 0)
5399 {
5400 int ch = re->first_byte & 255;
5401 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
5402 "" : " (caseless)";
5403 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5404 else printf("First char = \\x%02x%s\n", ch, caseless);
5405 }
5406
5407 if ((re->options & PCRE_REQCHSET) != 0)
5408 {
5409 int ch = re->req_byte & 255;
5410 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
5411 "" : " (caseless)";
5412 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5413 else printf("Req char = \\x%02x%s\n", ch, caseless);
5414 }
5415
5416 pcre_printint(re, stdout);
5417
5418 /* This check is done here in the debugging case so that the code that
5419 was compiled can be seen. */
5420
5421 if (code - codestart > length)
5422 {
5423 (pcre_free)(re);
5424 *errorptr = error_texts[ERR23];
5425 *erroroffset = ptr - (uschar *)pattern;
5426 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5427 return NULL;
5428 }
5429 #endif /* DEBUG */
5430
5431 return (pcre *)re;
5432 }
5433
5434 /* End of pcre_compile.c */