tin  2.4.5
About: TIN is a threaded NNTP and spool based UseNet newsreader.
  Fossies Dox: tin-2.4.5.tar.xz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

pcre_dfa_exec.c
Go to the documentation of this file.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8  Written by Philip Hazel
9  Copyright (c) 1997-2006 University of Cambridge
10 
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14 
15  * Redistributions of source code must retain the above copyright notice,
16  this list of conditions and the following disclaimer.
17 
18  * Redistributions in binary form must reproduce the above copyright
19  notice, this list of conditions and the following disclaimer in the
20  documentation and/or other materials provided with the distribution.
21 
22  * Neither the name of the University of Cambridge nor the names of its
23  contributors may be used to endorse or promote products derived from
24  this software without specific prior written permission.
25 
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39 
40 
41 /* This module contains the external function pcre_dfa_exec(), which is an
42 alternative matching function that uses a sort of DFA algorithm (not a true
43 FSM). This is NOT Perl- compatible, but it has advantages in certain
44 applications. */
45 
46 
47 #define NLBLOCK md /* Block containing newline information */
48 #define PSSTART start_subject /* Field containing processed string start */
49 #define PSEND end_subject /* Field containing processed string end */
50 
51 #include "pcre_internal.h"
52 
53 
54 /* For use to indent debugging output */
55 
56 #define SP " "
57 
58 
59 
60 /*************************************************
61 * Code parameters and static tables *
62 *************************************************/
63 
64 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
65 into others, under special conditions. A gap of 20 between the blocks should be
66 enough. */
67 
68 #define OP_PROP_EXTRA 100
69 #define OP_EXTUNI_EXTRA 120
70 #define OP_ANYNL_EXTRA 140
71 
72 
73 /* This table identifies those opcodes that are followed immediately by a
74 character that is to be tested in some way. This makes is possible to
75 centralize the loading of these characters. In the case of Type * etc, the
76 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
77 small value. */
78 
79 static uschar coptable[] = {
80  0, /* End */
81  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* \A, \G, \B, \b, \D, \d, \S, \s, \W, \w */
82  0, 0, /* Any, Anybyte */
83  0, 0, 0, 0, /* NOTPROP, PROP, EXTUNI, ANYNL */
84  0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
85  1, /* Char */
86  1, /* Charnc */
87  1, /* not */
88  /* Positive single-char repeats */
89  1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
90  3, 3, 3, /* upto, minupto, exact */
91  1, 1, 1, 3, /* *+, ++, ?+, upto+ */
92  /* Negative single-char repeats - only for chars < 256 */
93  1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
94  3, 3, 3, /* NOT upto, minupto, exact */
95  1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
96  /* Positive type repeats */
97  1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
98  3, 3, 3, /* Type upto, minupto, exact */
99  1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
100  /* Character class & ref repeats */
101  0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
102  0, 0, /* CRRANGE, CRMINRANGE */
103  0, /* CLASS */
104  0, /* NCLASS */
105  0, /* XCLASS - variable length */
106  0, /* REF */
107  0, /* RECURSE */
108  0, /* CALLOUT */
109  0, /* Alt */
110  0, /* Ket */
111  0, /* KetRmax */
112  0, /* KetRmin */
113  0, /* Assert */
114  0, /* Assert not */
115  0, /* Assert behind */
116  0, /* Assert behind not */
117  0, /* Reverse */
118  0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
119  0, 0, 0, /* SBRA, SCBRA, SCOND */
120  0, /* CREF */
121  0, /* RREF */
122  0, /* DEF */
123  0, 0 /* BRAZERO, BRAMINZERO */
124 };
125 
126 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
127 and \w */
128 
129 static uschar toptable1[] = {
130  0, 0, 0, 0, 0,
134  0 /* OP_ANY */
135 };
136 
137 static uschar toptable2[] = {
138  0, 0, 0, 0, 0,
139  ctype_digit, 0,
140  ctype_space, 0,
141  ctype_word, 0,
142  1 /* OP_ANY */
143 };
144 
145 
146 /* Structure for holding data about a particular state, which is in effect the
147 current data for an active path through the match tree. It must consist
148 entirely of ints because the working vector we are passed, and which we put
149 these structures in, is a vector of ints. */
150 
151 typedef struct stateblock {
152  int offset; /* Offset to opcode */
153  int count; /* Count for repeats */
154  int ims; /* ims flag bits */
155  int data; /* Some use extra data */
157 
158 #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
159 
160 
161 #ifdef DEBUG
162 /*************************************************
163 * Print character string *
164 *************************************************/
165 
166 /* Character string printing function for debugging.
167 
168 Arguments:
169  p points to string
170  length number of bytes
171  f where to print
172 
173 Returns: nothing
174 */
175 
176 static void
177 pchars(unsigned char *p, int length, FILE *f)
178 {
179 int c;
180 while (length-- > 0)
181  {
182  if (isprint(c = *(p++)))
183  fprintf(f, "%c", c);
184  else
185  fprintf(f, "\\x%02x", c);
186  }
187 }
188 #endif
189 
190 
191 
192 /*************************************************
193 * Execute a Regular Expression - DFA engine *
194 *************************************************/
195 
196 /* This internal function applies a compiled pattern to a subject string,
197 starting at a given point, using a DFA engine. This function is called from the
198 external one, possibly multiple times if the pattern is not anchored. The
199 function calls itself recursively for some kinds of subpattern.
200 
201 Arguments:
202  md the match_data block with fixed information
203  this_start_code the opening bracket of this subexpression's code
204  current_subject where we currently are in the subject string
205  start_offset start offset in the subject string
206  offsets vector to contain the matching string offsets
207  offsetcount size of same
208  workspace vector of workspace
209  wscount size of same
210  ims the current ims flags
211  rlevel function call recursion level
212  recursing regex recursive call level
213 
214 Returns: > 0 =>
215  = 0 =>
216  -1 => failed to match
217  < -1 => some kind of unexpected problem
218 
219 The following macros are used for adding states to the two state vectors (one
220 for the current character, one for the following character). */
221 
222 #define ADD_ACTIVE(x,y) \
223  if (active_count++ < wscount) \
224  { \
225  next_active_state->offset = (x); \
226  next_active_state->count = (y); \
227  next_active_state->ims = ims; \
228  next_active_state++; \
229  DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
230  } \
231  else return PCRE_ERROR_DFA_WSSIZE
232 
233 #define ADD_ACTIVE_DATA(x,y,z) \
234  if (active_count++ < wscount) \
235  { \
236  next_active_state->offset = (x); \
237  next_active_state->count = (y); \
238  next_active_state->ims = ims; \
239  next_active_state->data = (z); \
240  next_active_state++; \
241  DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
242  } \
243  else return PCRE_ERROR_DFA_WSSIZE
244 
245 #define ADD_NEW(x,y) \
246  if (new_count++ < wscount) \
247  { \
248  next_new_state->offset = (x); \
249  next_new_state->count = (y); \
250  next_new_state->ims = ims; \
251  next_new_state++; \
252  DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
253  } \
254  else return PCRE_ERROR_DFA_WSSIZE
255 
256 #define ADD_NEW_DATA(x,y,z) \
257  if (new_count++ < wscount) \
258  { \
259  next_new_state->offset = (x); \
260  next_new_state->count = (y); \
261  next_new_state->ims = ims; \
262  next_new_state->data = (z); \
263  next_new_state++; \
264  DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
265  } \
266  else return PCRE_ERROR_DFA_WSSIZE
267 
268 /* And now, here is the code */
269 
270 static int
272  dfa_match_data *md,
273  const uschar *this_start_code,
274  const uschar *current_subject,
275  int start_offset,
276  int *offsets,
277  int offsetcount,
278  int *workspace,
279  int wscount,
280  int ims,
281  int rlevel,
282  int recursing)
283 {
284 stateblock *active_states, *new_states, *temp_states;
285 stateblock *next_active_state, *next_new_state;
286 
287 const uschar *ctypes, *lcc, *fcc;
288 const uschar *ptr;
289 const uschar *end_code, *first_op;
290 
291 int active_count, new_count, match_count;
292 
293 /* Some fields in the md block are frequently referenced, so we load them into
294 independent variables in the hope that this will perform better. */
295 
296 const uschar *start_subject = md->start_subject;
297 const uschar *end_subject = md->end_subject;
298 const uschar *start_code = md->start_code;
299 
300 #ifdef SUPPORT_UTF8
301 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
302 #else
303 BOOL utf8 = FALSE;
304 #endif
305 
306 rlevel++;
307 offsetcount &= (-2);
308 
309 wscount -= 2;
310 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
311  (2 * INTS_PER_STATEBLOCK);
312 
313 DPRINTF(("\n%.*s---------------------\n"
314  "%.*sCall to internal_dfa_exec f=%d r=%d\n",
315  rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
316 
317 ctypes = md->tables + ctypes_offset;
318 lcc = md->tables + lcc_offset;
319 fcc = md->tables + fcc_offset;
320 
321 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
322 
323 active_states = (stateblock *)(workspace + 2);
324 next_new_state = new_states = active_states + wscount;
325 new_count = 0;
326 
327 first_op = this_start_code + 1 + LINK_SIZE +
328  ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
329 
330 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
331 the alternative states onto the list, and find out where the end is. This
332 makes is possible to use this function recursively, when we want to stop at a
333 matching internal ket rather than at the end.
334 
335 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
336 a backward assertion. In that case, we have to find out the maximum amount to
337 move back, and set up each alternative appropriately. */
338 
339 if (*first_op == OP_REVERSE)
340  {
341  int max_back = 0;
342  int gone_back;
343 
344  end_code = this_start_code;
345  do
346  {
347  int back = GET(end_code, 2+LINK_SIZE);
348  if (back > max_back) max_back = back;
349  end_code += GET(end_code, 1);
350  }
351  while (*end_code == OP_ALT);
352 
353  /* If we can't go back the amount required for the longest lookbehind
354  pattern, go back as far as we can; some alternatives may still be viable. */
355 
356 #ifdef SUPPORT_UTF8
357  /* In character mode we have to step back character by character */
358 
359  if (utf8)
360  {
361  for (gone_back = 0; gone_back < max_back; gone_back++)
362  {
363  if (current_subject <= start_subject) break;
364  current_subject--;
365  while (current_subject > start_subject &&
366  (*current_subject & 0xc0) == 0x80)
367  current_subject--;
368  }
369  }
370  else
371 #endif
372 
373  /* In byte-mode we can do this quickly. */
374 
375  {
376  gone_back = (current_subject - max_back < start_subject)?
377  current_subject - start_subject : max_back;
378  current_subject -= gone_back;
379  }
380 
381  /* Now we can process the individual branches. */
382 
383  end_code = this_start_code;
384  do
385  {
386  int back = GET(end_code, 2+LINK_SIZE);
387  if (back <= gone_back)
388  {
389  int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
390  ADD_NEW_DATA(-bstate, 0, gone_back - back);
391  }
392  end_code += GET(end_code, 1);
393  }
394  while (*end_code == OP_ALT);
395  }
396 
397 /* This is the code for a "normal" subpattern (not a backward assertion). The
398 start of a whole pattern is always one of these. If we are at the top level,
399 we may be asked to restart matching from the same point that we reached for a
400 previous partial match. We still have to scan through the top-level branches to
401 find the end state. */
402 
403 else
404  {
405  end_code = this_start_code;
406 
407  /* Restarting */
408 
409  if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
410  {
411  do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
412  new_count = workspace[1];
413  if (!workspace[0])
414  memcpy(new_states, active_states, new_count * sizeof(stateblock));
415  }
416 
417  /* Not restarting */
418 
419  else
420  {
421  int length = 1 + LINK_SIZE +
422  ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
423  do
424  {
425  ADD_NEW(end_code - start_code + length, 0);
426  end_code += GET(end_code, 1);
427  length = 1 + LINK_SIZE;
428  }
429  while (*end_code == OP_ALT);
430  }
431  }
432 
433 workspace[0] = 0; /* Bit indicating which vector is current */
434 
435 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
436 
437 /* Loop for scanning the subject */
438 
439 ptr = current_subject;
440 for (;;)
441  {
442  int i, j;
443  int clen, dlen;
444  unsigned int c, d;
445 
446  /* Make the new state list into the active state list and empty the
447  new state list. */
448 
449  temp_states = active_states;
450  active_states = new_states;
451  new_states = temp_states;
452  active_count = new_count;
453  new_count = 0;
454 
455  workspace[0] ^= 1; /* Remember for the restarting feature */
456  workspace[1] = active_count;
457 
458 #ifdef DEBUG
459  printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
460  pchars((uschar *)ptr, strlen((char *)ptr), stdout);
461  printf("\"\n");
462 
463  printf("%.*sActive states: ", rlevel*2-2, SP);
464  for (i = 0; i < active_count; i++)
465  printf("%d/%d ", active_states[i].offset, active_states[i].count);
466  printf("\n");
467 #endif
468 
469  /* Set the pointers for adding new states */
470 
471  next_active_state = active_states + active_count;
472  next_new_state = new_states;
473 
474  /* Load the current character from the subject outside the loop, as many
475  different states may want to look at it, and we assume that at least one
476  will. */
477 
478  if (ptr < end_subject)
479  {
480  clen = 1; /* Number of bytes in the character */
481 #ifdef SUPPORT_UTF8
482  if (utf8) { GETCHARLEN(c, ptr, clen); } else
483 #endif /* SUPPORT_UTF8 */
484  c = *ptr;
485  }
486  else
487  {
488  clen = 0; /* This indicates the end of the subject */
489  c = NOTACHAR; /* This value should never actually be used */
490  }
491 
492  /* Scan up the active states and act on each one. The result of an action
493  may be to add more states to the currently active list (e.g. on hitting a
494  parenthesis) or it may be to put states on the new list, for considering
495  when we move the character pointer on. */
496 
497  for (i = 0; i < active_count; i++)
498  {
499  stateblock *current_state = active_states + i;
500  const uschar *code;
501  int state_offset = current_state->offset;
502  int count, codevalue;
503  int chartype, script;
504 
505 #ifdef DEBUG
506  printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
507  if (clen == 0) printf("EOL\n");
508  else if (c > 32 && c < 127) printf("'%c'\n", c);
509  else printf("0x%02x\n", c);
510 #endif
511 
512  /* This variable is referred to implicity in the ADD_xxx macros. */
513 
514  ims = current_state->ims;
515 
516  /* A negative offset is a special case meaning "hold off going to this
517  (negated) state until the number of characters in the data field have
518  been skipped". */
519 
520  if (state_offset < 0)
521  {
522  if (current_state->data > 0)
523  {
524  DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
525  ADD_NEW_DATA(state_offset, current_state->count,
526  current_state->data - 1);
527  continue;
528  }
529  else
530  {
531  current_state->offset = state_offset = -state_offset;
532  }
533  }
534 
535  /* Check for a duplicate state with the same count, and skip if found. */
536 
537  for (j = 0; j < i; j++)
538  {
539  if (active_states[j].offset == state_offset &&
540  active_states[j].count == current_state->count)
541  {
542  DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
543  goto NEXT_ACTIVE_STATE;
544  }
545  }
546 
547  /* The state offset is the offset to the opcode */
548 
549  code = start_code + state_offset;
550  codevalue = *code;
551 
552  /* If this opcode is followed by an inline character, load it. It is
553  tempting to test for the presence of a subject character here, but that
554  is wrong, because sometimes zero repetitions of the subject are
555  permitted.
556 
557  We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
558  argument that is not a data character - but is always one byte long.
559  Unfortunately, we have to take special action to deal with \P, \p, and
560  \X in this case. To keep the other cases fast, convert these ones to new
561  opcodes. */
562 
563  if (coptable[codevalue] > 0)
564  {
565  dlen = 1;
566 #ifdef SUPPORT_UTF8
567  if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
568 #endif /* SUPPORT_UTF8 */
569  d = code[coptable[codevalue]];
570  if (codevalue >= OP_TYPESTAR)
571  {
572  switch(d)
573  {
574  case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
575  case OP_NOTPROP:
576  case OP_PROP: codevalue += OP_PROP_EXTRA; break;
577  case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
578  case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
579  default: break;
580  }
581  }
582  }
583  else
584  {
585  dlen = 0; /* Not strictly necessary, but compilers moan */
586  d = NOTACHAR; /* if these variables are not set. */
587  }
588 
589 
590  /* Now process the individual opcodes */
591 
592  switch (codevalue)
593  {
594 
595 /* ========================================================================== */
596  /* Reached a closing bracket. If not at the end of the pattern, carry
597  on with the next opcode. Otherwise, unless we have an empty string and
598  PCRE_NOTEMPTY is set, save the match data, shifting up all previous
599  matches so we always have the longest first. */
600 
601  case OP_KET:
602  case OP_KETRMIN:
603  case OP_KETRMAX:
604  if (code != end_code)
605  {
606  ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
607  if (codevalue != OP_KET)
608  {
609  ADD_ACTIVE(state_offset - GET(code, 1), 0);
610  }
611  }
612  else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
613  {
614  if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
615  else if (match_count > 0 && ++match_count * 2 >= offsetcount)
616  match_count = 0;
617  count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
618  if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
619  if (offsetcount >= 2)
620  {
621  offsets[0] = current_subject - start_subject;
622  offsets[1] = ptr - start_subject;
623  DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
624  offsets[1] - offsets[0], current_subject));
625  }
626  if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
627  {
628  DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
629  "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
630  match_count, rlevel*2-2, SP));
631  return match_count;
632  }
633  }
634  break;
635 
636 /* ========================================================================== */
637  /* These opcodes add to the current list of states without looking
638  at the current character. */
639 
640  /*-----------------------------------------------------------------*/
641  case OP_ALT:
642  do { code += GET(code, 1); } while (*code == OP_ALT);
643  ADD_ACTIVE(code - start_code, 0);
644  break;
645 
646  /*-----------------------------------------------------------------*/
647  case OP_BRA:
648  case OP_SBRA:
649  do
650  {
651  ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
652  code += GET(code, 1);
653  }
654  while (*code == OP_ALT);
655  break;
656 
657  /*-----------------------------------------------------------------*/
658  case OP_CBRA:
659  case OP_SCBRA:
660  ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
661  code += GET(code, 1);
662  while (*code == OP_ALT)
663  {
664  ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
665  code += GET(code, 1);
666  }
667  break;
668 
669  /*-----------------------------------------------------------------*/
670  case OP_BRAZERO:
671  case OP_BRAMINZERO:
672  ADD_ACTIVE(state_offset + 1, 0);
673  code += 1 + GET(code, 2);
674  while (*code == OP_ALT) code += GET(code, 1);
675  ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
676  break;
677 
678  /*-----------------------------------------------------------------*/
679  case OP_CIRC:
680  if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
681  ((ims & PCRE_MULTILINE) != 0 &&
682  ptr != end_subject &&
683  WAS_NEWLINE(ptr)))
684  { ADD_ACTIVE(state_offset + 1, 0); }
685  break;
686 
687  /*-----------------------------------------------------------------*/
688  case OP_EOD:
689  if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
690  break;
691 
692  /*-----------------------------------------------------------------*/
693  case OP_OPT:
694  ims = code[1];
695  ADD_ACTIVE(state_offset + 2, 0);
696  break;
697 
698  /*-----------------------------------------------------------------*/
699  case OP_SOD:
700  if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
701  break;
702 
703  /*-----------------------------------------------------------------*/
704  case OP_SOM:
705  if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
706  break;
707 
708 
709 /* ========================================================================== */
710  /* These opcodes inspect the next subject character, and sometimes
711  the previous one as well, but do not have an argument. The variable
712  clen contains the length of the current character and is zero if we are
713  at the end of the subject. */
714 
715  /*-----------------------------------------------------------------*/
716  case OP_ANY:
717  if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))
718  { ADD_NEW(state_offset + 1, 0); }
719  break;
720 
721  /*-----------------------------------------------------------------*/
722  case OP_EODN:
723  if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
724  { ADD_ACTIVE(state_offset + 1, 0); }
725  break;
726 
727  /*-----------------------------------------------------------------*/
728  case OP_DOLL:
729  if ((md->moptions & PCRE_NOTEOL) == 0)
730  {
731  if (clen == 0 ||
732  (IS_NEWLINE(ptr) &&
733  ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
734  ))
735  { ADD_ACTIVE(state_offset + 1, 0); }
736  }
737  else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
738  { ADD_ACTIVE(state_offset + 1, 0); }
739  break;
740 
741  /*-----------------------------------------------------------------*/
742 
743  case OP_DIGIT:
744  case OP_WHITESPACE:
745  case OP_WORDCHAR:
746  if (clen > 0 && c < 256 &&
747  ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
748  { ADD_NEW(state_offset + 1, 0); }
749  break;
750 
751  /*-----------------------------------------------------------------*/
752  case OP_NOT_DIGIT:
753  case OP_NOT_WHITESPACE:
754  case OP_NOT_WORDCHAR:
755  if (clen > 0 && (c >= 256 ||
756  ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
757  { ADD_NEW(state_offset + 1, 0); }
758  break;
759 
760  /*-----------------------------------------------------------------*/
761  case OP_WORD_BOUNDARY:
763  {
764  int left_word, right_word;
765 
766  if (ptr > start_subject)
767  {
768  const uschar *temp = ptr - 1;
769 #ifdef SUPPORT_UTF8
770  if (utf8) BACKCHAR(temp);
771 #endif
772  GETCHARTEST(d, temp);
773  left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
774  }
775  else left_word = 0;
776 
777  if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
778  else right_word = 0;
779 
780  if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
781  { ADD_ACTIVE(state_offset + 1, 0); }
782  }
783  break;
784 
785 
786 #ifdef SUPPORT_UCP
787 
788  /*-----------------------------------------------------------------*/
789  /* Check the next character by Unicode property. We will get here only
790  if the support is in the binary; otherwise a compile-time error occurs.
791  */
792 
793  case OP_PROP:
794  case OP_NOTPROP:
795  if (clen > 0)
796  {
797  BOOL OK;
798  int category = _pcre_ucp_findprop(c, &chartype, &script);
799  switch(code[1])
800  {
801  case PT_ANY:
802  OK = TRUE;
803  break;
804 
805  case PT_LAMP:
806  OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
807  break;
808 
809  case PT_GC:
810  OK = category == code[2];
811  break;
812 
813  case PT_PC:
814  OK = chartype == code[2];
815  break;
816 
817  case PT_SC:
818  OK = script == code[2];
819  break;
820 
821  /* Should never occur, but keep compilers from grumbling. */
822 
823  default:
824  OK = codevalue != OP_PROP;
825  break;
826  }
827 
828  if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
829  }
830  break;
831 #endif
832 
833 
834 
835 /* ========================================================================== */
836  /* These opcodes likewise inspect the subject character, but have an
837  argument that is not a data character. It is one of these opcodes:
838  OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,
839  OP_NOT_WORDCHAR. The value is loaded into d. */
840 
841  case OP_TYPEPLUS:
842  case OP_TYPEMINPLUS:
843  case OP_TYPEPOSPLUS:
844  count = current_state->count; /* Already matched */
845  if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
846  if (clen > 0)
847  {
848  if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
849  (c < 256 &&
850  (d != OP_ANY ||
851  (ims & PCRE_DOTALL) != 0 ||
852  !IS_NEWLINE(ptr)
853  ) &&
854  ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
855  {
856  if (count > 0 && codevalue == OP_TYPEPOSPLUS)
857  {
858  active_count--; /* Remove non-match possibility */
859  next_active_state--;
860  }
861  count++;
862  ADD_NEW(state_offset, count);
863  }
864  }
865  break;
866 
867  /*-----------------------------------------------------------------*/
868  case OP_TYPEQUERY:
869  case OP_TYPEMINQUERY:
870  case OP_TYPEPOSQUERY:
871  ADD_ACTIVE(state_offset + 2, 0);
872  if (clen > 0)
873  {
874  if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
875  (c < 256 &&
876  (d != OP_ANY ||
877  (ims & PCRE_DOTALL) != 0 ||
878  !IS_NEWLINE(ptr)
879  ) &&
880  ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
881  {
882  if (codevalue == OP_TYPEPOSQUERY)
883  {
884  active_count--; /* Remove non-match possibility */
885  next_active_state--;
886  }
887  ADD_NEW(state_offset + 2, 0);
888  }
889  }
890  break;
891 
892  /*-----------------------------------------------------------------*/
893  case OP_TYPESTAR:
894  case OP_TYPEMINSTAR:
895  case OP_TYPEPOSSTAR:
896  ADD_ACTIVE(state_offset + 2, 0);
897  if (clen > 0)
898  {
899  if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
900  (c < 256 &&
901  (d != OP_ANY ||
902  (ims & PCRE_DOTALL) != 0 ||
903  !IS_NEWLINE(ptr)
904  ) &&
905  ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
906  {
907  if (codevalue == OP_TYPEPOSSTAR)
908  {
909  active_count--; /* Remove non-match possibility */
910  next_active_state--;
911  }
912  ADD_NEW(state_offset, 0);
913  }
914  }
915  break;
916 
917  /*-----------------------------------------------------------------*/
918  case OP_TYPEEXACT:
919  count = current_state->count; /* Number already matched */
920  if (clen > 0)
921  {
922  if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
923  (c < 256 &&
924  (d != OP_ANY ||
925  (ims & PCRE_DOTALL) != 0 ||
926  !IS_NEWLINE(ptr)
927  ) &&
928  ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
929  {
930  if (++count >= GET2(code, 1))
931  { ADD_NEW(state_offset + 4, 0); }
932  else
933  { ADD_NEW(state_offset, count); }
934  }
935  }
936  break;
937 
938  /*-----------------------------------------------------------------*/
939  case OP_TYPEUPTO:
940  case OP_TYPEMINUPTO:
941  case OP_TYPEPOSUPTO:
942  ADD_ACTIVE(state_offset + 4, 0);
943  count = current_state->count; /* Number already matched */
944  if (clen > 0)
945  {
946  if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
947  (c < 256 &&
948  (d != OP_ANY ||
949  (ims & PCRE_DOTALL) != 0 ||
950  !IS_NEWLINE(ptr)
951  ) &&
952  ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
953  {
954  if (codevalue == OP_TYPEPOSUPTO)
955  {
956  active_count--; /* Remove non-match possibility */
957  next_active_state--;
958  }
959  if (++count >= GET2(code, 1))
960  { ADD_NEW(state_offset + 4, 0); }
961  else
962  { ADD_NEW(state_offset, count); }
963  }
964  }
965  break;
966 
967 /* ========================================================================== */
968  /* These are virtual opcodes that are used when something like
969  OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
970  argument. It keeps the code above fast for the other cases. The argument
971  is in the d variable. */
972 
973  case OP_PROP_EXTRA + OP_TYPEPLUS:
976  count = current_state->count; /* Already matched */
977  if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
978  if (clen > 0)
979  {
980  BOOL OK;
981  int category = _pcre_ucp_findprop(c, &chartype, &script);
982  switch(code[2])
983  {
984  case PT_ANY:
985  OK = TRUE;
986  break;
987 
988  case PT_LAMP:
989  OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
990  break;
991 
992  case PT_GC:
993  OK = category == code[3];
994  break;
995 
996  case PT_PC:
997  OK = chartype == code[3];
998  break;
999 
1000  case PT_SC:
1001  OK = script == code[3];
1002  break;
1003 
1004  /* Should never occur, but keep compilers from grumbling. */
1005 
1006  default:
1007  OK = codevalue != OP_PROP;
1008  break;
1009  }
1010 
1011  if (OK == (d == OP_PROP))
1012  {
1013  if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1014  {
1015  active_count--; /* Remove non-match possibility */
1016  next_active_state--;
1017  }
1018  count++;
1019  ADD_NEW(state_offset, count);
1020  }
1021  }
1022  break;
1023 
1024  /*-----------------------------------------------------------------*/
1028  count = current_state->count; /* Already matched */
1029  if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1030  if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1031  {
1032  const uschar *nptr = ptr + clen;
1033  int ncount = 0;
1034  if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1035  {
1036  active_count--; /* Remove non-match possibility */
1037  next_active_state--;
1038  }
1039  while (nptr < end_subject)
1040  {
1041  int nd;
1042  int ndlen = 1;
1043  GETCHARLEN(nd, nptr, ndlen);
1044  if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1045  ncount++;
1046  nptr += ndlen;
1047  }
1048  count++;
1049  ADD_NEW_DATA(-state_offset, count, ncount);
1050  }
1051  break;
1052 
1053  /*-----------------------------------------------------------------*/
1054  case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1057  count = current_state->count; /* Already matched */
1058  if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1059  if (clen > 0)
1060  {
1061  int ncount = 0;
1062  switch (c)
1063  {
1064  case 0x000d:
1065  if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1066  /* Fall through */
1067  case 0x000a:
1068  case 0x000b:
1069  case 0x000c:
1070  case 0x0085:
1071  case 0x2028:
1072  case 0x2029:
1073  if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1074  {
1075  active_count--; /* Remove non-match possibility */
1076  next_active_state--;
1077  }
1078  count++;
1079  ADD_NEW_DATA(-state_offset, count, ncount);
1080  break;
1081  default:
1082  break;
1083  }
1084  }
1085  break;
1086 
1087  /*-----------------------------------------------------------------*/
1088  case OP_PROP_EXTRA + OP_TYPEQUERY:
1091  count = 4;
1092  goto QS1;
1093 
1094  case OP_PROP_EXTRA + OP_TYPESTAR:
1097  count = 0;
1098 
1099  QS1:
1100 
1101  ADD_ACTIVE(state_offset + 4, 0);
1102  if (clen > 0)
1103  {
1104  BOOL OK;
1105  int category = _pcre_ucp_findprop(c, &chartype, &script);
1106  switch(code[2])
1107  {
1108  case PT_ANY:
1109  OK = TRUE;
1110  break;
1111 
1112  case PT_LAMP:
1113  OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1114  break;
1115 
1116  case PT_GC:
1117  OK = category == code[3];
1118  break;
1119 
1120  case PT_PC:
1121  OK = chartype == code[3];
1122  break;
1123 
1124  case PT_SC:
1125  OK = script == code[3];
1126  break;
1127 
1128  /* Should never occur, but keep compilers from grumbling. */
1129 
1130  default:
1131  OK = codevalue != OP_PROP;
1132  break;
1133  }
1134 
1135  if (OK == (d == OP_PROP))
1136  {
1137  if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1138  codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1139  {
1140  active_count--; /* Remove non-match possibility */
1141  next_active_state--;
1142  }
1143  ADD_NEW(state_offset + count, 0);
1144  }
1145  }
1146  break;
1147 
1148  /*-----------------------------------------------------------------*/
1152  count = 2;
1153  goto QS2;
1154 
1158  count = 0;
1159 
1160  QS2:
1161 
1162  ADD_ACTIVE(state_offset + 2, 0);
1163  if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1164  {
1165  const uschar *nptr = ptr + clen;
1166  int ncount = 0;
1167  if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1168  codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1169  {
1170  active_count--; /* Remove non-match possibility */
1171  next_active_state--;
1172  }
1173  while (nptr < end_subject)
1174  {
1175  int nd;
1176  int ndlen = 1;
1177  GETCHARLEN(nd, nptr, ndlen);
1178  if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1179  ncount++;
1180  nptr += ndlen;
1181  }
1182  ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1183  }
1184  break;
1185 
1186  /*-----------------------------------------------------------------*/
1190  count = 2;
1191  goto QS3;
1192 
1193  case OP_ANYNL_EXTRA + OP_TYPESTAR:
1196  count = 0;
1197 
1198  QS3:
1199  ADD_ACTIVE(state_offset + 2, 0);
1200  if (clen > 0)
1201  {
1202  int ncount = 0;
1203  switch (c)
1204  {
1205  case 0x000d:
1206  if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1207  /* Fall through */
1208  case 0x000a:
1209  case 0x000b:
1210  case 0x000c:
1211  case 0x0085:
1212  case 0x2028:
1213  case 0x2029:
1214  if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1215  codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1216  {
1217  active_count--; /* Remove non-match possibility */
1218  next_active_state--;
1219  }
1220  ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1221  break;
1222  default:
1223  break;
1224  }
1225  }
1226  break;
1227 
1228  /*-----------------------------------------------------------------*/
1229  case OP_PROP_EXTRA + OP_TYPEEXACT:
1230  case OP_PROP_EXTRA + OP_TYPEUPTO:
1233  if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1234  { ADD_ACTIVE(state_offset + 6, 0); }
1235  count = current_state->count; /* Number already matched */
1236  if (clen > 0)
1237  {
1238  BOOL OK;
1239  int category = _pcre_ucp_findprop(c, &chartype, &script);
1240  switch(code[4])
1241  {
1242  case PT_ANY:
1243  OK = TRUE;
1244  break;
1245 
1246  case PT_LAMP:
1247  OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1248  break;
1249 
1250  case PT_GC:
1251  OK = category == code[5];
1252  break;
1253 
1254  case PT_PC:
1255  OK = chartype == code[5];
1256  break;
1257 
1258  case PT_SC:
1259  OK = script == code[5];
1260  break;
1261 
1262  /* Should never occur, but keep compilers from grumbling. */
1263 
1264  default:
1265  OK = codevalue != OP_PROP;
1266  break;
1267  }
1268 
1269  if (OK == (d == OP_PROP))
1270  {
1271  if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1272  {
1273  active_count--; /* Remove non-match possibility */
1274  next_active_state--;
1275  }
1276  if (++count >= GET2(code, 1))
1277  { ADD_NEW(state_offset + 6, 0); }
1278  else
1279  { ADD_NEW(state_offset, count); }
1280  }
1281  }
1282  break;
1283 
1284  /*-----------------------------------------------------------------*/
1289  if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1290  { ADD_ACTIVE(state_offset + 4, 0); }
1291  count = current_state->count; /* Number already matched */
1292  if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1293  {
1294  const uschar *nptr = ptr + clen;
1295  int ncount = 0;
1296  if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1297  {
1298  active_count--; /* Remove non-match possibility */
1299  next_active_state--;
1300  }
1301  while (nptr < end_subject)
1302  {
1303  int nd;
1304  int ndlen = 1;
1305  GETCHARLEN(nd, nptr, ndlen);
1306  if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1307  ncount++;
1308  nptr += ndlen;
1309  }
1310  if (++count >= GET2(code, 1))
1311  { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1312  else
1313  { ADD_NEW_DATA(-state_offset, count, ncount); }
1314  }
1315  break;
1316 
1317  /*-----------------------------------------------------------------*/
1319  case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1322  if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1323  { ADD_ACTIVE(state_offset + 4, 0); }
1324  count = current_state->count; /* Number already matched */
1325  if (clen > 0)
1326  {
1327  int ncount = 0;
1328  switch (c)
1329  {
1330  case 0x000d:
1331  if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1332  /* Fall through */
1333  case 0x000a:
1334  case 0x000b:
1335  case 0x000c:
1336  case 0x0085:
1337  case 0x2028:
1338  case 0x2029:
1339  if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1340  {
1341  active_count--; /* Remove non-match possibility */
1342  next_active_state--;
1343  }
1344  if (++count >= GET2(code, 1))
1345  { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1346  else
1347  { ADD_NEW_DATA(-state_offset, count, ncount); }
1348  break;
1349  default:
1350  break;
1351  }
1352  }
1353  break;
1354 
1355 /* ========================================================================== */
1356  /* These opcodes are followed by a character that is usually compared
1357  to the current subject character; it is loaded into d. We still get
1358  here even if there is no subject character, because in some cases zero
1359  repetitions are permitted. */
1360 
1361  /*-----------------------------------------------------------------*/
1362  case OP_CHAR:
1363  if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1364  break;
1365 
1366  /*-----------------------------------------------------------------*/
1367  case OP_CHARNC:
1368  if (clen == 0) break;
1369 
1370 #ifdef SUPPORT_UTF8
1371  if (utf8)
1372  {
1373  if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1374  {
1375  unsigned int othercase;
1376  if (c < 128) othercase = fcc[c]; else
1377 
1378  /* If we have Unicode property support, we can use it to test the
1379  other case of the character. */
1380 
1381 #ifdef SUPPORT_UCP
1382  othercase = _pcre_ucp_othercase(c);
1383 #else
1384  othercase = NOTACHAR;
1385 #endif
1386 
1387  if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1388  }
1389  }
1390  else
1391 #endif /* SUPPORT_UTF8 */
1392 
1393  /* Non-UTF-8 mode */
1394  {
1395  if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1396  }
1397  break;
1398 
1399 
1400 #ifdef SUPPORT_UCP
1401  /*-----------------------------------------------------------------*/
1402  /* This is a tricky one because it can match more than one character.
1403  Find out how many characters to skip, and then set up a negative state
1404  to wait for them to pass before continuing. */
1405 
1406  case OP_EXTUNI:
1407  if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1408  {
1409  const uschar *nptr = ptr + clen;
1410  int ncount = 0;
1411  while (nptr < end_subject)
1412  {
1413  int nclen = 1;
1414  GETCHARLEN(c, nptr, nclen);
1415  if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
1416  ncount++;
1417  nptr += nclen;
1418  }
1419  ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1420  }
1421  break;
1422 #endif
1423 
1424  /*-----------------------------------------------------------------*/
1425  /* This is a tricky like EXTUNI because it too can match more than one
1426  character (when CR is followed by LF). In this case, set up a negative
1427  state to wait for one character to pass before continuing. */
1428 
1429  case OP_ANYNL:
1430  if (clen > 0) switch(c)
1431  {
1432  case 0x000a:
1433  case 0x000b:
1434  case 0x000c:
1435  case 0x0085:
1436  case 0x2028:
1437  case 0x2029:
1438  ADD_NEW(state_offset + 1, 0);
1439  break;
1440  case 0x000d:
1441  if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1442  {
1443  ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1444  }
1445  else
1446  {
1447  ADD_NEW(state_offset + 1, 0);
1448  }
1449  break;
1450  }
1451  break;
1452 
1453  /*-----------------------------------------------------------------*/
1454  /* Match a negated single character. This is only used for one-byte
1455  characters, that is, we know that d < 256. The character we are
1456  checking (c) can be multibyte. */
1457 
1458  case OP_NOT:
1459  if (clen > 0)
1460  {
1461  unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1462  if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1463  }
1464  break;
1465 
1466  /*-----------------------------------------------------------------*/
1467  case OP_PLUS:
1468  case OP_MINPLUS:
1469  case OP_POSPLUS:
1470  case OP_NOTPLUS:
1471  case OP_NOTMINPLUS:
1472  case OP_NOTPOSPLUS:
1473  count = current_state->count; /* Already matched */
1474  if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1475  if (clen > 0)
1476  {
1477  unsigned int otherd = NOTACHAR;
1478  if ((ims & PCRE_CASELESS) != 0)
1479  {
1480 #ifdef SUPPORT_UTF8
1481  if (utf8 && d >= 128)
1482  {
1483 #ifdef SUPPORT_UCP
1484  otherd = _pcre_ucp_othercase(d);
1485 #endif /* SUPPORT_UCP */
1486  }
1487  else
1488 #endif /* SUPPORT_UTF8 */
1489  otherd = fcc[d];
1490  }
1491  if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1492  {
1493  if (count > 0 &&
1494  (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1495  {
1496  active_count--; /* Remove non-match possibility */
1497  next_active_state--;
1498  }
1499  count++;
1500  ADD_NEW(state_offset, count);
1501  }
1502  }
1503  break;
1504 
1505  /*-----------------------------------------------------------------*/
1506  case OP_QUERY:
1507  case OP_MINQUERY:
1508  case OP_POSQUERY:
1509  case OP_NOTQUERY:
1510  case OP_NOTMINQUERY:
1511  case OP_NOTPOSQUERY:
1512  ADD_ACTIVE(state_offset + dlen + 1, 0);
1513  if (clen > 0)
1514  {
1515  unsigned int otherd = NOTACHAR;
1516  if ((ims & PCRE_CASELESS) != 0)
1517  {
1518 #ifdef SUPPORT_UTF8
1519  if (utf8 && d >= 128)
1520  {
1521 #ifdef SUPPORT_UCP
1522  otherd = _pcre_ucp_othercase(d);
1523 #endif /* SUPPORT_UCP */
1524  }
1525  else
1526 #endif /* SUPPORT_UTF8 */
1527  otherd = fcc[d];
1528  }
1529  if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1530  {
1531  if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1532  {
1533  active_count--; /* Remove non-match possibility */
1534  next_active_state--;
1535  }
1536  ADD_NEW(state_offset + dlen + 1, 0);
1537  }
1538  }
1539  break;
1540 
1541  /*-----------------------------------------------------------------*/
1542  case OP_STAR:
1543  case OP_MINSTAR:
1544  case OP_POSSTAR:
1545  case OP_NOTSTAR:
1546  case OP_NOTMINSTAR:
1547  case OP_NOTPOSSTAR:
1548  ADD_ACTIVE(state_offset + dlen + 1, 0);
1549  if (clen > 0)
1550  {
1551  unsigned int otherd = NOTACHAR;
1552  if ((ims & PCRE_CASELESS) != 0)
1553  {
1554 #ifdef SUPPORT_UTF8
1555  if (utf8 && d >= 128)
1556  {
1557 #ifdef SUPPORT_UCP
1558  otherd = _pcre_ucp_othercase(d);
1559 #endif /* SUPPORT_UCP */
1560  }
1561  else
1562 #endif /* SUPPORT_UTF8 */
1563  otherd = fcc[d];
1564  }
1565  if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1566  {
1567  if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
1568  {
1569  active_count--; /* Remove non-match possibility */
1570  next_active_state--;
1571  }
1572  ADD_NEW(state_offset, 0);
1573  }
1574  }
1575  break;
1576 
1577  /*-----------------------------------------------------------------*/
1578  case OP_EXACT:
1579  case OP_NOTEXACT:
1580  count = current_state->count; /* Number already matched */
1581  if (clen > 0)
1582  {
1583  unsigned int otherd = NOTACHAR;
1584  if ((ims & PCRE_CASELESS) != 0)
1585  {
1586 #ifdef SUPPORT_UTF8
1587  if (utf8 && d >= 128)
1588  {
1589 #ifdef SUPPORT_UCP
1590  otherd = _pcre_ucp_othercase(d);
1591 #endif /* SUPPORT_UCP */
1592  }
1593  else
1594 #endif /* SUPPORT_UTF8 */
1595  otherd = fcc[d];
1596  }
1597  if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1598  {
1599  if (++count >= GET2(code, 1))
1600  { ADD_NEW(state_offset + dlen + 3, 0); }
1601  else
1602  { ADD_NEW(state_offset, count); }
1603  }
1604  }
1605  break;
1606 
1607  /*-----------------------------------------------------------------*/
1608  case OP_UPTO:
1609  case OP_MINUPTO:
1610  case OP_POSUPTO:
1611  case OP_NOTUPTO:
1612  case OP_NOTMINUPTO:
1613  case OP_NOTPOSUPTO:
1614  ADD_ACTIVE(state_offset + dlen + 3, 0);
1615  count = current_state->count; /* Number already matched */
1616  if (clen > 0)
1617  {
1618  unsigned int otherd = NOTACHAR;
1619  if ((ims & PCRE_CASELESS) != 0)
1620  {
1621 #ifdef SUPPORT_UTF8
1622  if (utf8 && d >= 128)
1623  {
1624 #ifdef SUPPORT_UCP
1625  otherd = _pcre_ucp_othercase(d);
1626 #endif /* SUPPORT_UCP */
1627  }
1628  else
1629 #endif /* SUPPORT_UTF8 */
1630  otherd = fcc[d];
1631  }
1632  if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1633  {
1634  if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
1635  {
1636  active_count--; /* Remove non-match possibility */
1637  next_active_state--;
1638  }
1639  if (++count >= GET2(code, 1))
1640  { ADD_NEW(state_offset + dlen + 3, 0); }
1641  else
1642  { ADD_NEW(state_offset, count); }
1643  }
1644  }
1645  break;
1646 
1647 
1648 /* ========================================================================== */
1649  /* These are the class-handling opcodes */
1650 
1651  case OP_CLASS:
1652  case OP_NCLASS:
1653  case OP_XCLASS:
1654  {
1655  BOOL isinclass = FALSE;
1656  int next_state_offset;
1657  const uschar *ecode;
1658 
1659  /* For a simple class, there is always just a 32-byte table, and we
1660  can set isinclass from it. */
1661 
1662  if (codevalue != OP_XCLASS)
1663  {
1664  ecode = code + 33;
1665  if (clen > 0)
1666  {
1667  isinclass = (c > 255)? (codevalue == OP_NCLASS) :
1668  ((code[1 + c/8] & (1 << (c&7))) != 0);
1669  }
1670  }
1671 
1672  /* An extended class may have a table or a list of single characters,
1673  ranges, or both, and it may be positive or negative. There's a
1674  function that sorts all this out. */
1675 
1676  else
1677  {
1678  ecode = code + GET(code, 1);
1679  if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
1680  }
1681 
1682  /* At this point, isinclass is set for all kinds of class, and ecode
1683  points to the byte after the end of the class. If there is a
1684  quantifier, this is where it will be. */
1685 
1686  next_state_offset = ecode - start_code;
1687 
1688  switch (*ecode)
1689  {
1690  case OP_CRSTAR:
1691  case OP_CRMINSTAR:
1692  ADD_ACTIVE(next_state_offset + 1, 0);
1693  if (isinclass) { ADD_NEW(state_offset, 0); }
1694  break;
1695 
1696  case OP_CRPLUS:
1697  case OP_CRMINPLUS:
1698  count = current_state->count; /* Already matched */
1699  if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
1700  if (isinclass) { count++; ADD_NEW(state_offset, count); }
1701  break;
1702 
1703  case OP_CRQUERY:
1704  case OP_CRMINQUERY:
1705  ADD_ACTIVE(next_state_offset + 1, 0);
1706  if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
1707  break;
1708 
1709  case OP_CRRANGE:
1710  case OP_CRMINRANGE:
1711  count = current_state->count; /* Already matched */
1712  if (count >= GET2(ecode, 1))
1713  { ADD_ACTIVE(next_state_offset + 5, 0); }
1714  if (isinclass)
1715  {
1716  int max = GET2(ecode, 3);
1717  if (++count >= max && max != 0) /* Max 0 => no limit */
1718  { ADD_NEW(next_state_offset + 5, 0); }
1719  else
1720  { ADD_NEW(state_offset, count); }
1721  }
1722  break;
1723 
1724  default:
1725  if (isinclass) { ADD_NEW(next_state_offset, 0); }
1726  break;
1727  }
1728  }
1729  break;
1730 
1731 /* ========================================================================== */
1732  /* These are the opcodes for fancy brackets of various kinds. We have
1733  to use recursion in order to handle them. */
1734 
1735  case OP_ASSERT:
1736  case OP_ASSERT_NOT:
1737  case OP_ASSERTBACK:
1738  case OP_ASSERTBACK_NOT:
1739  {
1740  int rc;
1741  int local_offsets[2];
1742  int local_workspace[1000];
1743  const uschar *endasscode = code + GET(code, 1);
1744 
1745  while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
1746 
1747  rc = internal_dfa_exec(
1748  md, /* static match data */
1749  code, /* this subexpression's code */
1750  ptr, /* where we currently are */
1751  ptr - start_subject, /* start offset */
1752  local_offsets, /* offset vector */
1753  sizeof(local_offsets)/sizeof(int), /* size of same */
1754  local_workspace, /* workspace vector */
1755  sizeof(local_workspace)/sizeof(int), /* size of same */
1756  ims, /* the current ims flags */
1757  rlevel, /* function recursion level */
1758  recursing); /* pass on regex recursion */
1759 
1760  if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
1761  { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
1762  }
1763  break;
1764 
1765  /*-----------------------------------------------------------------*/
1766  case OP_COND:
1767  case OP_SCOND:
1768  {
1769  int local_offsets[1000];
1770  int local_workspace[1000];
1771  int condcode = code[LINK_SIZE+1];
1772 
1773  /* Back reference conditions are not supported */
1774 
1775  if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
1776 
1777  /* The DEFINE condition is always false */
1778 
1779  if (condcode == OP_DEF)
1780  {
1781  ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
1782  }
1783 
1784  /* The only supported version of OP_RREF is for the value RREF_ANY,
1785  which means "test if in any recursion". We can't test for specifically
1786  recursed groups. */
1787 
1788  else if (condcode == OP_RREF)
1789  {
1790  int value = GET2(code, LINK_SIZE+2);
1791  if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
1792  if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
1793  else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
1794  }
1795 
1796  /* Otherwise, the condition is an assertion */
1797 
1798  else
1799  {
1800  int rc;
1801  const uschar *asscode = code + LINK_SIZE + 1;
1802  const uschar *endasscode = asscode + GET(asscode, 1);
1803 
1804  while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
1805 
1806  rc = internal_dfa_exec(
1807  md, /* fixed match data */
1808  asscode, /* this subexpression's code */
1809  ptr, /* where we currently are */
1810  ptr - start_subject, /* start offset */
1811  local_offsets, /* offset vector */
1812  sizeof(local_offsets)/sizeof(int), /* size of same */
1813  local_workspace, /* workspace vector */
1814  sizeof(local_workspace)/sizeof(int), /* size of same */
1815  ims, /* the current ims flags */
1816  rlevel, /* function recursion level */
1817  recursing); /* pass on regex recursion */
1818 
1819  if ((rc >= 0) ==
1820  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
1821  { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
1822  else
1823  { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
1824  }
1825  }
1826  break;
1827 
1828  /*-----------------------------------------------------------------*/
1829  case OP_RECURSE:
1830  {
1831  int local_offsets[1000];
1832  int local_workspace[1000];
1833  int rc;
1834 
1835  DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
1836  recursing + 1));
1837 
1838  rc = internal_dfa_exec(
1839  md, /* fixed match data */
1840  start_code + GET(code, 1), /* this subexpression's code */
1841  ptr, /* where we currently are */
1842  ptr - start_subject, /* start offset */
1843  local_offsets, /* offset vector */
1844  sizeof(local_offsets)/sizeof(int), /* size of same */
1845  local_workspace, /* workspace vector */
1846  sizeof(local_workspace)/sizeof(int), /* size of same */
1847  ims, /* the current ims flags */
1848  rlevel, /* function recursion level */
1849  recursing + 1); /* regex recurse level */
1850 
1851  DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
1852  recursing + 1, rc));
1853 
1854  /* Ran out of internal offsets */
1855 
1856  if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
1857 
1858  /* For each successful matched substring, set up the next state with a
1859  count of characters to skip before trying it. Note that the count is in
1860  characters, not bytes. */
1861 
1862  if (rc > 0)
1863  {
1864  for (rc = rc*2 - 2; rc >= 0; rc -= 2)
1865  {
1866  const uschar *p = start_subject + local_offsets[rc];
1867  const uschar *pp = start_subject + local_offsets[rc+1];
1868  int charcount = local_offsets[rc+1] - local_offsets[rc];
1869  while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
1870  if (charcount > 0)
1871  {
1872  ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
1873  }
1874  else
1875  {
1876  ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
1877  }
1878  }
1879  }
1880  else if (rc != PCRE_ERROR_NOMATCH) return rc;
1881  }
1882  break;
1883 
1884  /*-----------------------------------------------------------------*/
1885  case OP_ONCE:
1886  {
1887  int local_offsets[2];
1888  int local_workspace[1000];
1889 
1890  int rc = internal_dfa_exec(
1891  md, /* fixed match data */
1892  code, /* this subexpression's code */
1893  ptr, /* where we currently are */
1894  ptr - start_subject, /* start offset */
1895  local_offsets, /* offset vector */
1896  sizeof(local_offsets)/sizeof(int), /* size of same */
1897  local_workspace, /* workspace vector */
1898  sizeof(local_workspace)/sizeof(int), /* size of same */
1899  ims, /* the current ims flags */
1900  rlevel, /* function recursion level */
1901  recursing); /* pass on regex recursion */
1902 
1903  if (rc >= 0)
1904  {
1905  const uschar *end_subpattern = code;
1906  int charcount = local_offsets[1] - local_offsets[0];
1907  int next_state_offset, repeat_state_offset;
1908 
1909  do { end_subpattern += GET(end_subpattern, 1); }
1910  while (*end_subpattern == OP_ALT);
1911  next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
1912 
1913  /* If the end of this subpattern is KETRMAX or KETRMIN, we must
1914  arrange for the repeat state also to be added to the relevant list.
1915  Calculate the offset, or set -1 for no repeat. */
1916 
1917  repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
1918  *end_subpattern == OP_KETRMIN)?
1919  end_subpattern - start_code - GET(end_subpattern, 1) : -1;
1920 
1921  /* If we have matched an empty string, add the next state at the
1922  current character pointer. This is important so that the duplicate
1923  checking kicks in, which is what breaks infinite loops that match an
1924  empty string. */
1925 
1926  if (charcount == 0)
1927  {
1928  ADD_ACTIVE(next_state_offset, 0);
1929  }
1930 
1931  /* Optimization: if there are no more active states, and there
1932  are no new states yet set up, then skip over the subject string
1933  right here, to save looping. Otherwise, set up the new state to swing
1934  into action when the end of the substring is reached. */
1935 
1936  else if (i + 1 >= active_count && new_count == 0)
1937  {
1938  ptr += charcount;
1939  clen = 0;
1940  ADD_NEW(next_state_offset, 0);
1941 
1942  /* If we are adding a repeat state at the new character position,
1943  we must fudge things so that it is the only current state.
1944  Otherwise, it might be a duplicate of one we processed before, and
1945  that would cause it to be skipped. */
1946 
1947  if (repeat_state_offset >= 0)
1948  {
1949  next_active_state = active_states;
1950  active_count = 0;
1951  i = -1;
1952  ADD_ACTIVE(repeat_state_offset, 0);
1953  }
1954  }
1955  else
1956  {
1957  const uschar *p = start_subject + local_offsets[0];
1958  const uschar *pp = start_subject + local_offsets[1];
1959  while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
1960  ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
1961  if (repeat_state_offset >= 0)
1962  { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
1963  }
1964 
1965  }
1966  else if (rc != PCRE_ERROR_NOMATCH) return rc;
1967  }
1968  break;
1969 
1970 
1971 /* ========================================================================== */
1972  /* Handle callouts */
1973 
1974  case OP_CALLOUT:
1975  if (pcre_callout != NULL)
1976  {
1977  int rrc;
1978  pcre_callout_block cb;
1979  cb.version = 1; /* Version 1 of the callout block */
1980  cb.callout_number = code[1];
1981  cb.offset_vector = offsets;
1982  cb.subject = (PCRE_SPTR)start_subject;
1983  cb.subject_length = end_subject - start_subject;
1984  cb.start_match = current_subject - start_subject;
1985  cb.current_position = ptr - start_subject;
1986  cb.pattern_position = GET(code, 2);
1987  cb.next_item_length = GET(code, 2 + LINK_SIZE);
1988  cb.capture_top = 1;
1989  cb.capture_last = -1;
1990  cb.callout_data = md->callout_data;
1991  if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
1992  if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
1993  }
1994  break;
1995 
1996 
1997 /* ========================================================================== */
1998  default: /* Unsupported opcode */
1999  return PCRE_ERROR_DFA_UITEM;
2000  }
2001 
2002  NEXT_ACTIVE_STATE: continue;
2003 
2004  } /* End of loop scanning active states */
2005 
2006  /* We have finished the processing at the current subject character. If no
2007  new states have been set for the next character, we have found all the
2008  matches that we are going to find. If we are at the top level and partial
2009  matching has been requested, check for appropriate conditions. */
2010 
2011  if (new_count <= 0)
2012  {
2013  if (match_count < 0 && /* No matches found */
2014  rlevel == 1 && /* Top level match function */
2015  (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
2016  ptr >= end_subject && /* Reached end of subject */
2017  ptr > current_subject) /* Matched non-empty string */
2018  {
2019  if (offsetcount >= 2)
2020  {
2021  offsets[0] = current_subject - start_subject;
2022  offsets[1] = end_subject - start_subject;
2023  }
2024  match_count = PCRE_ERROR_PARTIAL;
2025  }
2026 
2027  DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2028  "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2029  rlevel*2-2, SP));
2030  break; /* In effect, "return", but see the comment below */
2031  }
2032 
2033  /* One or more states are active for the next character. */
2034 
2035  ptr += clen; /* Advance to next subject character */
2036  } /* Loop to move along the subject string */
2037 
2038 /* Control gets here from "break" a few lines above. We do it this way because
2039 if we use "return" above, we have compiler trouble. Some compilers warn if
2040 there's nothing here because they think the function doesn't return a value. On
2041 the other hand, if we put a dummy statement here, some more clever compilers
2042 complain that it can't be reached. Sigh. */
2043 
2044 return match_count;
2045 }
2046 
2047 
2048 
2049 
2050 /*************************************************
2051 * Execute a Regular Expression - DFA engine *
2052 *************************************************/
2053 
2054 /* This external function applies a compiled re to a subject string using a DFA
2055 engine. This function calls the internal function multiple times if the pattern
2056 is not anchored.
2057 
2058 Arguments:
2059  argument_re points to the compiled expression
2060  extra_data points to extra data or is NULL (not currently used)
2061  subject points to the subject string
2062  length length of subject string (may contain binary zeros)
2063  start_offset where to start in the subject string
2064  options option bits
2065  offsets vector of match offsets
2066  offsetcount size of same
2067  workspace workspace vector
2068  wscount size of same
2069 
2070 Returns: > 0 => number of match offset pairs placed in offsets
2071  = 0 => offsets overflowed; longest matches are present
2072  -1 => failed to match
2073  < -1 => some kind of unexpected problem
2074 */
2075 
2076 PCRE_DATA_SCOPE int
2077 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2078  const char *subject, int length, int start_offset, int options, int *offsets,
2079  int offsetcount, int *workspace, int wscount)
2080 {
2081 real_pcre *re = (real_pcre *)argument_re;
2082 dfa_match_data match_block;
2083 dfa_match_data *md = &match_block;
2084 BOOL utf8, anchored, startline, firstline;
2085 const uschar *current_subject, *end_subject, *lcc;
2086 
2087 pcre_study_data internal_study;
2088 const pcre_study_data *study = NULL;
2089 real_pcre internal_re;
2090 
2091 const uschar *req_byte_ptr;
2092 const uschar *start_bits = NULL;
2093 BOOL first_byte_caseless = FALSE;
2094 BOOL req_byte_caseless = FALSE;
2095 int first_byte = -1;
2096 int req_byte = -1;
2097 int req_byte2 = -1;
2098 int newline;
2099 
2100 /* Plausibility checks */
2101 
2102 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2103 if (re == NULL || subject == NULL || workspace == NULL ||
2104  (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2105 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2106 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2107 
2108 /* We need to find the pointer to any study data before we test for byte
2109 flipping, so we scan the extra_data block first. This may set two fields in the
2110 match block, so we must initialize them beforehand. However, the other fields
2111 in the match block must not be set until after the byte flipping. */
2112 
2113 md->tables = re->tables;
2114 md->callout_data = NULL;
2115 
2116 if (extra_data != NULL)
2117  {
2118  unsigned int flags = extra_data->flags;
2119  if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2120  study = (const pcre_study_data *)extra_data->study_data;
2121  if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2122  if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2123  return PCRE_ERROR_DFA_UMLIMIT;
2124  if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2125  md->callout_data = extra_data->callout_data;
2126  if ((flags & PCRE_EXTRA_TABLES) != 0)
2127  md->tables = extra_data->tables;
2128  }
2129 
2130 /* Check that the first field in the block is the magic number. If it is not,
2131 test for a regex that was compiled on a host of opposite endianness. If this is
2132 the case, flipped values are put in internal_re and internal_study if there was
2133 study data too. */
2134 
2135 if (re->magic_number != MAGIC_NUMBER)
2136  {
2137  re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2138  if (re == NULL) return PCRE_ERROR_BADMAGIC;
2139  if (study != NULL) study = &internal_study;
2140  }
2141 
2142 /* Set some local values */
2143 
2144 current_subject = (const unsigned char *)subject + start_offset;
2145 end_subject = (const unsigned char *)subject + length;
2146 req_byte_ptr = current_subject - 1;
2147 
2148 #ifdef SUPPORT_UTF8
2149 utf8 = (re->options & PCRE_UTF8) != 0;
2150 #else
2151 utf8 = FALSE;
2152 #endif
2153 
2154 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2155  (re->options & PCRE_ANCHORED) != 0;
2156 
2157 /* The remaining fixed data for passing around. */
2158 
2159 md->start_code = (const uschar *)argument_re +
2161 md->start_subject = (const unsigned char *)subject;
2162 md->end_subject = end_subject;
2163 md->moptions = options;
2164 md->poptions = re->options;
2165 
2166 /* Handle different types of newline. The two bits give four cases. If nothing
2167 is set at run time, whatever was used at compile time applies. */
2168 
2169 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : options) &
2171  {
2172  case 0: newline = NEWLINE; break; /* Compile-time default */
2173  case PCRE_NEWLINE_CR: newline = '\r'; break;
2174  case PCRE_NEWLINE_LF: newline = '\n'; break;
2175  case PCRE_NEWLINE_CR+
2176  PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
2177  case PCRE_NEWLINE_ANY: newline = -1; break;
2178  default: return PCRE_ERROR_BADNEWLINE;
2179  }
2180 
2181 if (newline < 0)
2182  {
2183  md->nltype = NLTYPE_ANY;
2184  }
2185 else
2186  {
2187  md->nltype = NLTYPE_FIXED;
2188  if (newline > 255)
2189  {
2190  md->nllen = 2;
2191  md->nl[0] = (newline >> 8) & 255;
2192  md->nl[1] = newline & 255;
2193  }
2194  else
2195  {
2196  md->nllen = 1;
2197  md->nl[0] = newline;
2198  }
2199  }
2200 
2201 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2202 back the character offset. */
2203 
2204 #ifdef SUPPORT_UTF8
2205 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2206  {
2207  if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2208  return PCRE_ERROR_BADUTF8;
2209  if (start_offset > 0 && start_offset < length)
2210  {
2211  int tb = ((uschar *)subject)[start_offset];
2212  if (tb > 127)
2213  {
2214  tb &= 0xc0;
2215  if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2216  }
2217  }
2218  }
2219 #endif
2220 
2221 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2222 is a feature that makes it possible to save compiled regex and re-use them
2223 in other programs later. */
2224 
2225 if (md->tables == NULL) md->tables = _pcre_default_tables;
2226 
2227 /* The lower casing table and the "must be at the start of a line" flag are
2228 used in a loop when finding where to start. */
2229 
2230 lcc = md->tables + lcc_offset;
2231 startline = (re->options & PCRE_STARTLINE) != 0;
2232 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2233 
2234 /* Set up the first character to match, if available. The first_byte value is
2235 never set for an anchored regular expression, but the anchoring may be forced
2236 at run time, so we have to test for anchoring. The first char may be unset for
2237 an unanchored pattern, of course. If there's no first char and the pattern was
2238 studied, there may be a bitmap of possible first characters. */
2239 
2240 if (!anchored)
2241  {
2242  if ((re->options & PCRE_FIRSTSET) != 0)
2243  {
2244  first_byte = re->first_byte & 255;
2245  if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2246  first_byte = lcc[first_byte];
2247  }
2248  else
2249  {
2250  if (startline && study != NULL &&
2251  (study->options & PCRE_STUDY_MAPPED) != 0)
2252  start_bits = study->start_bits;
2253  }
2254  }
2255 
2256 /* For anchored or unanchored matches, there may be a "last known required
2257 character" set. */
2258 
2259 if ((re->options & PCRE_REQCHSET) != 0)
2260  {
2261  req_byte = re->req_byte & 255;
2262  req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2263  req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2264  }
2265 
2266 /* Call the main matching function, looping for a non-anchored regex after a
2267 failed match. Unless restarting, optimize by moving to the first match
2268 character if possible, when not anchored. Then unless wanting a partial match,
2269 check for a required later character. */
2270 
2271 for (;;)
2272  {
2273  int rc;
2274 
2275  if ((options & PCRE_DFA_RESTART) == 0)
2276  {
2277  const uschar *save_end_subject = end_subject;
2278 
2279  /* Advance to a unique first char if possible. If firstline is TRUE, the
2280  start of the match is constrained to the first line of a multiline string.
2281  Implement this by temporarily adjusting end_subject so that we stop
2282  scanning at a newline. If the match fails at the newline, later code breaks
2283  this loop. */
2284 
2285  if (firstline)
2286  {
2287  const uschar *t = current_subject;
2288  while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2289  end_subject = t;
2290  }
2291 
2292  if (first_byte >= 0)
2293  {
2294  if (first_byte_caseless)
2295  while (current_subject < end_subject &&
2296  lcc[*current_subject] != first_byte)
2297  current_subject++;
2298  else
2299  while (current_subject < end_subject && *current_subject != first_byte)
2300  current_subject++;
2301  }
2302 
2303  /* Or to just after a linebreak for a multiline match if possible */
2304 
2305  else if (startline)
2306  {
2307  if (current_subject > md->start_subject + start_offset)
2308  {
2309  while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
2310  current_subject++;
2311  }
2312  }
2313 
2314  /* Or to a non-unique first char after study */
2315 
2316  else if (start_bits != NULL)
2317  {
2318  while (current_subject < end_subject)
2319  {
2320  register unsigned int c = *current_subject;
2321  if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2322  else break;
2323  }
2324  }
2325 
2326  /* Restore fudged end_subject */
2327 
2328  end_subject = save_end_subject;
2329  }
2330 
2331  /* If req_byte is set, we know that that character must appear in the subject
2332  for the match to succeed. If the first character is set, req_byte must be
2333  later in the subject; otherwise the test starts at the match point. This
2334  optimization can save a huge amount of work in patterns with nested unlimited
2335  repeats that aren't going to match. Writing separate code for cased/caseless
2336  versions makes it go faster, as does using an autoincrement and backing off
2337  on a match.
2338 
2339  HOWEVER: when the subject string is very, very long, searching to its end can
2340  take a long time, and give bad performance on quite ordinary patterns. This
2341  showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2342  don't do this when the string is sufficiently long.
2343 
2344  ALSO: this processing is disabled when partial matching is requested.
2345  */
2346 
2347  if (req_byte >= 0 &&
2348  end_subject - current_subject < REQ_BYTE_MAX &&
2349  (options & PCRE_PARTIAL) == 0)
2350  {
2351  register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2352 
2353  /* We don't need to repeat the search if we haven't yet reached the
2354  place we found it at last time. */
2355 
2356  if (p > req_byte_ptr)
2357  {
2358  if (req_byte_caseless)
2359  {
2360  while (p < end_subject)
2361  {
2362  register int pp = *p++;
2363  if (pp == req_byte || pp == req_byte2) { p--; break; }
2364  }
2365  }
2366  else
2367  {
2368  while (p < end_subject)
2369  {
2370  if (*p++ == req_byte) { p--; break; }
2371  }
2372  }
2373 
2374  /* If we can't find the required character, break the matching loop,
2375  which will cause a return or PCRE_ERROR_NOMATCH. */
2376 
2377  if (p >= end_subject) break;
2378 
2379  /* If we have found the required character, save the point where we
2380  found it, so that we don't search again next time round the loop if
2381  the start hasn't passed this character yet. */
2382 
2383  req_byte_ptr = p;
2384  }
2385  }
2386 
2387  /* OK, now we can do the business */
2388 
2389  rc = internal_dfa_exec(
2390  md, /* fixed match data */
2391  md->start_code, /* this subexpression's code */
2392  current_subject, /* where we currently are */
2393  start_offset, /* start offset in subject */
2394  offsets, /* offset vector */
2395  offsetcount, /* size of same */
2396  workspace, /* workspace vector */
2397  wscount, /* size of same */
2398  re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2399  0, /* function recurse level */
2400  0); /* regex recurse level */
2401 
2402  /* Anything other than "no match" means we are done, always; otherwise, carry
2403  on only if not anchored. */
2404 
2405  if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2406 
2407  /* Advance to the next subject character unless we are at the end of a line
2408  and firstline is set. */
2409 
2410  if (firstline && IS_NEWLINE(current_subject)) break;
2411  current_subject++;
2412  if (utf8)
2413  {
2414  while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2415  current_subject++;
2416  }
2417  if (current_subject > end_subject) break;
2418 
2419  /* If we have just passed a CR and the newline option is CRLF or ANY, and we
2420  are now at a LF, advance the match position by one more character. */
2421 
2422  if (current_subject[-1] == '\r' &&
2423  (md->nltype == NLTYPE_ANY || md->nllen == 2) &&
2424  current_subject < end_subject &&
2425  *current_subject == '\n')
2426  current_subject++;
2427 
2428  } /* "Bumpalong" loop */
2429 
2430 return PCRE_ERROR_NOMATCH;
2431 }
2432 
2433 /* End of pcre_dfa_exec.c */
#define TRUE
Definition: bool.h:74
#define FALSE
Definition: bool.h:70
#define PCRE_ERROR_BADUTF8_OFFSET
Definition: pcre.h:136
#define PCRE_NO_UTF8_CHECK
Definition: pcre.h:111
#define PCRE_FIRSTLINE
Definition: pcre.h:116
#define PCRE_ERROR_PARTIAL
Definition: pcre.h:137
#define PCRE_NOTBOL
Definition: pcre.h:105
#define PCRE_UTF8
Definition: pcre.h:109
#define PCRE_EXTRA_CALLOUT_DATA
Definition: pcre.h:183
#define PCRE_ERROR_DFA_UMLIMIT
Definition: pcre.h:143
#define PCRE_NEWLINE_ANY
Definition: pcre.h:121
#define PCRE_ERROR_BADNEWLINE
Definition: pcre.h:148
#define PCRE_EXTRA_STUDY_DATA
Definition: pcre.h:181
#define PCRE_ERROR_BADOPTION
Definition: pcre.h:127
#define PCRE_CASELESS
Definition: pcre.h:98
#define PCRE_EXTRA_MATCH_LIMIT
Definition: pcre.h:182
#define PCRE_EXTRA_TABLES
Definition: pcre.h:184
#define PCRE_PARTIAL
Definition: pcre.h:113
#define PCRE_DFA_SHORTEST
Definition: pcre.h:114
#define PCRE_DATA_SCOPE
Definition: pcre.h:81
#define PCRE_NOTEOL
Definition: pcre.h:106
#define PCRE_ERROR_BADMAGIC
Definition: pcre.h:128
#define PCRE_MULTILINE
Definition: pcre.h:99
#define PCRE_ERROR_DFA_WSSIZE
Definition: pcre.h:144
#define PCRE_ERROR_DFA_UITEM
Definition: pcre.h:141
#define PCRE_NOTEMPTY
Definition: pcre.h:108
#define PCRE_NEWLINE_LF
Definition: pcre.h:119
#define PCRE_DOTALL
Definition: pcre.h:100
#define PCRE_ANCHORED
Definition: pcre.h:102
#define PCRE_ERROR_NULL
Definition: pcre.h:126
#define PCRE_ERROR_BADCOUNT
Definition: pcre.h:140
#define PCRE_DFA_RESTART
Definition: pcre.h:115
#define PCRE_NEWLINE_CR
Definition: pcre.h:118
#define PCRE_SPTR
Definition: pcre.h:197
#define PCRE_ERROR_DFA_UCOND
Definition: pcre.h:142
#define PCRE_ERROR_DFA_RECURSE
Definition: pcre.h:145
#define PCRE_EXTRA_MATCH_LIMIT_RECURSION
Definition: pcre.h:185
#define PCRE_ERROR_NOMATCH
Definition: pcre.h:125
#define PCRE_ERROR_BADUTF8
Definition: pcre.h:135
int(* pcre_callout)(pcre_callout_block *)
Definition: pcre_globals.c:79
int pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data, const char *subject, int length, int start_offset, int options, int *offsets, int offsetcount, int *workspace, int wscount)
struct stateblock stateblock
static uschar toptable1[]
#define ADD_ACTIVE(x, y)
#define OP_PROP_EXTRA
Definition: pcre_dfa_exec.c:68
static int internal_dfa_exec(dfa_match_data *md, const uschar *this_start_code, const uschar *current_subject, int start_offset, int *offsets, int offsetcount, int *workspace, int wscount, int ims, int rlevel, int recursing)
#define INTS_PER_STATEBLOCK
#define OP_ANYNL_EXTRA
Definition: pcre_dfa_exec.c:70
#define ADD_NEW(x, y)
#define OP_EXTUNI_EXTRA
Definition: pcre_dfa_exec.c:69
static uschar coptable[]
Definition: pcre_dfa_exec.c:79
static uschar toptable2[]
#define SP
Definition: pcre_dfa_exec.c:56
#define ADD_NEW_DATA(x, y, z)
int BOOL
#define PT_PC
#define REQ_BYTE_MAX
#define PT_ANY
#define PT_GC
#define ctype_digit
#define PCRE_NEWLINE_BITS
#define ctype_word
#define NLTYPE_FIXED
#define PCRE_STUDY_MAPPED
#define MAGIC_NUMBER
#define ctype_space
#define NOTACHAR
#define PCRE_REQCHSET
unsigned int _pcre_ucp_othercase(const unsigned int)
int _pcre_ucp_findprop(const unsigned int, int *, int *)
#define RREF_ANY
#define DPRINTF(p)
Definition: pcre_internal.h:66
#define PT_SC
real_pcre * _pcre_try_flipped(const real_pcre *, real_pcre *, const pcre_study_data *, pcre_study_data *)
#define IS_NEWLINE(p)
#define PCRE_FIRSTSET
#define WAS_NEWLINE(p)
@ OP_ANYNL
@ OP_CHAR
@ OP_CRMINQUERY
@ OP_SBRA
@ OP_ONCE
@ OP_NOTPROP
@ OP_NOTPLUS
@ OP_TYPEMINPLUS
@ OP_TYPEQUERY
@ OP_SCOND
@ OP_ASSERTBACK
@ OP_CLASS
@ OP_TYPEPLUS
@ OP_NOT_WORDCHAR
@ OP_CRMINPLUS
@ OP_CRRANGE
@ OP_DOLL
@ OP_ASSERT_NOT
@ OP_NOT
@ OP_ASSERT
@ OP_TYPEPOSSTAR
@ OP_TYPEMINUPTO
@ OP_TYPEPOSPLUS
@ OP_POSSTAR
@ OP_NOTUPTO
@ OP_TYPESTAR
@ OP_BRAMINZERO
@ OP_CRQUERY
@ OP_ASSERTBACK_NOT
@ OP_OPT
@ OP_RREF
@ OP_DIGIT
@ OP_EXACT
@ OP_TYPEEXACT
@ OP_PLUS
@ OP_WHITESPACE
@ OP_CRMINSTAR
@ OP_NOT_WORD_BOUNDARY
@ OP_KET
@ OP_NOT_DIGIT
@ OP_CALLOUT
@ OP_CRMINRANGE
@ OP_RECURSE
@ OP_BRA
@ OP_CHARNC
@ OP_CREF
@ OP_POSUPTO
@ OP_NOTPOSUPTO
@ OP_REVERSE
@ OP_NCLASS
@ OP_KETRMIN
@ OP_COND
@ OP_MINPLUS
@ OP_TYPEPOSUPTO
@ OP_WORDCHAR
@ OP_MINQUERY
@ OP_EODN
@ OP_ALT
@ OP_UPTO
@ OP_QUERY
@ OP_PROP
@ OP_NOTPOSSTAR
@ OP_KETRMAX
@ OP_NOTMINPLUS
@ OP_BRAZERO
@ OP_ANYBYTE
@ OP_TYPEMINQUERY
@ OP_NOT_WHITESPACE
@ OP_NOTMINSTAR
@ OP_NOTSTAR
@ OP_SCBRA
@ OP_MINUPTO
@ OP_CRSTAR
@ OP_POSQUERY
@ OP_MINSTAR
@ OP_STAR
@ OP_DEF
@ OP_TYPEMINSTAR
@ OP_NOTMINUPTO
@ OP_NOTMINQUERY
@ OP_CRPLUS
@ OP_TYPEPOSQUERY
@ OP_POSPLUS
@ OP_SOD
@ OP_NOTPOSQUERY
@ OP_TYPEUPTO
@ OP_SOM
@ OP_ANY
@ OP_XCLASS
@ OP_NOTQUERY
@ OP_CBRA
@ OP_EXTUNI
@ OP_EOD
@ OP_NOTEXACT
@ OP_WORD_BOUNDARY
@ OP_NOTPOSPLUS
@ OP_CIRC
#define PT_LAMP
BOOL _pcre_xclass(int, const uschar *)
Definition: pcre_xclass.c:64
unsigned char uschar
#define fcc_offset
#define NLTYPE_ANY
#define memmove(a, b, c)
#define GETCHARLEN(c, eptr, len)
#define lcc_offset
const uschar _pcre_default_tables[]
#define REQ_CASELESS
#define ctypes_offset
#define PUBLIC_DFA_EXEC_OPTIONS
#define BACKCHAR(eptr)
#define GETCHARTEST(c, eptr)
int _pcre_valid_utf8(const uschar *, int)
#define GET2(a, n)
#define PCRE_STARTLINE
static BOOL utf8
Definition: pcregrep.c:147
static char * newline
Definition: pcregrep.c:112
static int pchars(unsigned char *p, int length, FILE *f)
Definition: pcretest.c:375
static int offset
Definition: read.c:62
int code
Definition: signal.c:116
const uschar * start_code
const uschar * end_subject
const uschar * start_subject
const uschar * tables
int current_position
Definition: pcre.h:226
int * offset_vector
Definition: pcre.h:222
void * callout_data
Definition: pcre.h:229
int next_item_length
Definition: pcre.h:232
int capture_last
Definition: pcre.h:228
int subject_length
Definition: pcre.h:224
int pattern_position
Definition: pcre.h:231
const char * subject
Definition: pcre.h:223
int callout_number
Definition: pcre.h:221
void * study_data
Definition: pcre.h:206
unsigned long int flags
Definition: pcre.h:205
const unsigned char * tables
Definition: pcre.h:209
void * callout_data
Definition: pcre.h:208
pcre_uint32 options
uschar start_bits[32]
pcre_uint32 magic_number
pcre_uint32 options
pcre_uint16 name_entry_size
pcre_uint16 name_count
pcre_uint16 name_table_offset
pcre_uint16 first_byte
pcre_uint16 req_byte
const unsigned char * tables
@ ucp_Lu
Definition: ucp.h:37
@ ucp_Lt
Definition: ucp.h:36
@ ucp_Ll
Definition: ucp.h:33
@ ucp_M
Definition: ucp.h:18