"Fossies" - the Fresh Open Source Software Archive 
Member "ngrep-1_47/regex-0.12/regex.c" (7 Sep 2017, 162115 Bytes) of package /linux/misc/ngrep-1_47.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "regex.c" see the
Fossies "Dox" file reference documentation.
1 /* Extended regular expression matching and search library,
2 version 0.12.
3 (Implements POSIX draft P10003.2/D11.2, except for
4 internationalization features.)
5
6 Copyright (C) 1993 Free Software Foundation, Inc.
7
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
21
22 /*
23
24 This file has been modified to replace int re_max_failures with a
25 #define. This avoids compile problems when other various libc's
26 already have that symbol. It has also been modified to be 64-bit
27 clean. --jordan
28
29 */
30
31
32 /* AIX requires this to be the first thing in the file. */
33 #if defined (_AIX) && !defined (REGEX_MALLOC)
34 #pragma alloca
35 #endif
36
37 #define _GNU_SOURCE
38
39 /* We need this for `regex.h', and perhaps for the Emacs include files. */
40 #include <sys/types.h>
41
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45
46 /* The `emacs' switch turns on certain matching commands
47 that make sense only in Emacs. */
48 #ifdef emacs
49
50 #include "lisp.h"
51 #include "buffer.h"
52 #include "syntax.h"
53
54 /* Emacs uses `NULL' as a predicate. */
55 #undef NULL
56
57 #else /* not emacs */
58
59 /* We used to test for `BSTRING' here, but only GCC and Emacs define
60 `BSTRING', as far as I know, and neither of them use this code. */
61 #if HAVE_STRING_H || STDC_HEADERS
62 #include <string.h>
63 #ifndef bcmp
64 #define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
65 #endif
66 #ifndef bcopy
67 #define bcopy(s, d, n) memcpy ((d), (s), (n))
68 #endif
69 #ifndef bzero
70 #define bzero(s, n) memset ((s), 0, (n))
71 #endif
72 #else
73 #include <strings.h>
74 #endif
75
76 #ifdef STDC_HEADERS
77 #include <stdlib.h>
78 #else
79 char *malloc ();
80 char *realloc ();
81 #endif
82
83
84 /* Define the syntax stuff for \<, \>, etc. */
85
86 /* This must be nonzero for the wordchar and notwordchar pattern
87 commands in re_match_2. */
88 #ifndef Sword
89 #define Sword 1
90 #endif
91
92 #ifdef SYNTAX_TABLE
93
94 extern char *re_syntax_table;
95
96 #else /* not SYNTAX_TABLE */
97
98 /* How many characters in the character set. */
99 #define CHAR_SET_SIZE 256
100
101 static char re_syntax_table[CHAR_SET_SIZE];
102
103 static void
104 init_syntax_once ()
105 {
106 register int c;
107 static int done = 0;
108
109 if (done)
110 return;
111
112 bzero (re_syntax_table, sizeof re_syntax_table);
113
114 for (c = 'a'; c <= 'z'; c++)
115 re_syntax_table[c] = Sword;
116
117 for (c = 'A'; c <= 'Z'; c++)
118 re_syntax_table[c] = Sword;
119
120 for (c = '0'; c <= '9'; c++)
121 re_syntax_table[c] = Sword;
122
123 re_syntax_table['_'] = Sword;
124
125 done = 1;
126 }
127
128 #endif /* not SYNTAX_TABLE */
129
130 #define SYNTAX(c) re_syntax_table[c]
131
132 #endif /* not emacs */
133
134 /* Get the interface, including the syntax bits. */
135 #include "regex.h"
136
137 /* isalpha etc. are used for the character classes. */
138 #include <ctype.h>
139
140 #ifndef isascii
141 #define isascii(c) 1
142 #endif
143
144 #ifdef isblank
145 #define ISBLANK(c) (isascii (c) && isblank (c))
146 #else
147 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
148 #endif
149 #ifdef isgraph
150 #define ISGRAPH(c) (isascii (c) && isgraph (c))
151 #else
152 #define ISGRAPH(c) (isascii (c) && isprint (c) && !isspace (c))
153 #endif
154
155 #define ISPRINT(c) (isascii (c) && isprint (c))
156 #define ISDIGIT(c) (isascii (c) && isdigit (c))
157 #define ISALNUM(c) (isascii (c) && isalnum (c))
158 #define ISALPHA(c) (isascii (c) && isalpha (c))
159 #define ISCNTRL(c) (isascii (c) && iscntrl (c))
160 #define ISLOWER(c) (isascii (c) && islower (c))
161 #define ISPUNCT(c) (isascii (c) && ispunct (c))
162 #define ISSPACE(c) (isascii (c) && isspace (c))
163 #define ISUPPER(c) (isascii (c) && isupper (c))
164 #define ISXDIGIT(c) (isascii (c) && isxdigit (c))
165
166 #ifndef NULL
167 #define NULL 0
168 #endif
169
170 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
171 since ours (we hope) works properly with all combinations of
172 machines, compilers, `char' and `unsigned char' argument types.
173 (Per Bothner suggested the basic approach.) */
174 #undef SIGN_EXTEND_CHAR
175 #if __STDC__
176 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
177 #else /* not __STDC__ */
178 /* As in Harbison and Steele. */
179 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
180 #endif
181
182 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
183 use `alloca' instead of `malloc'. This is because using malloc in
184 re_search* or re_match* could cause memory leaks when C-g is used in
185 Emacs; also, malloc is slower and causes storage fragmentation. On
186 the other hand, malloc is more portable, and easier to debug.
187
188 Because we sometimes use alloca, some routines have to be macros,
189 not functions -- `alloca'-allocated space disappears at the end of the
190 function it is called in. */
191
192 #ifdef REGEX_MALLOC
193
194 #define REGEX_ALLOCATE malloc
195 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
196
197 #else /* not REGEX_MALLOC */
198
199 /* Emacs already defines alloca, sometimes. */
200 #ifndef alloca
201
202 /* Make alloca work the best possible way. */
203 #ifdef __GNUC__
204 #define alloca __builtin_alloca
205 #else /* not __GNUC__ */
206 #if HAVE_ALLOCA_H
207 #include <alloca.h>
208 #else /* not __GNUC__ or HAVE_ALLOCA_H */
209 #ifndef _AIX /* Already did AIX, up at the top. */
210 char *alloca ();
211 #endif /* not _AIX */
212 #endif /* not HAVE_ALLOCA_H */
213 #endif /* not __GNUC__ */
214
215 #endif /* not alloca */
216
217 #define REGEX_ALLOCATE alloca
218
219 /* Assumes a `char *destination' variable. */
220 #define REGEX_REALLOCATE(source, osize, nsize) \
221 (destination = (char *) alloca (nsize), \
222 bcopy (source, destination, osize), \
223 destination)
224
225 #endif /* not REGEX_MALLOC */
226
227
228 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
229 `string1' or just past its end. This works if PTR is NULL, which is
230 a good thing. */
231 #define FIRST_STRING_P(ptr) \
232 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
233
234 /* (Re)Allocate N items of type T using malloc, or fail. */
235 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
236 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
237 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
238
239 #define BYTEWIDTH 8 /* In bits. */
240
241 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
242
243 #define MAX(a, b) ((a) > (b) ? (a) : (b))
244 #define MIN(a, b) ((a) < (b) ? (a) : (b))
245
246 typedef char boolean;
247 #define false 0
248 #define true 1
249
250 /* These are the command codes that appear in compiled regular
251 expressions. Some opcodes are followed by argument bytes. A
252 command code can specify any interpretation whatsoever for its
253 arguments. Zero bytes may appear in the compiled regular expression.
254
255 The value of `exactn' is needed in search.c (search_buffer) in Emacs.
256 So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of
257 `exactn' we use here must also be 1. */
258
259 typedef enum
260 {
261 no_op = 0,
262
263 /* Followed by one byte giving n, then by n literal bytes. */
264 exactn = 1,
265
266 /* Matches any (more or less) character. */
267 anychar,
268
269 /* Matches any one char belonging to specified set. First
270 following byte is number of bitmap bytes. Then come bytes
271 for a bitmap saying which chars are in. Bits in each byte
272 are ordered low-bit-first. A character is in the set if its
273 bit is 1. A character too large to have a bit in the map is
274 automatically not in the set. */
275 charset,
276
277 /* Same parameters as charset, but match any character that is
278 not one of those specified. */
279 charset_not,
280
281 /* Start remembering the text that is matched, for storing in a
282 register. Followed by one byte with the register number, in
283 the range 0 to one less than the pattern buffer's re_nsub
284 field. Then followed by one byte with the number of groups
285 inner to this one. (This last has to be part of the
286 start_memory only because we need it in the on_failure_jump
287 of re_match_2.) */
288 start_memory,
289
290 /* Stop remembering the text that is matched and store it in a
291 memory register. Followed by one byte with the register
292 number, in the range 0 to one less than `re_nsub' in the
293 pattern buffer, and one byte with the number of inner groups,
294 just like `start_memory'. (We need the number of inner
295 groups here because we don't have any easy way of finding the
296 corresponding start_memory when we're at a stop_memory.) */
297 stop_memory,
298
299 /* Match a duplicate of something remembered. Followed by one
300 byte containing the register number. */
301 duplicate,
302
303 /* Fail unless at beginning of line. */
304 begline,
305
306 /* Fail unless at end of line. */
307 endline,
308
309 /* Succeeds if at beginning of buffer (if emacs) or at beginning
310 of string to be matched (if not). */
311 begbuf,
312
313 /* Analogously, for end of buffer/string. */
314 endbuf,
315
316 /* Followed by two byte relative address to which to jump. */
317 jump,
318
319 /* Same as jump, but marks the end of an alternative. */
320 jump_past_alt,
321
322 /* Followed by two-byte relative address of place to resume at
323 in case of failure. */
324 on_failure_jump,
325
326 /* Like on_failure_jump, but pushes a placeholder instead of the
327 current string position when executed. */
328 on_failure_keep_string_jump,
329
330 /* Throw away latest failure point and then jump to following
331 two-byte relative address. */
332 pop_failure_jump,
333
334 /* Change to pop_failure_jump if know won't have to backtrack to
335 match; otherwise change to jump. This is used to jump
336 back to the beginning of a repeat. If what follows this jump
337 clearly won't match what the repeat does, such that we can be
338 sure that there is no use backtracking out of repetitions
339 already matched, then we change it to a pop_failure_jump.
340 Followed by two-byte address. */
341 maybe_pop_jump,
342
343 /* Jump to following two-byte address, and push a dummy failure
344 point. This failure point will be thrown away if an attempt
345 is made to use it for a failure. A `+' construct makes this
346 before the first repeat. Also used as an intermediary kind
347 of jump when compiling an alternative. */
348 dummy_failure_jump,
349
350 /* Push a dummy failure point and continue. Used at the end of
351 alternatives. */
352 push_dummy_failure,
353
354 /* Followed by two-byte relative address and two-byte number n.
355 After matching N times, jump to the address upon failure. */
356 succeed_n,
357
358 /* Followed by two-byte relative address, and two-byte number n.
359 Jump to the address N times, then fail. */
360 jump_n,
361
362 /* Set the following two-byte relative address to the
363 subsequent two-byte number. The address *includes* the two
364 bytes of number. */
365 set_number_at,
366
367 wordchar, /* Matches any word-constituent character. */
368 notwordchar, /* Matches any char that is not a word-constituent. */
369
370 wordbeg, /* Succeeds if at word beginning. */
371 wordend, /* Succeeds if at word end. */
372
373 wordbound, /* Succeeds if at a word boundary. */
374 notwordbound /* Succeeds if not at a word boundary. */
375
376 #ifdef emacs
377 ,before_dot, /* Succeeds if before point. */
378 at_dot, /* Succeeds if at point. */
379 after_dot, /* Succeeds if after point. */
380
381 /* Matches any character whose syntax is specified. Followed by
382 a byte which contains a syntax code, e.g., Sword. */
383 syntaxspec,
384
385 /* Matches any character whose syntax is not that specified. */
386 notsyntaxspec
387 #endif /* emacs */
388 } re_opcode_t;
389
390 /* Common operations on the compiled pattern. */
391
392 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
393
394 #define STORE_NUMBER(destination, number) \
395 do { \
396 (destination)[0] = (number) & 0377; \
397 (destination)[1] = (number) >> 8; \
398 } while (0)
399
400 /* Same as STORE_NUMBER, except increment DESTINATION to
401 the byte after where the number is stored. Therefore, DESTINATION
402 must be an lvalue. */
403
404 #define STORE_NUMBER_AND_INCR(destination, number) \
405 do { \
406 STORE_NUMBER (destination, number); \
407 (destination) += 2; \
408 } while (0)
409
410 /* Put into DESTINATION a number stored in two contiguous bytes starting
411 at SOURCE. */
412
413 #define EXTRACT_NUMBER(destination, source) \
414 do { \
415 (destination) = *(source) & 0377; \
416 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
417 } while (0)
418
419 #ifdef DEBUG
420 static void
421 extract_number (dest, source)
422 int *dest;
423 unsigned char *source;
424 {
425 int temp = SIGN_EXTEND_CHAR (*(source + 1));
426 *dest = *source & 0377;
427 *dest += temp << 8;
428 }
429
430 #ifndef EXTRACT_MACROS /* To debug the macros. */
431 #undef EXTRACT_NUMBER
432 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
433 #endif /* not EXTRACT_MACROS */
434
435 #endif /* DEBUG */
436
437 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
438 SOURCE must be an lvalue. */
439
440 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
441 do { \
442 EXTRACT_NUMBER (destination, source); \
443 (source) += 2; \
444 } while (0)
445
446 #ifdef DEBUG
447 static void
448 extract_number_and_incr (destination, source)
449 int *destination;
450 unsigned char **source;
451 {
452 extract_number (destination, *source);
453 *source += 2;
454 }
455
456 #ifndef EXTRACT_MACROS
457 #undef EXTRACT_NUMBER_AND_INCR
458 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
459 extract_number_and_incr (&dest, &src)
460 #endif /* not EXTRACT_MACROS */
461
462 #endif /* DEBUG */
463
464 /* If DEBUG is defined, Regex prints many voluminous messages about what
465 it is doing (if the variable `debug' is nonzero). If linked with the
466 main program in `iregex.c', you can enter patterns and strings
467 interactively. And if linked with the main program in `main.c' and
468 the other test files, you can run the already-written tests. */
469
470 #ifdef DEBUG
471
472 /* We use standard I/O for debugging. */
473 #include <stdio.h>
474
475 /* It is useful to test things that ``must'' be true when debugging. */
476 #include <assert.h>
477
478 static int debug = 0;
479
480 #define DEBUG_STATEMENT(e) e
481 #define DEBUG_PRINT1(x) if (debug) printf (x)
482 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
483 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
484 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
485 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
486 if (debug) print_partial_compiled_pattern (s, e)
487 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
488 if (debug) print_double_string (w, s1, sz1, s2, sz2)
489
490
491 extern void printchar ();
492
493 /* Print the fastmap in human-readable form. */
494
495 void
496 print_fastmap (fastmap)
497 char *fastmap;
498 {
499 unsigned was_a_range = 0;
500 unsigned i = 0;
501
502 while (i < (1 << BYTEWIDTH))
503 {
504 if (fastmap[i++])
505 {
506 was_a_range = 0;
507 printchar (i - 1);
508 while (i < (1 << BYTEWIDTH) && fastmap[i])
509 {
510 was_a_range = 1;
511 i++;
512 }
513 if (was_a_range)
514 {
515 printf ("-");
516 printchar (i - 1);
517 }
518 }
519 }
520 putchar ('\n');
521 }
522
523
524 /* Print a compiled pattern string in human-readable form, starting at
525 the START pointer into it and ending just before the pointer END. */
526
527 void
528 print_partial_compiled_pattern (start, end)
529 unsigned char *start;
530 unsigned char *end;
531 {
532 int mcnt, mcnt2;
533 unsigned char *p = start;
534 unsigned char *pend = end;
535
536 if (start == NULL)
537 {
538 printf ("(null)\n");
539 return;
540 }
541
542 /* Loop over pattern commands. */
543 while (p < pend)
544 {
545 switch ((re_opcode_t) *p++)
546 {
547 case no_op:
548 printf ("/no_op");
549 break;
550
551 case exactn:
552 mcnt = *p++;
553 printf ("/exactn/%d", mcnt);
554 do
555 {
556 putchar ('/');
557 printchar (*p++);
558 }
559 while (--mcnt);
560 break;
561
562 case start_memory:
563 mcnt = *p++;
564 printf ("/start_memory/%d/%d", mcnt, *p++);
565 break;
566
567 case stop_memory:
568 mcnt = *p++;
569 printf ("/stop_memory/%d/%d", mcnt, *p++);
570 break;
571
572 case duplicate:
573 printf ("/duplicate/%d", *p++);
574 break;
575
576 case anychar:
577 printf ("/anychar");
578 break;
579
580 case charset:
581 case charset_not:
582 {
583 register int c;
584
585 printf ("/charset%s",
586 (re_opcode_t) *(p - 1) == charset_not ? "_not" : "");
587
588 assert (p + *p < pend);
589
590 for (c = 0; c < *p; c++)
591 {
592 unsigned bit;
593 unsigned char map_byte = p[1 + c];
594
595 putchar ('/');
596
597 for (bit = 0; bit < BYTEWIDTH; bit++)
598 if (map_byte & (1 << bit))
599 printchar (c * BYTEWIDTH + bit);
600 }
601 p += 1 + *p;
602 break;
603 }
604
605 case begline:
606 printf ("/begline");
607 break;
608
609 case endline:
610 printf ("/endline");
611 break;
612
613 case on_failure_jump:
614 extract_number_and_incr (&mcnt, &p);
615 printf ("/on_failure_jump/0/%d", mcnt);
616 break;
617
618 case on_failure_keep_string_jump:
619 extract_number_and_incr (&mcnt, &p);
620 printf ("/on_failure_keep_string_jump/0/%d", mcnt);
621 break;
622
623 case dummy_failure_jump:
624 extract_number_and_incr (&mcnt, &p);
625 printf ("/dummy_failure_jump/0/%d", mcnt);
626 break;
627
628 case push_dummy_failure:
629 printf ("/push_dummy_failure");
630 break;
631
632 case maybe_pop_jump:
633 extract_number_and_incr (&mcnt, &p);
634 printf ("/maybe_pop_jump/0/%d", mcnt);
635 break;
636
637 case pop_failure_jump:
638 extract_number_and_incr (&mcnt, &p);
639 printf ("/pop_failure_jump/0/%d", mcnt);
640 break;
641
642 case jump_past_alt:
643 extract_number_and_incr (&mcnt, &p);
644 printf ("/jump_past_alt/0/%d", mcnt);
645 break;
646
647 case jump:
648 extract_number_and_incr (&mcnt, &p);
649 printf ("/jump/0/%d", mcnt);
650 break;
651
652 case succeed_n:
653 extract_number_and_incr (&mcnt, &p);
654 extract_number_and_incr (&mcnt2, &p);
655 printf ("/succeed_n/0/%d/0/%d", mcnt, mcnt2);
656 break;
657
658 case jump_n:
659 extract_number_and_incr (&mcnt, &p);
660 extract_number_and_incr (&mcnt2, &p);
661 printf ("/jump_n/0/%d/0/%d", mcnt, mcnt2);
662 break;
663
664 case set_number_at:
665 extract_number_and_incr (&mcnt, &p);
666 extract_number_and_incr (&mcnt2, &p);
667 printf ("/set_number_at/0/%d/0/%d", mcnt, mcnt2);
668 break;
669
670 case wordbound:
671 printf ("/wordbound");
672 break;
673
674 case notwordbound:
675 printf ("/notwordbound");
676 break;
677
678 case wordbeg:
679 printf ("/wordbeg");
680 break;
681
682 case wordend:
683 printf ("/wordend");
684
685 #ifdef emacs
686 case before_dot:
687 printf ("/before_dot");
688 break;
689
690 case at_dot:
691 printf ("/at_dot");
692 break;
693
694 case after_dot:
695 printf ("/after_dot");
696 break;
697
698 case syntaxspec:
699 printf ("/syntaxspec");
700 mcnt = *p++;
701 printf ("/%d", mcnt);
702 break;
703
704 case notsyntaxspec:
705 printf ("/notsyntaxspec");
706 mcnt = *p++;
707 printf ("/%d", mcnt);
708 break;
709 #endif /* emacs */
710
711 case wordchar:
712 printf ("/wordchar");
713 break;
714
715 case notwordchar:
716 printf ("/notwordchar");
717 break;
718
719 case begbuf:
720 printf ("/begbuf");
721 break;
722
723 case endbuf:
724 printf ("/endbuf");
725 break;
726
727 default:
728 printf ("?%d", *(p-1));
729 }
730 }
731 printf ("/\n");
732 }
733
734
735 void
736 print_compiled_pattern (bufp)
737 struct re_pattern_buffer *bufp;
738 {
739 unsigned char *buffer = bufp->buffer;
740
741 print_partial_compiled_pattern (buffer, buffer + bufp->used);
742 printf ("%d bytes used/%d bytes allocated.\n", bufp->used, bufp->allocated);
743
744 if (bufp->fastmap_accurate && bufp->fastmap)
745 {
746 printf ("fastmap: ");
747 print_fastmap (bufp->fastmap);
748 }
749
750 printf ("re_nsub: %d\t", bufp->re_nsub);
751 printf ("regs_alloc: %d\t", bufp->regs_allocated);
752 printf ("can_be_null: %d\t", bufp->can_be_null);
753 printf ("newline_anchor: %d\n", bufp->newline_anchor);
754 printf ("no_sub: %d\t", bufp->no_sub);
755 printf ("not_bol: %d\t", bufp->not_bol);
756 printf ("not_eol: %d\t", bufp->not_eol);
757 printf ("syntax: %d\n", bufp->syntax);
758 /* Perhaps we should print the translate table? */
759 }
760
761
762 void
763 print_double_string (where, string1, size1, string2, size2)
764 const char *where;
765 const char *string1;
766 const char *string2;
767 int size1;
768 int size2;
769 {
770 unsigned this_char;
771
772 if (where == NULL)
773 printf ("(null)");
774 else
775 {
776 if (FIRST_STRING_P (where))
777 {
778 for (this_char = where - string1; this_char < size1; this_char++)
779 printchar (string1[this_char]);
780
781 where = string2;
782 }
783
784 for (this_char = where - string2; this_char < size2; this_char++)
785 printchar (string2[this_char]);
786 }
787 }
788
789 #else /* not DEBUG */
790
791 #undef assert
792 #define assert(e)
793
794 #define DEBUG_STATEMENT(e)
795 #define DEBUG_PRINT1(x)
796 #define DEBUG_PRINT2(x1, x2)
797 #define DEBUG_PRINT3(x1, x2, x3)
798 #define DEBUG_PRINT4(x1, x2, x3, x4)
799 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
800 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
801
802 #endif /* not DEBUG */
803
804 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
805 also be assigned to arbitrarily: each pattern buffer stores its own
806 syntax, so it can be changed between regex compilations. */
807 reg_syntax_t re_syntax_options = RE_SYNTAX_EMACS;
808
809
810 /* Specify the precise syntax of regexps for compilation. This provides
811 for compatibility for various utilities which historically have
812 different, incompatible syntaxes.
813
814 The argument SYNTAX is a bit mask comprised of the various bits
815 defined in regex.h. We return the old syntax. */
816
817 reg_syntax_t
818 re_set_syntax (syntax)
819 reg_syntax_t syntax;
820 {
821 reg_syntax_t ret = re_syntax_options;
822
823 re_syntax_options = syntax;
824 return ret;
825 }
826
827 /* This table gives an error message for each of the error codes listed
828 in regex.h. Obviously the order here has to be same as there. */
829
830 static const char *re_error_msg[] =
831 { NULL, /* REG_NOERROR */
832 "No match", /* REG_NOMATCH */
833 "Invalid regular expression", /* REG_BADPAT */
834 "Invalid collation character", /* REG_ECOLLATE */
835 "Invalid character class name", /* REG_ECTYPE */
836 "Trailing backslash", /* REG_EESCAPE */
837 "Invalid back reference", /* REG_ESUBREG */
838 "Unmatched [ or [^", /* REG_EBRACK */
839 "Unmatched ( or \\(", /* REG_EPAREN */
840 "Unmatched \\{", /* REG_EBRACE */
841 "Invalid content of \\{\\}", /* REG_BADBR */
842 "Invalid range end", /* REG_ERANGE */
843 "Memory exhausted", /* REG_ESPACE */
844 "Invalid preceding regular expression", /* REG_BADRPT */
845 "Premature end of regular expression", /* REG_EEND */
846 "Regular expression too big", /* REG_ESIZE */
847 "Unmatched ) or \\)", /* REG_ERPAREN */
848 };
849
850 /* Subroutine declarations and macros for regex_compile. */
851
852 static void store_op1 (), store_op2 ();
853 static void insert_op1 (), insert_op2 ();
854 static boolean at_begline_loc_p (), at_endline_loc_p ();
855 static boolean group_in_compile_stack ();
856 static reg_errcode_t compile_range ();
857
858 /* Fetch the next character in the uncompiled pattern---translating it
859 if necessary. Also cast from a signed character in the constant
860 string passed to us by the user to an unsigned char that we can use
861 as an array index (in, e.g., `translate'). */
862 #define PATFETCH(c) \
863 do {if (p == pend) return REG_EEND; \
864 c = (unsigned char) *p++; \
865 if (translate) c = translate[c]; \
866 } while (0)
867
868 /* Fetch the next character in the uncompiled pattern, with no
869 translation. */
870 #define PATFETCH_RAW(c) \
871 do {if (p == pend) return REG_EEND; \
872 c = (unsigned char) *p++; \
873 } while (0)
874
875 /* Go backwards one character in the pattern. */
876 #define PATUNFETCH p--
877
878
879 /* If `translate' is non-null, return translate[D], else just D. We
880 cast the subscript to translate because some data is declared as
881 `char *', to avoid warnings when a string constant is passed. But
882 when we use a character as a subscript we must make it unsigned. */
883 #define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (d))
884
885
886 /* Macros for outputting the compiled pattern into `buffer'. */
887
888 /* If the buffer isn't allocated when it comes in, use this. */
889 #define INIT_BUF_SIZE 32
890
891 /* Make sure we have at least N more bytes of space in buffer. */
892 #define GET_BUFFER_SPACE(n) \
893 while (b - bufp->buffer + (n) > bufp->allocated) \
894 EXTEND_BUFFER ()
895
896 /* Make sure we have one more byte of buffer space and then add C to it. */
897 #define BUF_PUSH(c) \
898 do { \
899 GET_BUFFER_SPACE (1); \
900 *b++ = (unsigned char) (c); \
901 } while (0)
902
903
904 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
905 #define BUF_PUSH_2(c1, c2) \
906 do { \
907 GET_BUFFER_SPACE (2); \
908 *b++ = (unsigned char) (c1); \
909 *b++ = (unsigned char) (c2); \
910 } while (0)
911
912
913 /* As with BUF_PUSH_2, except for three bytes. */
914 #define BUF_PUSH_3(c1, c2, c3) \
915 do { \
916 GET_BUFFER_SPACE (3); \
917 *b++ = (unsigned char) (c1); \
918 *b++ = (unsigned char) (c2); \
919 *b++ = (unsigned char) (c3); \
920 } while (0)
921
922
923 /* Store a jump with opcode OP at LOC to location TO. We store a
924 relative address offset by the three bytes the jump itself occupies. */
925 #define STORE_JUMP(op, loc, to) \
926 store_op1 (op, loc, (to) - (loc) - 3)
927
928 /* Likewise, for a two-argument jump. */
929 #define STORE_JUMP2(op, loc, to, arg) \
930 store_op2 (op, loc, (to) - (loc) - 3, arg)
931
932 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
933 #define INSERT_JUMP(op, loc, to) \
934 insert_op1 (op, loc, (to) - (loc) - 3, b)
935
936 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
937 #define INSERT_JUMP2(op, loc, to, arg) \
938 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
939
940
941 /* This is not an arbitrary limit: the arguments which represent offsets
942 into the pattern are two bytes long. So if 2^16 bytes turns out to
943 be too small, many things would have to change. */
944 #define MAX_BUF_SIZE (1L << 16)
945
946
947 /* Extend the buffer by twice its current size via realloc and
948 reset the pointers that pointed into the old block to point to the
949 correct places in the new one. If extending the buffer results in it
950 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
951 #define EXTEND_BUFFER() \
952 do { \
953 unsigned char *old_buffer = bufp->buffer; \
954 if (bufp->allocated == MAX_BUF_SIZE) \
955 return REG_ESIZE; \
956 bufp->allocated <<= 1; \
957 if (bufp->allocated > MAX_BUF_SIZE) \
958 bufp->allocated = MAX_BUF_SIZE; \
959 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
960 if (bufp->buffer == NULL) \
961 return REG_ESPACE; \
962 /* If the buffer moved, move all the pointers into it. */ \
963 if (old_buffer != bufp->buffer) \
964 { \
965 b = (b - old_buffer) + bufp->buffer; \
966 begalt = (begalt - old_buffer) + bufp->buffer; \
967 if (fixup_alt_jump) \
968 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
969 if (laststart) \
970 laststart = (laststart - old_buffer) + bufp->buffer; \
971 if (pending_exact) \
972 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
973 } \
974 } while (0)
975
976
977 /* Since we have one byte reserved for the register number argument to
978 {start,stop}_memory, the maximum number of groups we can report
979 things about is what fits in that byte. */
980 #define MAX_REGNUM 255
981
982 /* But patterns can have more than `MAX_REGNUM' registers. We just
983 ignore the excess. */
984 typedef unsigned regnum_t;
985
986
987 /* Macros for the compile stack. */
988
989 /* Since offsets can go either forwards or backwards, this type needs to
990 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
991 typedef int pattern_offset_t;
992
993 typedef struct
994 {
995 pattern_offset_t begalt_offset;
996 pattern_offset_t fixup_alt_jump;
997 pattern_offset_t inner_group_offset;
998 pattern_offset_t laststart_offset;
999 regnum_t regnum;
1000 } compile_stack_elt_t;
1001
1002
1003 typedef struct
1004 {
1005 compile_stack_elt_t *stack;
1006 unsigned size;
1007 unsigned avail; /* Offset of next open position. */
1008 } compile_stack_type;
1009
1010
1011 #define INIT_COMPILE_STACK_SIZE 32
1012
1013 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1014 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1015
1016 /* The next available element. */
1017 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1018
1019
1020 /* Set the bit for character C in a list. */
1021 #define SET_LIST_BIT(c) \
1022 (b[((unsigned char) (c)) / BYTEWIDTH] \
1023 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1024
1025
1026 /* Get the next unsigned number in the uncompiled pattern. */
1027 #define GET_UNSIGNED_NUMBER(num) \
1028 { if (p != pend) \
1029 { \
1030 PATFETCH (c); \
1031 while (ISDIGIT (c)) \
1032 { \
1033 if (num < 0) \
1034 num = 0; \
1035 num = num * 10 + c - '0'; \
1036 if (p == pend) \
1037 break; \
1038 PATFETCH (c); \
1039 } \
1040 } \
1041 }
1042
1043 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1044
1045 #define IS_CHAR_CLASS(string) \
1046 (STREQ (string, "alpha") || STREQ (string, "upper") \
1047 || STREQ (string, "lower") || STREQ (string, "digit") \
1048 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1049 || STREQ (string, "space") || STREQ (string, "print") \
1050 || STREQ (string, "punct") || STREQ (string, "graph") \
1051 || STREQ (string, "cntrl") || STREQ (string, "blank"))
1052
1053 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1054 Returns one of error codes defined in `regex.h', or zero for success.
1055
1056 Assumes the `allocated' (and perhaps `buffer') and `translate'
1057 fields are set in BUFP on entry.
1058
1059 If it succeeds, results are put in BUFP (if it returns an error, the
1060 contents of BUFP are undefined):
1061 `buffer' is the compiled pattern;
1062 `syntax' is set to SYNTAX;
1063 `used' is set to the length of the compiled pattern;
1064 `fastmap_accurate' is zero;
1065 `re_nsub' is the number of subexpressions in PATTERN;
1066 `not_bol' and `not_eol' are zero;
1067
1068 The `fastmap' and `newline_anchor' fields are neither
1069 examined nor set. */
1070
1071 static reg_errcode_t
1072 regex_compile (pattern, size, syntax, bufp)
1073 const char *pattern;
1074 int size;
1075 reg_syntax_t syntax;
1076 struct re_pattern_buffer *bufp;
1077 {
1078 /* We fetch characters from PATTERN here. Even though PATTERN is
1079 `char *' (i.e., signed), we declare these variables as unsigned, so
1080 they can be reliably used as array indices. */
1081 register unsigned char c, c1;
1082
1083 /* A random tempory spot in PATTERN. */
1084 const char *p1;
1085
1086 /* Points to the end of the buffer, where we should append. */
1087 register unsigned char *b;
1088
1089 /* Keeps track of unclosed groups. */
1090 compile_stack_type compile_stack;
1091
1092 /* Points to the current (ending) position in the pattern. */
1093 const char *p = pattern;
1094 const char *pend = pattern + size;
1095
1096 /* How to translate the characters in the pattern. */
1097 char *translate = bufp->translate;
1098
1099 /* Address of the count-byte of the most recently inserted `exactn'
1100 command. This makes it possible to tell if a new exact-match
1101 character can be added to that command or if the character requires
1102 a new `exactn' command. */
1103 unsigned char *pending_exact = 0;
1104
1105 /* Address of start of the most recently finished expression.
1106 This tells, e.g., postfix * where to find the start of its
1107 operand. Reset at the beginning of groups and alternatives. */
1108 unsigned char *laststart = 0;
1109
1110 /* Address of beginning of regexp, or inside of last group. */
1111 unsigned char *begalt;
1112
1113 /* Place in the uncompiled pattern (i.e., the {) to
1114 which to go back if the interval is invalid. */
1115 const char *beg_interval;
1116
1117 /* Address of the place where a forward jump should go to the end of
1118 the containing expression. Each alternative of an `or' -- except the
1119 last -- ends with a forward jump of this sort. */
1120 unsigned char *fixup_alt_jump = 0;
1121
1122 /* Counts open-groups as they are encountered. Remembered for the
1123 matching close-group on the compile stack, so the same register
1124 number is put in the stop_memory as the start_memory. */
1125 regnum_t regnum = 0;
1126
1127 #ifdef DEBUG
1128 DEBUG_PRINT1 ("\nCompiling pattern: ");
1129 if (debug)
1130 {
1131 unsigned debug_count;
1132
1133 for (debug_count = 0; debug_count < size; debug_count++)
1134 printchar (pattern[debug_count]);
1135 putchar ('\n');
1136 }
1137 #endif /* DEBUG */
1138
1139 /* Initialize the compile stack. */
1140 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
1141 if (compile_stack.stack == NULL)
1142 return REG_ESPACE;
1143
1144 compile_stack.size = INIT_COMPILE_STACK_SIZE;
1145 compile_stack.avail = 0;
1146
1147 /* Initialize the pattern buffer. */
1148 bufp->syntax = syntax;
1149 bufp->fastmap_accurate = 0;
1150 bufp->not_bol = bufp->not_eol = 0;
1151
1152 /* Set `used' to zero, so that if we return an error, the pattern
1153 printer (for debugging) will think there's no pattern. We reset it
1154 at the end. */
1155 bufp->used = 0;
1156
1157 /* Always count groups, whether or not bufp->no_sub is set. */
1158 bufp->re_nsub = 0;
1159
1160 #if !defined (emacs) && !defined (SYNTAX_TABLE)
1161 /* Initialize the syntax table. */
1162 init_syntax_once ();
1163 #endif
1164
1165 if (bufp->allocated == 0)
1166 {
1167 if (bufp->buffer)
1168 { /* If zero allocated, but buffer is non-null, try to realloc
1169 enough space. This loses if buffer's address is bogus, but
1170 that is the user's responsibility. */
1171 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
1172 }
1173 else
1174 { /* Caller did not allocate a buffer. Do it for them. */
1175 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
1176 }
1177 if (!bufp->buffer) return REG_ESPACE;
1178
1179 bufp->allocated = INIT_BUF_SIZE;
1180 }
1181
1182 begalt = b = bufp->buffer;
1183
1184 /* Loop through the uncompiled pattern until we're at the end. */
1185 while (p != pend)
1186 {
1187 PATFETCH (c);
1188
1189 switch (c)
1190 {
1191 case '^':
1192 {
1193 if ( /* If at start of pattern, it's an operator. */
1194 p == pattern + 1
1195 /* If context independent, it's an operator. */
1196 || syntax & RE_CONTEXT_INDEP_ANCHORS
1197 /* Otherwise, depends on what's come before. */
1198 || at_begline_loc_p (pattern, p, syntax))
1199 BUF_PUSH (begline);
1200 else
1201 goto normal_char;
1202 }
1203 break;
1204
1205
1206 case '$':
1207 {
1208 if ( /* If at end of pattern, it's an operator. */
1209 p == pend
1210 /* If context independent, it's an operator. */
1211 || syntax & RE_CONTEXT_INDEP_ANCHORS
1212 /* Otherwise, depends on what's next. */
1213 || at_endline_loc_p (p, pend, syntax))
1214 BUF_PUSH (endline);
1215 else
1216 goto normal_char;
1217 }
1218 break;
1219
1220
1221 case '+':
1222 case '?':
1223 if ((syntax & RE_BK_PLUS_QM)
1224 || (syntax & RE_LIMITED_OPS))
1225 goto normal_char;
1226 handle_plus:
1227 case '*':
1228 /* If there is no previous pattern... */
1229 if (!laststart)
1230 {
1231 if (syntax & RE_CONTEXT_INVALID_OPS)
1232 return REG_BADRPT;
1233 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
1234 goto normal_char;
1235 }
1236
1237 {
1238 /* Are we optimizing this jump? */
1239 boolean keep_string_p = false;
1240
1241 /* 1 means zero (many) matches is allowed. */
1242 char zero_times_ok = 0, many_times_ok = 0;
1243
1244 /* If there is a sequence of repetition chars, collapse it
1245 down to just one (the right one). We can't combine
1246 interval operators with these because of, e.g., `a{2}*',
1247 which should only match an even number of `a's. */
1248
1249 for (;;)
1250 {
1251 zero_times_ok |= c != '+';
1252 many_times_ok |= c != '?';
1253
1254 if (p == pend)
1255 break;
1256
1257 PATFETCH (c);
1258
1259 if (c == '*'
1260 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
1261 ;
1262
1263 else if (syntax & RE_BK_PLUS_QM && c == '\\')
1264 {
1265 if (p == pend) return REG_EESCAPE;
1266
1267 PATFETCH (c1);
1268 if (!(c1 == '+' || c1 == '?'))
1269 {
1270 PATUNFETCH;
1271 PATUNFETCH;
1272 break;
1273 }
1274
1275 c = c1;
1276 }
1277 else
1278 {
1279 PATUNFETCH;
1280 break;
1281 }
1282
1283 /* If we get here, we found another repeat character. */
1284 }
1285
1286 /* Star, etc. applied to an empty pattern is equivalent
1287 to an empty pattern. */
1288 if (!laststart)
1289 break;
1290
1291 /* Now we know whether or not zero matches is allowed
1292 and also whether or not two or more matches is allowed. */
1293 if (many_times_ok)
1294 { /* More than one repetition is allowed, so put in at the
1295 end a backward relative jump from `b' to before the next
1296 jump we're going to put in below (which jumps from
1297 laststart to after this jump).
1298
1299 But if we are at the `*' in the exact sequence `.*\n',
1300 insert an unconditional jump backwards to the .,
1301 instead of the beginning of the loop. This way we only
1302 push a failure point once, instead of every time
1303 through the loop. */
1304 assert (p - 1 > pattern);
1305
1306 /* Allocate the space for the jump. */
1307 GET_BUFFER_SPACE (3);
1308
1309 /* We know we are not at the first character of the pattern,
1310 because laststart was nonzero. And we've already
1311 incremented `p', by the way, to be the character after
1312 the `*'. Do we have to do something analogous here
1313 for null bytes, because of RE_DOT_NOT_NULL? */
1314 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
1315 && zero_times_ok
1316 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
1317 && !(syntax & RE_DOT_NEWLINE))
1318 { /* We have .*\n. */
1319 STORE_JUMP (jump, b, laststart);
1320 keep_string_p = true;
1321 }
1322 else
1323 /* Anything else. */
1324 STORE_JUMP (maybe_pop_jump, b, laststart - 3);
1325
1326 /* We've added more stuff to the buffer. */
1327 b += 3;
1328 }
1329
1330 /* On failure, jump from laststart to b + 3, which will be the
1331 end of the buffer after this jump is inserted. */
1332 GET_BUFFER_SPACE (3);
1333 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
1334 : on_failure_jump,
1335 laststart, b + 3);
1336 pending_exact = 0;
1337 b += 3;
1338
1339 if (!zero_times_ok)
1340 {
1341 /* At least one repetition is required, so insert a
1342 `dummy_failure_jump' before the initial
1343 `on_failure_jump' instruction of the loop. This
1344 effects a skip over that instruction the first time
1345 we hit that loop. */
1346 GET_BUFFER_SPACE (3);
1347 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
1348 b += 3;
1349 }
1350 }
1351 break;
1352
1353
1354 case '.':
1355 laststart = b;
1356 BUF_PUSH (anychar);
1357 break;
1358
1359
1360 case '[':
1361 {
1362 boolean had_char_class = false;
1363
1364 if (p == pend) return REG_EBRACK;
1365
1366 /* Ensure that we have enough space to push a charset: the
1367 opcode, the length count, and the bitset; 34 bytes in all. */
1368 GET_BUFFER_SPACE (34);
1369
1370 laststart = b;
1371
1372 /* We test `*p == '^' twice, instead of using an if
1373 statement, so we only need one BUF_PUSH. */
1374 BUF_PUSH (*p == '^' ? charset_not : charset);
1375 if (*p == '^')
1376 p++;
1377
1378 /* Remember the first position in the bracket expression. */
1379 p1 = p;
1380
1381 /* Push the number of bytes in the bitmap. */
1382 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
1383
1384 /* Clear the whole map. */
1385 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
1386
1387 /* charset_not matches newline according to a syntax bit. */
1388 if ((re_opcode_t) b[-2] == charset_not
1389 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
1390 SET_LIST_BIT ('\n');
1391
1392 /* Read in characters and ranges, setting map bits. */
1393 for (;;)
1394 {
1395 if (p == pend) return REG_EBRACK;
1396
1397 PATFETCH (c);
1398
1399 /* \ might escape characters inside [...] and [^...]. */
1400 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
1401 {
1402 if (p == pend) return REG_EESCAPE;
1403
1404 PATFETCH (c1);
1405 SET_LIST_BIT (c1);
1406 continue;
1407 }
1408
1409 /* Could be the end of the bracket expression. If it's
1410 not (i.e., when the bracket expression is `[]' so
1411 far), the ']' character bit gets set way below. */
1412 if (c == ']' && p != p1 + 1)
1413 break;
1414
1415 /* Look ahead to see if it's a range when the last thing
1416 was a character class. */
1417 if (had_char_class && c == '-' && *p != ']')
1418 return REG_ERANGE;
1419
1420 /* Look ahead to see if it's a range when the last thing
1421 was a character: if this is a hyphen not at the
1422 beginning or the end of a list, then it's the range
1423 operator. */
1424 if (c == '-'
1425 && !(p - 2 >= pattern && p[-2] == '[')
1426 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
1427 && *p != ']')
1428 {
1429 reg_errcode_t ret
1430 = compile_range (&p, pend, translate, syntax, b);
1431 if (ret != REG_NOERROR) return ret;
1432 }
1433
1434 else if (p[0] == '-' && p[1] != ']')
1435 { /* This handles ranges made up of characters only. */
1436 reg_errcode_t ret;
1437
1438 /* Move past the `-'. */
1439 PATFETCH (c1);
1440
1441 ret = compile_range (&p, pend, translate, syntax, b);
1442 if (ret != REG_NOERROR) return ret;
1443 }
1444
1445 /* See if we're at the beginning of a possible character
1446 class. */
1447
1448 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
1449 { /* Leave room for the null. */
1450 char str[CHAR_CLASS_MAX_LENGTH + 1];
1451
1452 PATFETCH (c);
1453 c1 = 0;
1454
1455 /* If pattern is `[[:'. */
1456 if (p == pend) return REG_EBRACK;
1457
1458 for (;;)
1459 {
1460 PATFETCH (c);
1461 if (c == ':' || c == ']' || p == pend
1462 || c1 == CHAR_CLASS_MAX_LENGTH)
1463 break;
1464 str[c1++] = c;
1465 }
1466 str[c1] = '\0';
1467
1468 /* If isn't a word bracketed by `[:' and:`]':
1469 undo the ending character, the letters, and leave
1470 the leading `:' and `[' (but set bits for them). */
1471 if (c == ':' && *p == ']')
1472 {
1473 int ch;
1474 boolean is_alnum = STREQ (str, "alnum");
1475 boolean is_alpha = STREQ (str, "alpha");
1476 boolean is_blank = STREQ (str, "blank");
1477 boolean is_cntrl = STREQ (str, "cntrl");
1478 boolean is_digit = STREQ (str, "digit");
1479 boolean is_graph = STREQ (str, "graph");
1480 boolean is_lower = STREQ (str, "lower");
1481 boolean is_print = STREQ (str, "print");
1482 boolean is_punct = STREQ (str, "punct");
1483 boolean is_space = STREQ (str, "space");
1484 boolean is_upper = STREQ (str, "upper");
1485 boolean is_xdigit = STREQ (str, "xdigit");
1486
1487 if (!IS_CHAR_CLASS (str)) return REG_ECTYPE;
1488
1489 /* Throw away the ] at the end of the character
1490 class. */
1491 PATFETCH (c);
1492
1493 if (p == pend) return REG_EBRACK;
1494
1495 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
1496 {
1497 if ( (is_alnum && ISALNUM (ch))
1498 || (is_alpha && ISALPHA (ch))
1499 || (is_blank && ISBLANK (ch))
1500 || (is_cntrl && ISCNTRL (ch))
1501 || (is_digit && ISDIGIT (ch))
1502 || (is_graph && ISGRAPH (ch))
1503 || (is_lower && ISLOWER (ch))
1504 || (is_print && ISPRINT (ch))
1505 || (is_punct && ISPUNCT (ch))
1506 || (is_space && ISSPACE (ch))
1507 || (is_upper && ISUPPER (ch))
1508 || (is_xdigit && ISXDIGIT (ch)))
1509 SET_LIST_BIT (ch);
1510 }
1511 had_char_class = true;
1512 }
1513 else
1514 {
1515 c1++;
1516 while (c1--)
1517 PATUNFETCH;
1518 SET_LIST_BIT ('[');
1519 SET_LIST_BIT (':');
1520 had_char_class = false;
1521 }
1522 }
1523 else
1524 {
1525 had_char_class = false;
1526 SET_LIST_BIT (c);
1527 }
1528 }
1529
1530 /* Discard any (non)matching list bytes that are all 0 at the
1531 end of the map. Decrease the map-length byte too. */
1532 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
1533 b[-1]--;
1534 b += b[-1];
1535 }
1536 break;
1537
1538
1539 case '(':
1540 if (syntax & RE_NO_BK_PARENS)
1541 goto handle_open;
1542 else
1543 goto normal_char;
1544
1545
1546 case ')':
1547 if (syntax & RE_NO_BK_PARENS)
1548 goto handle_close;
1549 else
1550 goto normal_char;
1551
1552
1553 case '\n':
1554 if (syntax & RE_NEWLINE_ALT)
1555 goto handle_alt;
1556 else
1557 goto normal_char;
1558
1559
1560 case '|':
1561 if (syntax & RE_NO_BK_VBAR)
1562 goto handle_alt;
1563 else
1564 goto normal_char;
1565
1566
1567 case '{':
1568 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
1569 goto handle_interval;
1570 else
1571 goto normal_char;
1572
1573
1574 case '\\':
1575 if (p == pend) return REG_EESCAPE;
1576
1577 /* Do not translate the character after the \, so that we can
1578 distinguish, e.g., \B from \b, even if we normally would
1579 translate, e.g., B to b. */
1580 PATFETCH_RAW (c);
1581
1582 switch (c)
1583 {
1584 case '(':
1585 if (syntax & RE_NO_BK_PARENS)
1586 goto normal_backslash;
1587
1588 handle_open:
1589 bufp->re_nsub++;
1590 regnum++;
1591
1592 if (COMPILE_STACK_FULL)
1593 {
1594 RETALLOC (compile_stack.stack, compile_stack.size << 1,
1595 compile_stack_elt_t);
1596 if (compile_stack.stack == NULL) return REG_ESPACE;
1597
1598 compile_stack.size <<= 1;
1599 }
1600
1601 /* These are the values to restore when we hit end of this
1602 group. They are all relative offsets, so that if the
1603 whole pattern moves because of realloc, they will still
1604 be valid. */
1605 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
1606 COMPILE_STACK_TOP.fixup_alt_jump
1607 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
1608 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
1609 COMPILE_STACK_TOP.regnum = regnum;
1610
1611 /* We will eventually replace the 0 with the number of
1612 groups inner to this one. But do not push a
1613 start_memory for groups beyond the last one we can
1614 represent in the compiled pattern. */
1615 if (regnum <= MAX_REGNUM)
1616 {
1617 COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2;
1618 BUF_PUSH_3 (start_memory, regnum, 0);
1619 }
1620
1621 compile_stack.avail++;
1622
1623 fixup_alt_jump = 0;
1624 laststart = 0;
1625 begalt = b;
1626 /* If we've reached MAX_REGNUM groups, then this open
1627 won't actually generate any code, so we'll have to
1628 clear pending_exact explicitly. */
1629 pending_exact = 0;
1630 break;
1631
1632
1633 case ')':
1634 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
1635
1636 if (COMPILE_STACK_EMPTY)
1637 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
1638 goto normal_backslash;
1639 else
1640 return REG_ERPAREN;
1641
1642 handle_close:
1643 if (fixup_alt_jump)
1644 { /* Push a dummy failure point at the end of the
1645 alternative for a possible future
1646 `pop_failure_jump' to pop. See comments at
1647 `push_dummy_failure' in `re_match_2'. */
1648 BUF_PUSH (push_dummy_failure);
1649
1650 /* We allocated space for this jump when we assigned
1651 to `fixup_alt_jump', in the `handle_alt' case below. */
1652 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
1653 }
1654
1655 /* See similar code for backslashed left paren above. */
1656 if (COMPILE_STACK_EMPTY)
1657 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
1658 goto normal_char;
1659 else
1660 return REG_ERPAREN;
1661
1662 /* Since we just checked for an empty stack above, this
1663 ``can't happen''. */
1664 assert (compile_stack.avail != 0);
1665 {
1666 /* We don't just want to restore into `regnum', because
1667 later groups should continue to be numbered higher,
1668 as in `(ab)c(de)' -- the second group is #2. */
1669 regnum_t this_group_regnum;
1670
1671 compile_stack.avail--;
1672 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
1673 fixup_alt_jump
1674 = COMPILE_STACK_TOP.fixup_alt_jump
1675 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
1676 : 0;
1677 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
1678 this_group_regnum = COMPILE_STACK_TOP.regnum;
1679 /* If we've reached MAX_REGNUM groups, then this open
1680 won't actually generate any code, so we'll have to
1681 clear pending_exact explicitly. */
1682 pending_exact = 0;
1683
1684 /* We're at the end of the group, so now we know how many
1685 groups were inside this one. */
1686 if (this_group_regnum <= MAX_REGNUM)
1687 {
1688 unsigned char *inner_group_loc
1689 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
1690
1691 *inner_group_loc = regnum - this_group_regnum;
1692 BUF_PUSH_3 (stop_memory, this_group_regnum,
1693 regnum - this_group_regnum);
1694 }
1695 }
1696 break;
1697
1698
1699 case '|': /* `\|'. */
1700 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
1701 goto normal_backslash;
1702 handle_alt:
1703 if (syntax & RE_LIMITED_OPS)
1704 goto normal_char;
1705
1706 /* Insert before the previous alternative a jump which
1707 jumps to this alternative if the former fails. */
1708 GET_BUFFER_SPACE (3);
1709 INSERT_JUMP (on_failure_jump, begalt, b + 6);
1710 pending_exact = 0;
1711 b += 3;
1712
1713 /* The alternative before this one has a jump after it
1714 which gets executed if it gets matched. Adjust that
1715 jump so it will jump to this alternative's analogous
1716 jump (put in below, which in turn will jump to the next
1717 (if any) alternative's such jump, etc.). The last such
1718 jump jumps to the correct final destination. A picture:
1719 _____ _____
1720 | | | |
1721 | v | v
1722 a | b | c
1723
1724 If we are at `b', then fixup_alt_jump right now points to a
1725 three-byte space after `a'. We'll put in the jump, set
1726 fixup_alt_jump to right after `b', and leave behind three
1727 bytes which we'll fill in when we get to after `c'. */
1728
1729 if (fixup_alt_jump)
1730 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
1731
1732 /* Mark and leave space for a jump after this alternative,
1733 to be filled in later either by next alternative or
1734 when know we're at the end of a series of alternatives. */
1735 fixup_alt_jump = b;
1736 GET_BUFFER_SPACE (3);
1737 b += 3;
1738
1739 laststart = 0;
1740 begalt = b;
1741 break;
1742
1743
1744 case '{':
1745 /* If \{ is a literal. */
1746 if (!(syntax & RE_INTERVALS)
1747 /* If we're at `\{' and it's not the open-interval
1748 operator. */
1749 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1750 || (p - 2 == pattern && p == pend))
1751 goto normal_backslash;
1752
1753 handle_interval:
1754 {
1755 /* If got here, then the syntax allows intervals. */
1756
1757 /* At least (most) this many matches must be made. */
1758 int lower_bound = -1, upper_bound = -1;
1759
1760 beg_interval = p - 1;
1761
1762 if (p == pend)
1763 {
1764 if (syntax & RE_NO_BK_BRACES)
1765 goto unfetch_interval;
1766 else
1767 return REG_EBRACE;
1768 }
1769
1770 GET_UNSIGNED_NUMBER (lower_bound);
1771
1772 if (c == ',')
1773 {
1774 GET_UNSIGNED_NUMBER (upper_bound);
1775 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
1776 }
1777 else
1778 /* Interval such as `{1}' => match exactly once. */
1779 upper_bound = lower_bound;
1780
1781 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
1782 || lower_bound > upper_bound)
1783 {
1784 if (syntax & RE_NO_BK_BRACES)
1785 goto unfetch_interval;
1786 else
1787 return REG_BADBR;
1788 }
1789
1790 if (!(syntax & RE_NO_BK_BRACES))
1791 {
1792 if (c != '\\') return REG_EBRACE;
1793
1794 PATFETCH (c);
1795 }
1796
1797 if (c != '}')
1798 {
1799 if (syntax & RE_NO_BK_BRACES)
1800 goto unfetch_interval;
1801 else
1802 return REG_BADBR;
1803 }
1804
1805 /* We just parsed a valid interval. */
1806
1807 /* If it's invalid to have no preceding re. */
1808 if (!laststart)
1809 {
1810 if (syntax & RE_CONTEXT_INVALID_OPS)
1811 return REG_BADRPT;
1812 else if (syntax & RE_CONTEXT_INDEP_OPS)
1813 laststart = b;
1814 else
1815 goto unfetch_interval;
1816 }
1817
1818 /* If the upper bound is zero, don't want to succeed at
1819 all; jump from `laststart' to `b + 3', which will be
1820 the end of the buffer after we insert the jump. */
1821 if (upper_bound == 0)
1822 {
1823 GET_BUFFER_SPACE (3);
1824 INSERT_JUMP (jump, laststart, b + 3);
1825 b += 3;
1826 }
1827
1828 /* Otherwise, we have a nontrivial interval. When
1829 we're all done, the pattern will look like:
1830 set_number_at <jump count> <upper bound>
1831 set_number_at <succeed_n count> <lower bound>
1832 succeed_n <after jump addr> <succed_n count>
1833 <body of loop>
1834 jump_n <succeed_n addr> <jump count>
1835 (The upper bound and `jump_n' are omitted if
1836 `upper_bound' is 1, though.) */
1837 else
1838 { /* If the upper bound is > 1, we need to insert
1839 more at the end of the loop. */
1840 unsigned nbytes = 10 + (upper_bound > 1) * 10;
1841
1842 GET_BUFFER_SPACE (nbytes);
1843
1844 /* Initialize lower bound of the `succeed_n', even
1845 though it will be set during matching by its
1846 attendant `set_number_at' (inserted next),
1847 because `re_compile_fastmap' needs to know.
1848 Jump to the `jump_n' we might insert below. */
1849 INSERT_JUMP2 (succeed_n, laststart,
1850 b + 5 + (upper_bound > 1) * 5,
1851 lower_bound);
1852 b += 5;
1853
1854 /* Code to initialize the lower bound. Insert
1855 before the `succeed_n'. The `5' is the last two
1856 bytes of this `set_number_at', plus 3 bytes of
1857 the following `succeed_n'. */
1858 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
1859 b += 5;
1860
1861 if (upper_bound > 1)
1862 { /* More than one repetition is allowed, so
1863 append a backward jump to the `succeed_n'
1864 that starts this interval.
1865
1866 When we've reached this during matching,
1867 we'll have matched the interval once, so
1868 jump back only `upper_bound - 1' times. */
1869 STORE_JUMP2 (jump_n, b, laststart + 5,
1870 upper_bound - 1);
1871 b += 5;
1872
1873 /* The location we want to set is the second
1874 parameter of the `jump_n'; that is `b-2' as
1875 an absolute address. `laststart' will be
1876 the `set_number_at' we're about to insert;
1877 `laststart+3' the number to set, the source
1878 for the relative address. But we are
1879 inserting into the middle of the pattern --
1880 so everything is getting moved up by 5.
1881 Conclusion: (b - 2) - (laststart + 3) + 5,
1882 i.e., b - laststart.
1883
1884 We insert this at the beginning of the loop
1885 so that if we fail during matching, we'll
1886 reinitialize the bounds. */
1887 insert_op2 (set_number_at, laststart, b - laststart,
1888 upper_bound - 1, b);
1889 b += 5;
1890 }
1891 }
1892 pending_exact = 0;
1893 beg_interval = NULL;
1894 }
1895 break;
1896
1897 unfetch_interval:
1898 /* If an invalid interval, match the characters as literals. */
1899 assert (beg_interval);
1900 p = beg_interval;
1901 beg_interval = NULL;
1902
1903 /* normal_char and normal_backslash need `c'. */
1904 PATFETCH (c);
1905
1906 if (!(syntax & RE_NO_BK_BRACES))
1907 {
1908 if (p > pattern && p[-1] == '\\')
1909 goto normal_backslash;
1910 }
1911 goto normal_char;
1912
1913 #ifdef emacs
1914 /* There is no way to specify the before_dot and after_dot
1915 operators. rms says this is ok. --karl */
1916 case '=':
1917 BUF_PUSH (at_dot);
1918 break;
1919
1920 case 's':
1921 laststart = b;
1922 PATFETCH (c);
1923 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
1924 break;
1925
1926 case 'S':
1927 laststart = b;
1928 PATFETCH (c);
1929 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
1930 break;
1931 #endif /* emacs */
1932
1933
1934 case 'w':
1935 laststart = b;
1936 BUF_PUSH (wordchar);
1937 break;
1938
1939
1940 case 'W':
1941 laststart = b;
1942 BUF_PUSH (notwordchar);
1943 break;
1944
1945
1946 case '<':
1947 BUF_PUSH (wordbeg);
1948 break;
1949
1950 case '>':
1951 BUF_PUSH (wordend);
1952 break;
1953
1954 case 'b':
1955 BUF_PUSH (wordbound);
1956 break;
1957
1958 case 'B':
1959 BUF_PUSH (notwordbound);
1960 break;
1961
1962 case '`':
1963 BUF_PUSH (begbuf);
1964 break;
1965
1966 case '\'':
1967 BUF_PUSH (endbuf);
1968 break;
1969
1970 case '1': case '2': case '3': case '4': case '5':
1971 case '6': case '7': case '8': case '9':
1972 if (syntax & RE_NO_BK_REFS)
1973 goto normal_char;
1974
1975 c1 = c - '0';
1976
1977 if (c1 > regnum)
1978 return REG_ESUBREG;
1979
1980 /* Can't back reference to a subexpression if inside of it. */
1981 if (group_in_compile_stack (compile_stack, c1))
1982 goto normal_char;
1983
1984 laststart = b;
1985 BUF_PUSH_2 (duplicate, c1);
1986 break;
1987
1988
1989 case '+':
1990 case '?':
1991 if (syntax & RE_BK_PLUS_QM)
1992 goto handle_plus;
1993 else
1994 goto normal_backslash;
1995
1996 default:
1997 normal_backslash:
1998 /* You might think it would be useful for \ to mean
1999 not to translate; but if we don't translate it
2000 it will never match anything. */
2001 c = TRANSLATE (c);
2002 goto normal_char;
2003 }
2004 break;
2005
2006
2007 default:
2008 /* Expects the character in `c'. */
2009 normal_char:
2010 /* If no exactn currently being built. */
2011 if (!pending_exact
2012
2013 /* If last exactn not at current position. */
2014 || pending_exact + *pending_exact + 1 != b
2015
2016 /* We have only one byte following the exactn for the count. */
2017 || *pending_exact == (1 << BYTEWIDTH) - 1
2018
2019 /* If followed by a repetition operator. */
2020 || *p == '*' || *p == '^'
2021 || ((syntax & RE_BK_PLUS_QM)
2022 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
2023 : (*p == '+' || *p == '?'))
2024 || ((syntax & RE_INTERVALS)
2025 && ((syntax & RE_NO_BK_BRACES)
2026 ? *p == '{'
2027 : (p[0] == '\\' && p[1] == '{'))))
2028 {
2029 /* Start building a new exactn. */
2030
2031 laststart = b;
2032
2033 BUF_PUSH_2 (exactn, 0);
2034 pending_exact = b - 1;
2035 }
2036
2037 BUF_PUSH (c);
2038 (*pending_exact)++;
2039 break;
2040 } /* switch (c) */
2041 } /* while p != pend */
2042
2043
2044 /* Through the pattern now. */
2045
2046 if (fixup_alt_jump)
2047 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
2048
2049 if (!COMPILE_STACK_EMPTY)
2050 return REG_EPAREN;
2051
2052 free (compile_stack.stack);
2053
2054 /* We have succeeded; set the length of the buffer. */
2055 bufp->used = b - bufp->buffer;
2056
2057 #ifdef DEBUG
2058 if (debug)
2059 {
2060 DEBUG_PRINT1 ("\nCompiled pattern: ");
2061 print_compiled_pattern (bufp);
2062 }
2063 #endif /* DEBUG */
2064
2065 return REG_NOERROR;
2066 } /* regex_compile */
2067
2068 /* Subroutines for `regex_compile'. */
2069
2070 /* Store OP at LOC followed by two-byte integer parameter ARG. */
2071
2072 static void
2073 store_op1 (op, loc, arg)
2074 re_opcode_t op;
2075 unsigned char *loc;
2076 int arg;
2077 {
2078 *loc = (unsigned char) op;
2079 STORE_NUMBER (loc + 1, arg);
2080 }
2081
2082
2083 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
2084
2085 static void
2086 store_op2 (op, loc, arg1, arg2)
2087 re_opcode_t op;
2088 unsigned char *loc;
2089 int arg1, arg2;
2090 {
2091 *loc = (unsigned char) op;
2092 STORE_NUMBER (loc + 1, arg1);
2093 STORE_NUMBER (loc + 3, arg2);
2094 }
2095
2096
2097 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
2098 for OP followed by two-byte integer parameter ARG. */
2099
2100 static void
2101 insert_op1 (op, loc, arg, end)
2102 re_opcode_t op;
2103 unsigned char *loc;
2104 int arg;
2105 unsigned char *end;
2106 {
2107 register unsigned char *pfrom = end;
2108 register unsigned char *pto = end + 3;
2109
2110 while (pfrom != loc)
2111 *--pto = *--pfrom;
2112
2113 store_op1 (op, loc, arg);
2114 }
2115
2116
2117 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
2118
2119 static void
2120 insert_op2 (op, loc, arg1, arg2, end)
2121 re_opcode_t op;
2122 unsigned char *loc;
2123 int arg1, arg2;
2124 unsigned char *end;
2125 {
2126 register unsigned char *pfrom = end;
2127 register unsigned char *pto = end + 5;
2128
2129 while (pfrom != loc)
2130 *--pto = *--pfrom;
2131
2132 store_op2 (op, loc, arg1, arg2);
2133 }
2134
2135
2136 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
2137 after an alternative or a begin-subexpression. We assume there is at
2138 least one character before the ^. */
2139
2140 static boolean
2141 at_begline_loc_p (pattern, p, syntax)
2142 const char *pattern, *p;
2143 reg_syntax_t syntax;
2144 {
2145 const char *prev = p - 2;
2146 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
2147
2148 return
2149 /* After a subexpression? */
2150 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
2151 /* After an alternative? */
2152 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
2153 }
2154
2155
2156 /* The dual of at_begline_loc_p. This one is for $. We assume there is
2157 at least one character after the $, i.e., `P < PEND'. */
2158
2159 static boolean
2160 at_endline_loc_p (p, pend, syntax)
2161 const char *p, *pend;
2162 int syntax;
2163 {
2164 const char *next = p;
2165 boolean next_backslash = *next == '\\';
2166 const char *next_next = p + 1 < pend ? p + 1 : NULL;
2167
2168 return
2169 /* Before a subexpression? */
2170 (syntax & RE_NO_BK_PARENS ? *next == ')'
2171 : next_backslash && next_next && *next_next == ')')
2172 /* Before an alternative? */
2173 || (syntax & RE_NO_BK_VBAR ? *next == '|'
2174 : next_backslash && next_next && *next_next == '|');
2175 }
2176
2177
2178 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
2179 false if it's not. */
2180
2181 static boolean
2182 group_in_compile_stack (compile_stack, regnum)
2183 compile_stack_type compile_stack;
2184 regnum_t regnum;
2185 {
2186 int this_element;
2187
2188 for (this_element = compile_stack.avail - 1;
2189 this_element >= 0;
2190 this_element--)
2191 if (compile_stack.stack[this_element].regnum == regnum)
2192 return true;
2193
2194 return false;
2195 }
2196
2197
2198 /* Read the ending character of a range (in a bracket expression) from the
2199 uncompiled pattern *P_PTR (which ends at PEND). We assume the
2200 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
2201 Then we set the translation of all bits between the starting and
2202 ending characters (inclusive) in the compiled pattern B.
2203
2204 Return an error code.
2205
2206 We use these short variable names so we can use the same macros as
2207 `regex_compile' itself. */
2208
2209 static reg_errcode_t
2210 compile_range (p_ptr, pend, translate, syntax, b)
2211 const char **p_ptr, *pend;
2212 char *translate;
2213 reg_syntax_t syntax;
2214 unsigned char *b;
2215 {
2216 unsigned this_char;
2217
2218 const char *p = *p_ptr;
2219 int range_start, range_end;
2220
2221 if (p == pend)
2222 return REG_ERANGE;
2223
2224 /* Even though the pattern is a signed `char *', we need to fetch
2225 with unsigned char *'s; if the high bit of the pattern character
2226 is set, the range endpoints will be negative if we fetch using a
2227 signed char *.
2228
2229 We also want to fetch the endpoints without translating them; the
2230 appropriate translation is done in the bit-setting loop below. */
2231 range_start = ((unsigned char *) p)[-2];
2232 range_end = ((unsigned char *) p)[0];
2233
2234 /* Have to increment the pointer into the pattern string, so the
2235 caller isn't still at the ending character. */
2236 (*p_ptr)++;
2237
2238 /* If the start is after the end, the range is empty. */
2239 if (range_start > range_end)
2240 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
2241
2242 /* Here we see why `this_char' has to be larger than an `unsigned
2243 char' -- the range is inclusive, so if `range_end' == 0xff
2244 (assuming 8-bit characters), we would otherwise go into an infinite
2245 loop, since all characters <= 0xff. */
2246 for (this_char = range_start; this_char <= range_end; this_char++)
2247 {
2248 SET_LIST_BIT (TRANSLATE (this_char));
2249 }
2250
2251 return REG_NOERROR;
2252 }
2253
2254 /* Failure stack declarations and macros; both re_compile_fastmap and
2255 re_match_2 use a failure stack. These have to be macros because of
2256 REGEX_ALLOCATE. */
2257
2258
2259 /* Number of failure points for which to initially allocate space
2260 when matching. If this number is exceeded, we allocate more
2261 space, so it is not a hard limit. */
2262 #ifndef INIT_FAILURE_ALLOC
2263 #define INIT_FAILURE_ALLOC 5
2264 #endif
2265
2266 /* Roughly the maximum number of failure points on the stack. Would be
2267 exactly that if always used MAX_FAILURE_SPACE each time we failed.
2268 This is a variable only so users of regex can assign to it; we never
2269 change it ourselves. */
2270
2271 /* change for rh glibc re_max_failures symbol collision - jpr5 */
2272 #define RE_MAX_FAILURES 2000
2273
2274 typedef const unsigned char *fail_stack_elt_t;
2275
2276 typedef struct
2277 {
2278 fail_stack_elt_t *stack;
2279 unsigned long size;
2280 unsigned long avail; /* Offset of next open position. */
2281 } fail_stack_type;
2282
2283 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
2284 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
2285 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
2286 #define FAIL_STACK_TOP() (fail_stack.stack[fail_stack.avail])
2287
2288
2289 /* Initialize `fail_stack'. Do `return -2' if the alloc fails. */
2290
2291 #define INIT_FAIL_STACK() \
2292 do { \
2293 fail_stack.stack = (fail_stack_elt_t *) \
2294 REGEX_ALLOCATE (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
2295 \
2296 if (fail_stack.stack == NULL) \
2297 return -2; \
2298 \
2299 fail_stack.size = INIT_FAILURE_ALLOC; \
2300 fail_stack.avail = 0; \
2301 } while (0)
2302
2303
2304 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
2305
2306 Return 1 if succeeds, and 0 if either ran out of memory
2307 allocating space for it or it was already too large.
2308
2309 REGEX_REALLOCATE requires `destination' be declared. */
2310
2311 /* re_max_failures -> #define RE_MAX_FAILURES */
2312 #define DOUBLE_FAIL_STACK(fail_stack) \
2313 ((fail_stack).size > RE_MAX_FAILURES * MAX_FAILURE_ITEMS \
2314 ? 0 \
2315 : ((fail_stack).stack = (fail_stack_elt_t *) \
2316 REGEX_REALLOCATE ((fail_stack).stack, \
2317 (fail_stack).size * sizeof (fail_stack_elt_t), \
2318 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
2319 \
2320 (fail_stack).stack == NULL \
2321 ? 0 \
2322 : ((fail_stack).size <<= 1, \
2323 1)))
2324
2325
2326 /* Push PATTERN_OP on FAIL_STACK.
2327
2328 Return 1 if was able to do so and 0 if ran out of memory allocating
2329 space to do so. */
2330 #define PUSH_PATTERN_OP(pattern_op, fail_stack) \
2331 ((FAIL_STACK_FULL () \
2332 && !DOUBLE_FAIL_STACK (fail_stack)) \
2333 ? 0 \
2334 : ((fail_stack).stack[(fail_stack).avail++] = pattern_op, \
2335 1))
2336
2337 /* This pushes an item onto the failure stack. Must be a four-byte
2338 value. Assumes the variable `fail_stack'. Probably should only
2339 be called from within `PUSH_FAILURE_POINT'. */
2340 #define PUSH_FAILURE_ITEM(item) \
2341 fail_stack.stack[fail_stack.avail++] = (fail_stack_elt_t) item
2342
2343 /* The complement operation. Assumes `fail_stack' is nonempty. */
2344 #define POP_FAILURE_ITEM() fail_stack.stack[--fail_stack.avail]
2345
2346 /* Used to omit pushing failure point id's when we're not debugging. */
2347 #ifdef DEBUG
2348 #define DEBUG_PUSH PUSH_FAILURE_ITEM
2349 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_ITEM ()
2350 #else
2351 #define DEBUG_PUSH(item)
2352 #define DEBUG_POP(item_addr)
2353 #endif
2354
2355
2356 /* Push the information about the state we will need
2357 if we ever fail back to it.
2358
2359 Requires variables fail_stack, regstart, regend, reg_info, and
2360 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be
2361 declared.
2362
2363 Does `return FAILURE_CODE' if runs out of memory. */
2364
2365 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
2366 do { \
2367 char *destination; \
2368 /* Must be int, so when we don't save any registers, the arithmetic \
2369 of 0 + -1 isn't done as unsigned. */ \
2370 int this_reg; \
2371 \
2372 DEBUG_STATEMENT (failure_id++); \
2373 DEBUG_STATEMENT (nfailure_points_pushed++); \
2374 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
2375 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
2376 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
2377 \
2378 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
2379 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
2380 \
2381 /* Ensure we have enough space allocated for what we will push. */ \
2382 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
2383 { \
2384 if (!DOUBLE_FAIL_STACK (fail_stack)) \
2385 return failure_code; \
2386 \
2387 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
2388 (fail_stack).size); \
2389 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
2390 } \
2391 \
2392 /* Push the info, starting with the registers. */ \
2393 DEBUG_PRINT1 ("\n"); \
2394 \
2395 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
2396 this_reg++) \
2397 { \
2398 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
2399 DEBUG_STATEMENT (num_regs_pushed++); \
2400 \
2401 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
2402 PUSH_FAILURE_ITEM (regstart[this_reg]); \
2403 \
2404 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
2405 PUSH_FAILURE_ITEM (regend[this_reg]); \
2406 \
2407 DEBUG_PRINT2 (" info: 0x%x\n ", reg_info[this_reg]); \
2408 DEBUG_PRINT2 (" match_null=%d", \
2409 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
2410 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
2411 DEBUG_PRINT2 (" matched_something=%d", \
2412 MATCHED_SOMETHING (reg_info[this_reg])); \
2413 DEBUG_PRINT2 (" ever_matched=%d", \
2414 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
2415 DEBUG_PRINT1 ("\n"); \
2416 PUSH_FAILURE_ITEM (reg_info[this_reg].word); \
2417 } \
2418 \
2419 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\
2420 PUSH_FAILURE_ITEM (lowest_active_reg); \
2421 \
2422 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\
2423 PUSH_FAILURE_ITEM (highest_active_reg); \
2424 \
2425 DEBUG_PRINT2 (" Pushing pattern 0x%x: ", pattern_place); \
2426 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
2427 PUSH_FAILURE_ITEM (pattern_place); \
2428 \
2429 DEBUG_PRINT2 (" Pushing string 0x%x: `", string_place); \
2430 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
2431 size2); \
2432 DEBUG_PRINT1 ("'\n"); \
2433 PUSH_FAILURE_ITEM (string_place); \
2434 \
2435 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
2436 DEBUG_PUSH (failure_id); \
2437 } while (0)
2438
2439 /* This is the number of items that are pushed and popped on the stack
2440 for each register. */
2441 #define NUM_REG_ITEMS 3
2442
2443 /* Individual items aside from the registers. */
2444 #ifdef DEBUG
2445 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
2446 #else
2447 #define NUM_NONREG_ITEMS 4
2448 #endif
2449
2450 /* We push at most this many items on the stack. */
2451 #define MAX_FAILURE_ITEMS ((num_regs - 1) * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
2452
2453 /* We actually push this many items. */
2454 #define NUM_FAILURE_ITEMS \
2455 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \
2456 + NUM_NONREG_ITEMS)
2457
2458 /* How many items can still be added to the stack without overflowing it. */
2459 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
2460
2461
2462 /* Pops what PUSH_FAIL_STACK pushes.
2463
2464 We restore into the parameters, all of which should be lvalues:
2465 STR -- the saved data position.
2466 PAT -- the saved pattern position.
2467 LOW_REG, HIGH_REG -- the highest and lowest active registers.
2468 REGSTART, REGEND -- arrays of string positions.
2469 REG_INFO -- array of information about each subexpression.
2470
2471 Also assumes the variables `fail_stack' and (if debugging), `bufp',
2472 `pend', `string1', `size1', `string2', and `size2'. */
2473
2474 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
2475 { \
2476 DEBUG_STATEMENT (fail_stack_elt_t failure_id;) \
2477 int this_reg; \
2478 const unsigned char *string_temp; \
2479 \
2480 assert (!FAIL_STACK_EMPTY ()); \
2481 \
2482 /* Remove failure points and point to how many regs pushed. */ \
2483 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
2484 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
2485 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
2486 \
2487 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
2488 \
2489 DEBUG_POP (&failure_id); \
2490 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \
2491 \
2492 /* If the saved string location is NULL, it came from an \
2493 on_failure_keep_string_jump opcode, and we want to throw away the \
2494 saved NULL, thus retaining our current position in the string. */ \
2495 string_temp = POP_FAILURE_ITEM (); \
2496 if (string_temp != NULL) \
2497 str = (const char *) string_temp; \
2498 \
2499 DEBUG_PRINT2 (" Popping string 0x%x: `", str); \
2500 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
2501 DEBUG_PRINT1 ("'\n"); \
2502 \
2503 pat = (unsigned char *) POP_FAILURE_ITEM (); \
2504 DEBUG_PRINT2 (" Popping pattern 0x%x: ", pat); \
2505 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
2506 \
2507 /* Restore register info. */ \
2508 high_reg = (unsigned long) POP_FAILURE_ITEM (); \
2509 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
2510 \
2511 low_reg = (unsigned long) POP_FAILURE_ITEM (); \
2512 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
2513 \
2514 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
2515 { \
2516 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
2517 \
2518 reg_info[this_reg].word = POP_FAILURE_ITEM (); \
2519 DEBUG_PRINT2 (" info: 0x%x\n", reg_info[this_reg]); \
2520 \
2521 regend[this_reg] = (const char *) POP_FAILURE_ITEM (); \
2522 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
2523 \
2524 regstart[this_reg] = (const char *) POP_FAILURE_ITEM (); \
2525 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
2526 } \
2527 \
2528 DEBUG_STATEMENT (nfailure_points_popped++); \
2529 } /* POP_FAILURE_POINT */
2530
2531 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
2532 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
2533 characters can start a string that matches the pattern. This fastmap
2534 is used by re_search to skip quickly over impossible starting points.
2535
2536 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
2537 area as BUFP->fastmap.
2538
2539 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
2540 the pattern buffer.
2541
2542 Returns 0 if we succeed, -2 if an internal error. */
2543
2544 int
2545 re_compile_fastmap (bufp)
2546 struct re_pattern_buffer *bufp;
2547 {
2548 int j, k;
2549 fail_stack_type fail_stack;
2550 #ifndef REGEX_MALLOC
2551 char *destination;
2552 #endif
2553 /* We don't push any register information onto the failure stack. */
2554 unsigned num_regs = 0;
2555
2556 register char *fastmap = bufp->fastmap;
2557 unsigned char *pattern = bufp->buffer;
2558 unsigned long size = bufp->used;
2559 const unsigned char *p = pattern;
2560 register unsigned char *pend = pattern + size;
2561
2562 /* Assume that each path through the pattern can be null until
2563 proven otherwise. We set this false at the bottom of switch
2564 statement, to which we get only if a particular path doesn't
2565 match the empty string. */
2566 boolean path_can_be_null = true;
2567
2568 /* We aren't doing a `succeed_n' to begin with. */
2569 boolean succeed_n_p = false;
2570
2571 assert (fastmap != NULL && p != NULL);
2572
2573 INIT_FAIL_STACK ();
2574 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
2575 bufp->fastmap_accurate = 1; /* It will be when we're done. */
2576 bufp->can_be_null = 0;
2577
2578 while (p != pend || !FAIL_STACK_EMPTY ())
2579 {
2580 if (p == pend)
2581 {
2582 bufp->can_be_null |= path_can_be_null;
2583
2584 /* Reset for next path. */
2585 path_can_be_null = true;
2586
2587 p = fail_stack.stack[--fail_stack.avail];
2588 }
2589
2590 /* We should never be about to go beyond the end of the pattern. */
2591 assert (p < pend);
2592
2593 #ifdef SWITCH_ENUM_BUG
2594 switch ((int) ((re_opcode_t) *p++))
2595 #else
2596 switch ((re_opcode_t) *p++)
2597 #endif
2598 {
2599
2600 /* I guess the idea here is to simply not bother with a fastmap
2601 if a backreference is used, since it's too hard to figure out
2602 the fastmap for the corresponding group. Setting
2603 `can_be_null' stops `re_search_2' from using the fastmap, so
2604 that is all we do. */
2605 case duplicate:
2606 bufp->can_be_null = 1;
2607 return 0;
2608
2609
2610 /* Following are the cases which match a character. These end
2611 with `break'. */
2612
2613 case exactn:
2614 fastmap[p[1]] = 1;
2615 break;
2616
2617
2618 case charset:
2619 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
2620 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
2621 fastmap[j] = 1;
2622 break;
2623
2624
2625 case charset_not:
2626 /* Chars beyond end of map must be allowed. */
2627 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
2628 fastmap[j] = 1;
2629
2630 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
2631 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
2632 fastmap[j] = 1;
2633 break;
2634
2635
2636 case wordchar:
2637 for (j = 0; j < (1 << BYTEWIDTH); j++)
2638 if (SYNTAX (j) == Sword)
2639 fastmap[j] = 1;
2640 break;
2641
2642
2643 case notwordchar:
2644 for (j = 0; j < (1 << BYTEWIDTH); j++)
2645 if (SYNTAX (j) != Sword)
2646 fastmap[j] = 1;
2647 break;
2648
2649
2650 case anychar:
2651 /* `.' matches anything ... */
2652 for (j = 0; j < (1 << BYTEWIDTH); j++)
2653 fastmap[j] = 1;
2654
2655 /* ... except perhaps newline. */
2656 if (!(bufp->syntax & RE_DOT_NEWLINE))
2657 fastmap['\n'] = 0;
2658
2659 /* Return if we have already set `can_be_null'; if we have,
2660 then the fastmap is irrelevant. Something's wrong here. */
2661 else if (bufp->can_be_null)
2662 return 0;
2663
2664 /* Otherwise, have to check alternative paths. */
2665 break;
2666
2667
2668 #ifdef emacs
2669 case syntaxspec:
2670 k = *p++;
2671 for (j = 0; j < (1 << BYTEWIDTH); j++)
2672 if (SYNTAX (j) == (enum syntaxcode) k)
2673 fastmap[j] = 1;
2674 break;
2675
2676
2677 case notsyntaxspec:
2678 k = *p++;
2679 for (j = 0; j < (1 << BYTEWIDTH); j++)
2680 if (SYNTAX (j) != (enum syntaxcode) k)
2681 fastmap[j] = 1;
2682 break;
2683
2684
2685 /* All cases after this match the empty string. These end with
2686 `continue'. */
2687
2688
2689 case before_dot:
2690 case at_dot:
2691 case after_dot:
2692 continue;
2693 #endif /* not emacs */
2694
2695
2696 case no_op:
2697 case begline:
2698 case endline:
2699 case begbuf:
2700 case endbuf:
2701 case wordbound:
2702 case notwordbound:
2703 case wordbeg:
2704 case wordend:
2705 case push_dummy_failure:
2706 continue;
2707
2708
2709 case jump_n:
2710 case pop_failure_jump:
2711 case maybe_pop_jump:
2712 case jump:
2713 case jump_past_alt:
2714 case dummy_failure_jump:
2715 EXTRACT_NUMBER_AND_INCR (j, p);
2716 p += j;
2717 if (j > 0)
2718 continue;
2719
2720 /* Jump backward implies we just went through the body of a
2721 loop and matched nothing. Opcode jumped to should be
2722 `on_failure_jump' or `succeed_n'. Just treat it like an
2723 ordinary jump. For a * loop, it has pushed its failure
2724 point already; if so, discard that as redundant. */
2725 if ((re_opcode_t) *p != on_failure_jump
2726 && (re_opcode_t) *p != succeed_n)
2727 continue;
2728
2729 p++;
2730 EXTRACT_NUMBER_AND_INCR (j, p);
2731 p += j;
2732
2733 /* If what's on the stack is where we are now, pop it. */
2734 if (!FAIL_STACK_EMPTY ()
2735 && fail_stack.stack[fail_stack.avail - 1] == p)
2736 fail_stack.avail--;
2737
2738 continue;
2739
2740
2741 case on_failure_jump:
2742 case on_failure_keep_string_jump:
2743 handle_on_failure_jump:
2744 EXTRACT_NUMBER_AND_INCR (j, p);
2745
2746 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
2747 end of the pattern. We don't want to push such a point,
2748 since when we restore it above, entering the switch will
2749 increment `p' past the end of the pattern. We don't need
2750 to push such a point since we obviously won't find any more
2751 fastmap entries beyond `pend'. Such a pattern can match
2752 the null string, though. */
2753 if (p + j < pend)
2754 {
2755 if (!PUSH_PATTERN_OP (p + j, fail_stack))
2756 return -2;
2757 }
2758 else
2759 bufp->can_be_null = 1;
2760
2761 if (succeed_n_p)
2762 {
2763 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
2764 succeed_n_p = false;
2765 }
2766
2767 continue;
2768
2769
2770 case succeed_n:
2771 /* Get to the number of times to succeed. */
2772 p += 2;
2773
2774 /* Increment p past the n for when k != 0. */
2775 EXTRACT_NUMBER_AND_INCR (k, p);
2776 if (k == 0)
2777 {
2778 p -= 4;
2779 succeed_n_p = true; /* Spaghetti code alert. */
2780 goto handle_on_failure_jump;
2781 }
2782 continue;
2783
2784
2785 case set_number_at:
2786 p += 4;
2787 continue;
2788
2789
2790 case start_memory:
2791 case stop_memory:
2792 p += 2;
2793 continue;
2794
2795
2796 default:
2797 abort (); /* We have listed all the cases. */
2798 } /* switch *p++ */
2799
2800 /* Getting here means we have found the possible starting
2801 characters for one path of the pattern -- and that the empty
2802 string does not match. We need not follow this path further.
2803 Instead, look at the next alternative (remembered on the
2804 stack), or quit if no more. The test at the top of the loop
2805 does these things. */
2806 path_can_be_null = false;
2807 p = pend;
2808 } /* while p */
2809
2810 /* Set `can_be_null' for the last path (also the first path, if the
2811 pattern is empty). */
2812 bufp->can_be_null |= path_can_be_null;
2813 return 0;
2814 } /* re_compile_fastmap */
2815
2816 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
2817 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
2818 this memory for recording register information. STARTS and ENDS
2819 must be allocated using the malloc library routine, and must each
2820 be at least NUM_REGS * sizeof (regoff_t) bytes long.
2821
2822 If NUM_REGS == 0, then subsequent matches should allocate their own
2823 register data.
2824
2825 Unless this function is called, the first search or match using
2826 PATTERN_BUFFER will allocate its own register data, without
2827 freeing the old data. */
2828
2829 void
2830 re_set_registers (bufp, regs, num_regs, starts, ends)
2831 struct re_pattern_buffer *bufp;
2832 struct re_registers *regs;
2833 unsigned num_regs;
2834 regoff_t *starts, *ends;
2835 {
2836 if (num_regs)
2837 {
2838 bufp->regs_allocated = REGS_REALLOCATE;
2839 regs->num_regs = num_regs;
2840 regs->start = starts;
2841 regs->end = ends;
2842 }
2843 else
2844 {
2845 bufp->regs_allocated = REGS_UNALLOCATED;
2846 regs->num_regs = 0;
2847 regs->start = regs->end = (regoff_t) 0;
2848 }
2849 }
2850
2851 /* Searching routines. */
2852
2853 /* Like re_search_2, below, but only one string is specified, and
2854 doesn't let you say where to stop matching. */
2855
2856 int
2857 re_search (bufp, string, size, startpos, range, regs)
2858 struct re_pattern_buffer *bufp;
2859 const char *string;
2860 int size, startpos, range;
2861 struct re_registers *regs;
2862 {
2863 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
2864 regs, size);
2865 }
2866
2867
2868 /* Using the compiled pattern in BUFP->buffer, first tries to match the
2869 virtual concatenation of STRING1 and STRING2, starting first at index
2870 STARTPOS, then at STARTPOS + 1, and so on.
2871
2872 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
2873
2874 RANGE is how far to scan while trying to match. RANGE = 0 means try
2875 only at STARTPOS; in general, the last start tried is STARTPOS +
2876 RANGE.
2877
2878 In REGS, return the indices of the virtual concatenation of STRING1
2879 and STRING2 that matched the entire BUFP->buffer and its contained
2880 subexpressions.
2881
2882 Do not consider matching one past the index STOP in the virtual
2883 concatenation of STRING1 and STRING2.
2884
2885 We return either the position in the strings at which the match was
2886 found, -1 if no match, or -2 if error (such as failure
2887 stack overflow). */
2888
2889 int
2890 re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
2891 struct re_pattern_buffer *bufp;
2892 const char *string1, *string2;
2893 int size1, size2;
2894 int startpos;
2895 int range;
2896 struct re_registers *regs;
2897 int stop;
2898 {
2899 int val;
2900 register char *fastmap = bufp->fastmap;
2901 register char *translate = bufp->translate;
2902 int total_size = size1 + size2;
2903 int endpos = startpos + range;
2904
2905 /* Check for out-of-range STARTPOS. */
2906 if (startpos < 0 || startpos > total_size)
2907 return -1;
2908
2909 /* Fix up RANGE if it might eventually take us outside
2910 the virtual concatenation of STRING1 and STRING2. */
2911 if (endpos < -1)
2912 range = -1 - startpos;
2913 else if (endpos > total_size)
2914 range = total_size - startpos;
2915
2916 /* If the search isn't to be a backwards one, don't waste time in a
2917 search for a pattern that must be anchored. */
2918 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
2919 {
2920 if (startpos > 0)
2921 return -1;
2922 else
2923 range = 1;
2924 }
2925
2926 /* Update the fastmap now if not correct already. */
2927 if (fastmap && !bufp->fastmap_accurate)
2928 if (re_compile_fastmap (bufp) == -2)
2929 return -2;
2930
2931 /* Loop through the string, looking for a place to start matching. */
2932 for (;;)
2933 {
2934 /* If a fastmap is supplied, skip quickly over characters that
2935 cannot be the start of a match. If the pattern can match the
2936 null string, however, we don't need to skip characters; we want
2937 the first null string. */
2938 if (fastmap && startpos < total_size && !bufp->can_be_null)
2939 {
2940 if (range > 0) /* Searching forwards. */
2941 {
2942 register const char *d;
2943 register int lim = 0;
2944 int irange = range;
2945
2946 if (startpos < size1 && startpos + range >= size1)
2947 lim = range - (size1 - startpos);
2948
2949 d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
2950
2951 /* Written out as an if-else to avoid testing `translate'
2952 inside the loop. */
2953 if (translate)
2954 while (range > lim
2955 && !fastmap[(unsigned char)
2956 translate[(unsigned char) *d++]])
2957 range--;
2958 else
2959 while (range > lim && !fastmap[(unsigned char) *d++])
2960 range--;
2961
2962 startpos += irange - range;
2963 }
2964 else /* Searching backwards. */
2965 {
2966 register char c = (size1 == 0 || startpos >= size1
2967 ? string2[startpos - size1]
2968 : string1[startpos]);
2969
2970 if (!fastmap[(unsigned char) TRANSLATE (c)])
2971 goto advance;
2972 }
2973 }
2974
2975 /* If can't match the null string, and that's all we have left, fail. */
2976 if (range >= 0 && startpos == total_size && fastmap
2977 && !bufp->can_be_null)
2978 return -1;
2979
2980 val = re_match_2 (bufp, string1, size1, string2, size2,
2981 startpos, regs, stop);
2982 if (val >= 0)
2983 return startpos;
2984
2985 if (val == -2)
2986 return -2;
2987
2988 advance:
2989 if (!range)
2990 break;
2991 else if (range > 0)
2992 {
2993 range--;
2994 startpos++;
2995 }
2996 else
2997 {
2998 range++;
2999 startpos--;
3000 }
3001 }
3002 return -1;
3003 } /* re_search_2 */
3004
3005 /* Declarations and macros for re_match_2. */
3006
3007 static int bcmp_translate ();
3008 static boolean alt_match_null_string_p (),
3009 common_op_match_null_string_p (),
3010 group_match_null_string_p ();
3011
3012 /* Structure for per-register (a.k.a. per-group) information.
3013 This must not be longer than one word, because we push this value
3014 onto the failure stack. Other register information, such as the
3015 starting and ending positions (which are addresses), and the list of
3016 inner groups (which is a bits list) are maintained in separate
3017 variables.
3018
3019 We are making a (strictly speaking) nonportable assumption here: that
3020 the compiler will pack our bit fields into something that fits into
3021 the type of `word', i.e., is something that fits into one item on the
3022 failure stack. */
3023 typedef union
3024 {
3025 fail_stack_elt_t word;
3026 struct
3027 {
3028 /* This field is one if this group can match the empty string,
3029 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
3030 #define MATCH_NULL_UNSET_VALUE 3
3031 unsigned match_null_string_p : 2;
3032 unsigned is_active : 1;
3033 unsigned matched_something : 1;
3034 unsigned ever_matched_something : 1;
3035 } bits;
3036 } register_info_type;
3037
3038 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
3039 #define IS_ACTIVE(R) ((R).bits.is_active)
3040 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
3041 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
3042
3043
3044 /* Call this when have matched a real character; it sets `matched' flags
3045 for the subexpressions which we are currently inside. Also records
3046 that those subexprs have matched. */
3047 #define SET_REGS_MATCHED() \
3048 do \
3049 { \
3050 unsigned r; \
3051 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
3052 { \
3053 MATCHED_SOMETHING (reg_info[r]) \
3054 = EVER_MATCHED_SOMETHING (reg_info[r]) \
3055 = 1; \
3056 } \
3057 } \
3058 while (0)
3059
3060
3061 /* This converts PTR, a pointer into one of the search strings `string1'
3062 and `string2' into an offset from the beginning of that string. */
3063 #define POINTER_TO_OFFSET(ptr) \
3064 (FIRST_STRING_P (ptr) ? (ptr) - string1 : (ptr) - string2 + size1)
3065
3066 /* Registers are set to a sentinel when they haven't yet matched. */
3067 #define REG_UNSET_VALUE ((char *) -1)
3068 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
3069
3070
3071 /* Macros for dealing with the split strings in re_match_2. */
3072
3073 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
3074
3075 /* Call before fetching a character with *d. This switches over to
3076 string2 if necessary. */
3077 #define PREFETCH() \
3078 while (d == dend) \
3079 { \
3080 /* End of string2 => fail. */ \
3081 if (dend == end_match_2) \
3082 goto fail; \
3083 /* End of string1 => advance to string2. */ \
3084 d = string2; \
3085 dend = end_match_2; \
3086 }
3087
3088
3089 /* Test if at very beginning or at very end of the virtual concatenation
3090 of `string1' and `string2'. If only one string, it's `string2'. */
3091 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
3092 #define AT_STRINGS_END(d) ((d) == end2)
3093
3094
3095 /* Test if D points to a character which is word-constituent. We have
3096 two special cases to check for: if past the end of string1, look at
3097 the first character in string2; and if before the beginning of
3098 string2, look at the last character in string1. */
3099 #define WORDCHAR_P(d) \
3100 (SYNTAX ((d) == end1 ? *string2 \
3101 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
3102 == Sword)
3103
3104 /* Test if the character before D and the one at D differ with respect
3105 to being word-constituent. */
3106 #define AT_WORD_BOUNDARY(d) \
3107 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
3108 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
3109
3110
3111 /* Free everything we malloc. */
3112 #ifdef REGEX_MALLOC
3113 #define FREE_VAR(var) if (var) free (var); var = NULL
3114 #define FREE_VARIABLES() \
3115 do { \
3116 FREE_VAR (fail_stack.stack); \
3117 FREE_VAR (regstart); \
3118 FREE_VAR (regend); \
3119 FREE_VAR (old_regstart); \
3120 FREE_VAR (old_regend); \
3121 FREE_VAR (best_regstart); \
3122 FREE_VAR (best_regend); \
3123 FREE_VAR (reg_info); \
3124 FREE_VAR (reg_dummy); \
3125 FREE_VAR (reg_info_dummy); \
3126 } while (0)
3127 #else /* not REGEX_MALLOC */
3128 /* Some MIPS systems (at least) want this to free alloca'd storage. */
3129 #define FREE_VARIABLES() alloca (0)
3130 #endif /* not REGEX_MALLOC */
3131
3132
3133 /* These values must meet several constraints. They must not be valid
3134 register values; since we have a limit of 255 registers (because
3135 we use only one byte in the pattern for the register number), we can
3136 use numbers larger than 255. They must differ by 1, because of
3137 NUM_FAILURE_ITEMS above. And the value for the lowest register must
3138 be larger than the value for the highest register, so we do not try
3139 to actually save any registers when none are active. */
3140 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
3141 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
3142
3143 /* Matching routines. */
3144
3145 #ifndef emacs /* Emacs never uses this. */
3146 /* re_match is like re_match_2 except it takes only a single string. */
3147
3148 int
3149 re_match (bufp, string, size, pos, regs)
3150 struct re_pattern_buffer *bufp;
3151 const char *string;
3152 int size, pos;
3153 struct re_registers *regs;
3154 {
3155 return re_match_2 (bufp, NULL, 0, string, size, pos, regs, size);
3156 }
3157 #endif /* not emacs */
3158
3159
3160 /* re_match_2 matches the compiled pattern in BUFP against the
3161 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
3162 and SIZE2, respectively). We start matching at POS, and stop
3163 matching at STOP.
3164
3165 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
3166 store offsets for the substring each group matched in REGS. See the
3167 documentation for exactly how many groups we fill.
3168
3169 We return -1 if no match, -2 if an internal error (such as the
3170 failure stack overflowing). Otherwise, we return the length of the
3171 matched substring. */
3172
3173 int
3174 re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
3175 struct re_pattern_buffer *bufp;
3176 const char *string1, *string2;
3177 int size1, size2;
3178 int pos;
3179 struct re_registers *regs;
3180 int stop;
3181 {
3182 /* General temporaries. */
3183 int mcnt;
3184 unsigned char *p1;
3185
3186 /* Just past the end of the corresponding string. */
3187 const char *end1, *end2;
3188
3189 /* Pointers into string1 and string2, just past the last characters in
3190 each to consider matching. */
3191 const char *end_match_1, *end_match_2;
3192
3193 /* Where we are in the data, and the end of the current string. */
3194 const char *d, *dend;
3195
3196 /* Where we are in the pattern, and the end of the pattern. */
3197 unsigned char *p = bufp->buffer;
3198 register unsigned char *pend = p + bufp->used;
3199
3200 /* We use this to map every character in the string. */
3201 char *translate = bufp->translate;
3202
3203 /* Failure point stack. Each place that can handle a failure further
3204 down the line pushes a failure point on this stack. It consists of
3205 restart, regend, and reg_info for all registers corresponding to
3206 the subexpressions we're currently inside, plus the number of such
3207 registers, and, finally, two char *'s. The first char * is where
3208 to resume scanning the pattern; the second one is where to resume
3209 scanning the strings. If the latter is zero, the failure point is
3210 a ``dummy''; if a failure happens and the failure point is a dummy,
3211 it gets discarded and the next next one is tried. */
3212 fail_stack_type fail_stack;
3213 #ifdef DEBUG
3214 static unsigned failure_id = 0;
3215 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
3216 #endif
3217
3218 /* We fill all the registers internally, independent of what we
3219 return, for use in backreferences. The number here includes
3220 an element for register zero. */
3221 unsigned num_regs = bufp->re_nsub + 1;
3222
3223 /* The currently active registers. */
3224 unsigned long lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3225 unsigned long highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3226
3227 /* Information on the contents of registers. These are pointers into
3228 the input strings; they record just what was matched (on this
3229 attempt) by a subexpression part of the pattern, that is, the
3230 regnum-th regstart pointer points to where in the pattern we began
3231 matching and the regnum-th regend points to right after where we
3232 stopped matching the regnum-th subexpression. (The zeroth register
3233 keeps track of what the whole pattern matches.) */
3234 const char **regstart, **regend;
3235
3236 /* If a group that's operated upon by a repetition operator fails to
3237 match anything, then the register for its start will need to be
3238 restored because it will have been set to wherever in the string we
3239 are when we last see its open-group operator. Similarly for a
3240 register's end. */
3241 const char **old_regstart, **old_regend;
3242
3243 /* The is_active field of reg_info helps us keep track of which (possibly
3244 nested) subexpressions we are currently in. The matched_something
3245 field of reg_info[reg_num] helps us tell whether or not we have
3246 matched any of the pattern so far this time through the reg_num-th
3247 subexpression. These two fields get reset each time through any
3248 loop their register is in. */
3249 register_info_type *reg_info;
3250
3251 /* The following record the register info as found in the above
3252 variables when we find a match better than any we've seen before.
3253 This happens as we backtrack through the failure points, which in
3254 turn happens only if we have not yet matched the entire string. */
3255 unsigned best_regs_set = false;
3256 const char **best_regstart, **best_regend;
3257
3258 /* Logically, this is `best_regend[0]'. But we don't want to have to
3259 allocate space for that if we're not allocating space for anything
3260 else (see below). Also, we never need info about register 0 for
3261 any of the other register vectors, and it seems rather a kludge to
3262 treat `best_regend' differently than the rest. So we keep track of
3263 the end of the best match so far in a separate variable. We
3264 initialize this to NULL so that when we backtrack the first time
3265 and need to test it, it's not garbage. */
3266 const char *match_end = NULL;
3267
3268 /* Used when we pop values we don't care about. */
3269 const char **reg_dummy;
3270 register_info_type *reg_info_dummy;
3271
3272 #ifdef DEBUG
3273 /* Counts the total number of registers pushed. */
3274 unsigned num_regs_pushed = 0;
3275 #endif
3276
3277 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
3278
3279 INIT_FAIL_STACK ();
3280
3281 /* Do not bother to initialize all the register variables if there are
3282 no groups in the pattern, as it takes a fair amount of time. If
3283 there are groups, we include space for register 0 (the whole
3284 pattern), even though we never use it, since it simplifies the
3285 array indexing. We should fix this. */
3286 if (bufp->re_nsub)
3287 {
3288 regstart = REGEX_TALLOC (num_regs, const char *);
3289 regend = REGEX_TALLOC (num_regs, const char *);
3290 old_regstart = REGEX_TALLOC (num_regs, const char *);
3291 old_regend = REGEX_TALLOC (num_regs, const char *);
3292 best_regstart = REGEX_TALLOC (num_regs, const char *);
3293 best_regend = REGEX_TALLOC (num_regs, const char *);
3294 reg_info = REGEX_TALLOC (num_regs, register_info_type);
3295 reg_dummy = REGEX_TALLOC (num_regs, const char *);
3296 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
3297
3298 if (!(regstart && regend && old_regstart && old_regend && reg_info
3299 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
3300 {
3301 FREE_VARIABLES ();
3302 return -2;
3303 }
3304 }
3305 #ifdef REGEX_MALLOC
3306 else
3307 {
3308 /* We must initialize all our variables to NULL, so that
3309 `FREE_VARIABLES' doesn't try to free them. */
3310 regstart = regend = old_regstart = old_regend = best_regstart
3311 = best_regend = reg_dummy = NULL;
3312 reg_info = reg_info_dummy = (register_info_type *) NULL;
3313 }
3314 #endif /* REGEX_MALLOC */
3315
3316 /* The starting position is bogus. */
3317 if (pos < 0 || pos > size1 + size2)
3318 {
3319 FREE_VARIABLES ();
3320 return -1;
3321 }
3322
3323 /* Initialize subexpression text positions to -1 to mark ones that no
3324 start_memory/stop_memory has been seen for. Also initialize the
3325 register information struct. */
3326 for (mcnt = 1; mcnt < num_regs; mcnt++)
3327 {
3328 regstart[mcnt] = regend[mcnt]
3329 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
3330
3331 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
3332 IS_ACTIVE (reg_info[mcnt]) = 0;
3333 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3334 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3335 }
3336
3337 /* We move `string1' into `string2' if the latter's empty -- but not if
3338 `string1' is null. */
3339 if (size2 == 0 && string1 != NULL)
3340 {
3341 string2 = string1;
3342 size2 = size1;
3343 string1 = 0;
3344 size1 = 0;
3345 }
3346 end1 = string1 + size1;
3347 end2 = string2 + size2;
3348
3349 /* Compute where to stop matching, within the two strings. */
3350 if (stop <= size1)
3351 {
3352 end_match_1 = string1 + stop;
3353 end_match_2 = string2;
3354 }
3355 else
3356 {
3357 end_match_1 = end1;
3358 end_match_2 = string2 + stop - size1;
3359 }
3360
3361 /* `p' scans through the pattern as `d' scans through the data.
3362 `dend' is the end of the input string that `d' points within. `d'
3363 is advanced into the following input string whenever necessary, but
3364 this happens before fetching; therefore, at the beginning of the
3365 loop, `d' can be pointing at the end of a string, but it cannot
3366 equal `string2'. */
3367 if (size1 > 0 && pos <= size1)
3368 {
3369 d = string1 + pos;
3370 dend = end_match_1;
3371 }
3372 else
3373 {
3374 d = string2 + pos - size1;
3375 dend = end_match_2;
3376 }
3377
3378 DEBUG_PRINT1 ("The compiled pattern is: ");
3379 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
3380 DEBUG_PRINT1 ("The string to match is: `");
3381 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
3382 DEBUG_PRINT1 ("'\n");
3383
3384 /* This loops over pattern commands. It exits by returning from the
3385 function if the match is complete, or it drops through if the match
3386 fails at this starting point in the input data. */
3387 for (;;)
3388 {
3389 DEBUG_PRINT2 ("\n0x%x: ", p);
3390
3391 if (p == pend)
3392 { /* End of pattern means we might have succeeded. */
3393 DEBUG_PRINT1 ("end of pattern ... ");
3394
3395 /* If we haven't matched the entire string, and we want the
3396 longest match, try backtracking. */
3397 if (d != end_match_2)
3398 {
3399 DEBUG_PRINT1 ("backtracking.\n");
3400
3401 if (!FAIL_STACK_EMPTY ())
3402 { /* More failure points to try. */
3403 boolean same_str_p = (FIRST_STRING_P (match_end)
3404 == MATCHING_IN_FIRST_STRING);
3405
3406 /* If exceeds best match so far, save it. */
3407 if (!best_regs_set
3408 || (same_str_p && d > match_end)
3409 || (!same_str_p && !MATCHING_IN_FIRST_STRING))
3410 {
3411 best_regs_set = true;
3412 match_end = d;
3413
3414 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
3415
3416 for (mcnt = 1; mcnt < num_regs; mcnt++)
3417 {
3418 best_regstart[mcnt] = regstart[mcnt];
3419 best_regend[mcnt] = regend[mcnt];
3420 }
3421 }
3422 goto fail;
3423 }
3424
3425 /* If no failure points, don't restore garbage. */
3426 else if (best_regs_set)
3427 {
3428 restore_best_regs:
3429 /* Restore best match. It may happen that `dend ==
3430 end_match_1' while the restored d is in string2.
3431 For example, the pattern `x.*y.*z' against the
3432 strings `x-' and `y-z-', if the two strings are
3433 not consecutive in memory. */
3434 DEBUG_PRINT1 ("Restoring best registers.\n");
3435
3436 d = match_end;
3437 dend = ((d >= string1 && d <= end1)
3438 ? end_match_1 : end_match_2);
3439
3440 for (mcnt = 1; mcnt < num_regs; mcnt++)
3441 {
3442 regstart[mcnt] = best_regstart[mcnt];
3443 regend[mcnt] = best_regend[mcnt];
3444 }
3445 }
3446 } /* d != end_match_2 */
3447
3448 DEBUG_PRINT1 ("Accepting match.\n");
3449
3450 /* If caller wants register contents data back, do it. */
3451 if (regs && !bufp->no_sub)
3452 {
3453 /* Have the register data arrays been allocated? */
3454 if (bufp->regs_allocated == REGS_UNALLOCATED)
3455 { /* No. So allocate them with malloc. We need one
3456 extra element beyond `num_regs' for the `-1' marker
3457 GNU code uses. */
3458 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
3459 regs->start = TALLOC (regs->num_regs, regoff_t);
3460 regs->end = TALLOC (regs->num_regs, regoff_t);
3461 if (regs->start == NULL || regs->end == NULL)
3462 return -2;
3463 bufp->regs_allocated = REGS_REALLOCATE;
3464 }
3465 else if (bufp->regs_allocated == REGS_REALLOCATE)
3466 { /* Yes. If we need more elements than were already
3467 allocated, reallocate them. If we need fewer, just
3468 leave it alone. */
3469 if (regs->num_regs < num_regs + 1)
3470 {
3471 regs->num_regs = num_regs + 1;
3472 RETALLOC (regs->start, regs->num_regs, regoff_t);
3473 RETALLOC (regs->end, regs->num_regs, regoff_t);
3474 if (regs->start == NULL || regs->end == NULL)
3475 return -2;
3476 }
3477 }
3478 else
3479 assert (bufp->regs_allocated == REGS_FIXED);
3480
3481 /* Convert the pointer data in `regstart' and `regend' to
3482 indices. Register zero has to be set differently,
3483 since we haven't kept track of any info for it. */
3484 if (regs->num_regs > 0)
3485 {
3486 regs->start[0] = pos;
3487 regs->end[0] = (MATCHING_IN_FIRST_STRING ? d - string1
3488 : d - string2 + size1);
3489 }
3490
3491 /* Go through the first `min (num_regs, regs->num_regs)'
3492 registers, since that is all we initialized. */
3493 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++)
3494 {
3495 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
3496 regs->start[mcnt] = regs->end[mcnt] = -1;
3497 else
3498 {
3499 regs->start[mcnt] = POINTER_TO_OFFSET (regstart[mcnt]);
3500 regs->end[mcnt] = POINTER_TO_OFFSET (regend[mcnt]);
3501 }
3502 }
3503
3504 /* If the regs structure we return has more elements than
3505 were in the pattern, set the extra elements to -1. If
3506 we (re)allocated the registers, this is the case,
3507 because we always allocate enough to have at least one
3508 -1 at the end. */
3509 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++)
3510 regs->start[mcnt] = regs->end[mcnt] = -1;
3511 } /* regs && !bufp->no_sub */
3512
3513 FREE_VARIABLES ();
3514 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
3515 nfailure_points_pushed, nfailure_points_popped,
3516 nfailure_points_pushed - nfailure_points_popped);
3517 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
3518
3519 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
3520 ? string1
3521 : string2 - size1);
3522
3523 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
3524
3525 return mcnt;
3526 }
3527
3528 /* Otherwise match next pattern command. */
3529 #ifdef SWITCH_ENUM_BUG
3530 switch ((int) ((re_opcode_t) *p++))
3531 #else
3532 switch ((re_opcode_t) *p++)
3533 #endif
3534 {
3535 /* Ignore these. Used to ignore the n of succeed_n's which
3536 currently have n == 0. */
3537 case no_op:
3538 DEBUG_PRINT1 ("EXECUTING no_op.\n");
3539 break;
3540
3541
3542 /* Match the next n pattern characters exactly. The following
3543 byte in the pattern defines n, and the n bytes after that
3544 are the characters to match. */
3545 case exactn:
3546 mcnt = *p++;
3547 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
3548
3549 /* This is written out as an if-else so we don't waste time
3550 testing `translate' inside the loop. */
3551 if (translate)
3552 {
3553 do
3554 {
3555 PREFETCH ();
3556 if (translate[(unsigned char) *d++] != (char) *p++)
3557 goto fail;
3558 }
3559 while (--mcnt);
3560 }
3561 else
3562 {
3563 do
3564 {
3565 PREFETCH ();
3566 if (*d++ != (char) *p++) goto fail;
3567 }
3568 while (--mcnt);
3569 }
3570 SET_REGS_MATCHED ();
3571 break;
3572
3573
3574 /* Match any character except possibly a newline or a null. */
3575 case anychar:
3576 DEBUG_PRINT1 ("EXECUTING anychar.\n");
3577
3578 PREFETCH ();
3579
3580 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
3581 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
3582 goto fail;
3583
3584 SET_REGS_MATCHED ();
3585 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
3586 d++;
3587 break;
3588
3589
3590 case charset:
3591 case charset_not:
3592 {
3593 register unsigned char c;
3594 boolean not = (re_opcode_t) *(p - 1) == charset_not;
3595
3596 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
3597
3598 PREFETCH ();
3599 c = TRANSLATE (*d); /* The character to match. */
3600
3601 /* Cast to `unsigned' instead of `unsigned char' in case the
3602 bit list is a full 32 bytes long. */
3603 if (c < (unsigned) (*p * BYTEWIDTH)
3604 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
3605 not = !not;
3606
3607 p += 1 + *p;
3608
3609 if (!not) goto fail;
3610
3611 SET_REGS_MATCHED ();
3612 d++;
3613 break;
3614 }
3615
3616
3617 /* The beginning of a group is represented by start_memory.
3618 The arguments are the register number in the next byte, and the
3619 number of groups inner to this one in the next. The text
3620 matched within the group is recorded (in the internal
3621 registers data structure) under the register number. */
3622 case start_memory:
3623 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
3624
3625 /* Find out if this group can match the empty string. */
3626 p1 = p; /* To send to group_match_null_string_p. */
3627
3628 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
3629 REG_MATCH_NULL_STRING_P (reg_info[*p])
3630 = group_match_null_string_p (&p1, pend, reg_info);
3631
3632 /* Save the position in the string where we were the last time
3633 we were at this open-group operator in case the group is
3634 operated upon by a repetition operator, e.g., with `(a*)*b'
3635 against `ab'; then we want to ignore where we are now in
3636 the string in case this attempt to match fails. */
3637 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
3638 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
3639 : regstart[*p];
3640 DEBUG_PRINT2 (" old_regstart: %d\n",
3641 POINTER_TO_OFFSET (old_regstart[*p]));
3642
3643 regstart[*p] = d;
3644 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
3645
3646 IS_ACTIVE (reg_info[*p]) = 1;
3647 MATCHED_SOMETHING (reg_info[*p]) = 0;
3648
3649 /* This is the new highest active register. */
3650 highest_active_reg = *p;
3651
3652 /* If nothing was active before, this is the new lowest active
3653 register. */
3654 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
3655 lowest_active_reg = *p;
3656
3657 /* Move past the register number and inner group count. */
3658 p += 2;
3659 break;
3660
3661
3662 /* The stop_memory opcode represents the end of a group. Its
3663 arguments are the same as start_memory's: the register
3664 number, and the number of inner groups. */
3665 case stop_memory:
3666 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
3667
3668 /* We need to save the string position the last time we were at
3669 this close-group operator in case the group is operated
3670 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
3671 against `aba'; then we want to ignore where we are now in
3672 the string in case this attempt to match fails. */
3673 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
3674 ? REG_UNSET (regend[*p]) ? d : regend[*p]
3675 : regend[*p];
3676 DEBUG_PRINT2 (" old_regend: %d\n",
3677 POINTER_TO_OFFSET (old_regend[*p]));
3678
3679 regend[*p] = d;
3680 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
3681
3682 /* This register isn't active anymore. */
3683 IS_ACTIVE (reg_info[*p]) = 0;
3684
3685 /* If this was the only register active, nothing is active
3686 anymore. */
3687 if (lowest_active_reg == highest_active_reg)
3688 {
3689 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3690 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3691 }
3692 else
3693 { /* We must scan for the new highest active register, since
3694 it isn't necessarily one less than now: consider
3695 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
3696 new highest active register is 1. */
3697 unsigned char r = *p - 1;
3698 while (r > 0 && !IS_ACTIVE (reg_info[r]))
3699 r--;
3700
3701 /* If we end up at register zero, that means that we saved
3702 the registers as the result of an `on_failure_jump', not
3703 a `start_memory', and we jumped to past the innermost
3704 `stop_memory'. For example, in ((.)*) we save
3705 registers 1 and 2 as a result of the *, but when we pop
3706 back to the second ), we are at the stop_memory 1.
3707 Thus, nothing is active. */
3708 if (r == 0)
3709 {
3710 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3711 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3712 }
3713 else
3714 highest_active_reg = r;
3715 }
3716
3717 /* If just failed to match something this time around with a
3718 group that's operated on by a repetition operator, try to
3719 force exit from the ``loop'', and restore the register
3720 information for this group that we had before trying this
3721 last match. */
3722 if ((!MATCHED_SOMETHING (reg_info[*p])
3723 || (re_opcode_t) p[-3] == start_memory)
3724 && (p + 2) < pend)
3725 {
3726 boolean is_a_jump_n = false;
3727
3728 p1 = p + 2;
3729 mcnt = 0;
3730 switch ((re_opcode_t) *p1++)
3731 {
3732 case jump_n:
3733 is_a_jump_n = true;
3734 case pop_failure_jump:
3735 case maybe_pop_jump:
3736 case jump:
3737 case dummy_failure_jump:
3738 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
3739 if (is_a_jump_n)
3740 p1 += 2;
3741 break;
3742
3743 default:
3744 /* do nothing */ ;
3745 }
3746 p1 += mcnt;
3747
3748 /* If the next operation is a jump backwards in the pattern
3749 to an on_failure_jump right before the start_memory
3750 corresponding to this stop_memory, exit from the loop
3751 by forcing a failure after pushing on the stack the
3752 on_failure_jump's jump in the pattern, and d. */
3753 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
3754 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
3755 {
3756 /* If this group ever matched anything, then restore
3757 what its registers were before trying this last
3758 failed match, e.g., with `(a*)*b' against `ab' for
3759 regstart[1], and, e.g., with `((a*)*(b*)*)*'
3760 against `aba' for regend[3].
3761
3762 Also restore the registers for inner groups for,
3763 e.g., `((a*)(b*))*' against `aba' (register 3 would
3764 otherwise get trashed). */
3765
3766 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
3767 {
3768 unsigned r;
3769
3770 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
3771
3772 /* Restore this and inner groups' (if any) registers. */
3773 for (r = *p; r < *p + *(p + 1); r++)
3774 {
3775 regstart[r] = old_regstart[r];
3776
3777 /* xx why this test? */
3778 if ((long) old_regend[r] >= (long) regstart[r])
3779 regend[r] = old_regend[r];
3780 }
3781 }
3782 p1++;
3783 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
3784 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
3785
3786 goto fail;
3787 }
3788 }
3789
3790 /* Move past the register number and the inner group count. */
3791 p += 2;
3792 break;
3793
3794
3795 /* \<digit> has been turned into a `duplicate' command which is
3796 followed by the numeric value of <digit> as the register number. */
3797 case duplicate:
3798 {
3799 register const char *d2, *dend2;
3800 int regno = *p++; /* Get which register to match against. */
3801 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
3802
3803 /* Can't back reference a group which we've never matched. */
3804 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
3805 goto fail;
3806
3807 /* Where in input to try to start matching. */
3808 d2 = regstart[regno];
3809
3810 /* Where to stop matching; if both the place to start and
3811 the place to stop matching are in the same string, then
3812 set to the place to stop, otherwise, for now have to use
3813 the end of the first string. */
3814
3815 dend2 = ((FIRST_STRING_P (regstart[regno])
3816 == FIRST_STRING_P (regend[regno]))
3817 ? regend[regno] : end_match_1);
3818 for (;;)
3819 {
3820 /* If necessary, advance to next segment in register
3821 contents. */
3822 while (d2 == dend2)
3823 {
3824 if (dend2 == end_match_2) break;
3825 if (dend2 == regend[regno]) break;
3826
3827 /* End of string1 => advance to string2. */
3828 d2 = string2;
3829 dend2 = regend[regno];
3830 }
3831 /* At end of register contents => success */
3832 if (d2 == dend2) break;
3833
3834 /* If necessary, advance to next segment in data. */
3835 PREFETCH ();
3836
3837 /* How many characters left in this segment to match. */
3838 mcnt = dend - d;
3839
3840 /* Want how many consecutive characters we can match in
3841 one shot, so, if necessary, adjust the count. */
3842 if (mcnt > dend2 - d2)
3843 mcnt = dend2 - d2;
3844
3845 /* Compare that many; failure if mismatch, else move
3846 past them. */
3847 if (translate
3848 ? bcmp_translate (d, d2, mcnt, translate)
3849 : bcmp (d, d2, mcnt))
3850 goto fail;
3851 d += mcnt, d2 += mcnt;
3852 }
3853 }
3854 break;
3855
3856
3857 /* begline matches the empty string at the beginning of the string
3858 (unless `not_bol' is set in `bufp'), and, if
3859 `newline_anchor' is set, after newlines. */
3860 case begline:
3861 DEBUG_PRINT1 ("EXECUTING begline.\n");
3862
3863 if (AT_STRINGS_BEG (d))
3864 {
3865 if (!bufp->not_bol) break;
3866 }
3867 else if (d[-1] == '\n' && bufp->newline_anchor)
3868 {
3869 break;
3870 }
3871 /* In all other cases, we fail. */
3872 goto fail;
3873
3874
3875 /* endline is the dual of begline. */
3876 case endline:
3877 DEBUG_PRINT1 ("EXECUTING endline.\n");
3878
3879 if (AT_STRINGS_END (d))
3880 {
3881 if (!bufp->not_eol) break;
3882 }
3883
3884 /* We have to ``prefetch'' the next character. */
3885 else if ((d == end1 ? *string2 : *d) == '\n'
3886 && bufp->newline_anchor)
3887 {
3888 break;
3889 }
3890 goto fail;
3891
3892
3893 /* Match at the very beginning of the data. */
3894 case begbuf:
3895 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
3896 if (AT_STRINGS_BEG (d))
3897 break;
3898 goto fail;
3899
3900
3901 /* Match at the very end of the data. */
3902 case endbuf:
3903 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
3904 if (AT_STRINGS_END (d))
3905 break;
3906 goto fail;
3907
3908
3909 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
3910 pushes NULL as the value for the string on the stack. Then
3911 `pop_failure_point' will keep the current value for the
3912 string, instead of restoring it. To see why, consider
3913 matching `foo\nbar' against `.*\n'. The .* matches the foo;
3914 then the . fails against the \n. But the next thing we want
3915 to do is match the \n against the \n; if we restored the
3916 string value, we would be back at the foo.
3917
3918 Because this is used only in specific cases, we don't need to
3919 check all the things that `on_failure_jump' does, to make
3920 sure the right things get saved on the stack. Hence we don't
3921 share its code. The only reason to push anything on the
3922 stack at all is that otherwise we would have to change
3923 `anychar's code to do something besides goto fail in this
3924 case; that seems worse than this. */
3925 case on_failure_keep_string_jump:
3926 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
3927
3928 EXTRACT_NUMBER_AND_INCR (mcnt, p);
3929 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
3930
3931 PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
3932 break;
3933
3934
3935 /* Uses of on_failure_jump:
3936
3937 Each alternative starts with an on_failure_jump that points
3938 to the beginning of the next alternative. Each alternative
3939 except the last ends with a jump that in effect jumps past
3940 the rest of the alternatives. (They really jump to the
3941 ending jump of the following alternative, because tensioning
3942 these jumps is a hassle.)
3943
3944 Repeats start with an on_failure_jump that points past both
3945 the repetition text and either the following jump or
3946 pop_failure_jump back to this on_failure_jump. */
3947 case on_failure_jump:
3948 on_failure:
3949 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
3950
3951 EXTRACT_NUMBER_AND_INCR (mcnt, p);
3952 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
3953
3954 /* If this on_failure_jump comes right before a group (i.e.,
3955 the original * applied to a group), save the information
3956 for that group and all inner ones, so that if we fail back
3957 to this point, the group's information will be correct.
3958 For example, in \(a*\)*\1, we need the preceding group,
3959 and in \(\(a*\)b*\)\2, we need the inner group. */
3960
3961 /* We can't use `p' to check ahead because we push
3962 a failure point to `p + mcnt' after we do this. */
3963 p1 = p;
3964
3965 /* We need to skip no_op's before we look for the
3966 start_memory in case this on_failure_jump is happening as
3967 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
3968 against aba. */
3969 while (p1 < pend && (re_opcode_t) *p1 == no_op)
3970 p1++;
3971
3972 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
3973 {
3974 /* We have a new highest active register now. This will
3975 get reset at the start_memory we are about to get to,
3976 but we will have saved all the registers relevant to
3977 this repetition op, as described above. */
3978 highest_active_reg = *(p1 + 1) + *(p1 + 2);
3979 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
3980 lowest_active_reg = *(p1 + 1);
3981 }
3982
3983 DEBUG_PRINT1 (":\n");
3984 PUSH_FAILURE_POINT (p + mcnt, d, -2);
3985 break;
3986
3987
3988 /* A smart repeat ends with `maybe_pop_jump'.
3989 We change it to either `pop_failure_jump' or `jump'. */
3990 case maybe_pop_jump:
3991 EXTRACT_NUMBER_AND_INCR (mcnt, p);
3992 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
3993 {
3994 register unsigned char *p2 = p;
3995
3996 /* Compare the beginning of the repeat with what in the
3997 pattern follows its end. If we can establish that there
3998 is nothing that they would both match, i.e., that we
3999 would have to backtrack because of (as in, e.g., `a*a')
4000 then we can change to pop_failure_jump, because we'll
4001 never have to backtrack.
4002
4003 This is not true in the case of alternatives: in
4004 `(a|ab)*' we do need to backtrack to the `ab' alternative
4005 (e.g., if the string was `ab'). But instead of trying to
4006 detect that here, the alternative has put on a dummy
4007 failure point which is what we will end up popping. */
4008
4009 /* Skip over open/close-group commands. */
4010 while (p2 + 2 < pend
4011 && ((re_opcode_t) *p2 == stop_memory
4012 || (re_opcode_t) *p2 == start_memory))
4013 p2 += 3; /* Skip over args, too. */
4014
4015 /* If we're at the end of the pattern, we can change. */
4016 if (p2 == pend)
4017 {
4018 /* Consider what happens when matching ":\(.*\)"
4019 against ":/". I don't really understand this code
4020 yet. */
4021 p[-3] = (unsigned char) pop_failure_jump;
4022 DEBUG_PRINT1
4023 (" End of pattern: change to `pop_failure_jump'.\n");
4024 }
4025
4026 else if ((re_opcode_t) *p2 == exactn
4027 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
4028 {
4029 register unsigned char c
4030 = *p2 == (unsigned char) endline ? '\n' : p2[2];
4031 p1 = p + mcnt;
4032
4033 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
4034 to the `maybe_finalize_jump' of this case. Examine what
4035 follows. */
4036 if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
4037 {
4038 p[-3] = (unsigned char) pop_failure_jump;
4039 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
4040 c, p1[5]);
4041 }
4042
4043 else if ((re_opcode_t) p1[3] == charset
4044 || (re_opcode_t) p1[3] == charset_not)
4045 {
4046 int not = (re_opcode_t) p1[3] == charset_not;
4047
4048 if (c < (unsigned char) (p1[4] * BYTEWIDTH)
4049 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4050 not = !not;
4051
4052 /* `not' is equal to 1 if c would match, which means
4053 that we can't change to pop_failure_jump. */
4054 if (!not)
4055 {
4056 p[-3] = (unsigned char) pop_failure_jump;
4057 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
4058 }
4059 }
4060 }
4061 }
4062 p -= 2; /* Point at relative address again. */
4063 if ((re_opcode_t) p[-1] != pop_failure_jump)
4064 {
4065 p[-1] = (unsigned char) jump;
4066 DEBUG_PRINT1 (" Match => jump.\n");
4067 goto unconditional_jump;
4068 }
4069 /* Note fall through. */
4070
4071
4072 /* The end of a simple repeat has a pop_failure_jump back to
4073 its matching on_failure_jump, where the latter will push a
4074 failure point. The pop_failure_jump takes off failure
4075 points put on by this pop_failure_jump's matching
4076 on_failure_jump; we got through the pattern to here from the
4077 matching on_failure_jump, so didn't fail. */
4078 case pop_failure_jump:
4079 {
4080 /* We need to pass separate storage for the lowest and
4081 highest registers, even though we don't care about the
4082 actual values. Otherwise, we will restore only one
4083 register from the stack, since lowest will == highest in
4084 `pop_failure_point'. */
4085 unsigned dummy_low_reg, dummy_high_reg;
4086 unsigned char *pdummy;
4087 const char *sdummy;
4088
4089 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
4090 POP_FAILURE_POINT (sdummy, pdummy,
4091 dummy_low_reg, dummy_high_reg,
4092 reg_dummy, reg_dummy, reg_info_dummy);
4093 }
4094 /* Note fall through. */
4095
4096
4097 /* Unconditionally jump (without popping any failure points). */
4098 case jump:
4099 unconditional_jump:
4100 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
4101 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
4102 p += mcnt; /* Do the jump. */
4103 DEBUG_PRINT2 ("(to 0x%x).\n", p);
4104 break;
4105
4106
4107 /* We need this opcode so we can detect where alternatives end
4108 in `group_match_null_string_p' et al. */
4109 case jump_past_alt:
4110 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
4111 goto unconditional_jump;
4112
4113
4114 /* Normally, the on_failure_jump pushes a failure point, which
4115 then gets popped at pop_failure_jump. We will end up at
4116 pop_failure_jump, also, and with a pattern of, say, `a+', we
4117 are skipping over the on_failure_jump, so we have to push
4118 something meaningless for pop_failure_jump to pop. */
4119 case dummy_failure_jump:
4120 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
4121 /* It doesn't matter what we push for the string here. What
4122 the code at `fail' tests is the value for the pattern. */
4123 PUSH_FAILURE_POINT (0, 0, -2);
4124 goto unconditional_jump;
4125
4126
4127 /* At the end of an alternative, we need to push a dummy failure
4128 point in case we are followed by a `pop_failure_jump', because
4129 we don't want the failure point for the alternative to be
4130 popped. For example, matching `(a|ab)*' against `aab'
4131 requires that we match the `ab' alternative. */
4132 case push_dummy_failure:
4133 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
4134 /* See comments just above at `dummy_failure_jump' about the
4135 two zeroes. */
4136 PUSH_FAILURE_POINT (0, 0, -2);
4137 break;
4138
4139 /* Have to succeed matching what follows at least n times.
4140 After that, handle like `on_failure_jump'. */
4141 case succeed_n:
4142 EXTRACT_NUMBER (mcnt, p + 2);
4143 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
4144
4145 assert (mcnt >= 0);
4146 /* Originally, this is how many times we HAVE to succeed. */
4147 if (mcnt > 0)
4148 {
4149 mcnt--;
4150 p += 2;
4151 STORE_NUMBER_AND_INCR (p, mcnt);
4152 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p, mcnt);
4153 }
4154 else if (mcnt == 0)
4155 {
4156 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2);
4157 p[2] = (unsigned char) no_op;
4158 p[3] = (unsigned char) no_op;
4159 goto on_failure;
4160 }
4161 break;
4162
4163 case jump_n:
4164 EXTRACT_NUMBER (mcnt, p + 2);
4165 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
4166
4167 /* Originally, this is how many times we CAN jump. */
4168 if (mcnt)
4169 {
4170 mcnt--;
4171 STORE_NUMBER (p + 2, mcnt);
4172 goto unconditional_jump;
4173 }
4174 /* If don't have to jump any more, skip over the rest of command. */
4175 else
4176 p += 4;
4177 break;
4178
4179 case set_number_at:
4180 {
4181 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
4182
4183 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4184 p1 = p + mcnt;
4185 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4186 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
4187 STORE_NUMBER (p1, mcnt);
4188 break;
4189 }
4190
4191 case wordbound:
4192 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
4193 if (AT_WORD_BOUNDARY (d))
4194 break;
4195 goto fail;
4196
4197 case notwordbound:
4198 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
4199 if (AT_WORD_BOUNDARY (d))
4200 goto fail;
4201 break;
4202
4203 case wordbeg:
4204 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
4205 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
4206 break;
4207 goto fail;
4208
4209 case wordend:
4210 DEBUG_PRINT1 ("EXECUTING wordend.\n");
4211 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
4212 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
4213 break;
4214 goto fail;
4215
4216 #ifdef emacs
4217 #ifdef emacs19
4218 case before_dot:
4219 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
4220 if (PTR_CHAR_POS ((unsigned char *) d) >= point)
4221 goto fail;
4222 break;
4223
4224 case at_dot:
4225 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
4226 if (PTR_CHAR_POS ((unsigned char *) d) != point)
4227 goto fail;
4228 break;
4229
4230 case after_dot:
4231 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
4232 if (PTR_CHAR_POS ((unsigned char *) d) <= point)
4233 goto fail;
4234 break;
4235 #else /* not emacs19 */
4236 case at_dot:
4237 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
4238 if (PTR_CHAR_POS ((unsigned char *) d) + 1 != point)
4239 goto fail;
4240 break;
4241 #endif /* not emacs19 */
4242
4243 case syntaxspec:
4244 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
4245 mcnt = *p++;
4246 goto matchsyntax;
4247
4248 case wordchar:
4249 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
4250 mcnt = (int) Sword;
4251 matchsyntax:
4252 PREFETCH ();
4253 if (SYNTAX (*d++) != (enum syntaxcode) mcnt)
4254 goto fail;
4255 SET_REGS_MATCHED ();
4256 break;
4257
4258 case notsyntaxspec:
4259 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
4260 mcnt = *p++;
4261 goto matchnotsyntax;
4262
4263 case notwordchar:
4264 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
4265 mcnt = (int) Sword;
4266 matchnotsyntax:
4267 PREFETCH ();
4268 if (SYNTAX (*d++) == (enum syntaxcode) mcnt)
4269 goto fail;
4270 SET_REGS_MATCHED ();
4271 break;
4272
4273 #else /* not emacs */
4274 case wordchar:
4275 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
4276 PREFETCH ();
4277 if (!WORDCHAR_P (d))
4278 goto fail;
4279 SET_REGS_MATCHED ();
4280 d++;
4281 break;
4282
4283 case notwordchar:
4284 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
4285 PREFETCH ();
4286 if (WORDCHAR_P (d))
4287 goto fail;
4288 SET_REGS_MATCHED ();
4289 d++;
4290 break;
4291 #endif /* not emacs */
4292
4293 default:
4294 abort ();
4295 }
4296 continue; /* Successfully executed one pattern command; keep going. */
4297
4298
4299 /* We goto here if a matching operation fails. */
4300 fail:
4301 if (!FAIL_STACK_EMPTY ())
4302 { /* A restart point is known. Restore to that state. */
4303 DEBUG_PRINT1 ("\nFAIL:\n");
4304 POP_FAILURE_POINT (d, p,
4305 lowest_active_reg, highest_active_reg,
4306 regstart, regend, reg_info);
4307
4308 /* If this failure point is a dummy, try the next one. */
4309 if (!p)
4310 goto fail;
4311
4312 /* If we failed to the end of the pattern, don't examine *p. */
4313 assert (p <= pend);
4314 if (p < pend)
4315 {
4316 boolean is_a_jump_n = false;
4317
4318 /* If failed to a backwards jump that's part of a repetition
4319 loop, need to pop this failure point and use the next one. */
4320 switch ((re_opcode_t) *p)
4321 {
4322 case jump_n:
4323 is_a_jump_n = true;
4324 case maybe_pop_jump:
4325 case pop_failure_jump:
4326 case jump:
4327 p1 = p + 1;
4328 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4329 p1 += mcnt;
4330
4331 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
4332 || (!is_a_jump_n
4333 && (re_opcode_t) *p1 == on_failure_jump))
4334 goto fail;
4335 break;
4336 default:
4337 /* do nothing */ ;
4338 }
4339 }
4340
4341 if (d >= string1 && d <= end1)
4342 dend = end_match_1;
4343 }
4344 else
4345 break; /* Matching at this starting point really fails. */
4346 } /* for (;;) */
4347
4348 if (best_regs_set)
4349 goto restore_best_regs;
4350
4351 FREE_VARIABLES ();
4352
4353 return -1; /* Failure to match. */
4354 } /* re_match_2 */
4355
4356 /* Subroutine definitions for re_match_2. */
4357
4358
4359 /* We are passed P pointing to a register number after a start_memory.
4360
4361 Return true if the pattern up to the corresponding stop_memory can
4362 match the empty string, and false otherwise.
4363
4364 If we find the matching stop_memory, sets P to point to one past its number.
4365 Otherwise, sets P to an undefined byte less than or equal to END.
4366
4367 We don't handle duplicates properly (yet). */
4368
4369 static boolean
4370 group_match_null_string_p (p, end, reg_info)
4371 unsigned char **p, *end;
4372 register_info_type *reg_info;
4373 {
4374 int mcnt;
4375 /* Point to after the args to the start_memory. */
4376 unsigned char *p1 = *p + 2;
4377
4378 while (p1 < end)
4379 {
4380 /* Skip over opcodes that can match nothing, and return true or
4381 false, as appropriate, when we get to one that can't, or to the
4382 matching stop_memory. */
4383
4384 switch ((re_opcode_t) *p1)
4385 {
4386 /* Could be either a loop or a series of alternatives. */
4387 case on_failure_jump:
4388 p1++;
4389 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4390
4391 /* If the next operation is not a jump backwards in the
4392 pattern. */
4393
4394 if (mcnt >= 0)
4395 {
4396 /* Go through the on_failure_jumps of the alternatives,
4397 seeing if any of the alternatives cannot match nothing.
4398 The last alternative starts with only a jump,
4399 whereas the rest start with on_failure_jump and end
4400 with a jump, e.g., here is the pattern for `a|b|c':
4401
4402 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
4403 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
4404 /exactn/1/c
4405
4406 So, we have to first go through the first (n-1)
4407 alternatives and then deal with the last one separately. */
4408
4409
4410 /* Deal with the first (n-1) alternatives, which start
4411 with an on_failure_jump (see above) that jumps to right
4412 past a jump_past_alt. */
4413
4414 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
4415 {
4416 /* `mcnt' holds how many bytes long the alternative
4417 is, including the ending `jump_past_alt' and
4418 its number. */
4419
4420 if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
4421 reg_info))
4422 return false;
4423
4424 /* Move to right after this alternative, including the
4425 jump_past_alt. */
4426 p1 += mcnt;
4427
4428 /* Break if it's the beginning of an n-th alternative
4429 that doesn't begin with an on_failure_jump. */
4430 if ((re_opcode_t) *p1 != on_failure_jump)
4431 break;
4432
4433 /* Still have to check that it's not an n-th
4434 alternative that starts with an on_failure_jump. */
4435 p1++;
4436 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4437 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
4438 {
4439 /* Get to the beginning of the n-th alternative. */
4440 p1 -= 3;
4441 break;
4442 }
4443 }
4444
4445 /* Deal with the last alternative: go back and get number
4446 of the `jump_past_alt' just before it. `mcnt' contains
4447 the length of the alternative. */
4448 EXTRACT_NUMBER (mcnt, p1 - 2);
4449
4450 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
4451 return false;
4452
4453 p1 += mcnt; /* Get past the n-th alternative. */
4454 } /* if mcnt > 0 */
4455 break;
4456
4457
4458 case stop_memory:
4459 assert (p1[1] == **p);
4460 *p = p1 + 2;
4461 return true;
4462
4463
4464 default:
4465 if (!common_op_match_null_string_p (&p1, end, reg_info))
4466 return false;
4467 }
4468 } /* while p1 < end */
4469
4470 return false;
4471 } /* group_match_null_string_p */
4472
4473
4474 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
4475 It expects P to be the first byte of a single alternative and END one
4476 byte past the last. The alternative can contain groups. */
4477
4478 static boolean
4479 alt_match_null_string_p (p, end, reg_info)
4480 unsigned char *p, *end;
4481 register_info_type *reg_info;
4482 {
4483 int mcnt;
4484 unsigned char *p1 = p;
4485
4486 while (p1 < end)
4487 {
4488 /* Skip over opcodes that can match nothing, and break when we get
4489 to one that can't. */
4490
4491 switch ((re_opcode_t) *p1)
4492 {
4493 /* It's a loop. */
4494 case on_failure_jump:
4495 p1++;
4496 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4497 p1 += mcnt;
4498 break;
4499
4500 default:
4501 if (!common_op_match_null_string_p (&p1, end, reg_info))
4502 return false;
4503 }
4504 } /* while p1 < end */
4505
4506 return true;
4507 } /* alt_match_null_string_p */
4508
4509
4510 /* Deals with the ops common to group_match_null_string_p and
4511 alt_match_null_string_p.
4512
4513 Sets P to one after the op and its arguments, if any. */
4514
4515 static boolean
4516 common_op_match_null_string_p (p, end, reg_info)
4517 unsigned char **p, *end;
4518 register_info_type *reg_info;
4519 {
4520 int mcnt;
4521 boolean ret;
4522 int reg_no;
4523 unsigned char *p1 = *p;
4524
4525 switch ((re_opcode_t) *p1++)
4526 {
4527 case no_op:
4528 case begline:
4529 case endline:
4530 case begbuf:
4531 case endbuf:
4532 case wordbeg:
4533 case wordend:
4534 case wordbound:
4535 case notwordbound:
4536 #ifdef emacs
4537 case before_dot:
4538 case at_dot:
4539 case after_dot:
4540 #endif
4541 break;
4542
4543 case start_memory:
4544 reg_no = *p1;
4545 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
4546 ret = group_match_null_string_p (&p1, end, reg_info);
4547
4548 /* Have to set this here in case we're checking a group which
4549 contains a group and a back reference to it. */
4550
4551 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
4552 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
4553
4554 if (!ret)
4555 return false;
4556 break;
4557
4558 /* If this is an optimized succeed_n for zero times, make the jump. */
4559 case jump:
4560 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4561 if (mcnt >= 0)
4562 p1 += mcnt;
4563 else
4564 return false;
4565 break;
4566
4567 case succeed_n:
4568 /* Get to the number of times to succeed. */
4569 p1 += 2;
4570 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4571
4572 if (mcnt == 0)
4573 {
4574 p1 -= 4;
4575 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4576 p1 += mcnt;
4577 }
4578 else
4579 return false;
4580 break;
4581
4582 case duplicate:
4583 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
4584 return false;
4585 break;
4586
4587 case set_number_at:
4588 p1 += 4;
4589
4590 default:
4591 /* All other opcodes mean we cannot match the empty string. */
4592 return false;
4593 }
4594
4595 *p = p1;
4596 return true;
4597 } /* common_op_match_null_string_p */
4598
4599
4600 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
4601 bytes; nonzero otherwise. */
4602
4603 static int
4604 bcmp_translate (s1, s2, len, translate)
4605 unsigned char *s1, *s2;
4606 register int len;
4607 char *translate;
4608 {
4609 register unsigned char *p1 = s1, *p2 = s2;
4610 while (len)
4611 {
4612 if (translate[*p1++] != translate[*p2++]) return 1;
4613 len--;
4614 }
4615 return 0;
4616 }
4617
4618 /* Entry points for GNU code. */
4619
4620 /* re_compile_pattern is the GNU regular expression compiler: it
4621 compiles PATTERN (of length SIZE) and puts the result in BUFP.
4622 Returns 0 if the pattern was valid, otherwise an error string.
4623
4624 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
4625 are set in BUFP on entry.
4626
4627 We call regex_compile to do the actual compilation. */
4628
4629 const char *
4630 re_compile_pattern (pattern, length, bufp)
4631 const char *pattern;
4632 int length;
4633 struct re_pattern_buffer *bufp;
4634 {
4635 reg_errcode_t ret;
4636
4637 /* GNU code is written to assume at least RE_NREGS registers will be set
4638 (and at least one extra will be -1). */
4639 bufp->regs_allocated = REGS_UNALLOCATED;
4640
4641 /* And GNU code determines whether or not to get register information
4642 by passing null for the REGS argument to re_match, etc., not by
4643 setting no_sub. */
4644 bufp->no_sub = 0;
4645
4646 /* Match anchors at newline. */
4647 bufp->newline_anchor = 1;
4648
4649 ret = regex_compile (pattern, length, re_syntax_options, bufp);
4650
4651 return re_error_msg[(int) ret];
4652 }
4653
4654 /* Entry points compatible with 4.2 BSD regex library. We don't define
4655 them if this is an Emacs or POSIX compilation. */
4656
4657 #if !defined (emacs) && !defined (_POSIX_SOURCE)
4658
4659 /* BSD has one and only one pattern buffer. */
4660 static struct re_pattern_buffer re_comp_buf;
4661
4662 char *
4663 re_comp (s)
4664 const char *s;
4665 {
4666 reg_errcode_t ret;
4667
4668 if (!s)
4669 {
4670 if (!re_comp_buf.buffer)
4671 return "No previous regular expression";
4672 return 0;
4673 }
4674
4675 if (!re_comp_buf.buffer)
4676 {
4677 re_comp_buf.buffer = (unsigned char *) malloc (200);
4678 if (re_comp_buf.buffer == NULL)
4679 return "Memory exhausted";
4680 re_comp_buf.allocated = 200;
4681
4682 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
4683 if (re_comp_buf.fastmap == NULL)
4684 return "Memory exhausted";
4685 }
4686
4687 /* Since `re_exec' always passes NULL for the `regs' argument, we
4688 don't need to initialize the pattern buffer fields which affect it. */
4689
4690 /* Match anchors at newlines. */
4691 re_comp_buf.newline_anchor = 1;
4692
4693 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
4694
4695 /* Yes, we're discarding `const' here. */
4696 return (char *) re_error_msg[(int) ret];
4697 }
4698
4699
4700 int
4701 re_exec (s)
4702 const char *s;
4703 {
4704 const int len = strlen (s);
4705 return
4706 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
4707 }
4708 #endif /* not emacs and not _POSIX_SOURCE */
4709
4710 /* POSIX.2 functions. Don't define these for Emacs. */
4711
4712 #ifndef emacs
4713
4714 /* regcomp takes a regular expression as a string and compiles it.
4715
4716 PREG is a regex_t *. We do not expect any fields to be initialized,
4717 since POSIX says we shouldn't. Thus, we set
4718
4719 `buffer' to the compiled pattern;
4720 `used' to the length of the compiled pattern;
4721 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
4722 REG_EXTENDED bit in CFLAGS is set; otherwise, to
4723 RE_SYNTAX_POSIX_BASIC;
4724 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
4725 `fastmap' and `fastmap_accurate' to zero;
4726 `re_nsub' to the number of subexpressions in PATTERN.
4727
4728 PATTERN is the address of the pattern string.
4729
4730 CFLAGS is a series of bits which affect compilation.
4731
4732 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
4733 use POSIX basic syntax.
4734
4735 If REG_NEWLINE is set, then . and [^...] don't match newline.
4736 Also, regexec will try a match beginning after every newline.
4737
4738 If REG_ICASE is set, then we considers upper- and lowercase
4739 versions of letters to be equivalent when matching.
4740
4741 If REG_NOSUB is set, then when PREG is passed to regexec, that
4742 routine will report only success or failure, and nothing about the
4743 registers.
4744
4745 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
4746 the return codes and their meanings.) */
4747
4748 int
4749 regcomp (preg, pattern, cflags)
4750 regex_t *preg;
4751 const char *pattern;
4752 int cflags;
4753 {
4754 reg_errcode_t ret;
4755 unsigned syntax
4756 = (cflags & REG_EXTENDED) ?
4757 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
4758
4759 /* regex_compile will allocate the space for the compiled pattern. */
4760 preg->buffer = 0;
4761 preg->allocated = 0;
4762
4763 /* Don't bother to use a fastmap when searching. This simplifies the
4764 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
4765 characters after newlines into the fastmap. This way, we just try
4766 every character. */
4767 preg->fastmap = 0;
4768
4769 if (cflags & REG_ICASE)
4770 {
4771 unsigned i;
4772
4773 preg->translate = (char *) malloc (CHAR_SET_SIZE);
4774 if (preg->translate == NULL)
4775 return (int) REG_ESPACE;
4776
4777 /* Map uppercase characters to corresponding lowercase ones. */
4778 for (i = 0; i < CHAR_SET_SIZE; i++)
4779 preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
4780 }
4781 else
4782 preg->translate = NULL;
4783
4784 /* If REG_NEWLINE is set, newlines are treated differently. */
4785 if (cflags & REG_NEWLINE)
4786 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
4787 syntax &= ~RE_DOT_NEWLINE;
4788 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
4789 /* It also changes the matching behavior. */
4790 preg->newline_anchor = 1;
4791 }
4792 else
4793 preg->newline_anchor = 0;
4794
4795 preg->no_sub = !!(cflags & REG_NOSUB);
4796
4797 /* POSIX says a null character in the pattern terminates it, so we
4798 can use strlen here in compiling the pattern. */
4799 ret = regex_compile (pattern, strlen (pattern), syntax, preg);
4800
4801 /* POSIX doesn't distinguish between an unmatched open-group and an
4802 unmatched close-group: both are REG_EPAREN. */
4803 if (ret == REG_ERPAREN) ret = REG_EPAREN;
4804
4805 return (int) ret;
4806 }
4807
4808
4809 /* regexec searches for a given pattern, specified by PREG, in the
4810 string STRING.
4811
4812 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
4813 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
4814 least NMATCH elements, and we set them to the offsets of the
4815 corresponding matched substrings.
4816
4817 EFLAGS specifies `execution flags' which affect matching: if
4818 REG_NOTBOL is set, then ^ does not match at the beginning of the
4819 string; if REG_NOTEOL is set, then $ does not match at the end.
4820
4821 We return 0 if we find a match and REG_NOMATCH if not. */
4822
4823 int
4824 regexec (preg, string, nmatch, pmatch, eflags)
4825 const regex_t *preg;
4826 const char *string;
4827 size_t nmatch;
4828 regmatch_t pmatch[];
4829 int eflags;
4830 {
4831 int ret;
4832 struct re_registers regs;
4833 regex_t private_preg;
4834 int len = strlen (string);
4835 boolean want_reg_info = !preg->no_sub && nmatch > 0;
4836
4837 private_preg = *preg;
4838
4839 private_preg.not_bol = !!(eflags & REG_NOTBOL);
4840 private_preg.not_eol = !!(eflags & REG_NOTEOL);
4841
4842 /* The user has told us exactly how many registers to return
4843 information about, via `nmatch'. We have to pass that on to the
4844 matching routines. */
4845 private_preg.regs_allocated = REGS_FIXED;
4846
4847 if (want_reg_info)
4848 {
4849 regs.num_regs = nmatch;
4850 regs.start = TALLOC (nmatch, regoff_t);
4851 regs.end = TALLOC (nmatch, regoff_t);
4852 if (regs.start == NULL || regs.end == NULL)
4853 return (int) REG_NOMATCH;
4854 }
4855
4856 /* Perform the searching operation. */
4857 ret = re_search (&private_preg, string, len,
4858 /* start: */ 0, /* range: */ len,
4859 want_reg_info ? ®s : (struct re_registers *) 0);
4860
4861 /* Copy the register information to the POSIX structure. */
4862 if (want_reg_info)
4863 {
4864 if (ret >= 0)
4865 {
4866 unsigned r;
4867
4868 for (r = 0; r < nmatch; r++)
4869 {
4870 pmatch[r].rm_so = regs.start[r];
4871 pmatch[r].rm_eo = regs.end[r];
4872 }
4873 }
4874
4875 /* If we needed the temporary register info, free the space now. */
4876 free (regs.start);
4877 free (regs.end);
4878 }
4879
4880 /* We want zero return to mean success, unlike `re_search'. */
4881 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
4882 }
4883
4884
4885 /* Returns a message corresponding to an error code, ERRCODE, returned
4886 from either regcomp or regexec. We don't use PREG here. */
4887
4888 size_t
4889 regerror (errcode, preg, errbuf, errbuf_size)
4890 int errcode;
4891 const regex_t *preg;
4892 char *errbuf;
4893 size_t errbuf_size;
4894 {
4895 const char *msg;
4896 size_t msg_size;
4897
4898 if (errcode < 0
4899 || errcode >= (sizeof (re_error_msg) / sizeof (re_error_msg[0])))
4900 /* Only error codes returned by the rest of the code should be passed
4901 to this routine. If we are given anything else, or if other regex
4902 code generates an invalid error code, then the program has a bug.
4903 Dump core so we can fix it. */
4904 abort ();
4905
4906 msg = re_error_msg[errcode];
4907
4908 /* POSIX doesn't require that we do anything in this case, but why
4909 not be nice. */
4910 if (! msg)
4911 msg = "Success";
4912
4913 msg_size = strlen (msg) + 1; /* Includes the null. */
4914
4915 if (errbuf_size != 0)
4916 {
4917 if (msg_size > errbuf_size)
4918 {
4919 strncpy (errbuf, msg, errbuf_size - 1);
4920 errbuf[errbuf_size - 1] = 0;
4921 }
4922 else
4923 strcpy (errbuf, msg);
4924 }
4925
4926 return msg_size;
4927 }
4928
4929
4930 /* Free dynamically allocated space used by PREG. */
4931
4932 void
4933 regfree (preg)
4934 regex_t *preg;
4935 {
4936 if (preg->buffer != NULL)
4937 free (preg->buffer);
4938 preg->buffer = NULL;
4939
4940 preg->allocated = 0;
4941 preg->used = 0;
4942
4943 if (preg->fastmap != NULL)
4944 free (preg->fastmap);
4945 preg->fastmap = NULL;
4946 preg->fastmap_accurate = 0;
4947
4948 if (preg->translate != NULL)
4949 free (preg->translate);
4950 preg->translate = NULL;
4951 }
4952
4953 #endif /* not emacs */
4954
4955 /*
4956 Local variables:
4957 make-backup-files: t
4958 version-control: t
4959 trim-versions-without-asking: nil
4960 End:
4961 */