"Fossies" - the Fresh Open Source Software Archive  

Source code changes of the file "support/dfa.c" between
gawk-5.0.1.tar.xz and gawk-5.1.0.tar.xz

About: GNU awk - pattern scanning and processing language.

dfa.c  (gawk-5.0.1.tar.xz):dfa.c  (gawk-5.1.0.tar.xz)
/* dfa.c - deterministic extended regexp routines for GNU /* dfa.c - deterministic extended regexp routines for GNU
Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2019 Free Software Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2020 Free Software
Foundation, Inc. Foundation, Inc.
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option) the Free Software Foundation; either version 3, or (at your option)
any later version. any later version.
This program is distributed in the hope that it will be useful, This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
skipping to change at line 27 skipping to change at line 27
Foundation, Inc., Foundation, Inc.,
51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */ 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
/* Written June, 1988 by Mike Haertel /* Written June, 1988 by Mike Haertel
Modified July, 1988 by Arthur David Olson to assist BMG speedups */ Modified July, 1988 by Arthur David Olson to assist BMG speedups */
#include <config.h> #include <config.h>
#include <assert.h> #include <assert.h>
#include <ctype.h> #include <ctype.h>
#ifndef VMS
#include <stdint.h> #include <stdint.h>
#else
#define SIZE_MAX __INT32_MAX
#define PTRDIFF_MAX __INT32_MAX
#endif
#include <stdio.h> #include <stdio.h>
#ifndef VMS
#include <sys/types.h>
#else
#include <stddef.h>
#endif
#include <stdlib.h> #include <stdlib.h>
#include <limits.h> #include <limits.h>
#include <string.h> #include <string.h>
#if HAVE_SETLOCALE
#include <locale.h>
#endif
#include "dfa.h" // gets stdbool.h for us #include "dfa.h"
#include "flexmember.h"
/* Another name for ptrdiff_t, for sizes of objects and nonnegative
indexes into objects. It is signed to help catch integer overflow.
It has its own name because it is for nonnegative values only. */
typedef ptrdiff_t idx_t;
static idx_t const IDX_MAX = PTRDIFF_MAX;
static bool static bool
streq (char const *a, char const *b) streq (char const *a, char const *b)
{ {
return strcmp (a, b) == 0; return strcmp (a, b) == 0;
} }
static bool static bool
isasciidigit (char c) isasciidigit (char c)
{ {
return '0' <= c && c <= '9'; return '0' <= c && c <= '9';
} }
/* Gawk doesn't use Gnulib, so don't assume that setlocale is present. */
#ifndef LC_ALL
# define setlocale(category, locale) NULL
#endif
#include "gettext.h" #include "gettext.h"
#define _(str) gettext (str) #define _(str) gettext (str)
#include <wchar.h> #include <wchar.h>
#include "intprops.h" #include "intprops.h"
#include "xalloc.h" #include "xalloc.h"
#include "localeinfo.h" #include "localeinfo.h"
#ifndef FALLTHROUGH #ifndef FALLTHROUGH
skipping to change at line 87 skipping to change at line 75
# define FALLTHROUGH ((void) 0) # define FALLTHROUGH ((void) 0)
# else # else
# define FALLTHROUGH __attribute__ ((__fallthrough__)) # define FALLTHROUGH __attribute__ ((__fallthrough__))
# endif # endif
#endif #endif
#ifndef MIN #ifndef MIN
# define MIN(a,b) ((a) < (b) ? (a) : (b)) # define MIN(a,b) ((a) < (b) ? (a) : (b))
#endif #endif
#if defined(__DJGPP__)
#include "mbsupport.h"
#endif
#ifdef GAWK
static int
is_blank (int c)
{
return (c == ' ' || c == '\t');
}
#endif /* GAWK */
/* HPUX defines these as macros in sys/param.h. */ /* HPUX defines these as macros in sys/param.h. */
#ifdef setbit #ifdef setbit
# undef setbit # undef setbit
#endif #endif
#ifdef clrbit #ifdef clrbit
# undef clrbit # undef clrbit
#endif #endif
/* For code that does not use Gnulib’s isblank module. */
#if !defined isblank && !defined HAVE_ISBLANK && !defined GNULIB_ISBLANK
# define isblank dfa_isblank
static int
isblank (int c)
{
return c == ' ' || c == '\t';
}
#endif
/* First integer value that is greater than any character code. */ /* First integer value that is greater than any character code. */
enum { NOTCHAR = 1 << CHAR_BIT }; enum { NOTCHAR = 1 << CHAR_BIT };
#ifdef UINT_LEAST64_MAX
/* Number of bits used in a charclass word. */
enum { CHARCLASS_WORD_BITS = 64 };
/* This represents part of a character class. It must be unsigned and /* This represents part of a character class. It must be unsigned and
at least CHARCLASS_WORD_BITS wide. Any excess bits are zero. */ at least CHARCLASS_WORD_BITS wide. Any excess bits are zero. */
typedef unsigned long int charclass_word; typedef uint_least64_t charclass_word;
/* Part of a charclass initializer that represents 64 bits' worth of a
charclass, where LO and HI are the low and high-order 32 bits of
the 64-bit quantity. */
# define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
/* CHARCLASS_WORD_BITS is the number of bits used in a charclass word. #else
CHARCLASS_PAIR (LO, HI) is part of a charclass initializer, and /* Fallbacks for pre-C99 hosts that lack 64-bit integers. */
represents 64 bits' worth of a charclass, where LO and HI are the
low and high-order 32 bits of the 64-bit quantity. */
#if ULONG_MAX >> 31 >> 31 < 3
enum { CHARCLASS_WORD_BITS = 32 }; enum { CHARCLASS_WORD_BITS = 32 };
typedef unsigned long charclass_word;
# define CHARCLASS_PAIR(lo, hi) lo, hi # define CHARCLASS_PAIR(lo, hi) lo, hi
#else
enum { CHARCLASS_WORD_BITS = 64 };
# define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
#endif #endif
/* An initializer for a charclass whose 32-bit words are A through H. */ /* An initializer for a charclass whose 32-bit words are A through H. */
#define CHARCLASS_INIT(a, b, c, d, e, f, g, h) \ #define CHARCLASS_INIT(a, b, c, d, e, f, g, h) \
{{ \ {{ \
CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d), \ CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d), \
CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h) \ CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h) \
}} }}
/* The maximum useful value of a charclass_word; all used bits are 1. */ /* The maximum useful value of a charclass_word; all used bits are 1. */
skipping to change at line 247 skipping to change at line 238
ENDWORD_CONSTRAINT = 0202, ENDWORD_CONSTRAINT = 0202,
LIMWORD_CONSTRAINT = 0252, LIMWORD_CONSTRAINT = 0252,
NOTLIMWORD_CONSTRAINT = 0525 NOTLIMWORD_CONSTRAINT = 0525
}; };
/* The regexp is parsed into an array of tokens in postfix form. Some tokens /* The regexp is parsed into an array of tokens in postfix form. Some tokens
are operators and others are terminal symbols. Most (but not all) of these are operators and others are terminal symbols. Most (but not all) of these
codes are returned by the lexical analyzer. */ codes are returned by the lexical analyzer. */
typedef ptrdiff_t token; typedef ptrdiff_t token;
static ptrdiff_t const TOKEN_MAX = PTRDIFF_MAX; static token const TOKEN_MAX = PTRDIFF_MAX;
/* States are indexed by state_num values. These are normally /* States are indexed by state_num values. These are normally
nonnegative but -1 is used as a special value. */ nonnegative but -1 is used as a special value. */
typedef ptrdiff_t state_num; typedef ptrdiff_t state_num;
/* Predefined token values. */ /* Predefined token values. */
enum enum
{ {
END = -1, /* END is a terminal symbol that matches the END = -1, /* END is a terminal symbol that matches the
end of input; any value of END or less in end of input; any value of END or less in
skipping to change at line 357 skipping to change at line 348
terminal symbol that matches any of a terminal symbol that matches any of a
class of characters. */ class of characters. */
}; };
/* States of the recognizer correspond to sets of positions in the parse /* States of the recognizer correspond to sets of positions in the parse
tree, together with the constraints under which they may be matched. tree, together with the constraints under which they may be matched.
So a position is encoded as an index into the parse tree together with So a position is encoded as an index into the parse tree together with
a constraint. */ a constraint. */
typedef struct typedef struct
{ {
size_t index; /* Index into the parse array. */ idx_t index; /* Index into the parse array. */
unsigned int constraint; /* Constraint for matching this position. */ unsigned int constraint; /* Constraint for matching this position. */
} position; } position;
/* Sets of positions are stored as arrays. */ /* Sets of positions are stored as arrays. */
typedef struct typedef struct
{ {
position *elems; /* Elements of this position set. */ position *elems; /* Elements of this position set. */
ptrdiff_t nelem; /* Number of elements in this set. */ idx_t nelem; /* Number of elements in this set. */
ptrdiff_t alloc; /* Number of elements allocated in ELEMS. */ idx_t alloc; /* Number of elements allocated in ELEMS. */
} position_set; } position_set;
/* A state of the dfa consists of a set of positions, some flags, /* A state of the dfa consists of a set of positions, some flags,
and the token value of the lowest-numbered position of the state that and the token value of the lowest-numbered position of the state that
contains an END token. */ contains an END token. */
typedef struct typedef struct
{ {
size_t hash; /* Hash of the positions of this state. */ size_t hash; /* Hash of the positions of this state. */
position_set elems; /* Positions this state could match. */ position_set elems; /* Positions this state could match. */
unsigned char context; /* Context from previous state. */ unsigned char context; /* Context from previous state. */
skipping to change at line 398 skipping to change at line 389
for the initial state setup. */ for the initial state setup. */
enum { MAX_TRCOUNT = 1024 }; enum { MAX_TRCOUNT = 1024 };
/* A bracket operator. /* A bracket operator.
e.g., [a-c], [[:alpha:]], etc. */ e.g., [a-c], [[:alpha:]], etc. */
struct mb_char_classes struct mb_char_classes
{ {
ptrdiff_t cset; ptrdiff_t cset;
bool invert; bool invert;
wchar_t *chars; /* Normal characters. */ wchar_t *chars; /* Normal characters. */
ptrdiff_t nchars; idx_t nchars;
ptrdiff_t nchars_alloc; idx_t nchars_alloc;
}; };
struct regex_syntax struct regex_syntax
{ {
/* Syntax bits controlling the behavior of the lexical analyzer. */ /* Syntax bits controlling the behavior of the lexical analyzer. */
reg_syntax_t syntax_bits; reg_syntax_t syntax_bits;
bool syntax_bits_set; bool syntax_bits_set;
/* Flag for case-folding letters into sets. */ /* Flag for case-folding letters into sets. */
bool case_fold; bool case_fold;
skipping to change at line 439 skipping to change at line 430
charclass newline; charclass newline;
}; };
/* Lexical analyzer. All the dross that deals with the obnoxious /* Lexical analyzer. All the dross that deals with the obnoxious
GNU Regex syntax bits is located here. The poor, suffering GNU Regex syntax bits is located here. The poor, suffering
reader is referred to the GNU Regex documentation for the reader is referred to the GNU Regex documentation for the
meaning of the @#%!@#%^!@ syntax bits. */ meaning of the @#%!@#%^!@ syntax bits. */
struct lexer_state struct lexer_state
{ {
char const *ptr; /* Pointer to next input character. */ char const *ptr; /* Pointer to next input character. */
size_t left; /* Number of characters remaining. */ idx_t left; /* Number of characters remaining. */
token lasttok; /* Previous token returned; initially END. */ token lasttok; /* Previous token returned; initially END. */
size_t parens; /* Count of outstanding left parens. */ idx_t parens; /* Count of outstanding left parens. */
int minrep, maxrep; /* Repeat counts for {m,n}. */ int minrep, maxrep; /* Repeat counts for {m,n}. */
/* Wide character representation of the current multibyte character, /* Wide character representation of the current multibyte character,
or WEOF if there was an encoding error. Used only if or WEOF if there was an encoding error. Used only if
MB_CUR_MAX > 1. */ MB_CUR_MAX > 1. */
wint_t wctok; wint_t wctok;
/* Length of the multibyte representation of wctok. */
int cur_mb_len;
/* The most recently analyzed multibyte bracket expression. */ /* The most recently analyzed multibyte bracket expression. */
struct mb_char_classes brack; struct mb_char_classes brack;
/* We're separated from beginning or (, | only by zero-width characters. */ /* We're separated from beginning or (, | only by zero-width characters. */
bool laststart; bool laststart;
}; };
/* Recursive descent parser for regular expressions. */ /* Recursive descent parser for regular expressions. */
struct parser_state struct parser_state
{ {
token tok; /* Lookahead token. */ token tok; /* Lookahead token. */
size_t depth; /* Current depth of a hypothetical stack idx_t depth; /* Current depth of a hypothetical stack
holding deferred productions. This is holding deferred productions. This is
used to determine the depth that will be used to determine the depth that will be
required of the real stack later on in required of the real stack later on in
dfaanalyze. */ dfaanalyze. */
}; };
/* A compiled regular expression. */ /* A compiled regular expression. */
struct dfa struct dfa
{ {
/* Syntax configuration */
struct regex_syntax syntax;
/* Fields filled by the scanner. */ /* Fields filled by the scanner. */
charclass *charclasses; /* Array of character sets for CSET tokens. */ charclass *charclasses; /* Array of character sets for CSET tokens. */
ptrdiff_t cindex; /* Index for adding new charclasses. */ idx_t cindex; /* Index for adding new charclasses. */
ptrdiff_t calloc; /* Number of charclasses allocated. */ idx_t calloc; /* Number of charclasses allocated. */
size_t canychar; /* Index of anychar class, or (size_t) -1. */ ptrdiff_t canychar; /* Index of anychar class, or -1. */
/* Scanner state */ /* Scanner state */
struct lexer_state lex; struct lexer_state lex;
/* Parser state */ /* Parser state */
struct parser_state parse; struct parser_state parse;
/* Fields filled by the parser. */ /* Fields filled by the parser. */
token *tokens; /* Postfix parse array. */ token *tokens; /* Postfix parse array. */
size_t tindex; /* Index for adding new tokens. */ idx_t tindex; /* Index for adding new tokens. */
size_t talloc; /* Number of tokens currently allocated. */ idx_t talloc; /* Number of tokens currently allocated.
size_t depth; /* Depth required of an evaluation stack */
idx_t depth; /* Depth required of an evaluation stack
used for depth-first traversal of the used for depth-first traversal of the
parse tree. */ parse tree. */
size_t nleaves; /* Number of leaves on the parse tree. */ idx_t nleaves; /* Number of leaves on the parse tree. */
size_t nregexps; /* Count of parallel regexps being built idx_t nregexps; /* Count of parallel regexps being built
with dfaparse. */ with dfaparse. */
bool fast; /* The DFA is fast. */ bool fast; /* The DFA is fast. */
token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */ token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales. */
mbstate_t mbs; /* Multibyte conversion state. */ mbstate_t mbs; /* Multibyte conversion state. */
/* The following are valid only if MB_CUR_MAX > 1. */ /* The following are valid only if MB_CUR_MAX > 1. */
/* The value of multibyte_prop[i] is defined by following rule. /* The value of multibyte_prop[i] is defined by following rule.
if tokens[i] < NOTCHAR if tokens[i] < NOTCHAR
bit 0 : tokens[i] is the first byte of a character, including bit 0 : tokens[i] is the first byte of a character, including
single-byte characters. single-byte characters.
bit 1 : tokens[i] is the last byte of a character, including bit 1 : tokens[i] is the last byte of a character, including
single-byte characters. single-byte characters.
skipping to change at line 527 skipping to change at line 512
= 3 , 1 , 0 , 2 , 3 = 3 , 1 , 0 , 2 , 3
*/ */
char *multibyte_prop; char *multibyte_prop;
/* Fields filled by the superset. */ /* Fields filled by the superset. */
struct dfa *superset; /* Hint of the dfa. */ struct dfa *superset; /* Hint of the dfa. */
/* Fields filled by the state builder. */ /* Fields filled by the state builder. */
dfa_state *states; /* States of the dfa. */ dfa_state *states; /* States of the dfa. */
state_num sindex; /* Index for adding new states. */ state_num sindex; /* Index for adding new states. */
ptrdiff_t salloc; /* Number of states currently allocated. */ idx_t salloc; /* Number of states currently allocated. */
/* Fields filled by the parse tree->NFA conversion. */ /* Fields filled by the parse tree->NFA conversion. */
position_set *follows; /* Array of follow sets, indexed by position position_set *follows; /* Array of follow sets, indexed by position
index. The follow of a position is the set index. The follow of a position is the set
of positions containing characters that of positions containing characters that
could conceivably follow a character could conceivably follow a character
matching the given position in a string matching the given position in a string
matching the regexp. Allocated to the matching the regexp. Allocated to the
maximum possible position index. */ maximum possible position index. */
bool searchflag; /* We are supposed to build a searching bool searchflag; /* We are supposed to build a searching
skipping to change at line 592 skipping to change at line 577
state_num initstate_notbol; /* Initial state for CTX_LETTER and CTX_NONE state_num initstate_notbol; /* Initial state for CTX_LETTER and CTX_NONE
context in multibyte locales, in which we context in multibyte locales, in which we
do not distinguish between their contexts, do not distinguish between their contexts,
as not supported word. */ as not supported word. */
position_set mb_follows; /* Follow set added by ANYCHAR on demand. */ position_set mb_follows; /* Follow set added by ANYCHAR on demand. */
state_num **mb_trans; /* Transition tables for states with state_num **mb_trans; /* Transition tables for states with
ANYCHAR. */ ANYCHAR. */
state_num mb_trcount; /* Number of transition tables for states with state_num mb_trcount; /* Number of transition tables for states with
ANYCHAR that have actually been built. */ ANYCHAR that have actually been built. */
/* Syntax configuration. This is near the end so that dfacopysyntax
can memset up to here. */
struct regex_syntax syntax;
/* Information derived from the locale. This is at the end so that /* Information derived from the locale. This is at the end so that
a quick memset need not clear it specially. */ a quick memset need not clear it specially. */
/* dfaexec implementation. */ /* dfaexec implementation. */
char *(*dfaexec) (struct dfa *, char const *, char *, char *(*dfaexec) (struct dfa *, char const *, char *,
bool, size_t *, bool *); bool, ptrdiff_t *, bool *);
/* The locale is simple, like the C locale. These locales can be
processed more efficiently, as they are single-byte, their native
character set is in collating-sequence order, and they do not
have multi-character collating elements. */
bool simple_locale;
/* Other cached information derived from the locale. */ /* Other cached information derived from the locale. */
struct localeinfo localeinfo; struct localeinfo localeinfo;
}; };
/* User access to dfa internals. */ /* User access to dfa internals. */
/* S could possibly be an accepting state of R. */ /* S could possibly be an accepting state of R. */
static bool static bool
accepting (state_num s, struct dfa const *r) accepting (state_num s, struct dfa const *r)
skipping to change at line 644 skipping to change at line 627
* PWC points to wint_t, not to wchar_t. * PWC points to wint_t, not to wchar_t.
* The last arg is a dfa *D instead of merely a multibyte conversion * The last arg is a dfa *D instead of merely a multibyte conversion
state D->mbs. state D->mbs.
* N must be at least 1. * N must be at least 1.
* S[N - 1] must be a sentinel byte. * S[N - 1] must be a sentinel byte.
* Shift encodings are not supported. * Shift encodings are not supported.
* The return value is always in the range 1..N. * The return value is always in the range 1..N.
* D->mbs is always valid afterwards. * D->mbs is always valid afterwards.
* *PWC is always set to something. */ * *PWC is always set to something. */
static size_t static int
mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d) mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d)
{ {
unsigned char uc = s[0]; unsigned char uc = s[0];
wint_t wc = d->localeinfo.sbctowc[uc]; wint_t wc = d->localeinfo.sbctowc[uc];
if (wc == WEOF) if (wc == WEOF)
{ {
wchar_t wch; wchar_t wch;
size_t nbytes = mbrtowc (&wch, s, n, &d->mbs); size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
if (0 < nbytes && nbytes < (size_t) -2) if (0 < nbytes && nbytes < (size_t) -2)
skipping to change at line 823 skipping to change at line 806
nonnegative. If NITEMS_MAX is -1, it is treated as if it were nonnegative. If NITEMS_MAX is -1, it is treated as if it were
infinity. infinity.
If PA is null, then allocate a new array instead of reallocating If PA is null, then allocate a new array instead of reallocating
the old one. the old one.
Thus, to grow an array A without saving its old contents, do Thus, to grow an array A without saving its old contents, do
{ free (A); A = xpalloc (NULL, &AITEMS, ...); }. */ { free (A); A = xpalloc (NULL, &AITEMS, ...); }. */
static void * static void *
xpalloc (void *pa, ptrdiff_t *nitems, ptrdiff_t nitems_incr_min, xpalloc (void *pa, idx_t *nitems, idx_t nitems_incr_min,
ptrdiff_t nitems_max, ptrdiff_t item_size) ptrdiff_t nitems_max, idx_t item_size)
{ {
ptrdiff_t n0 = *nitems; idx_t n0 = *nitems;
/* The approximate size to use for initial small allocation /* The approximate size to use for initial small allocation
requests. This is the largest "small" request for the GNU C requests. This is the largest "small" request for the GNU C
library malloc. */ library malloc. */
enum { DEFAULT_MXFAST = 64 * sizeof (size_t) / 4 }; enum { DEFAULT_MXFAST = 64 * sizeof (size_t) / 4 };
/* If the array is tiny, grow it to about (but no greater than) /* If the array is tiny, grow it to about (but no greater than)
DEFAULT_MXFAST bytes. Otherwise, grow it by about 50%. DEFAULT_MXFAST bytes. Otherwise, grow it by about 50%.
Adjust the growth according to three constraints: NITEMS_INCR_MIN, Adjust the growth according to three constraints: NITEMS_INCR_MIN,
NITEMS_MAX, and what the C language can represent safely. */ NITEMS_MAX, and what the C language can represent safely. */
ptrdiff_t n, nbytes; idx_t n, nbytes;
if (INT_ADD_WRAPV (n0, n0 >> 1, &n)) if (INT_ADD_WRAPV (n0, n0 >> 1, &n))
n = PTRDIFF_MAX; n = IDX_MAX;
if (0 <= nitems_max && nitems_max < n) if (0 <= nitems_max && nitems_max < n)
n = nitems_max; n = nitems_max;
ptrdiff_t adjusted_nbytes idx_t adjusted_nbytes
= ((INT_MULTIPLY_WRAPV (n, item_size, &nbytes) || SIZE_MAX < nbytes) = ((INT_MULTIPLY_WRAPV (n, item_size, &nbytes) || SIZE_MAX < nbytes)
? MIN (PTRDIFF_MAX, SIZE_MAX) ? MIN (IDX_MAX, SIZE_MAX)
: nbytes < DEFAULT_MXFAST ? DEFAULT_MXFAST : 0); : nbytes < DEFAULT_MXFAST ? DEFAULT_MXFAST : 0);
if (adjusted_nbytes) if (adjusted_nbytes)
{ {
n = adjusted_nbytes / item_size; n = adjusted_nbytes / item_size;
nbytes = adjusted_nbytes - adjusted_nbytes % item_size; nbytes = adjusted_nbytes - adjusted_nbytes % item_size;
} }
if (! pa) if (! pa)
*nitems = 0; *nitems = 0;
if (n - n0 < nitems_incr_min if (n - n0 < nitems_incr_min
skipping to change at line 876 skipping to change at line 859
/* Ensure that the array addressed by PA holds at least I + 1 items. /* Ensure that the array addressed by PA holds at least I + 1 items.
Either return PA, or reallocate the array and return its new address. Either return PA, or reallocate the array and return its new address.
Although PA may be null, the returned value is never null. Although PA may be null, the returned value is never null.
The array holds *NITEMS items, where 0 <= I <= *NITEMS; *NITEMS The array holds *NITEMS items, where 0 <= I <= *NITEMS; *NITEMS
is updated on reallocation. If PA is null, *NITEMS must be zero. is updated on reallocation. If PA is null, *NITEMS must be zero.
Do not allocate more than NITEMS_MAX items total; -1 means no limit. Do not allocate more than NITEMS_MAX items total; -1 means no limit.
ITEM_SIZE is the size of one item; it must be positive. ITEM_SIZE is the size of one item; it must be positive.
Avoid O(N**2) behavior on arrays growing linearly. */ Avoid O(N**2) behavior on arrays growing linearly. */
static void * static void *
maybe_realloc (void *pa, ptrdiff_t i, ptrdiff_t *nitems, maybe_realloc (void *pa, idx_t i, idx_t *nitems,
ptrdiff_t nitems_max, ptrdiff_t item_size) ptrdiff_t nitems_max, idx_t item_size)
{ {
if (i < *nitems) if (i < *nitems)
return pa; return pa;
return xpalloc (pa, nitems, 1, nitems_max, item_size); return xpalloc (pa, nitems, 1, nitems_max, item_size);
} }
/* In DFA D, find the index of charclass S, or allocate a new one. */ /* In DFA D, find the index of charclass S, or allocate a new one. */
static ptrdiff_t static idx_t
charclass_index (struct dfa *d, charclass *s) charclass_index (struct dfa *d, charclass const *s)
{ {
ptrdiff_t i; idx_t i;
for (i = 0; i < d->cindex; ++i) for (i = 0; i < d->cindex; ++i)
if (equal (s, &d->charclasses[i])) if (equal (s, &d->charclasses[i]))
return i; return i;
d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc, d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
TOKEN_MAX - CSET, sizeof *d->charclasses); TOKEN_MAX - CSET, sizeof *d->charclasses);
++d->cindex; ++d->cindex;
d->charclasses[i] = *s; d->charclasses[i] = *s;
return i; return i;
} }
skipping to change at line 916 skipping to change at line 899
static int static int
char_context (struct dfa const *dfa, unsigned char c) char_context (struct dfa const *dfa, unsigned char c)
{ {
if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor) if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor)
return CTX_NEWLINE; return CTX_NEWLINE;
if (unibyte_word_constituent (dfa, c)) if (unibyte_word_constituent (dfa, c))
return CTX_LETTER; return CTX_LETTER;
return CTX_NONE; return CTX_NONE;
} }
/* Copy the syntax settings from one dfa instance to another.
Saves considerable computation time if compiling many regular expressions
based on the same setting. */
void
dfacopysyntax (struct dfa *to, const struct dfa *from)
{
to->dfaexec = from->dfaexec;
to->simple_locale = from->simple_locale;
to->localeinfo = from->localeinfo;
to->fast = from->fast;
to->canychar = from->canychar;
to->lex.cur_mb_len = from->lex.cur_mb_len;
to->syntax = from->syntax;
}
/* Set a bit in the charclass for the given wchar_t. Do nothing if WC /* Set a bit in the charclass for the given wchar_t. Do nothing if WC
is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1, is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
this may happen when folding case in weird Turkish locales where this may happen when folding case in weird Turkish locales where
dotless i/dotted I are not included in the chosen character set. dotless i/dotted I are not included in the chosen character set.
Return whether a bit was set in the charclass. */ Return whether a bit was set in the charclass. */
static bool static bool
setbit_wc (wint_t wc, charclass *c) setbit_wc (wint_t wc, charclass *c)
{ {
int b = wctob (wc); int b = wctob (wc);
if (b < 0) if (b < 0)
skipping to change at line 960 skipping to change at line 926
MB_CUR_MAX must be 1. */ MB_CUR_MAX must be 1. */
static void static void
setbit_case_fold_c (int b, charclass *c) setbit_case_fold_c (int b, charclass *c)
{ {
int ub = toupper (b); int ub = toupper (b);
for (int i = 0; i < NOTCHAR; i++) for (int i = 0; i < NOTCHAR; i++)
if (toupper (i) == ub) if (toupper (i) == ub)
setbit (i, c); setbit (i, c);
} }
/* Return true if the locale compatible with the C locale. */
static bool
using_simple_locale (bool multibyte)
{
/* The native character set is known to be compatible with
the C locale. The following test isn't perfect, but it's good
enough in practice, as only ASCII and EBCDIC are in common use
and this test correctly accepts ASCII and rejects EBCDIC. */
enum { native_c_charset =
('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
&& '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
&& '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
&& '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
&& '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
&& '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
&& 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
&& '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
&& '}' == 125 && '~' == 126)
};
if (!native_c_charset || multibyte)
return false;
else
{
/* Treat C and POSIX locales as being compatible. Also, treat
errors as compatible, as these are invariably from stubs. */
char const *loc = setlocale (LC_ALL, NULL);
return !loc || streq (loc, "C") || streq (loc, "POSIX");
}
}
/* Fetch the next lexical input character from the pattern. There /* Fetch the next lexical input character from the pattern. There
must at least one byte of pattern input. Set DFA->lex.wctok to the must at least one byte of pattern input. Set DFA->lex.wctok to the
value of the character or to WEOF depending on whether the input is value of the character or to WEOF depending on whether the input is
a valid multibyte character (possibly of length 1). Then return a valid multibyte character (possibly of length 1). Then return
the next input byte value, except return EOF if the input is a the next input byte value, except return EOF if the input is a
multibyte character of length greater than 1. */ multibyte character of length greater than 1. */
static int static int
fetch_wc (struct dfa *dfa) fetch_wc (struct dfa *dfa)
{ {
size_t nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left, int nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
dfa); dfa);
dfa->lex.cur_mb_len = nbytes;
int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF; int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
dfa->lex.ptr += nbytes; dfa->lex.ptr += nbytes;
dfa->lex.left -= nbytes; dfa->lex.left -= nbytes;
return c; return c;
} }
/* If there is no more input, report an error about unbalanced brackets. /* If there is no more input, report an error about unbalanced brackets.
Otherwise, behave as with fetch_wc (DFA). */ Otherwise, behave as with fetch_wc (DFA). */
static int static int
bracket_fetch_wc (struct dfa *dfa) bracket_fetch_wc (struct dfa *dfa)
skipping to change at line 1045 skipping to change at line 978
{"upper", isupper, false}, {"upper", isupper, false},
{"lower", islower, false}, {"lower", islower, false},
{"digit", isdigit, true}, {"digit", isdigit, true},
{"xdigit", isxdigit, false}, {"xdigit", isxdigit, false},
{"space", isspace, false}, {"space", isspace, false},
{"punct", ispunct, false}, {"punct", ispunct, false},
{"alnum", isalnum, false}, {"alnum", isalnum, false},
{"print", isprint, false}, {"print", isprint, false},
{"graph", isgraph, false}, {"graph", isgraph, false},
{"cntrl", iscntrl, false}, {"cntrl", iscntrl, false},
{"blank", is_blank, false}, {"blank", isblank, false},
{NULL, NULL, false} {NULL, NULL, false}
}; };
static const struct dfa_ctype *_GL_ATTRIBUTE_PURE static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
find_pred (const char *str) find_pred (const char *str)
{ {
for (unsigned int i = 0; prednames[i].name; ++i) for (int i = 0; prednames[i].name; i++)
if (streq (str, prednames[i].name)) if (streq (str, prednames[i].name))
return &prednames[i]; return &prednames[i];
return NULL; return NULL;
} }
/* Parse a bracket expression, which possibly includes multibyte /* Parse a bracket expression, which possibly includes multibyte
characters. */ characters. */
static token static token
parse_bracket_exp (struct dfa *dfa) parse_bracket_exp (struct dfa *dfa)
{ {
skipping to change at line 1082 skipping to change at line 1015
int colon_warning_state; int colon_warning_state;
dfa->lex.brack.nchars = 0; dfa->lex.brack.nchars = 0;
charclass ccl; charclass ccl;
zeroset (&ccl); zeroset (&ccl);
int c = bracket_fetch_wc (dfa); int c = bracket_fetch_wc (dfa);
bool invert = c == '^'; bool invert = c == '^';
if (invert) if (invert)
{ {
c = bracket_fetch_wc (dfa); c = bracket_fetch_wc (dfa);
known_bracket_exp = dfa->simple_locale; known_bracket_exp = dfa->localeinfo.simple;
} }
wint_t wc = dfa->lex.wctok; wint_t wc = dfa->lex.wctok;
int c1; int c1;
wint_t wc1; wint_t wc1;
colon_warning_state = (c == ':'); colon_warning_state = (c == ':');
do do
{ {
c1 = NOTCHAR; /* Mark c1 as not initialized. */ c1 = NOTCHAR; /* Mark c1 as not initialized. */
colon_warning_state &= ~2; colon_warning_state &= ~2;
skipping to change at line 1107 skipping to change at line 1040
if (c == '[') if (c == '[')
{ {
c1 = bracket_fetch_wc (dfa); c1 = bracket_fetch_wc (dfa);
wc1 = dfa->lex.wctok; wc1 = dfa->lex.wctok;
if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES)) if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
|| c1 == '.' || c1 == '=') || c1 == '.' || c1 == '=')
{ {
enum { MAX_BRACKET_STRING_LEN = 32 }; enum { MAX_BRACKET_STRING_LEN = 32 };
char str[MAX_BRACKET_STRING_LEN + 1]; char str[MAX_BRACKET_STRING_LEN + 1];
size_t len = 0; int len = 0;
for (;;) for (;;)
{ {
c = bracket_fetch_wc (dfa); c = bracket_fetch_wc (dfa);
if (dfa->lex.left == 0 if (dfa->lex.left == 0
|| (c == c1 && dfa->lex.ptr[0] == ']')) || (c == c1 && dfa->lex.ptr[0] == ']'))
break; break;
if (len < MAX_BRACKET_STRING_LEN) if (len < MAX_BRACKET_STRING_LEN)
str[len++] = c; str[len++] = c;
else else
/* This is in any case an invalid class name. */ /* This is in any case an invalid class name. */
skipping to change at line 1193 skipping to change at line 1126
if (c2 == '[' && dfa->lex.ptr[0] == '.') if (c2 == '[' && dfa->lex.ptr[0] == '.')
{ {
known_bracket_exp = false; known_bracket_exp = false;
c2 = ']'; c2 = ']';
} }
if (c2 == ']') if (c2 == ']')
{ {
/* In the case [x-], the - is an ordinary hyphen, /* In the case [x-], the - is an ordinary hyphen,
which is left in c1, the lookahead character. */ which is left in c1, the lookahead character. */
dfa->lex.ptr -= dfa->lex.cur_mb_len; dfa->lex.ptr--;
dfa->lex.left += dfa->lex.cur_mb_len; dfa->lex.left++;
} }
else else
{ {
if (c2 == '\\' && (dfa->syntax.syntax_bits if (c2 == '\\' && (dfa->syntax.syntax_bits
& RE_BACKSLASH_ESCAPE_IN_LISTS)) & RE_BACKSLASH_ESCAPE_IN_LISTS))
{ {
c2 = bracket_fetch_wc (dfa); c2 = bracket_fetch_wc (dfa);
wc2 = dfa->lex.wctok; wc2 = dfa->lex.wctok;
} }
colon_warning_state |= 8; colon_warning_state |= 8;
c1 = bracket_fetch_wc (dfa); c1 = bracket_fetch_wc (dfa);
wc1 = dfa->lex.wctok; wc1 = dfa->lex.wctok;
/* Treat [x-y] as a range if x != y. */ /* Treat [x-y] as a range if x != y. */
if (wc != wc2 || wc == WEOF) if (wc != wc2 || wc == WEOF)
{ {
if (dfa->simple_locale if (dfa->localeinfo.simple
|| (isasciidigit (c) & isasciidigit (c2))) || (isasciidigit (c) & isasciidigit (c2)))
{ {
for (int ci = c; ci <= c2; ci++) for (int ci = c; ci <= c2; ci++)
if (dfa->syntax.case_fold && isalpha (ci)) if (dfa->syntax.case_fold && isalpha (ci))
setbit_case_fold_c (ci, &ccl); setbit_case_fold_c (ci, &ccl);
else else
setbit (ci, &ccl); setbit (ci, &ccl);
} }
else else
known_bracket_exp = false; known_bracket_exp = false;
skipping to change at line 1245 skipping to change at line 1178
else else
setbit (c, &ccl); setbit (c, &ccl);
continue; continue;
} }
if (wc == WEOF) if (wc == WEOF)
known_bracket_exp = false; known_bracket_exp = false;
else else
{ {
wchar_t folded[CASE_FOLDED_BUFSIZE + 1]; wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
unsigned int n = (dfa->syntax.case_fold int n = (dfa->syntax.case_fold
? case_folded_counterparts (wc, folded + 1) + 1 ? case_folded_counterparts (wc, folded + 1) + 1
: 1); : 1);
folded[0] = wc; folded[0] = wc;
for (unsigned int i = 0; i < n; i++) for (int i = 0; i < n; i++)
if (!setbit_wc (folded[i], &ccl)) if (!setbit_wc (folded[i], &ccl))
{ {
dfa->lex.brack.chars dfa->lex.brack.chars
= maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars, = maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
&dfa->lex.brack.nchars_alloc, -1, &dfa->lex.brack.nchars_alloc, -1,
sizeof *dfa->lex.brack.chars); sizeof *dfa->lex.brack.chars);
dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i]; dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
} }
} }
} }
skipping to change at line 1288 skipping to change at line 1221
if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE) if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
clrbit ('\n', &ccl); clrbit ('\n', &ccl);
} }
return CSET + charclass_index (dfa, &ccl); return CSET + charclass_index (dfa, &ccl);
} }
struct lexptr struct lexptr
{ {
char const *ptr; char const *ptr;
size_t left; idx_t left;
}; };
static void static void
push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s) push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s)
{ {
ls->ptr = dfa->lex.ptr; ls->ptr = dfa->lex.ptr;
ls->left = dfa->lex.left; ls->left = dfa->lex.left;
dfa->lex.ptr = s; dfa->lex.ptr = s;
dfa->lex.left = strlen (s); dfa->lex.left = strlen (s);
} }
skipping to change at line 1536 skipping to change at line 1469
if (dfa->lex.parens == 0 if (dfa->lex.parens == 0
&& dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD) && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
goto normal_char; goto normal_char;
dfa->lex.parens--; dfa->lex.parens--;
dfa->lex.laststart = false; dfa->lex.laststart = false;
return dfa->lex.lasttok = RPAREN; return dfa->lex.lasttok = RPAREN;
case '.': case '.':
if (backslash) if (backslash)
goto normal_char; goto normal_char;
if (dfa->canychar == (size_t) -1) if (dfa->canychar < 0)
{ {
charclass ccl; charclass ccl;
fillset (&ccl); fillset (&ccl);
if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE)) if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
clrbit ('\n', &ccl); clrbit ('\n', &ccl);
if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL) if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
clrbit ('\0', &ccl); clrbit ('\0', &ccl);
if (dfa->localeinfo.multibyte) if (dfa->localeinfo.multibyte)
for (int c2 = 0; c2 < NOTCHAR; c2++) for (int c2 = 0; c2 < NOTCHAR; c2++)
if (dfa->localeinfo.sbctowc[c2] == WEOF) if (dfa->localeinfo.sbctowc[c2] == WEOF)
skipping to change at line 1659 skipping to change at line 1592
and some other character. */ and some other character. */
abort (); abort ();
return END; /* keeps pedantic compilers happy. */ return END; /* keeps pedantic compilers happy. */
} }
static void static void
addtok_mb (struct dfa *dfa, token t, char mbprop) addtok_mb (struct dfa *dfa, token t, char mbprop)
{ {
if (dfa->talloc == dfa->tindex) if (dfa->talloc == dfa->tindex)
{ {
dfa->tokens = x2nrealloc (dfa->tokens, &dfa->talloc, dfa->tokens = xpalloc (dfa->tokens, &dfa->talloc, 1, -1,
sizeof *dfa->tokens); sizeof *dfa->tokens);
if (dfa->localeinfo.multibyte) if (dfa->localeinfo.multibyte)
dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc, dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc,
sizeof *dfa->multibyte_prop); sizeof *dfa->multibyte_prop);
} }
if (dfa->localeinfo.multibyte) if (dfa->localeinfo.multibyte)
dfa->multibyte_prop[dfa->tindex] = mbprop; dfa->multibyte_prop[dfa->tindex] = mbprop;
dfa->tokens[dfa->tindex++] = t; dfa->tokens[dfa->tindex++] = t;
switch (t) switch (t)
{ {
skipping to change at line 1708 skipping to change at line 1641
updating the maximum depth if necessary. */ updating the maximum depth if necessary. */
static void static void
addtok (struct dfa *dfa, token t) addtok (struct dfa *dfa, token t)
{ {
if (dfa->localeinfo.multibyte && t == MBCSET) if (dfa->localeinfo.multibyte && t == MBCSET)
{ {
bool need_or = false; bool need_or = false;
/* Extract wide characters into alternations for better performance. /* Extract wide characters into alternations for better performance.
This does not require UTF-8. */ This does not require UTF-8. */
for (ptrdiff_t i = 0; i < dfa->lex.brack.nchars; i++) for (idx_t i = 0; i < dfa->lex.brack.nchars; i++)
{ {
addtok_wc (dfa, dfa->lex.brack.chars[i]); addtok_wc (dfa, dfa->lex.brack.chars[i]);
if (need_or) if (need_or)
addtok (dfa, OR); addtok (dfa, OR);
need_or = true; need_or = true;
} }
dfa->lex.brack.nchars = 0; dfa->lex.brack.nchars = 0;
/* Wide characters have been handled above, so it is possible /* Wide characters have been handled above, so it is possible
that the set is empty now. Do nothing in that case. */ that the set is empty now. Do nothing in that case. */
skipping to change at line 1744 skipping to change at line 1677
e.g., we construct the following tree from "<mb1><mb2>". e.g., we construct the following tree from "<mb1><mb2>".
<mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT> <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
<mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */ <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
static void static void
addtok_wc (struct dfa *dfa, wint_t wc) addtok_wc (struct dfa *dfa, wint_t wc)
{ {
unsigned char buf[MB_LEN_MAX]; unsigned char buf[MB_LEN_MAX];
mbstate_t s = { 0 }; mbstate_t s = { 0 };
size_t stored_bytes = wcrtomb ((char *) buf, wc, &s); size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
int buflen;
if (stored_bytes != (size_t) -1) if (stored_bytes != (size_t) -1)
dfa->lex.cur_mb_len = stored_bytes; buflen = stored_bytes;
else else
{ {
/* This is merely stop-gap. buf[0] is undefined, yet skipping /* This is merely stop-gap. buf[0] is undefined, yet skipping
the addtok_mb call altogether can corrupt the heap. */ the addtok_mb call altogether can corrupt the heap. */
dfa->lex.cur_mb_len = 1; buflen = 1;
buf[0] = 0; buf[0] = 0;
} }
addtok_mb (dfa, buf[0], dfa->lex.cur_mb_len == 1 ? 3 : 1); addtok_mb (dfa, buf[0], buflen == 1 ? 3 : 1);
for (int i = 1; i < dfa->lex.cur_mb_len; i++) for (int i = 1; i < buflen; i++)
{ {
addtok_mb (dfa, buf[i], i == dfa->lex.cur_mb_len - 1 ? 2 : 0); addtok_mb (dfa, buf[i], i == buflen - 1 ? 2 : 0);
addtok (dfa, CAT); addtok (dfa, CAT);
} }
} }
static void static void
add_utf8_anychar (struct dfa *dfa) add_utf8_anychar (struct dfa *dfa)
{ {
static charclass const utf8_classes[5] = { /* Since the Unicode Standard Version 4.0.0 (2003), a well-formed
/* 80-bf: non-leading bytes. */ UTF-8 byte sequence has been defined as follows:
CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
([\x00-\x7f]
|[\xc2-\xdf][\x80-\xbf]
|[\xe0][\xa0-\xbf][\x80-\xbf]
|[\xe1-\xec\xee-\xef][\x80-\xbf][\x80-\xbf]
|[\xed][\x80-\x9f][\x80-\xbf]
|[\xf0][\x90-\xbf][\x80-\xbf][\x80-\xbf])
|[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf]
|[\xf4][\x80-\x8f][\x80-\xbf][\x80-\xbf])
which I'll write more concisely "A|BC|DEC|FCC|GHC|IJCC|KCCC|LMCC",
where A = [\x00-\x7f], B = [\xc2-\xdf], C = [\x80-\xbf],
D = [\xe0], E = [\xa0-\xbf], F = [\xe1-\xec\xee-\xef], G = [\xed],
H = [\x80-\x9f], I = [\xf0],
J = [\x90-\xbf], K = [\xf1-\xf3], L = [\xf4], M = [\x80-\x8f].
This can be refactored to "A|(B|DE|GH|(F|IJ|LM|KC)C)C". */
/* Mnemonics for classes containing two or more bytes. */
enum { A, B, C, E, F, H, J, K, M };
/* Mnemonics for single-byte tokens. */
enum { D_token = 0xe0, G_token = 0xed, I_token = 0xf0, L_token = 0xf4 };
/* 00-7f: 1-byte sequence. */ static charclass const utf8_classes[] = {
/* A. 00-7f: 1-byte sequence. */
CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0), CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
/* c2-df: 2-byte sequence. */ /* B. c2-df: 1st byte of a 2-byte sequence. */
CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0), CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
/* e0-ef: 3-byte sequence. */ /* C. 80-bf: non-leading bytes. */
CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xffff), CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
/* f0-f7: 4-byte sequence. */ /* D. e0 (just a token). */
CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xff0000)
};
const unsigned int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]);
/* Define the five character classes that are needed below. */ /* E. a0-bf: 2nd byte of a "DEC" sequence. */
if (dfa->utf8_anychar_classes[0] == 0) CHARCLASS_INIT (0, 0, 0, 0, 0, 0xffffffff, 0, 0),
for (unsigned int i = 0; i < n; i++)
{ /* F. e1-ec + ee-ef: 1st byte of an "FCC" sequence. */
charclass c = utf8_classes[i]; CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xdffe),
if (i == 1)
{ /* G. ed (just a token). */
if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
clrbit ('\n', &c); /* H. 80-9f: 2nd byte of a "GHC" sequence. */
if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL) CHARCLASS_INIT (0, 0, 0, 0, 0xffff, 0, 0, 0),
clrbit ('\0', &c);
} /* I. f0 (just a token). */
dfa->utf8_anychar_classes[i] = CSET + charclass_index (dfa, &c);
} /* J. 90-bf: 2nd byte of an "IJCC" sequence. */
CHARCLASS_INIT (0, 0, 0, 0, 0xffff0000, 0xffffffff, 0, 0),
/* A valid UTF-8 character is /* K. f1-f3: 1st byte of a "KCCC" sequence. */
CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xe0000),
([0x00-0x7f] /* L. f4 (just a token). */
|[0xc2-0xdf][0x80-0xbf]
|[0xe0-0xef[0x80-0xbf][0x80-0xbf] /* M. 80-8f: 2nd byte of a "LMCC" sequence. */
|[0xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf]) CHARCLASS_INIT (0, 0, 0, 0, 0xff, 0, 0, 0),
};
which I'll write more concisely "B|CA|DAA|EAAA". Factor the [0x00-0x7f]
and you get "B|(C|(D|EA)A)A". And since the token buffer is in reverse /* Define the character classes that are needed below. */
Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR". */ if (dfa->utf8_anychar_classes[0] == 0)
unsigned int i; {
for (i = 1; i < n; i++) charclass c = utf8_classes[0];
addtok (dfa, dfa->utf8_anychar_classes[i]); if (! (dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
while (--i > 1) clrbit ('\n', &c);
if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
clrbit ('\0', &c);
dfa->utf8_anychar_classes[0] = CSET + charclass_index (dfa, &c);
for (int i = 1; i < sizeof utf8_classes / sizeof *utf8_classes; i++)
dfa->utf8_anychar_classes[i]
= CSET + charclass_index (dfa, &utf8_classes[i]);
}
/* Implement the "A|(B|DE|GH|(F|IJ|LM|KC)C)C" pattern mentioned above.
The token buffer is in reverse Polish order, so we get
"A B D E CAT OR G H CAT OR F I J CAT OR L M CAT OR K
C CAT OR C CAT OR C CAT OR". */
addtok (dfa, dfa->utf8_anychar_classes[A]);
addtok (dfa, dfa->utf8_anychar_classes[B]);
addtok (dfa, D_token);
addtok (dfa, dfa->utf8_anychar_classes[E]);
addtok (dfa, CAT);
addtok (dfa, OR);
addtok (dfa, G_token);
addtok (dfa, dfa->utf8_anychar_classes[H]);
addtok (dfa, CAT);
addtok (dfa, OR);
addtok (dfa, dfa->utf8_anychar_classes[F]);
addtok (dfa, I_token);
addtok (dfa, dfa->utf8_anychar_classes[J]);
addtok (dfa, CAT);
addtok (dfa, OR);
addtok (dfa, L_token);
addtok (dfa, dfa->utf8_anychar_classes[M]);
addtok (dfa, CAT);
addtok (dfa, OR);
addtok (dfa, dfa->utf8_anychar_classes[K]);
for (int i = 0; i < 3; i++)
{ {
addtok (dfa, dfa->utf8_anychar_classes[0]); addtok (dfa, dfa->utf8_anychar_classes[C]);
addtok (dfa, CAT); addtok (dfa, CAT);
addtok (dfa, OR); addtok (dfa, OR);
} }
} }
/* The grammar understood by the parser is as follows. /* The grammar understood by the parser is as follows.
regexp: regexp:
regexp OR branch regexp OR branch
branch branch
skipping to change at line 1892 skipping to change at line 1882
{ {
if (dfa->lex.wctok == WEOF) if (dfa->lex.wctok == WEOF)
addtok (dfa, BACKREF); addtok (dfa, BACKREF);
else else
{ {
addtok_wc (dfa, dfa->lex.wctok); addtok_wc (dfa, dfa->lex.wctok);
if (dfa->syntax.case_fold) if (dfa->syntax.case_fold)
{ {
wchar_t folded[CASE_FOLDED_BUFSIZE]; wchar_t folded[CASE_FOLDED_BUFSIZE];
unsigned int n = case_folded_counterparts (dfa->lex.wctok, int n = case_folded_counterparts (dfa->lex.wctok, folded);
folded); for (int i = 0; i < n; i++)
for (unsigned int i = 0; i < n; i++)
{ {
addtok_wc (dfa, folded[i]); addtok_wc (dfa, folded[i]);
addtok (dfa, OR); addtok (dfa, OR);
} }
} }
} }
dfa->parse.tok = lex (dfa); dfa->parse.tok = lex (dfa);
} }
else if (dfa->parse.tok == LPAREN) else if (dfa->parse.tok == LPAREN)
skipping to change at line 1917 skipping to change at line 1906
regexp (dfa); regexp (dfa);
if (dfa->parse.tok != RPAREN) if (dfa->parse.tok != RPAREN)
dfaerror (_("unbalanced (")); dfaerror (_("unbalanced ("));
dfa->parse.tok = lex (dfa); dfa->parse.tok = lex (dfa);
} }
else else
addtok (dfa, EMPTY); addtok (dfa, EMPTY);
} }
/* Return the number of tokens in the given subexpression. */ /* Return the number of tokens in the given subexpression. */
static size_t _GL_ATTRIBUTE_PURE static idx_t _GL_ATTRIBUTE_PURE
nsubtoks (struct dfa const *dfa, size_t tindex) nsubtoks (struct dfa const *dfa, idx_t tindex)
{ {
switch (dfa->tokens[tindex - 1]) switch (dfa->tokens[tindex - 1])
{ {
default: default:
return 1; return 1;
case QMARK: case QMARK:
case STAR: case STAR:
case PLUS: case PLUS:
return 1 + nsubtoks (dfa, tindex - 1); return 1 + nsubtoks (dfa, tindex - 1);
case CAT: case CAT:
case OR: case OR:
{ {
size_t ntoks1 = nsubtoks (dfa, tindex - 1); idx_t ntoks1 = nsubtoks (dfa, tindex - 1);
return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1); return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
} }
} }
} }
/* Copy the given subexpression to the top of the tree. */ /* Copy the given subexpression to the top of the tree. */
static void static void
copytoks (struct dfa *dfa, size_t tindex, size_t ntokens) copytoks (struct dfa *dfa, idx_t tindex, idx_t ntokens)
{ {
if (dfa->localeinfo.multibyte) if (dfa->localeinfo.multibyte)
for (size_t i = 0; i < ntokens; ++i) for (idx_t i = 0; i < ntokens; i++)
addtok_mb (dfa, dfa->tokens[tindex + i], addtok_mb (dfa, dfa->tokens[tindex + i],
dfa->multibyte_prop[tindex + i]); dfa->multibyte_prop[tindex + i]);
else else
for (size_t i = 0; i < ntokens; ++i) for (idx_t i = 0; i < ntokens; i++)
addtok_mb (dfa, dfa->tokens[tindex + i], 3); addtok_mb (dfa, dfa->tokens[tindex + i], 3);
} }
static void static void
closure (struct dfa *dfa) closure (struct dfa *dfa)
{ {
atom (dfa); atom (dfa);
while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR
|| dfa->parse.tok == PLUS || dfa->parse.tok == REPMN) || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN)
if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep)) if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep))
{ {
size_t ntokens = nsubtoks (dfa, dfa->tindex); idx_t ntokens = nsubtoks (dfa, dfa->tindex);
size_t tindex = dfa->tindex - ntokens; idx_t tindex = dfa->tindex - ntokens;
if (dfa->lex.maxrep < 0) if (dfa->lex.maxrep < 0)
addtok (dfa, PLUS); addtok (dfa, PLUS);
if (dfa->lex.minrep == 0) if (dfa->lex.minrep == 0)
addtok (dfa, QMARK); addtok (dfa, QMARK);
int i; int i;
for (i = 1; i < dfa->lex.minrep; i++) for (i = 1; i < dfa->lex.minrep; i++)
{ {
copytoks (dfa, tindex, ntokens); copytoks (dfa, tindex, ntokens);
addtok (dfa, CAT); addtok (dfa, CAT);
} }
skipping to change at line 2015 skipping to change at line 2004
{ {
branch (dfa); branch (dfa);
while (dfa->parse.tok == OR) while (dfa->parse.tok == OR)
{ {
dfa->parse.tok = lex (dfa); dfa->parse.tok = lex (dfa);
branch (dfa); branch (dfa);
addtok (dfa, OR); addtok (dfa, OR);
} }
} }
/* Main entry point for the parser. S is a string to be parsed, len is the /* Parse a string S of length LEN into D. S can include NUL characters.
length of the string, so s can include NUL characters. D is a pointer to This is the main entry point for the parser. */
the struct dfa to parse into. */ void
static void dfaparse (char const *s, idx_t len, struct dfa *d)
dfaparse (char const *s, size_t len, struct dfa *d)
{ {
d->lex.ptr = s; d->lex.ptr = s;
d->lex.left = len; d->lex.left = len;
d->lex.lasttok = END; d->lex.lasttok = END;
d->lex.laststart = true; d->lex.laststart = true;
if (!d->syntax.syntax_bits_set) if (!d->syntax.syntax_bits_set)
dfaerror (_("no syntax specified")); dfaerror (_("no syntax specified"));
if (!d->nregexps) if (!d->nregexps)
skipping to change at line 2067 skipping to change at line 2055
free (dst->elems); free (dst->elems);
dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1, dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
sizeof *dst->elems); sizeof *dst->elems);
} }
dst->nelem = src->nelem; dst->nelem = src->nelem;
if (src->nelem != 0) if (src->nelem != 0)
memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems); memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
} }
static void static void
alloc_position_set (position_set *s, size_t size) alloc_position_set (position_set *s, idx_t size)
{ {
s->elems = xnmalloc (size, sizeof *s->elems); s->elems = xnmalloc (size, sizeof *s->elems);
s->alloc = size; s->alloc = size;
s->nelem = 0; s->nelem = 0;
} }
/* Insert position P in set S. S is maintained in sorted order on /* Insert position P in set S. S is maintained in sorted order on
decreasing index. If there is already an entry in S with P.index decreasing index. If there is already an entry in S with P.index
then merge (logically-OR) P's constraints into the one in S. then merge (logically-OR) P's constraints into the one in S.
S->elems must point to an array large enough to hold the resulting set. */ S->elems must point to an array large enough to hold the resulting set. */
static void static void
insert (position p, position_set *s) insert (position p, position_set *s)
{ {
ptrdiff_t count = s->nelem; idx_t count = s->nelem;
ptrdiff_t lo = 0, hi = count; idx_t lo = 0, hi = count;
while (lo < hi) while (lo < hi)
{ {
ptrdiff_t mid = (lo + hi) >> 1; idx_t mid = (lo + hi) >> 1;
if (s->elems[mid].index < p.index) if (s->elems[mid].index < p.index)
lo = mid + 1; lo = mid + 1;
else if (s->elems[mid].index == p.index) else if (s->elems[mid].index == p.index)
{ {
s->elems[mid].constraint |= p.constraint; s->elems[mid].constraint |= p.constraint;
return; return;
} }
else else
hi = mid; hi = mid;
} }
s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems); s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
for (ptrdiff_t i = count; i > lo; i--) for (idx_t i = count; i > lo; i--)
s->elems[i] = s->elems[i - 1]; s->elems[i] = s->elems[i - 1];
s->elems[lo] = p; s->elems[lo] = p;
++s->nelem; ++s->nelem;
} }
static void static void
append (position p, position_set *s) append (position p, position_set *s)
{ {
ptrdiff_t count = s->nelem; idx_t count = s->nelem;
s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems); s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
s->elems[s->nelem++] = p; s->elems[s->nelem++] = p;
} }
/* Merge S1 and S2 (with the additional constraint C2) into M. The /* Merge S1 and S2 (with the additional constraint C2) into M. The
result is as if the positions of S1, and of S2 with the additional result is as if the positions of S1, and of S2 with the additional
constraint C2, were inserted into an initially empty set. */ constraint C2, were inserted into an initially empty set. */
static void static void
merge_constrained (position_set const *s1, position_set const *s2, merge_constrained (position_set const *s1, position_set const *s2,
unsigned int c2, position_set *m) unsigned int c2, position_set *m)
{ {
ptrdiff_t i = 0, j = 0; idx_t i = 0, j = 0;
if (m->alloc - s1->nelem < s2->nelem) if (m->alloc - s1->nelem < s2->nelem)
{ {
free (m->elems); free (m->elems);
m->alloc = s1->nelem; m->alloc = s1->nelem;
m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems); m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
} }
m->nelem = 0; m->nelem = 0;
while (i < s1->nelem || j < s2->nelem) while (i < s1->nelem || j < s2->nelem)
if (! (j < s2->nelem) if (! (j < s2->nelem)
skipping to change at line 2163 skipping to change at line 2151
merge (position_set const *s1, position_set const *s2, position_set *m) merge (position_set const *s1, position_set const *s2, position_set *m)
{ {
merge_constrained (s1, s2, -1, m); merge_constrained (s1, s2, -1, m);
} }
static void static void
merge2 (position_set *dst, position_set const *src, position_set *m) merge2 (position_set *dst, position_set const *src, position_set *m)
{ {
if (src->nelem < 4) if (src->nelem < 4)
{ {
for (ptrdiff_t i = 0; i < src->nelem; ++i) for (idx_t i = 0; i < src->nelem; i++)
insert (src->elems[i], dst); insert (src->elems[i], dst);
} }
else else
{ {
merge (src, dst, m); merge (src, dst, m);
copy (m, dst); copy (m, dst);
} }
} }
/* Delete a position from a set. Return the nonzero constraint of the /* Delete a position from a set. Return the nonzero constraint of the
deleted position, or zero if there was no such position. */ deleted position, or zero if there was no such position. */
static unsigned int static unsigned int
delete (size_t del, position_set *s) delete (idx_t del, position_set *s)
{ {
size_t count = s->nelem; idx_t count = s->nelem;
size_t lo = 0, hi = count; idx_t lo = 0, hi = count;
while (lo < hi) while (lo < hi)
{ {
size_t mid = (lo + hi) >> 1; idx_t mid = (lo + hi) >> 1;
if (s->elems[mid].index < del) if (s->elems[mid].index < del)
lo = mid + 1; lo = mid + 1;
else if (s->elems[mid].index == del) else if (s->elems[mid].index == del)
{ {
unsigned int c = s->elems[mid].constraint; unsigned int c = s->elems[mid].constraint;
size_t i; idx_t i;
for (i = mid; i + 1 < count; i++) for (i = mid; i + 1 < count; i++)
s->elems[i] = s->elems[i + 1]; s->elems[i] = s->elems[i + 1];
s->nelem = i; s->nelem = i;
return c; return c;
} }
else else
hi = mid; hi = mid;
} }
return 0; return 0;
} }
/* Replace a position with the followed set. */ /* Replace a position with the followed set. */
static void static void
replace (position_set *dst, size_t del, position_set *add, replace (position_set *dst, idx_t del, position_set *add,
unsigned int constraint, position_set *tmp) unsigned int constraint, position_set *tmp)
{ {
unsigned int c = delete (del, dst) & constraint; unsigned int c = delete (del, dst) & constraint;
if (c) if (c)
{ {
copy (dst, tmp); copy (dst, tmp);
merge_constrained (tmp, add, c, dst); merge_constrained (tmp, add, c, dst);
} }
} }
skipping to change at line 2226 skipping to change at line 2214
state. Context tells whether we got here on a newline or letter. */ state. Context tells whether we got here on a newline or letter. */
static state_num static state_num
state_index (struct dfa *d, position_set const *s, int context) state_index (struct dfa *d, position_set const *s, int context)
{ {
size_t hash = 0; size_t hash = 0;
int constraint = 0; int constraint = 0;
state_num i; state_num i;
token first_end = 0; token first_end = 0;
for (i = 0; i < s->nelem; ++i) for (i = 0; i < s->nelem; ++i)
hash ^= s->elems[i].index + s->elems[i].constraint; {
size_t ind = s->elems[i].index;
hash ^= ind + s->elems[i].constraint;
}
/* Try to find a state that exactly matches the proposed one. */ /* Try to find a state that exactly matches the proposed one. */
for (i = 0; i < d->sindex; ++i) for (i = 0; i < d->sindex; ++i)
{ {
if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
|| context != d->states[i].context) || context != d->states[i].context)
continue; continue;
state_num j; state_num j;
for (j = 0; j < s->nelem; ++j) for (j = 0; j < s->nelem; ++j)
if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
|| s->elems[j].index != d->states[i].elems.elems[j].index) || s->elems[j].index != d->states[i].elems.elems[j].index)
break; break;
if (j == s->nelem) if (j == s->nelem)
return i; return i;
} }
#ifdef DEBUG #ifdef DEBUG
fprintf (stderr, "new state %zd\n nextpos:", i); fprintf (stderr, "new state %td\n nextpos:", i);
for (state_num j = 0; j < s->nelem; j++) for (state_num j = 0; j < s->nelem; j++)
{ {
fprintf (stderr, " %zu:", s->elems[j].index); fprintf (stderr, " %td:", s->elems[j].index);
prtok (d->tokens[s->elems[j].index]); prtok (d->tokens[s->elems[j].index]);
} }
fprintf (stderr, "\n context:"); fprintf (stderr, "\n context:");
if (context ^ CTX_ANY) if (context ^ CTX_ANY)
{ {
if (context & CTX_NONE) if (context & CTX_NONE)
fprintf (stderr, " CTX_NONE"); fprintf (stderr, " CTX_NONE");
if (context & CTX_LETTER) if (context & CTX_LETTER)
fprintf (stderr, " CTX_LETTER"); fprintf (stderr, " CTX_LETTER");
if (context & CTX_NEWLINE) if (context & CTX_NEWLINE)
skipping to change at line 2308 skipping to change at line 2299
/* Find the epsilon closure of a set of positions. If any position of the set /* Find the epsilon closure of a set of positions. If any position of the set
contains a symbol that matches the empty string in some context, replace contains a symbol that matches the empty string in some context, replace
that position with the elements of its follow labeled with an appropriate that position with the elements of its follow labeled with an appropriate
constraint. Repeat exhaustively until no funny positions are left. constraint. Repeat exhaustively until no funny positions are left.
S->elems must be large enough to hold the result. */ S->elems must be large enough to hold the result. */
static void static void
epsclosure (struct dfa const *d) epsclosure (struct dfa const *d)
{ {
position_set tmp; position_set tmp;
alloc_position_set (&tmp, d->nleaves); alloc_position_set (&tmp, d->nleaves);
for (size_t i = 0; i < d->tindex; ++i) for (idx_t i = 0; i < d->tindex; i++)
if (d->follows[i].nelem > 0 && d->tokens[i] >= NOTCHAR if (d->follows[i].nelem > 0 && d->tokens[i] >= NOTCHAR
&& d->tokens[i] != BACKREF && d->tokens[i] != ANYCHAR && d->tokens[i] != BACKREF && d->tokens[i] != ANYCHAR
&& d->tokens[i] != MBCSET && d->tokens[i] < CSET) && d->tokens[i] != MBCSET && d->tokens[i] < CSET)
{ {
unsigned int constraint; unsigned int constraint;
switch (d->tokens[i]) switch (d->tokens[i])
{ {
case BEGLINE: case BEGLINE:
constraint = BEGLINE_CONSTRAINT; constraint = BEGLINE_CONSTRAINT;
break; break;
skipping to change at line 2341 skipping to change at line 2332
case NOTLIMWORD: case NOTLIMWORD:
constraint = NOTLIMWORD_CONSTRAINT; constraint = NOTLIMWORD_CONSTRAINT;
break; break;
default: default:
constraint = NO_CONSTRAINT; constraint = NO_CONSTRAINT;
break; break;
} }
delete (i, &d->follows[i]); delete (i, &d->follows[i]);
for (size_t j = 0; j < d->tindex; j++) for (idx_t j = 0; j < d->tindex; j++)
if (i != j && d->follows[j].nelem > 0) if (i != j && d->follows[j].nelem > 0)
replace (&d->follows[j], i, &d->follows[i], constraint, &tmp); replace (&d->follows[j], i, &d->follows[i], constraint, &tmp);
} }
free (tmp.elems); free (tmp.elems);
} }
/* Returns the set of contexts for which there is at least one /* Returns the set of contexts for which there is at least one
character included in C. */ character included in C. */
static int static int
charclass_context (struct dfa const *dfa, charclass const *c) charclass_context (struct dfa const *dfa, charclass const *c)
{ {
int context = 0; int context = 0;
for (unsigned int j = 0; j < CHARCLASS_WORDS; ++j) for (int j = 0; j < CHARCLASS_WORDS; j++)
{ {
if (c->w[j] & dfa->syntax.newline.w[j]) if (c->w[j] & dfa->syntax.newline.w[j])
context |= CTX_NEWLINE; context |= CTX_NEWLINE;
if (c->w[j] & dfa->syntax.letters.w[j]) if (c->w[j] & dfa->syntax.letters.w[j])
context |= CTX_LETTER; context |= CTX_LETTER;
if (c->w[j] & ~(dfa->syntax.letters.w[j] | dfa->syntax.newline.w[j])) if (c->w[j] & ~(dfa->syntax.letters.w[j] | dfa->syntax.newline.w[j]))
context |= CTX_NONE; context |= CTX_NONE;
} }
return context; return context;
skipping to change at line 2380 skipping to change at line 2371
in the set of returned contexts (let's call it SC) may have a different in the set of returned contexts (let's call it SC) may have a different
follow set than other contexts in SC, and also different from the follow set than other contexts in SC, and also different from the
follow set of the complement set (sc ^ CTX_ANY). However, all contexts follow set of the complement set (sc ^ CTX_ANY). However, all contexts
in the complement set will have the same follow set. */ in the complement set will have the same follow set. */
static int _GL_ATTRIBUTE_PURE static int _GL_ATTRIBUTE_PURE
state_separate_contexts (struct dfa *d, position_set const *s) state_separate_contexts (struct dfa *d, position_set const *s)
{ {
int separate_contexts = 0; int separate_contexts = 0;
for (size_t j = 0; j < s->nelem; j++) for (idx_t j = 0; j < s->nelem; j++)
separate_contexts |= d->separates[s->elems[j].index]; separate_contexts |= d->separates[s->elems[j].index];
return separate_contexts; return separate_contexts;
} }
enum enum
{ {
/* Single token is repeated. It is distinguished from non-repeated. */ /* Single token is repeated. It is distinguished from non-repeated. */
OPT_REPEAT = (1 << 0), OPT_REPEAT = (1 << 0),
skipping to change at line 2407 skipping to change at line 2398
/* The node is walked. If the node is found in walking again, OPT_RPAREN /* The node is walked. If the node is found in walking again, OPT_RPAREN
flag is turned on. */ flag is turned on. */
OPT_WALKED = (1 << 3), OPT_WALKED = (1 << 3),
/* The node is queued. The node is not queued again. */ /* The node is queued. The node is not queued again. */
OPT_QUEUED = (1 << 4) OPT_QUEUED = (1 << 4)
}; };
static void static void
merge_nfa_state (struct dfa *d, size_t tindex, char *flags, merge_nfa_state (struct dfa *d, idx_t tindex, char *flags,
position_set *merged) position_set *merged)
{ {
position_set *follows = d->follows; position_set *follows = d->follows;
ptrdiff_t nelem = 0; idx_t nelem = 0;
d->constraints[tindex] = 0; d->constraints[tindex] = 0;
for (ptrdiff_t i = 0; i < follows[tindex].nelem; i++) for (idx_t i = 0; i < follows[tindex].nelem; i++)
{ {
size_t sindex = follows[tindex].elems[i].index; idx_t sindex = follows[tindex].elems[i].index;
/* Skip the node as pruned in future. */ /* Skip the node as pruned in future. */
unsigned int iconstraint = follows[tindex].elems[i].constraint; unsigned int iconstraint = follows[tindex].elems[i].constraint;
if (iconstraint == 0) if (iconstraint == 0)
continue; continue;
if (d->tokens[follows[tindex].elems[i].index] <= END) if (d->tokens[follows[tindex].elems[i].index] <= END)
{ {
d->constraints[tindex] |= follows[tindex].elems[i].constraint; d->constraints[tindex] |= follows[tindex].elems[i].constraint;
continue; continue;
} }
if (!(flags[sindex] & (OPT_LPAREN | OPT_RPAREN))) if (!(flags[sindex] & (OPT_LPAREN | OPT_RPAREN)))
{ {
ptrdiff_t j; idx_t j;
for (j = 0; j < nelem; j++) for (j = 0; j < nelem; j++)
{ {
size_t dindex = follows[tindex].elems[j].index; idx_t dindex = follows[tindex].elems[j].index;
if (follows[tindex].elems[j].constraint != iconstraint) if (follows[tindex].elems[j].constraint != iconstraint)
continue; continue;
if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN)) if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN))
continue; continue;
if (d->tokens[sindex] != d->tokens[dindex]) if (d->tokens[sindex] != d->tokens[dindex])
continue; continue;
skipping to change at line 2472 skipping to change at line 2463
follows[tindex].elems[nelem++] = follows[tindex].elems[i]; follows[tindex].elems[nelem++] = follows[tindex].elems[i];
flags[sindex] |= OPT_QUEUED; flags[sindex] |= OPT_QUEUED;
} }
follows[tindex].nelem = nelem; follows[tindex].nelem = nelem;
} }
static int static int
compare (const void *a, const void *b) compare (const void *a, const void *b)
{ {
int aindex; position const *p = a, *q = b;
int bindex; return p->index < q->index ? -1 : p->index > q->index;
aindex = (int) ((position *) a)->index;
bindex = (int) ((position *) b)->index;
return aindex - bindex;
} }
static void static void
reorder_tokens (struct dfa *d) reorder_tokens (struct dfa *d)
{ {
ptrdiff_t nleaves; idx_t nleaves;
ptrdiff_t *map; ptrdiff_t *map;
token *tokens; token *tokens;
position_set *follows; position_set *follows;
int *constraints; int *constraints;
char *multibyte_prop; char *multibyte_prop;
nleaves = 0; nleaves = 0;
map = xnmalloc (d->tindex, sizeof *map); map = xnmalloc (d->tindex, sizeof *map);
map[0] = nleaves++; map[0] = nleaves++;
for (ptrdiff_t i = 1; i < d->tindex; i++) for (idx_t i = 1; i < d->tindex; i++)
map[i] = -1; map[i] = -1;
tokens = xnmalloc (d->nleaves, sizeof *tokens); tokens = xnmalloc (d->nleaves, sizeof *tokens);
follows = xnmalloc (d->nleaves, sizeof *follows); follows = xnmalloc (d->nleaves, sizeof *follows);
constraints = xnmalloc (d->nleaves, sizeof *constraints); constraints = xnmalloc (d->nleaves, sizeof *constraints);
if (d->localeinfo.multibyte) if (d->localeinfo.multibyte)
multibyte_prop = xnmalloc (d->nleaves, sizeof *multibyte_prop); multibyte_prop = xnmalloc (d->nleaves, sizeof *multibyte_prop);
else else
multibyte_prop = NULL; multibyte_prop = NULL;
for (ptrdiff_t i = 0; i < d->tindex; i++) for (idx_t i = 0; i < d->tindex; i++)
{ {
if (map[i] == -1) if (map[i] == -1)
{ {
free (d->follows[i].elems); free (d->follows[i].elems);
d->follows[i].elems = NULL; d->follows[i].elems = NULL;
d->follows[i].nelem = 0; d->follows[i].nelem = 0;
continue; continue;
} }
tokens[map[i]] = d->tokens[i]; tokens[map[i]] = d->tokens[i];
follows[map[i]] = d->follows[i]; follows[map[i]] = d->follows[i];
constraints[map[i]] = d->constraints[i]; constraints[map[i]] = d->constraints[i];
if (multibyte_prop != NULL) if (multibyte_prop != NULL)
multibyte_prop[map[i]] = d->multibyte_prop[i]; multibyte_prop[map[i]] = d->multibyte_prop[i];
for (ptrdiff_t j = 0; j < d->follows[i].nelem; j++) for (idx_t j = 0; j < d->follows[i].nelem; j++)
{ {
if (map[d->follows[i].elems[j].index] == -1) if (map[d->follows[i].elems[j].index] == -1)
map[d->follows[i].elems[j].index] = nleaves++; map[d->follows[i].elems[j].index] = nleaves++;
d->follows[i].elems[j].index = map[d->follows[i].elems[j].index]; d->follows[i].elems[j].index = map[d->follows[i].elems[j].index];
} }
qsort (d->follows[i].elems, d->follows[i].nelem, qsort (d->follows[i].elems, d->follows[i].nelem,
sizeof *d->follows[i].elems, compare); sizeof *d->follows[i].elems, compare);
} }
for (ptrdiff_t i = 0; i < nleaves; i++) for (idx_t i = 0; i < nleaves; i++)
{ {
d->tokens[i] = tokens[i]; d->tokens[i] = tokens[i];
d->follows[i] = follows[i]; d->follows[i] = follows[i];
d->constraints[i] = constraints[i]; d->constraints[i] = constraints[i];
if (multibyte_prop != NULL) if (multibyte_prop != NULL)
d->multibyte_prop[i] = multibyte_prop[i]; d->multibyte_prop[i] = multibyte_prop[i];
} }
d->tindex = d->nleaves = nleaves; d->tindex = d->nleaves = nleaves;
skipping to change at line 2560 skipping to change at line 2546
free (tokens); free (tokens);
free (follows); free (follows);
free (constraints); free (constraints);
free (multibyte_prop); free (multibyte_prop);
free (map); free (map);
} }
static void static void
dfaoptimize (struct dfa *d) dfaoptimize (struct dfa *d)
{ {
char *flags; char *flags = xzalloc (d->tindex);
position_set merged0;
position_set *merged;
flags = xmalloc (d->tindex * sizeof *flags); for (idx_t i = 0; i < d->tindex; i++)
memset (flags, 0, d->tindex * sizeof *flags);
for (size_t i = 0; i < d->tindex; i++)
{ {
for (ptrdiff_t j = 0; j < d->follows[i].nelem; j++) for (idx_t j = 0; j < d->follows[i].nelem; j++)
{ {
if (d->follows[i].elems[j].index == i) if (d->follows[i].elems[j].index == i)
flags[d->follows[i].elems[j].index] |= OPT_REPEAT; flags[d->follows[i].elems[j].index] |= OPT_REPEAT;
else if (d->follows[i].elems[j].index < i) else if (d->follows[i].elems[j].index < i)
flags[d->follows[i].elems[j].index] |= OPT_LPAREN; flags[d->follows[i].elems[j].index] |= OPT_LPAREN;
else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED) else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
flags[d->follows[i].elems[j].index] |= OPT_RPAREN; flags[d->follows[i].elems[j].index] |= OPT_RPAREN;
else else
flags[d->follows[i].elems[j].index] |= OPT_WALKED; flags[d->follows[i].elems[j].index] |= OPT_WALKED;
} }
} }
flags[0] |= OPT_QUEUED; flags[0] |= OPT_QUEUED;
merged = &merged0; position_set merged0;
position_set *merged = &merged0;
alloc_position_set (merged, d->nleaves); alloc_position_set (merged, d->nleaves);
d->constraints = xnmalloc (d->tindex, sizeof *d->constraints); d->constraints = xnmalloc (d->tindex, sizeof *d->constraints);
for (ptrdiff_t i = 0; i < d->tindex; i++) for (idx_t i = 0; i < d->tindex; i++)
if (flags[i] & OPT_QUEUED) if (flags[i] & OPT_QUEUED)
merge_nfa_state (d, i, flags, merged); merge_nfa_state (d, i, flags, merged);
reorder_tokens (d); reorder_tokens (d);
free (merged->elems); free (merged->elems);
free (flags); free (flags);
} }
/* Perform bottom-up analysis on the parse tree, computing various functions. /* Perform bottom-up analysis on the parse tree, computing various functions.
skipping to change at line 2669 skipping to change at line 2651
position pos; position pos;
position_set tmp; position_set tmp;
/* Stack for element counts and nullable flags. */ /* Stack for element counts and nullable flags. */
struct struct
{ {
/* Whether the entry is nullable. */ /* Whether the entry is nullable. */
bool nullable; bool nullable;
/* Counts of firstpos and lastpos sets. */ /* Counts of firstpos and lastpos sets. */
size_t nfirstpos; idx_t nfirstpos;
size_t nlastpos; idx_t nlastpos;
} *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc; } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc;
position_set merged; /* Result of merging sets. */ position_set merged; /* Result of merging sets. */
addtok (d, CAT); addtok (d, CAT);
#ifdef DEBUG #ifdef DEBUG
fprintf (stderr, "dfaanalyze:\n"); fprintf (stderr, "dfaanalyze:\n");
for (size_t i = 0; i < d->tindex; ++i) for (idx_t i = 0; i < d->tindex; i++)
{ {
fprintf (stderr, " %zu:", i); fprintf (stderr, " %td:", i);
prtok (d->tokens[i]); prtok (d->tokens[i]);
} }
putc ('\n', stderr); putc ('\n', stderr);
#endif #endif
d->searchflag = searchflag; d->searchflag = searchflag;
alloc_position_set (&merged, d->nleaves); alloc_position_set (&merged, d->nleaves);
d->follows = xcalloc (d->tindex, sizeof *d->follows); d->follows = xcalloc (d->tindex, sizeof *d->follows);
for (size_t i = 0; i < d->tindex; ++i) for (idx_t i = 0; i < d->tindex; i++)
{ {
switch (d->tokens[i]) switch (d->tokens[i])
{ {
case EMPTY: case EMPTY:
/* The empty set is nullable. */ /* The empty set is nullable. */
stk->nullable = true; stk->nullable = true;
/* The firstpos and lastpos of the empty leaf are both empty. */ /* The firstpos and lastpos of the empty leaf are both empty. */
stk->nfirstpos = stk->nlastpos = 0; stk->nfirstpos = stk->nlastpos = 0;
stk++; stk++;
break; break;
case STAR: case STAR:
case PLUS: case PLUS:
/* Every element in the firstpos of the argument is in the follow /* Every element in the firstpos of the argument is in the follow
of every element in the lastpos. */ of every element in the lastpos. */
{ {
tmp.elems = firstpos - stk[-1].nfirstpos; tmp.elems = firstpos - stk[-1].nfirstpos;
tmp.nelem = stk[-1].nfirstpos; tmp.nelem = stk[-1].nfirstpos;
position *p = lastpos - stk[-1].nlastpos; position *p = lastpos - stk[-1].nlastpos;
for (size_t j = 0; j < stk[-1].nlastpos; j++) for (idx_t j = 0; j < stk[-1].nlastpos; j++)
{ {
merge (&tmp, &d->follows[p[j].index], &merged); merge (&tmp, &d->follows[p[j].index], &merged);
copy (&merged, &d->follows[p[j].index]); copy (&merged, &d->follows[p[j].index]);
} }
} }
FALLTHROUGH; FALLTHROUGH;
case QMARK: case QMARK:
/* A QMARK or STAR node is automatically nullable. */ /* A QMARK or STAR node is automatically nullable. */
if (d->tokens[i] != PLUS) if (d->tokens[i] != PLUS)
stk[-1].nullable = true; stk[-1].nullable = true;
break; break;
case CAT: case CAT:
/* Every element in the firstpos of the second argument is in the /* Every element in the firstpos of the second argument is in the
follow of every element in the lastpos of the first argument. */ follow of every element in the lastpos of the first argument. */
{ {
tmp.nelem = stk[-1].nfirstpos; tmp.nelem = stk[-1].nfirstpos;
tmp.elems = firstpos - stk[-1].nfirstpos; tmp.elems = firstpos - stk[-1].nfirstpos;
position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos; position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
for (size_t j = 0; j < stk[-2].nlastpos; j++) for (idx_t j = 0; j < stk[-2].nlastpos; j++)
{ {
merge (&tmp, &d->follows[p[j].index], &merged); merge (&tmp, &d->follows[p[j].index], &merged);
copy (&merged, &d->follows[p[j].index]); copy (&merged, &d->follows[p[j].index]);
} }
} }
/* The firstpos of a CAT node is the firstpos of the first argument, /* The firstpos of a CAT node is the firstpos of the first argument,
union that of the second argument if the first is nullable. */ union that of the second argument if the first is nullable. */
if (stk[-2].nullable) if (stk[-2].nullable)
stk[-2].nfirstpos += stk[-1].nfirstpos; stk[-2].nfirstpos += stk[-1].nfirstpos;
else else
firstpos -= stk[-1].nfirstpos; firstpos -= stk[-1].nfirstpos;
/* The lastpos of a CAT node is the lastpos of the second argument, /* The lastpos of a CAT node is the lastpos of the second argument,
union that of the first argument if the second is nullable. */ union that of the first argument if the second is nullable. */
if (stk[-1].nullable) if (stk[-1].nullable)
stk[-2].nlastpos += stk[-1].nlastpos; stk[-2].nlastpos += stk[-1].nlastpos;
else else
{ {
position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos; position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
for (size_t j = 0; j < stk[-1].nlastpos; j++) for (idx_t j = 0; j < stk[-1].nlastpos; j++)
p[j] = p[j + stk[-2].nlastpos]; p[j] = p[j + stk[-2].nlastpos];
lastpos -= stk[-2].nlastpos; lastpos -= stk[-2].nlastpos;
stk[-2].nlastpos = stk[-1].nlastpos; stk[-2].nlastpos = stk[-1].nlastpos;
} }
/* A CAT node is nullable if both arguments are nullable. */ /* A CAT node is nullable if both arguments are nullable. */
stk[-2].nullable &= stk[-1].nullable; stk[-2].nullable &= stk[-1].nullable;
stk--; stk--;
break; break;
skipping to change at line 2796 skipping to change at line 2778
stk++; stk++;
firstpos->index = lastpos->index = i; firstpos->index = lastpos->index = i;
firstpos->constraint = lastpos->constraint = NO_CONSTRAINT; firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
firstpos++, lastpos++; firstpos++, lastpos++;
break; break;
} }
#ifdef DEBUG #ifdef DEBUG
/* ... balance the above nonsyntactic #ifdef goo... */ /* ... balance the above nonsyntactic #ifdef goo... */
fprintf (stderr, "node %zu:", i); fprintf (stderr, "node %td:", i);
prtok (d->tokens[i]); prtok (d->tokens[i]);
putc ('\n', stderr); putc ('\n', stderr);
fprintf (stderr, fprintf (stderr,
stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n"); stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
fprintf (stderr, " firstpos:"); fprintf (stderr, " firstpos:");
for (size_t j = 0; j < stk[-1].nfirstpos; j++) for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
{ {
fprintf (stderr, " %zu:", firstpos[j - stk[-1].nfirstpos].index); fprintf (stderr, " %td:", firstpos[j - stk[-1].nfirstpos].index);
prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]); prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]);
} }
fprintf (stderr, "\n lastpos:"); fprintf (stderr, "\n lastpos:");
for (size_t j = 0; j < stk[-1].nlastpos; j++) for (idx_t j = 0; j < stk[-1].nlastpos; j++)
{ {
fprintf (stderr, " %zu:", lastpos[j - stk[-1].nlastpos].index); fprintf (stderr, " %td:", lastpos[j - stk[-1].nlastpos].index);
prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]); prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]);
} }
putc ('\n', stderr); putc ('\n', stderr);
#endif #endif
} }
/* For each follow set that is the follow set of a real position, replace /* For each follow set that is the follow set of a real position, replace
it with its epsilon closure. */ it with its epsilon closure. */
epsclosure (d); epsclosure (d);
dfaoptimize (d); dfaoptimize (d);
#ifdef DEBUG #ifdef DEBUG
for (size_t i = 0; i < d->tindex; ++i) for (idx_t i = 0; i < d->tindex; i++)
if (d->tokens[i] == BEG || d->tokens[i] < NOTCHAR if (d->tokens[i] == BEG || d->tokens[i] < NOTCHAR
|| d->tokens[i] == BACKREF || d->tokens[i] == ANYCHAR || d->tokens[i] == BACKREF || d->tokens[i] == ANYCHAR
|| d->tokens[i] == MBCSET || d->tokens[i] >= CSET) || d->tokens[i] == MBCSET || d->tokens[i] >= CSET)
{ {
fprintf (stderr, "follows(%zu:", i); fprintf (stderr, "follows(%td:", i);
prtok (d->tokens[i]); prtok (d->tokens[i]);
fprintf (stderr, "):"); fprintf (stderr, "):");
for (size_t j = 0; j < d->follows[i].nelem; j++) for (idx_t j = 0; j < d->follows[i].nelem; j++)
{ {
fprintf (stderr, " %zu:", d->follows[i].elems[j].index); fprintf (stderr, " %td:", d->follows[i].elems[j].index);
prtok (d->tokens[d->follows[i].elems[j].index]); prtok (d->tokens[d->follows[i].elems[j].index]);
} }
putc ('\n', stderr); putc ('\n', stderr);
} }
#endif #endif
pos.index = 0; pos.index = 0;
pos.constraint = NO_CONSTRAINT; pos.constraint = NO_CONSTRAINT;
alloc_position_set (&tmp, 1); alloc_position_set (&tmp, 1);
append (pos, &tmp); append (pos, &tmp);
d->separates = xnmalloc (d->tindex, sizeof *d->separates); d->separates = xnmalloc (d->tindex, sizeof *d->separates);
for (ptrdiff_t i = 0; i < d->tindex; i++) for (idx_t i = 0; i < d->tindex; i++)
{ {
d->separates[i] = 0; d->separates[i] = 0;
if (prev_newline_dependent (d->constraints[i])) if (prev_newline_dependent (d->constraints[i]))
d->separates[i] |= CTX_NEWLINE; d->separates[i] |= CTX_NEWLINE;
if (prev_letter_dependent (d->constraints[i])) if (prev_letter_dependent (d->constraints[i]))
d->separates[i] |= CTX_LETTER; d->separates[i] |= CTX_LETTER;
for (ptrdiff_t j = 0; j < d->follows[i].nelem; j++) for (idx_t j = 0; j < d->follows[i].nelem; j++)
{ {
if (prev_newline_dependent (d->follows[i].elems[j].constraint)) if (prev_newline_dependent (d->follows[i].elems[j].constraint))
d->separates[i] |= CTX_NEWLINE; d->separates[i] |= CTX_NEWLINE;
if (prev_letter_dependent (d->follows[i].elems[j].constraint)) if (prev_letter_dependent (d->follows[i].elems[j].constraint))
d->separates[i] |= CTX_LETTER; d->separates[i] |= CTX_LETTER;
} }
} }
/* Context wanted by some position. */ /* Context wanted by some position. */
int separate_contexts = state_separate_contexts (d, &tmp); int separate_contexts = state_separate_contexts (d, &tmp);
skipping to change at line 2895 skipping to change at line 2877
} }
/* Make sure D's state arrays are large enough to hold NEW_STATE. */ /* Make sure D's state arrays are large enough to hold NEW_STATE. */
static void static void
realloc_trans_if_necessary (struct dfa *d) realloc_trans_if_necessary (struct dfa *d)
{ {
state_num oldalloc = d->tralloc; state_num oldalloc = d->tralloc;
if (oldalloc < d->sindex) if (oldalloc < d->sindex)
{ {
state_num **realtrans = d->trans ? d->trans - 2 : NULL; state_num **realtrans = d->trans ? d->trans - 2 : NULL;
ptrdiff_t newalloc1 = realtrans ? d->tralloc + 2 : 0; idx_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc, realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
-1, sizeof *realtrans); -1, sizeof *realtrans);
realtrans[0] = realtrans[1] = NULL; realtrans[0] = realtrans[1] = NULL;
d->trans = realtrans + 2; d->trans = realtrans + 2;
ptrdiff_t newalloc = d->tralloc = newalloc1 - 2; idx_t newalloc = d->tralloc = newalloc1 - 2;
d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails); d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails);
d->success = xnrealloc (d->success, newalloc, sizeof *d->success); d->success = xnrealloc (d->success, newalloc, sizeof *d->success);
d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines); d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines);
if (d->localeinfo.multibyte) if (d->localeinfo.multibyte)
{ {
realtrans = d->mb_trans ? d->mb_trans - 2 : NULL; realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans); realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans);
if (oldalloc == 0) if (oldalloc == 0)
realtrans[0] = realtrans[1] = NULL; realtrans[0] = realtrans[1] = NULL;
d->mb_trans = realtrans + 2; d->mb_trans = realtrans + 2;
skipping to change at line 3013 skipping to change at line 2995
d->success[s] |= CTX_NEWLINE; d->success[s] |= CTX_NEWLINE;
if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d)) if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
d->success[s] |= CTX_LETTER; d->success[s] |= CTX_LETTER;
if (accepts_in_context (d->states[s].context, CTX_NONE, s, d)) if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
d->success[s] |= CTX_NONE; d->success[s] |= CTX_NONE;
alloc_position_set (&follows, d->nleaves); alloc_position_set (&follows, d->nleaves);
/* Find the union of the follows of the positions of the group. /* Find the union of the follows of the positions of the group.
This is a hideously inefficient loop. Fix it someday. */ This is a hideously inefficient loop. Fix it someday. */
for (size_t j = 0; j < d->states[s].elems.nelem; ++j) for (idx_t j = 0; j < d->states[s].elems.nelem; j++)
for (size_t k = 0; for (idx_t k = 0;
k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k) k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k)
insert (d->follows[d->states[s].elems.elems[j].index].elems[k], insert (d->follows[d->states[s].elems.elems[j].index].elems[k],
&follows); &follows);
/* Positions that match the input char. */ /* Positions that match the input char. */
alloc_position_set (&group, d->nleaves); alloc_position_set (&group, d->nleaves);
/* The group's label. */ /* The group's label. */
charclass label; charclass label;
fillset (&label); fillset (&label);
for (size_t i = 0; i < follows.nelem; ++i) for (idx_t i = 0; i < follows.nelem; i++)
{ {
charclass matches; /* Set of matching characters. */ charclass matches; /* Set of matching characters. */
position pos = follows.elems[i]; position pos = follows.elems[i];
bool matched = false; bool matched = false;
if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR) if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
{ {
zeroset (&matches); zeroset (&matches);
setbit (d->tokens[pos.index], &matches); setbit (d->tokens[pos.index], &matches);
if (d->tokens[pos.index] == uc) if (d->tokens[pos.index] == uc)
matched = true; matched = true;
skipping to change at line 3073 skipping to change at line 3055
} }
else else
continue; continue;
/* Some characters may need to be eliminated from matches because /* Some characters may need to be eliminated from matches because
they fail in the current context. */ they fail in the current context. */
if (pos.constraint != NO_CONSTRAINT) if (pos.constraint != NO_CONSTRAINT)
{ {
if (!succeeds_in_context (pos.constraint, if (!succeeds_in_context (pos.constraint,
d->states[s].context, CTX_NEWLINE)) d->states[s].context, CTX_NEWLINE))
for (size_t j = 0; j < CHARCLASS_WORDS; ++j) for (int j = 0; j < CHARCLASS_WORDS; j++)
matches.w[j] &= ~d->syntax.newline.w[j]; matches.w[j] &= ~d->syntax.newline.w[j];
if (!succeeds_in_context (pos.constraint, if (!succeeds_in_context (pos.constraint,
d->states[s].context, CTX_LETTER)) d->states[s].context, CTX_LETTER))
for (size_t j = 0; j < CHARCLASS_WORDS; ++j) for (int j = 0; j < CHARCLASS_WORDS; ++j)
matches.w[j] &= ~d->syntax.letters.w[j]; matches.w[j] &= ~d->syntax.letters.w[j];
if (!succeeds_in_context (pos.constraint, if (!succeeds_in_context (pos.constraint,
d->states[s].context, CTX_NONE)) d->states[s].context, CTX_NONE))
for (size_t j = 0; j < CHARCLASS_WORDS; ++j) for (int j = 0; j < CHARCLASS_WORDS; ++j)
matches.w[j] &= d->syntax.letters.w[j] | d->syntax.newline.w[j]; matches.w[j] &= d->syntax.letters.w[j] | d->syntax.newline.w[j];
/* If there are no characters left, there's no point in going on. */ /* If there are no characters left, there's no point in going on. */
if (emptyset (&matches)) if (emptyset (&matches))
continue; continue;
/* If we have reset the bit that made us declare "matched", reset /* If we have reset the bit that made us declare "matched", reset
that indicator, too. This is required to avoid an infinite loop that indicator, too. This is required to avoid an infinite loop
with this command: echo cx | LC_ALL=C grep -E 'c\b[x ]' */ with this command: echo cx | LC_ALL=C grep -E 'c\b[x ]' */
if (!tstbit (uc, &matches)) if (!tstbit (uc, &matches))
matched = false; matched = false;
} }
#ifdef DEBUG #ifdef DEBUG
fprintf (stderr, " nextpos %zu:", pos.index); fprintf (stderr, " nextpos %td:", pos.index);
prtok (d->tokens[pos.index]); prtok (d->tokens[pos.index]);
fprintf (stderr, " of"); fprintf (stderr, " of");
for (size_t j = 0; j < NOTCHAR; j++) for (unsigned j = 0; j < NOTCHAR; j++)
if (tstbit (j, &matches)) if (tstbit (j, &matches))
fprintf (stderr, " 0x%02zx", j); fprintf (stderr, " 0x%02x", j);
fprintf (stderr, "\n"); fprintf (stderr, "\n");
#endif #endif
if (matched) if (matched)
{ {
for (size_t k = 0; k < CHARCLASS_WORDS; ++k) for (int k = 0; k < CHARCLASS_WORDS; ++k)
label.w[k] &= matches.w[k]; label.w[k] &= matches.w[k];
append (pos, &group); append (pos, &group);
} }
else else
{ {
for (size_t k = 0; k < CHARCLASS_WORDS; ++k) for (int k = 0; k < CHARCLASS_WORDS; ++k)
label.w[k] &= ~matches.w[k]; label.w[k] &= ~matches.w[k];
} }
} }
alloc_position_set (&tmp, d->nleaves); alloc_position_set (&tmp, d->nleaves);
if (group.nelem > 0) if (group.nelem > 0)
{ {
/* If we are building a searching matcher, throw in the positions /* If we are building a searching matcher, throw in the positions
of state 0 as well, if possible. */ of state 0 as well, if possible. */
skipping to change at line 3147 skipping to change at line 3129
state[0] accepts <sb a>, state[i] transits to state[i+1] by state[0] accepts <sb a>, state[i] transits to state[i+1] by
accepting the 1st byte of <mb A>, and state[i+1] accepts the accepting the 1st byte of <mb A>, and state[i+1] accepts the
2nd byte of <mb A>, if state[i+1] encounters the codepoint of 2nd byte of <mb A>, if state[i+1] encounters the codepoint of
<sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do <sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
not add state[0]. */ not add state[0]. */
bool mergeit = !d->localeinfo.multibyte; bool mergeit = !d->localeinfo.multibyte;
if (!mergeit) if (!mergeit)
{ {
mergeit = true; mergeit = true;
for (size_t j = 0; mergeit && j < group.nelem; j++) for (idx_t j = 0; mergeit && j < group.nelem; j++)
mergeit &= d->multibyte_prop[group.elems[j].index]; mergeit &= d->multibyte_prop[group.elems[j].index];
} }
if (mergeit) if (mergeit)
{ {
merge (&d->states[0].elems, &group, &tmp); merge (&d->states[0].elems, &group, &tmp);
copy (&tmp, &group); copy (&tmp, &group);
} }
} }
/* Find out if the new state will want any context information, /* Find out if the new state will want any context information,
skipping to change at line 3198 skipping to change at line 3180
state = d->initstate_notbol; state = d->initstate_notbol;
} }
else else
{ {
state_newline = -1; state_newline = -1;
state_letter = -1; state_letter = -1;
state = -1; state = -1;
} }
/* Set the transitions for each character in the label. */ /* Set the transitions for each character in the label. */
for (size_t i = 0; i < NOTCHAR; i++) for (int i = 0; i < NOTCHAR; i++)
if (tstbit (i, &label)) if (tstbit (i, &label))
switch (d->syntax.sbit[i]) switch (d->syntax.sbit[i])
{ {
case CTX_NEWLINE: case CTX_NEWLINE:
trans[i] = state_newline; trans[i] = state_newline;
break; break;
case CTX_LETTER: case CTX_LETTER:
trans[i] = state_letter; trans[i] = state_letter;
break; break;
default: default:
trans[i] = state; trans[i] = state;
break; break;
} }
#ifdef DEBUG #ifdef DEBUG
fprintf (stderr, "trans table %td", s); fprintf (stderr, "trans table %td", s);
for (size_t i = 0; i < NOTCHAR; ++i) for (int i = 0; i < NOTCHAR; ++i)
{ {
if (!(i & 0xf)) if (!(i & 0xf))
fprintf (stderr, "\n"); fprintf (stderr, "\n");
fprintf (stderr, " %2td", trans[i]); fprintf (stderr, " %2td", trans[i]);
} }
fprintf (stderr, "\n"); fprintf (stderr, "\n");
#endif #endif
free (group.elems); free (group.elems);
free (follows.elems); free (follows.elems);
skipping to change at line 3394 skipping to change at line 3376
If COUNT is non-NULL, increment *COUNT once for each newline processed. If COUNT is non-NULL, increment *COUNT once for each newline processed.
If MULTIBYTE, the input consists of multibyte characters and/or If MULTIBYTE, the input consists of multibyte characters and/or
encoding-error bytes. Otherwise, it consists of single-byte characters. encoding-error bytes. Otherwise, it consists of single-byte characters.
Here is the list of features that make this DFA matcher punt: Here is the list of features that make this DFA matcher punt:
- [M-N] range in non-simple locale: regex is up to 25% faster on [a-z] - [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
- [^...] in non-simple locale - [^...] in non-simple locale
- [[=foo=]] or [[.foo.]] - [[=foo=]] or [[.foo.]]
- [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK) - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
- back-reference: (.)\1 - back-reference: (.)\1
- word-delimiter in multibyte locale: \<, \>, \b, \B - word-delimiter in multibyte locale: \<, \>, \b, \B
See using_simple_locale for the definition of "simple locale". */ See struct localeinfo.simple for the definition of "simple locale". */
static inline char * static inline char *
dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
size_t *count, bool multibyte) ptrdiff_t *count, bool multibyte)
{ {
if (MAX_TRCOUNT <= d->sindex) if (MAX_TRCOUNT <= d->sindex)
{ {
for (state_num s = d->min_trcount; s < d->sindex; s++) for (state_num s = d->min_trcount; s < d->sindex; s++)
{ {
free (d->states[s].elems.elems); free (d->states[s].elems.elems);
free (d->states[s].mbps.elems); free (d->states[s].mbps.elems);
} }
d->sindex = d->min_trcount; d->sindex = d->min_trcount;
skipping to change at line 3456 skipping to change at line 3438
unsigned char saved_end = *(unsigned char *) end; unsigned char saved_end = *(unsigned char *) end;
*end = eol; *end = eol;
if (multibyte) if (multibyte)
{ {
memset (&d->mbs, 0, sizeof d->mbs); memset (&d->mbs, 0, sizeof d->mbs);
if (d->mb_follows.alloc == 0) if (d->mb_follows.alloc == 0)
alloc_position_set (&d->mb_follows, d->nleaves); alloc_position_set (&d->mb_follows, d->nleaves);
} }
size_t nlcount = 0; idx_t nlcount = 0;
for (;;) for (;;)
{ {
state_num *t; state_num *t;
while ((t = trans[s]) != NULL) while ((t = trans[s]) != NULL)
{ {
if (s < d->min_trcount) if (s < d->min_trcount)
{ {
if (!multibyte || d->states[s].mbps.nelem == 0) if (!multibyte || d->states[s].mbps.nelem == 0)
{ {
while (t[*p] == s) while (t[*p] == s)
skipping to change at line 3581 skipping to change at line 3563
*count += nlcount; *count += nlcount;
*end = saved_end; *end = saved_end;
return (char *) p; return (char *) p;
} }
/* Specialized versions of dfaexec for multibyte and single-byte cases. /* Specialized versions of dfaexec for multibyte and single-byte cases.
This is for performance, as dfaexec_main is an inline function. */ This is for performance, as dfaexec_main is an inline function. */
static char * static char *
dfaexec_mb (struct dfa *d, char const *begin, char *end, dfaexec_mb (struct dfa *d, char const *begin, char *end,
bool allow_nl, size_t *count, bool *backref) bool allow_nl, ptrdiff_t *count, bool *backref)
{ {
return dfaexec_main (d, begin, end, allow_nl, count, true); return dfaexec_main (d, begin, end, allow_nl, count, true);
} }
static char * static char *
dfaexec_sb (struct dfa *d, char const *begin, char *end, dfaexec_sb (struct dfa *d, char const *begin, char *end,
bool allow_nl, size_t *count, bool *backref) bool allow_nl, ptrdiff_t *count, bool *backref)
{ {
return dfaexec_main (d, begin, end, allow_nl, count, false); return dfaexec_main (d, begin, end, allow_nl, count, false);
} }
/* Always set *BACKREF and return BEGIN. Use this wrapper for /* Always set *BACKREF and return BEGIN. Use this wrapper for
any regexp that uses a construct not supported by this code. */ any regexp that uses a construct not supported by this code. */
static char * static char *
dfaexec_noop (struct dfa *d, char const *begin, char *end, dfaexec_noop (struct dfa *d, char const *begin, char *end,
bool allow_nl, size_t *count, bool *backref) bool allow_nl, ptrdiff_t *count, bool *backref)
{ {
*backref = true; *backref = true;
return (char *) begin; return (char *) begin;
} }
/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte), /* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
but faster and set *BACKREF if the DFA code does not support this but faster and set *BACKREF if the DFA code does not support this
regexp usage. */ regexp usage. */
char * char *
dfaexec (struct dfa *d, char const *begin, char *end, dfaexec (struct dfa *d, char const *begin, char *end,
bool allow_nl, size_t *count, bool *backref) bool allow_nl, ptrdiff_t *count, bool *backref)
{ {
return d->dfaexec (d, begin, end, allow_nl, count, backref); return d->dfaexec (d, begin, end, allow_nl, count, backref);
} }
struct dfa * struct dfa *
dfasuperset (struct dfa const *d) dfasuperset (struct dfa const *d)
{ {
return d->superset; return d->superset;
} }
skipping to change at line 3646 skipping to change at line 3628
for (s = -1; s < d->tralloc; s++) for (s = -1; s < d->tralloc; s++)
free (d->mb_trans[s]); free (d->mb_trans[s]);
free (d->mb_trans - 2); free (d->mb_trans - 2);
} }
} }
/* Return true if every construct in D is supported by this DFA matcher. */ /* Return true if every construct in D is supported by this DFA matcher. */
static bool _GL_ATTRIBUTE_PURE static bool _GL_ATTRIBUTE_PURE
dfa_supported (struct dfa const *d) dfa_supported (struct dfa const *d)
{ {
for (size_t i = 0; i < d->tindex; i++) for (idx_t i = 0; i < d->tindex; i++)
{ {
switch (d->tokens[i]) switch (d->tokens[i])
{ {
case BEGWORD: case BEGWORD:
case ENDWORD: case ENDWORD:
case LIMWORD: case LIMWORD:
case NOTLIMWORD: case NOTLIMWORD:
if (!d->localeinfo.multibyte) if (!d->localeinfo.multibyte)
continue; continue;
FALLTHROUGH; FALLTHROUGH;
skipping to change at line 3674 skipping to change at line 3656
/* Disable use of the superset DFA if it is not likely to help /* Disable use of the superset DFA if it is not likely to help
performance. */ performance. */
static void static void
maybe_disable_superset_dfa (struct dfa *d) maybe_disable_superset_dfa (struct dfa *d)
{ {
if (!d->localeinfo.using_utf8) if (!d->localeinfo.using_utf8)
return; return;
bool have_backref = false; bool have_backref = false;
for (size_t i = 0; i < d->tindex; ++i) for (idx_t i = 0; i < d->tindex; i++)
{ {
switch (d->tokens[i]) switch (d->tokens[i])
{ {
case ANYCHAR: case ANYCHAR:
/* Lowered. */ /* Lowered. */
abort (); abort ();
case BACKREF: case BACKREF:
have_backref = true; have_backref = true;
break; break;
case MBCSET: case MBCSET:
skipping to change at line 3739 skipping to change at line 3721
{ {
memcpy (sup->charclasses, d->charclasses, memcpy (sup->charclasses, d->charclasses,
d->cindex * sizeof *sup->charclasses); d->cindex * sizeof *sup->charclasses);
} }
sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens); sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
sup->talloc = d->tindex * 2; sup->talloc = d->tindex * 2;
bool have_achar = false; bool have_achar = false;
bool have_nchar = false; bool have_nchar = false;
size_t j; idx_t j;
for (size_t i = j = 0; i < d->tindex; i++) for (idx_t i = j = 0; i < d->tindex; i++)
{ {
switch (d->tokens[i]) switch (d->tokens[i])
{ {
case ANYCHAR: case ANYCHAR:
case MBCSET: case MBCSET:
case BACKREF: case BACKREF:
{ {
charclass ccl; charclass ccl;
fillset (&ccl); fillset (&ccl);
sup->tokens[j++] = CSET + charclass_index (sup, &ccl); sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
skipping to change at line 3789 skipping to change at line 3771
if (have_nchar && (have_achar || d->localeinfo.multibyte)) if (have_nchar && (have_achar || d->localeinfo.multibyte))
d->superset = sup; d->superset = sup;
else else
{ {
dfafree (sup); dfafree (sup);
free (sup); free (sup);
} }
} }
/* Parse and analyze a single string of the given length. */ /* Parse a string S of length LEN into D (but skip this step if S is null).
Then analyze D and build a matcher for it.
SEARCHFLAG says whether to build a searching or an exact matcher. */
void void
dfacomp (char const *s, size_t len, struct dfa *d, bool searchflag) dfacomp (char const *s, idx_t len, struct dfa *d, bool searchflag)
{ {
dfaparse (s, len, d); if (s != NULL)
dfaparse (s, len, d);
dfassbuild (d); dfassbuild (d);
if (dfa_supported (d)) if (dfa_supported (d))
{ {
maybe_disable_superset_dfa (d); maybe_disable_superset_dfa (d);
dfaanalyze (d, searchflag); dfaanalyze (d, searchflag);
} }
else else
{ {
d->dfaexec = dfaexec_noop; d->dfaexec = dfaexec_noop;
skipping to change at line 3826 skipping to change at line 3812
{ {
free (d->charclasses); free (d->charclasses);
free (d->tokens); free (d->tokens);
if (d->localeinfo.multibyte) if (d->localeinfo.multibyte)
free_mbdata (d); free_mbdata (d);
free (d->constraints); free (d->constraints);
free (d->separates); free (d->separates);
for (size_t i = 0; i < d->sindex; ++i) for (idx_t i = 0; i < d->sindex; i++)
{ {
free (d->states[i].elems.elems); free (d->states[i].elems.elems);
free (d->states[i].mbps.elems); free (d->states[i].mbps.elems);
} }
free (d->states); free (d->states);
if (d->follows) if (d->follows)
{ {
for (size_t i = 0; i < d->tindex; ++i) for (idx_t i = 0; i < d->tindex; i++)
free (d->follows[i].elems); free (d->follows[i].elems);
free (d->follows); free (d->follows);
} }
if (d->trans) if (d->trans)
{ {
for (size_t i = 0; i < d->tralloc; ++i) for (idx_t i = 0; i < d->tralloc; i++)
{ {
free (d->trans[i]); free (d->trans[i]);
free (d->fails[i]); free (d->fails[i]);
} }
free (d->trans - 2); free (d->trans - 2);
free (d->fails); free (d->fails);
free (d->newlines); free (d->newlines);
free (d->success); free (d->success);
} }
skipping to change at line 3946 skipping to change at line 3932
or is the automaton you get from "psi|epsilon" (for example) or is the automaton you get from "psi|epsilon" (for example)
the same as the one you get from "psi" (for example)? the same as the one you get from "psi" (for example)?
Are optimizable r.e.'s likely to be used in real-life situations Are optimizable r.e.'s likely to be used in real-life situations
(something like 'ab*' is probably unlikely; something like is (something like 'ab*' is probably unlikely; something like is
'psi|epsilon' is likelier)? */ 'psi|epsilon' is likelier)? */
static char * static char *
icatalloc (char *old, char const *new) icatalloc (char *old, char const *new)
{ {
size_t newsize = strlen (new); idx_t newsize = strlen (new);
if (newsize == 0) if (newsize == 0)
return old; return old;
size_t oldsize = strlen (old); idx_t oldsize = strlen (old);
char *result = xrealloc (old, oldsize + newsize + 1); char *result = xrealloc (old, oldsize + newsize + 1);
memcpy (result + oldsize, new, newsize + 1); memcpy (result + oldsize, new, newsize + 1);
return result; return result;
} }
static void static void
freelist (char **cpp) freelist (char **cpp)
{ {
while (*cpp) while (*cpp)
free (*cpp++); free (*cpp++);
} }
static char ** static char **
enlist (char **cpp, char *new, size_t len) enlist (char **cpp, char *new, idx_t len)
{ {
new = memcpy (xmalloc (len + 1), new, len); new = memcpy (xmalloc (len + 1), new, len);
new[len] = '\0'; new[len] = '\0';
/* Is there already something in the list that's new (or longer)? */ /* Is there already something in the list that's new (or longer)? */
size_t i; idx_t i;
for (i = 0; cpp[i] != NULL; ++i) for (i = 0; cpp[i] != NULL; i++)
if (strstr (cpp[i], new) != NULL) if (strstr (cpp[i], new) != NULL)
{ {
free (new); free (new);
return cpp; return cpp;
} }
/* Eliminate any obsoleted strings. */ /* Eliminate any obsoleted strings. */
for (size_t j = 0; cpp[j] != NULL; ) for (idx_t j = 0; cpp[j] != NULL; )
if (strstr (new, cpp[j]) == NULL) if (strstr (new, cpp[j]) == NULL)
++j; ++j;
else else
{ {
free (cpp[j]); free (cpp[j]);
if (--i == j) if (--i == j)
break; break;
cpp[j] = cpp[i]; cpp[j] = cpp[i];
cpp[i] = NULL; cpp[i] = NULL;
} }
skipping to change at line 4003 skipping to change at line 3989
/* Given pointers to two strings, return a pointer to an allocated /* Given pointers to two strings, return a pointer to an allocated
list of their distinct common substrings. */ list of their distinct common substrings. */
static char ** static char **
comsubs (char *left, char const *right) comsubs (char *left, char const *right)
{ {
char **cpp = xzalloc (sizeof *cpp); char **cpp = xzalloc (sizeof *cpp);
for (char *lcp = left; *lcp != '\0'; lcp++) for (char *lcp = left; *lcp != '\0'; lcp++)
{ {
size_t len = 0; idx_t len = 0;
char *rcp = strchr (right, *lcp); char *rcp = strchr (right, *lcp);
while (rcp != NULL) while (rcp != NULL)
{ {
size_t i; idx_t i;
for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i) for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
continue; continue;
if (i > len) if (i > len)
len = i; len = i;
rcp = strchr (rcp + 1, *lcp); rcp = strchr (rcp + 1, *lcp);
} }
if (len != 0) if (len != 0)
cpp = enlist (cpp, lcp, len); cpp = enlist (cpp, lcp, len);
} }
return cpp; return cpp;
skipping to change at line 4035 skipping to change at line 4021
return old; return old;
} }
/* Given two lists of substrings, return a new list giving substrings /* Given two lists of substrings, return a new list giving substrings
common to both. */ common to both. */
static char ** static char **
inboth (char **left, char **right) inboth (char **left, char **right)
{ {
char **both = xzalloc (sizeof *both); char **both = xzalloc (sizeof *both);
for (size_t lnum = 0; left[lnum] != NULL; ++lnum) for (idx_t lnum = 0; left[lnum] != NULL; lnum++)
{ {
for (size_t rnum = 0; right[rnum] != NULL; ++rnum) for (idx_t rnum = 0; right[rnum] != NULL; rnum++)
{ {
char **temp = comsubs (left[lnum], right[rnum]); char **temp = comsubs (left[lnum], right[rnum]);
both = addlists (both, temp); both = addlists (both, temp);
freelist (temp); freelist (temp);
free (temp); free (temp);
} }
} }
return both; return both;
} }
skipping to change at line 4062 skipping to change at line 4048
char **in; char **in;
char *left; char *left;
char *right; char *right;
char *is; char *is;
bool begline; bool begline;
bool endline; bool endline;
must *prev; must *prev;
}; };
static must * static must *
allocmust (must *mp, size_t size) allocmust (must *mp, idx_t size)
{ {
must *new_mp = xmalloc (sizeof *new_mp); must *new_mp = xmalloc (sizeof *new_mp);
new_mp->in = xzalloc (sizeof *new_mp->in); new_mp->in = xzalloc (sizeof *new_mp->in);
new_mp->left = xzalloc (size); new_mp->left = xzalloc (size);
new_mp->right = xzalloc (size); new_mp->right = xzalloc (size);
new_mp->is = xzalloc (size); new_mp->is = xzalloc (size);
new_mp->begline = false; new_mp->begline = false;
new_mp->endline = false; new_mp->endline = false;
new_mp->prev = mp; new_mp->prev = mp;
return new_mp; return new_mp;
skipping to change at line 4106 skipping to change at line 4092
struct dfamust * struct dfamust *
dfamust (struct dfa const *d) dfamust (struct dfa const *d)
{ {
must *mp = NULL; must *mp = NULL;
char const *result = ""; char const *result = "";
bool exact = false; bool exact = false;
bool begline = false; bool begline = false;
bool endline = false; bool endline = false;
bool need_begline = false; bool need_begline = false;
bool need_endline = false; bool need_endline = false;
bool case_fold_unibyte = d->syntax.case_fold && MB_CUR_MAX == 1; bool case_fold_unibyte = d->syntax.case_fold & !d->localeinfo.multibyte;
for (size_t ri = 1; ri + 1 < d->tindex; ri++) for (idx_t ri = 1; ri + 1 < d->tindex; ri++)
{ {
token t = d->tokens[ri]; token t = d->tokens[ri];
switch (t) switch (t)
{ {
case BEGLINE: case BEGLINE:
mp = allocmust (mp, 2); mp = allocmust (mp, 2);
mp->begline = true; mp->begline = true;
need_begline = true; need_begline = true;
break; break;
case ENDLINE: case ENDLINE:
skipping to change at line 4148 skipping to change at line 4134
case STAR: case STAR:
case QMARK: case QMARK:
resetmust (mp); resetmust (mp);
break; break;
case OR: case OR:
{ {
char **new; char **new;
must *rmp = mp; must *rmp = mp;
must *lmp = mp = mp->prev; must *lmp = mp = mp->prev;
size_t j, ln, rn, n; idx_t j, ln, rn, n;
/* Guaranteed to be. Unlikely, but ... */ /* Guaranteed to be. Unlikely, but ... */
if (streq (lmp->is, rmp->is)) if (streq (lmp->is, rmp->is))
{ {
lmp->begline &= rmp->begline; lmp->begline &= rmp->begline;
lmp->endline &= rmp->endline; lmp->endline &= rmp->endline;
} }
else else
{ {
lmp->is[0] = '\0'; lmp->is[0] = '\0';
lmp->begline = false; lmp->begline = false;
lmp->endline = false; lmp->endline = false;
} }
/* Left side--easy */ /* Left side--easy */
size_t i = 0; idx_t i = 0;
while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i]) while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
++i; ++i;
lmp->left[i] = '\0'; lmp->left[i] = '\0';
/* Right side */ /* Right side */
ln = strlen (lmp->right); ln = strlen (lmp->right);
rn = strlen (rmp->right); rn = strlen (rmp->right);
n = ln; n = ln;
if (n > rn) if (n > rn)
n = rn; n = rn;
for (i = 0; i < n; ++i) for (i = 0; i < n; ++i)
skipping to change at line 4193 skipping to change at line 4179
freemust (rmp); freemust (rmp);
} }
break; break;
case PLUS: case PLUS:
mp->is[0] = '\0'; mp->is[0] = '\0';
break; break;
case END: case END:
assert (!mp->prev); assert (!mp->prev);
for (size_t i = 0; mp->in[i] != NULL; ++i) for (idx_t i = 0; mp->in[i] != NULL; i++)
if (strlen (mp->in[i]) > strlen (result)) if (strlen (mp->in[i]) > strlen (result))
result = mp->in[i]; result = mp->in[i];
if (streq (result, mp->is)) if (streq (result, mp->is))
{ {
if ((!need_begline || mp->begline) && (!need_endline if ((!need_begline || mp->begline) && (!need_endline
|| mp->endline)) || mp->endline))
exact = true; exact = true;
begline = mp->begline; begline = mp->begline;
endline = mp->endline; endline = mp->endline;
} }
skipping to change at line 4217 skipping to change at line 4203
{ {
must *rmp = mp; must *rmp = mp;
must *lmp = mp = mp->prev; must *lmp = mp = mp->prev;
/* In. Everything in left, plus everything in /* In. Everything in left, plus everything in
right, plus concatenation of right, plus concatenation of
left's right and right's left. */ left's right and right's left. */
lmp->in = addlists (lmp->in, rmp->in); lmp->in = addlists (lmp->in, rmp->in);
if (lmp->right[0] != '\0' && rmp->left[0] != '\0') if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
{ {
size_t lrlen = strlen (lmp->right); idx_t lrlen = strlen (lmp->right);
size_t rllen = strlen (rmp->left); idx_t rllen = strlen (rmp->left);
char *tp = xmalloc (lrlen + rllen); char *tp = xmalloc (lrlen + rllen);
memcpy (tp, lmp->right, lrlen); memcpy (tp, lmp->right, lrlen);
memcpy (tp + lrlen, rmp->left, rllen); memcpy (tp + lrlen, rmp->left, rllen);
lmp->in = enlist (lmp->in, tp, lrlen + rllen); lmp->in = enlist (lmp->in, tp, lrlen + rllen);
free (tp); free (tp);
} }
/* Left-hand */ /* Left-hand */
if (lmp->is[0] != '\0') if (lmp->is[0] != '\0')
lmp->left = icatalloc (lmp->left, rmp->left); lmp->left = icatalloc (lmp->left, rmp->left);
/* Right-hand */ /* Right-hand */
skipping to change at line 4283 skipping to change at line 4269
&& ! (case_fold_unibyte && ! (case_fold_unibyte
&& toupper (j) == toupper (t))) && toupper (j) == toupper (t)))
break; break;
if (j < NOTCHAR) if (j < NOTCHAR)
{ {
mp = allocmust (mp, 2); mp = allocmust (mp, 2);
break; break;
} }
} }
size_t rj = ri + 2; idx_t rj = ri + 2;
if (d->tokens[ri + 1] == CAT) if (d->tokens[ri + 1] == CAT)
{ {
for (; rj < d->tindex - 1; rj += 2) for (; rj < d->tindex - 1; rj += 2)
{ {
if ((rj != ri && (d->tokens[rj] <= 0 if ((rj != ri && (d->tokens[rj] <= 0
|| NOTCHAR <= d->tokens[rj])) || NOTCHAR <= d->tokens[rj]))
|| d->tokens[rj + 1] != CAT) || d->tokens[rj + 1] != CAT)
break; break;
} }
} }
mp = allocmust (mp, ((rj - ri) >> 1) + 1); mp = allocmust (mp, ((rj - ri) >> 1) + 1);
mp->is[0] = mp->left[0] = mp->right[0] mp->is[0] = mp->left[0] = mp->right[0]
= case_fold_unibyte ? toupper (t) : t; = case_fold_unibyte ? toupper (t) : t;
size_t i; idx_t i;
for (i = 1; ri + 2 < rj; i++) for (i = 1; ri + 2 < rj; i++)
{ {
ri += 2; ri += 2;
t = d->tokens[ri]; t = d->tokens[ri];
mp->is[i] = mp->left[i] = mp->right[i] mp->is[i] = mp->left[i] = mp->right[i]
= case_fold_unibyte ? toupper (t) : t; = case_fold_unibyte ? toupper (t) : t;
} }
mp->is[i] = mp->left[i] = mp->right[i] = '\0'; mp->is[i] = mp->left[i] = mp->right[i] = '\0';
mp->in = enlist (mp->in, mp->is, i); mp->in = enlist (mp->in, mp->is, i);
break; break;
} }
} }
done:; done:;
struct dfamust *dm = NULL; struct dfamust *dm = NULL;
if (*result) if (*result)
{ {
dm = xmalloc (sizeof *dm); dm = xmalloc (FLEXSIZEOF (struct dfamust, must, strlen (result) + 1));
dm->exact = exact; dm->exact = exact;
dm->begline = begline; dm->begline = begline;
dm->endline = endline; dm->endline = endline;
dm->must = xstrdup (result); strcpy (dm->must, result);
} }
while (mp) while (mp)
{ {
must *prev = mp->prev; must *prev = mp->prev;
freemust (mp); freemust (mp);
mp = prev; mp = prev;
} }
return dm; return dm;
} }
void void
dfamustfree (struct dfamust *dm) dfamustfree (struct dfamust *dm)
{ {
free (dm->must);
free (dm); free (dm);
} }
struct dfa * struct dfa *
dfaalloc (void) dfaalloc (void)
{ {
return xzalloc (sizeof (struct dfa)); return xmalloc (sizeof (struct dfa));
} }
/* Initialize DFA. */ /* Initialize DFA. */
void void
dfasyntax (struct dfa *dfa, struct localeinfo const *linfo, dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
reg_syntax_t bits, int dfaopts) reg_syntax_t bits, int dfaopts)
{ {
memset (dfa, 0, offsetof (struct dfa, dfaexec)); memset (dfa, 0, offsetof (struct dfa, dfaexec));
dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb; dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
dfa->simple_locale = using_simple_locale (linfo->multibyte);
dfa->localeinfo = *linfo; dfa->localeinfo = *linfo;
dfa->fast = !dfa->localeinfo.multibyte; dfa->fast = !dfa->localeinfo.multibyte;
dfa->canychar = -1; dfa->canychar = -1;
dfa->lex.cur_mb_len = 1;
dfa->syntax.syntax_bits_set = true; dfa->syntax.syntax_bits_set = true;
dfa->syntax.case_fold = (bits & RE_ICASE) != 0; dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0; dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0;
dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n'; dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
dfa->syntax.syntax_bits = bits; dfa->syntax.syntax_bits = bits;
for (int i = CHAR_MIN; i <= CHAR_MAX; ++i) for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
{ {
unsigned char uc = i; unsigned char uc = i;
skipping to change at line 4389 skipping to change at line 4372
} }
/* POSIX requires that the five bytes in "\n\r./" (including the /* POSIX requires that the five bytes in "\n\r./" (including the
terminating NUL) cannot occur inside a multibyte character. */ terminating NUL) cannot occur inside a multibyte character. */
dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8 dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
? (uc & 0xc0) != 0x80 ? (uc & 0xc0) != 0x80
: strchr ("\n\r./", uc) != NULL); : strchr ("\n\r./", uc) != NULL);
} }
} }
/* Initialize TO by copying FROM's syntax settings. */
void
dfacopysyntax (struct dfa *to, struct dfa const *from)
{
memset (to, 0, offsetof (struct dfa, syntax));
to->canychar = -1;
to->fast = from->fast;
to->syntax = from->syntax;
to->dfaexec = from->dfaexec;
to->localeinfo = from->localeinfo;
}
/* vim:set shiftwidth=2: */ /* vim:set shiftwidth=2: */
 End of changes. 187 change blocks. 
342 lines changed or deleted 338 lines changed or added

Home  |  About  |  Features  |  All  |  Newest  |  Dox  |  Diffs  |  RSS Feeds  |  Screenshots  |  Comments  |  Imprint  |  Privacy  |  HTTP(S)