"Fossies" - the Fresh Open Source Software Archive

Member "ripgrep-11.0.1/grep-regex/src/config.rs" (16 Apr 2019, 11712 Bytes) of package /linux/privat/ripgrep-11.0.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Rust source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. See also the last Fossies "Diffs" side-by-side code changes report for "config.rs": 0.10.0_vs_11.0.0.

    1 use grep_matcher::{ByteSet, LineTerminator};
    2 use regex::bytes::{Regex, RegexBuilder};
    3 use regex_syntax::ast::{self, Ast};
    4 use regex_syntax::hir::{self, Hir};
    5 
    6 use ast::AstAnalysis;
    7 use crlf::crlfify;
    8 use error::Error;
    9 use literal::LiteralSets;
   10 use multi::alternation_literals;
   11 use non_matching::non_matching_bytes;
   12 use strip::strip_from_match;
   13 
   14 /// Config represents the configuration of a regex matcher in this crate.
   15 /// The configuration is itself a rough combination of the knobs found in
   16 /// the `regex` crate itself, along with additional `grep-matcher` specific
   17 /// options.
   18 ///
   19 /// The configuration can be used to build a "configured" HIR expression. A
   20 /// configured HIR expression is an HIR expression that is aware of the
   21 /// configuration which generated it, and provides transformation on that HIR
   22 /// such that the configuration is preserved.
   23 #[derive(Clone, Debug)]
   24 pub struct Config {
   25     pub case_insensitive: bool,
   26     pub case_smart: bool,
   27     pub multi_line: bool,
   28     pub dot_matches_new_line: bool,
   29     pub swap_greed: bool,
   30     pub ignore_whitespace: bool,
   31     pub unicode: bool,
   32     pub octal: bool,
   33     pub size_limit: usize,
   34     pub dfa_size_limit: usize,
   35     pub nest_limit: u32,
   36     pub line_terminator: Option<LineTerminator>,
   37     pub crlf: bool,
   38     pub word: bool,
   39 }
   40 
   41 impl Default for Config {
   42     fn default() -> Config {
   43         Config {
   44             case_insensitive: false,
   45             case_smart: false,
   46             multi_line: false,
   47             dot_matches_new_line: false,
   48             swap_greed: false,
   49             ignore_whitespace: false,
   50             unicode: true,
   51             octal: false,
   52             // These size limits are much bigger than what's in the regex
   53             // crate.
   54             size_limit: 100 * (1<<20),
   55             dfa_size_limit: 1000 * (1<<20),
   56             nest_limit: 250,
   57             line_terminator: None,
   58             crlf: false,
   59             word: false,
   60         }
   61     }
   62 }
   63 
   64 impl Config {
   65     /// Parse the given pattern and returned its HIR expression along with
   66     /// the current configuration.
   67     ///
   68     /// If there was a problem parsing the given expression then an error
   69     /// is returned.
   70     pub fn hir(&self, pattern: &str) -> Result<ConfiguredHIR, Error> {
   71         let ast = self.ast(pattern)?;
   72         let analysis = self.analysis(&ast)?;
   73         let expr = hir::translate::TranslatorBuilder::new()
   74             .allow_invalid_utf8(true)
   75             .case_insensitive(self.is_case_insensitive(&analysis))
   76             .multi_line(self.multi_line)
   77             .dot_matches_new_line(self.dot_matches_new_line)
   78             .swap_greed(self.swap_greed)
   79             .unicode(self.unicode)
   80             .build()
   81             .translate(pattern, &ast)
   82             .map_err(Error::regex)?;
   83         let expr = match self.line_terminator {
   84             None => expr,
   85             Some(line_term) => strip_from_match(expr, line_term)?,
   86         };
   87         Ok(ConfiguredHIR {
   88             original: pattern.to_string(),
   89             config: self.clone(),
   90             analysis: analysis,
   91             // If CRLF mode is enabled, replace `$` with `(?:\r?$)`.
   92             expr: if self.crlf { crlfify(expr) } else { expr },
   93         })
   94     }
   95 
   96     /// Accounting for the `smart_case` config knob, return true if and only if
   97     /// this pattern should be matched case insensitively.
   98     fn is_case_insensitive(
   99         &self,
  100         analysis: &AstAnalysis,
  101     ) -> bool {
  102         if self.case_insensitive {
  103             return true;
  104         }
  105         if !self.case_smart {
  106             return false;
  107         }
  108         analysis.any_literal() && !analysis.any_uppercase()
  109     }
  110 
  111     /// Returns true if and only if this config is simple enough such that
  112     /// if the pattern is a simple alternation of literals, then it can be
  113     /// constructed via a plain Aho-Corasick automaton.
  114     ///
  115     /// Note that it is OK to return true even when settings like `multi_line`
  116     /// are enabled, since if multi-line can impact the match semantics of a
  117     /// regex, then it is by definition not a simple alternation of literals.
  118     pub fn can_plain_aho_corasick(&self) -> bool {
  119         !self.word
  120         && !self.case_insensitive
  121         && !self.case_smart
  122     }
  123 
  124     /// Perform analysis on the AST of this pattern.
  125     ///
  126     /// This returns an error if the given pattern failed to parse.
  127     fn analysis(&self, ast: &Ast) -> Result<AstAnalysis, Error> {
  128         Ok(AstAnalysis::from_ast(ast))
  129     }
  130 
  131     /// Parse the given pattern into its abstract syntax.
  132     ///
  133     /// This returns an error if the given pattern failed to parse.
  134     fn ast(&self, pattern: &str) -> Result<Ast, Error> {
  135         ast::parse::ParserBuilder::new()
  136             .nest_limit(self.nest_limit)
  137             .octal(self.octal)
  138             .ignore_whitespace(self.ignore_whitespace)
  139             .build()
  140             .parse(pattern)
  141             .map_err(Error::regex)
  142     }
  143 }
  144 
  145 /// A "configured" HIR expression, which is aware of the configuration which
  146 /// produced this HIR.
  147 ///
  148 /// Since the configuration is tracked, values with this type can be
  149 /// transformed into other HIR expressions (or regular expressions) in a way
  150 /// that preserves the configuration. For example, the `fast_line_regex`
  151 /// method will apply literal extraction to the inner HIR and use that to build
  152 /// a new regex that matches the extracted literals in a way that is
  153 /// consistent with the configuration that produced this HIR. For example, the
  154 /// size limits set on the configured HIR will be propagated out to any
  155 /// subsequently constructed HIR or regular expression.
  156 #[derive(Clone, Debug)]
  157 pub struct ConfiguredHIR {
  158     original: String,
  159     config: Config,
  160     analysis: AstAnalysis,
  161     expr: Hir,
  162 }
  163 
  164 impl ConfiguredHIR {
  165     /// Return the configuration for this HIR expression.
  166     pub fn config(&self) -> &Config {
  167         &self.config
  168     }
  169 
  170     /// Compute the set of non-matching bytes for this HIR expression.
  171     pub fn non_matching_bytes(&self) -> ByteSet {
  172         non_matching_bytes(&self.expr)
  173     }
  174 
  175     /// Returns true if and only if this regex needs to have its match offsets
  176     /// tweaked because of CRLF support. Specifically, this occurs when the
  177     /// CRLF hack is enabled and the regex is line anchored at the end. In
  178     /// this case, matches that end with a `\r` have the `\r` stripped.
  179     pub fn needs_crlf_stripped(&self) -> bool {
  180         self.config.crlf && self.expr.is_line_anchored_end()
  181     }
  182 
  183     /// Builds a regular expression from this HIR expression.
  184     pub fn regex(&self) -> Result<Regex, Error> {
  185         self.pattern_to_regex(&self.expr.to_string())
  186     }
  187 
  188     /// If this HIR corresponds to an alternation of literals with no
  189     /// capturing groups, then this returns those literals.
  190     pub fn alternation_literals(&self) -> Option<Vec<Vec<u8>>> {
  191         if !self.config.can_plain_aho_corasick() {
  192             return None;
  193         }
  194         alternation_literals(&self.expr)
  195     }
  196 
  197     /// Applies the given function to the concrete syntax of this HIR and then
  198     /// generates a new HIR based on the result of the function in a way that
  199     /// preserves the configuration.
  200     ///
  201     /// For example, this can be used to wrap a user provided regular
  202     /// expression with additional semantics. e.g., See the `WordMatcher`.
  203     pub fn with_pattern<F: FnMut(&str) -> String>(
  204         &self,
  205         mut f: F,
  206     ) -> Result<ConfiguredHIR, Error>
  207     {
  208         self.pattern_to_hir(&f(&self.expr.to_string()))
  209     }
  210 
  211     /// If the current configuration has a line terminator set and if useful
  212     /// literals could be extracted, then a regular expression matching those
  213     /// literals is returned. If no line terminator is set, then `None` is
  214     /// returned.
  215     ///
  216     /// If compiling the resulting regular expression failed, then an error
  217     /// is returned.
  218     ///
  219     /// This method only returns something when a line terminator is set
  220     /// because matches from this regex are generally candidates that must be
  221     /// confirmed before reporting a match. When performing a line oriented
  222     /// search, confirmation is easy: just extend the candidate match to its
  223     /// respective line boundaries and then re-search that line for a full
  224     /// match. This only works when the line terminator is set because the line
  225     /// terminator setting guarantees that the regex itself can never match
  226     /// through the line terminator byte.
  227     pub fn fast_line_regex(&self) -> Result<Option<Regex>, Error> {
  228         if self.config.line_terminator.is_none() {
  229             return Ok(None);
  230         }
  231         match LiteralSets::new(&self.expr).one_regex(self.config.word) {
  232             None => Ok(None),
  233             Some(pattern) => self.pattern_to_regex(&pattern).map(Some),
  234         }
  235     }
  236 
  237     /// Create a regex from the given pattern using this HIR's configuration.
  238     fn pattern_to_regex(&self, pattern: &str) -> Result<Regex, Error> {
  239         // The settings we explicitly set here are intentionally a subset
  240         // of the settings we have. The key point here is that our HIR
  241         // expression is computed with the settings in mind, such that setting
  242         // them here could actually lead to unintended behavior. For example,
  243         // consider the pattern `(?U)a+`. This will get folded into the HIR
  244         // as a non-greedy repetition operator which will in turn get printed
  245         // to the concrete syntax as `a+?`, which is correct. But if we
  246         // set the `swap_greed` option again, then we'll wind up with `(?U)a+?`
  247         // which is equal to `a+` which is not the same as what we were given.
  248         //
  249         // We also don't need to apply `case_insensitive` since this gets
  250         // folded into the HIR and would just cause us to do redundant work.
  251         //
  252         // Finally, we don't need to set `ignore_whitespace` since the concrete
  253         // syntax emitted by the HIR printer never needs it.
  254         //
  255         // We set the rest of the options. Some of them are important, such as
  256         // the size limit, and some of them are necessary to preserve the
  257         // intention of the original pattern. For example, the Unicode flag
  258         // will impact how the WordMatcher functions, namely, whether its
  259         // word boundaries are Unicode aware or not.
  260         RegexBuilder::new(&pattern)
  261             .nest_limit(self.config.nest_limit)
  262             .octal(self.config.octal)
  263             .multi_line(self.config.multi_line)
  264             .dot_matches_new_line(self.config.dot_matches_new_line)
  265             .unicode(self.config.unicode)
  266             .size_limit(self.config.size_limit)
  267             .dfa_size_limit(self.config.dfa_size_limit)
  268             .build()
  269             .map_err(Error::regex)
  270     }
  271 
  272     /// Create an HIR expression from the given pattern using this HIR's
  273     /// configuration.
  274     fn pattern_to_hir(&self, pattern: &str) -> Result<ConfiguredHIR, Error> {
  275         // See `pattern_to_regex` comment for explanation of why we only set
  276         // a subset of knobs here. e.g., `swap_greed` is explicitly left out.
  277         let expr = ::regex_syntax::ParserBuilder::new()
  278             .nest_limit(self.config.nest_limit)
  279             .octal(self.config.octal)
  280             .allow_invalid_utf8(true)
  281             .multi_line(self.config.multi_line)
  282             .dot_matches_new_line(self.config.dot_matches_new_line)
  283             .unicode(self.config.unicode)
  284             .build()
  285             .parse(pattern)
  286             .map_err(Error::regex)?;
  287         Ok(ConfiguredHIR {
  288             original: self.original.clone(),
  289             config: self.config.clone(),
  290             analysis: self.analysis.clone(),
  291             expr: expr,
  292         })
  293     }
  294 }