"Fossies" - the Fresh Open Source Software Archive

Member "ripgrep-11.0.1/grep-regex/src/non_matching.rs" (16 Apr 2019, 3825 Bytes) of package /linux/privat/ripgrep-11.0.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Rust source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 use grep_matcher::ByteSet;
    2 use regex_syntax::hir::{self, Hir, HirKind};
    3 use utf8_ranges::Utf8Sequences;
    4 
    5 /// Return a confirmed set of non-matching bytes from the given expression.
    6 pub fn non_matching_bytes(expr: &Hir) -> ByteSet {
    7     let mut set = ByteSet::full();
    8     remove_matching_bytes(expr, &mut set);
    9     set
   10 }
   11 
   12 /// Remove any bytes from the given set that can occur in a matched produced by
   13 /// the given expression.
   14 fn remove_matching_bytes(
   15     expr: &Hir,
   16     set: &mut ByteSet,
   17 ) {
   18     match *expr.kind() {
   19         HirKind::Empty
   20         | HirKind::Anchor(_)
   21         | HirKind::WordBoundary(_) => {}
   22         HirKind::Literal(hir::Literal::Unicode(c)) => {
   23             for &b in c.encode_utf8(&mut [0; 4]).as_bytes() {
   24                 set.remove(b);
   25             }
   26         }
   27         HirKind::Literal(hir::Literal::Byte(b)) => {
   28             set.remove(b);
   29         }
   30         HirKind::Class(hir::Class::Unicode(ref cls)) => {
   31             for range in cls.iter() {
   32                 // This is presumably faster than encoding every codepoint
   33                 // to UTF-8 and then removing those bytes from the set.
   34                 for seq in Utf8Sequences::new(range.start(), range.end()) {
   35                     for byte_range in seq.as_slice() {
   36                         set.remove_all(byte_range.start, byte_range.end);
   37                     }
   38                 }
   39             }
   40         }
   41         HirKind::Class(hir::Class::Bytes(ref cls)) => {
   42             for range in cls.iter() {
   43                 set.remove_all(range.start(), range.end());
   44             }
   45         }
   46         HirKind::Repetition(ref x) => {
   47             remove_matching_bytes(&x.hir, set);
   48         }
   49         HirKind::Group(ref x) => {
   50             remove_matching_bytes(&x.hir, set);
   51         }
   52         HirKind::Concat(ref xs) => {
   53             for x in xs {
   54                 remove_matching_bytes(x, set);
   55             }
   56         }
   57         HirKind::Alternation(ref xs) => {
   58             for x in xs {
   59                 remove_matching_bytes(x, set);
   60             }
   61         }
   62     }
   63 }
   64 
   65 #[cfg(test)]
   66 mod tests {
   67     use grep_matcher::ByteSet;
   68     use regex_syntax::ParserBuilder;
   69 
   70     use super::non_matching_bytes;
   71 
   72     fn extract(pattern: &str) -> ByteSet {
   73         let expr = ParserBuilder::new()
   74             .allow_invalid_utf8(true)
   75             .build()
   76             .parse(pattern)
   77             .unwrap();
   78         non_matching_bytes(&expr)
   79     }
   80 
   81     fn sparse(set: &ByteSet) -> Vec<u8> {
   82         let mut sparse_set = vec![];
   83         for b in (0..256).map(|b| b as u8) {
   84             if set.contains(b) {
   85                 sparse_set.push(b);
   86             }
   87         }
   88         sparse_set
   89     }
   90 
   91     fn sparse_except(except: &[u8]) -> Vec<u8> {
   92         let mut except_set = vec![false; 256];
   93         for &b in except {
   94             except_set[b as usize] = true;
   95         }
   96 
   97         let mut set = vec![];
   98         for b in (0..256).map(|b| b as u8) {
   99             if !except_set[b as usize] {
  100                 set.push(b);
  101             }
  102         }
  103         set
  104     }
  105 
  106     #[test]
  107     fn dot() {
  108         assert_eq!(sparse(&extract(".")), vec![
  109             b'\n',
  110             192, 193, 245, 246, 247, 248, 249,
  111             250, 251, 252, 253, 254, 255,
  112         ]);
  113         assert_eq!(sparse(&extract("(?s).")), vec![
  114             192, 193, 245, 246, 247, 248, 249,
  115             250, 251, 252, 253, 254, 255,
  116         ]);
  117         assert_eq!(sparse(&extract("(?-u).")), vec![b'\n']);
  118         assert_eq!(sparse(&extract("(?s-u).")), vec![]);
  119     }
  120 
  121     #[test]
  122     fn literal() {
  123         assert_eq!(sparse(&extract("a")), sparse_except(&[b'a']));
  124         assert_eq!(sparse(&extract("☃")), sparse_except(&[0xE2, 0x98, 0x83]));
  125         assert_eq!(sparse(&extract(r"\xFF")), sparse_except(&[0xC3, 0xBF]));
  126         assert_eq!(sparse(&extract(r"(?-u)\xFF")), sparse_except(&[0xFF]));
  127     }
  128 }