"Fossies" - the Fresh Open Source Software Archive

Member "ripgrep-11.0.1/grep-searcher/src/searcher/mod.rs" (16 Apr 2019, 38687 Bytes) of package /linux/privat/ripgrep-11.0.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Rust source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. See also the last Fossies "Diffs" side-by-side code changes report for "mod.rs": 0.10.0_vs_11.0.0.

    1 use std::cell::RefCell;
    2 use std::cmp;
    3 use std::fmt;
    4 use std::fs::File;
    5 use std::io::{self, Read};
    6 use std::path::Path;
    7 
    8 use encoding_rs;
    9 use encoding_rs_io::DecodeReaderBytesBuilder;
   10 use grep_matcher::{LineTerminator, Match, Matcher};
   11 use line_buffer::{
   12     self, BufferAllocation, LineBuffer, LineBufferBuilder, LineBufferReader,
   13     DEFAULT_BUFFER_CAPACITY, alloc_error,
   14 };
   15 use searcher::glue::{ReadByLine, SliceByLine, MultiLine};
   16 use sink::{Sink, SinkError};
   17 
   18 pub use self::mmap::MmapChoice;
   19 
   20 mod core;
   21 mod glue;
   22 mod mmap;
   23 
   24 /// We use this type alias since we want the ergonomics of a matcher's `Match`
   25 /// type, but in practice, we use it for arbitrary ranges, so give it a more
   26 /// accurate name. This is only used in the searcher's internals.
   27 type Range = Match;
   28 
   29 /// The behavior of binary detection while searching.
   30 ///
   31 /// Binary detection is the process of _heuristically_ identifying whether a
   32 /// given chunk of data is binary or not, and then taking an action based on
   33 /// the result of that heuristic. The motivation behind detecting binary data
   34 /// is that binary data often indicates data that is undesirable to search
   35 /// using textual patterns. Of course, there are many cases in which this isn't
   36 /// true, which is why binary detection is disabled by default.
   37 ///
   38 /// Unfortunately, binary detection works differently depending on the type of
   39 /// search being executed:
   40 ///
   41 /// 1. When performing a search using a fixed size buffer, binary detection is
   42 ///    applied to the buffer's contents as it is filled. Binary detection must
   43 ///    be applied to the buffer directly because binary files may not contain
   44 ///    line terminators, which could result in exorbitant memory usage.
   45 /// 2. When performing a search using memory maps or by reading data off the
   46 ///    heap, then binary detection is only guaranteed to be applied to the
   47 ///    parts corresponding to a match. When `Quit` is enabled, then the first
   48 ///    few KB of the data are searched for binary data.
   49 #[derive(Clone, Debug, Default)]
   50 pub struct BinaryDetection(line_buffer::BinaryDetection);
   51 
   52 impl BinaryDetection {
   53     /// No binary detection is performed. Data reported by the searcher may
   54     /// contain arbitrary bytes.
   55     ///
   56     /// This is the default.
   57     pub fn none() -> BinaryDetection {
   58         BinaryDetection(line_buffer::BinaryDetection::None)
   59     }
   60 
   61     /// Binary detection is performed by looking for the given byte.
   62     ///
   63     /// When searching is performed using a fixed size buffer, then the
   64     /// contents of that buffer are always searched for the presence of this
   65     /// byte. If it is found, then the underlying data is considered binary
   66     /// and the search stops as if it reached EOF.
   67     ///
   68     /// When searching is performed with the entire contents mapped into
   69     /// memory, then binary detection is more conservative. Namely, only a
   70     /// fixed sized region at the beginning of the contents are detected for
   71     /// binary data. As a compromise, any subsequent matching (or context)
   72     /// lines are also searched for binary data. If binary data is detected at
   73     /// any point, then the search stops as if it reached EOF.
   74     pub fn quit(binary_byte: u8) -> BinaryDetection {
   75         BinaryDetection(line_buffer::BinaryDetection::Quit(binary_byte))
   76     }
   77 
   78     /// Binary detection is performed by looking for the given byte, and
   79     /// replacing it with the line terminator configured on the searcher.
   80     /// (If the searcher is configured to use `CRLF` as the line terminator,
   81     /// then this byte is replaced by just `LF`.)
   82     ///
   83     /// When searching is performed using a fixed size buffer, then the
   84     /// contents of that buffer are always searched for the presence of this
   85     /// byte and replaced with the line terminator. In effect, the caller is
   86     /// guaranteed to never observe this byte while searching.
   87     ///
   88     /// When searching is performed with the entire contents mapped into
   89     /// memory, then this setting has no effect and is ignored.
   90     pub fn convert(binary_byte: u8) -> BinaryDetection {
   91         BinaryDetection(line_buffer::BinaryDetection::Convert(binary_byte))
   92     }
   93 
   94     /// If this binary detection uses the "quit" strategy, then this returns
   95     /// the byte that will cause a search to quit. In any other case, this
   96     /// returns `None`.
   97     pub fn quit_byte(&self) -> Option<u8> {
   98         match self.0 {
   99             line_buffer::BinaryDetection::Quit(b) => Some(b),
  100             _ => None,
  101         }
  102     }
  103 
  104     /// If this binary detection uses the "convert" strategy, then this returns
  105     /// the byte that will be replaced by the line terminator. In any other
  106     /// case, this returns `None`.
  107     pub fn convert_byte(&self) -> Option<u8> {
  108         match self.0 {
  109             line_buffer::BinaryDetection::Convert(b) => Some(b),
  110             _ => None,
  111         }
  112     }
  113 }
  114 
  115 /// An encoding to use when searching.
  116 ///
  117 /// An encoding can be used to configure a
  118 /// [`SearcherBuilder`](struct.SearchBuilder.html)
  119 /// to transcode source data from an encoding to UTF-8 before searching.
  120 ///
  121 /// An `Encoding` will always be cheap to clone.
  122 #[derive(Clone, Debug)]
  123 pub struct Encoding(&'static encoding_rs::Encoding);
  124 
  125 impl Encoding {
  126     /// Create a new encoding for the specified label.
  127     ///
  128     /// The encoding label provided is mapped to an encoding via the set of
  129     /// available choices specified in the
  130     /// [Encoding Standard](https://encoding.spec.whatwg.org/#concept-encoding-get).
  131     /// If the given label does not correspond to a valid encoding, then this
  132     /// returns an error.
  133     pub fn new(label: &str) -> Result<Encoding, ConfigError> {
  134         let label = label.as_bytes();
  135         match encoding_rs::Encoding::for_label_no_replacement(label) {
  136             Some(encoding) => Ok(Encoding(encoding)),
  137             None => {
  138                 Err(ConfigError::UnknownEncoding { label: label.to_vec() })
  139             }
  140         }
  141     }
  142 }
  143 
  144 /// The internal configuration of a searcher. This is shared among several
  145 /// search related types, but is only ever written to by the SearcherBuilder.
  146 #[derive(Clone, Debug)]
  147 pub struct Config {
  148     /// The line terminator to use.
  149     line_term: LineTerminator,
  150     /// Whether to invert matching.
  151     invert_match: bool,
  152     /// The number of lines after a match to include.
  153     after_context: usize,
  154     /// The number of lines before a match to include.
  155     before_context: usize,
  156     /// Whether to enable unbounded context or not.
  157     passthru: bool,
  158     /// Whether to count line numbers.
  159     line_number: bool,
  160     /// The maximum amount of heap memory to use.
  161     ///
  162     /// When not given, no explicit limit is enforced. When set to `0`, then
  163     /// only the memory map search strategy is available.
  164     heap_limit: Option<usize>,
  165     /// The memory map strategy.
  166     mmap: MmapChoice,
  167     /// The binary data detection strategy.
  168     binary: BinaryDetection,
  169     /// Whether to enable matching across multiple lines.
  170     multi_line: bool,
  171     /// An encoding that, when present, causes the searcher to transcode all
  172     /// input from the encoding to UTF-8.
  173     encoding: Option<Encoding>,
  174     /// Whether to do automatic transcoding based on a BOM or not.
  175     bom_sniffing: bool,
  176 }
  177 
  178 impl Default for Config {
  179     fn default() -> Config {
  180         Config {
  181             line_term: LineTerminator::default(),
  182             invert_match: false,
  183             after_context: 0,
  184             before_context: 0,
  185             passthru: false,
  186             line_number: true,
  187             heap_limit: None,
  188             mmap: MmapChoice::default(),
  189             binary: BinaryDetection::default(),
  190             multi_line: false,
  191             encoding: None,
  192             bom_sniffing: true,
  193         }
  194     }
  195 }
  196 
  197 impl Config {
  198     /// Return the maximal amount of lines needed to fulfill this
  199     /// configuration's context.
  200     ///
  201     /// If this returns `0`, then no context is ever needed.
  202     fn max_context(&self) -> usize {
  203         cmp::max(self.before_context, self.after_context)
  204     }
  205 
  206     /// Build a line buffer from this configuration.
  207     fn line_buffer(&self) -> LineBuffer {
  208         let mut builder = LineBufferBuilder::new();
  209         builder
  210             .line_terminator(self.line_term.as_byte())
  211             .binary_detection(self.binary.0);
  212 
  213         if let Some(limit) = self.heap_limit {
  214             let (capacity, additional) =
  215                 if limit <= DEFAULT_BUFFER_CAPACITY {
  216                     (limit, 0)
  217                 } else {
  218                     (DEFAULT_BUFFER_CAPACITY, limit - DEFAULT_BUFFER_CAPACITY)
  219                 };
  220             builder
  221                 .capacity(capacity)
  222                 .buffer_alloc(BufferAllocation::Error(additional));
  223         }
  224         builder.build()
  225     }
  226 }
  227 
  228 /// An error that can occur when building a searcher.
  229 ///
  230 /// This error occurs when a non-sensical configuration is present when trying
  231 /// to construct a `Searcher` from a `SearcherBuilder`.
  232 #[derive(Clone, Debug, Eq, PartialEq)]
  233 pub enum ConfigError {
  234     /// Indicates that the heap limit configuration prevents all possible
  235     /// search strategies from being used. For example, if the heap limit is
  236     /// set to 0 and memory map searching is disabled or unavailable.
  237     SearchUnavailable,
  238     /// Occurs when a matcher reports a line terminator that is different than
  239     /// the one configured in the searcher.
  240     MismatchedLineTerminators {
  241         /// The matcher's line terminator.
  242         matcher: LineTerminator,
  243         /// The searcher's line terminator.
  244         searcher: LineTerminator,
  245     },
  246     /// Occurs when no encoding could be found for a particular label.
  247     UnknownEncoding {
  248         /// The provided encoding label that could not be found.
  249         label: Vec<u8>,
  250     },
  251     /// Hints that destructuring should not be exhaustive.
  252     ///
  253     /// This enum may grow additional variants, so this makes sure clients
  254     /// don't count on exhaustive matching. (Otherwise, adding a new variant
  255     /// could break existing code.)
  256     #[doc(hidden)]
  257     __Nonexhaustive,
  258 }
  259 
  260 impl ::std::error::Error for ConfigError {
  261     fn description(&self) -> &str { "grep-searcher configuration error" }
  262 }
  263 
  264 impl fmt::Display for ConfigError {
  265     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  266         match *self {
  267             ConfigError::SearchUnavailable => {
  268                 write!(f, "grep config error: no available searchers")
  269             }
  270             ConfigError::MismatchedLineTerminators { matcher, searcher } => {
  271                 write!(
  272                     f,
  273                     "grep config error: mismatched line terminators, \
  274                      matcher has {:?} but searcher has {:?}",
  275                     matcher,
  276                     searcher
  277                 )
  278             }
  279             ConfigError::UnknownEncoding { ref label } => {
  280                 write!(
  281                     f,
  282                     "grep config error: unknown encoding: {}",
  283                     String::from_utf8_lossy(label),
  284                 )
  285             }
  286             _ => panic!("BUG: unexpected variant found"),
  287         }
  288     }
  289 }
  290 
  291 /// A builder for configuring a searcher.
  292 ///
  293 /// A search builder permits specifying the configuration of a searcher,
  294 /// including options like whether to invert the search or to enable multi
  295 /// line search.
  296 ///
  297 /// Once a searcher has been built, it is beneficial to reuse that searcher
  298 /// for multiple searches, if possible.
  299 #[derive(Clone, Debug)]
  300 pub struct SearcherBuilder {
  301     config: Config,
  302 }
  303 
  304 impl Default for SearcherBuilder {
  305     fn default() -> SearcherBuilder {
  306         SearcherBuilder::new()
  307     }
  308 }
  309 
  310 impl SearcherBuilder {
  311     /// Create a new searcher builder with a default configuration.
  312     pub fn new() -> SearcherBuilder {
  313         SearcherBuilder {
  314             config: Config::default(),
  315         }
  316     }
  317 
  318     /// Build a searcher with the given matcher.
  319     pub fn build(&self) -> Searcher {
  320         let mut config = self.config.clone();
  321         if config.passthru {
  322             config.before_context = 0;
  323             config.after_context = 0;
  324         }
  325 
  326         let mut decode_builder = DecodeReaderBytesBuilder::new();
  327         decode_builder
  328             .encoding(self.config.encoding.as_ref().map(|e| e.0))
  329             .utf8_passthru(true)
  330             .strip_bom(self.config.bom_sniffing)
  331             .bom_override(true)
  332             .bom_sniffing(self.config.bom_sniffing);
  333 
  334         Searcher {
  335             config: config,
  336             decode_builder: decode_builder,
  337             decode_buffer: RefCell::new(vec![0; 8 * (1<<10)]),
  338             line_buffer: RefCell::new(self.config.line_buffer()),
  339             multi_line_buffer: RefCell::new(vec![]),
  340         }
  341     }
  342 
  343     /// Set the line terminator that is used by the searcher.
  344     ///
  345     /// When using a searcher, if the matcher provided has a line terminator
  346     /// set, then it must be the same as this one. If they aren't, building
  347     /// a searcher will return an error.
  348     ///
  349     /// By default, this is set to `b'\n'`.
  350     pub fn line_terminator(
  351         &mut self,
  352         line_term: LineTerminator,
  353     ) -> &mut SearcherBuilder {
  354         self.config.line_term = line_term;
  355         self
  356     }
  357 
  358     /// Whether to invert matching, whereby lines that don't match are reported
  359     /// instead of reporting lines that do match.
  360     ///
  361     /// By default, this is disabled.
  362     pub fn invert_match(&mut self, yes: bool) -> &mut SearcherBuilder {
  363         self.config.invert_match = yes;
  364         self
  365     }
  366 
  367     /// Whether to count and include line numbers with matching lines.
  368     ///
  369     /// This is enabled by default. There is a small performance penalty
  370     /// associated with computing line numbers, so this can be disabled when
  371     /// this isn't desirable.
  372     pub fn line_number(&mut self, yes: bool) -> &mut SearcherBuilder {
  373         self.config.line_number = yes;
  374         self
  375     }
  376 
  377     /// Whether to enable multi line search or not.
  378     ///
  379     /// When multi line search is enabled, matches *may* match across multiple
  380     /// lines. Conversely, when multi line search is disabled, it is impossible
  381     /// for any match to span more than one line.
  382     ///
  383     /// **Warning:** multi line search requires having the entire contents to
  384     /// search mapped in memory at once. When searching files, memory maps
  385     /// will be used if possible and if they are enabled, which avoids using
  386     /// your program's heap. However, if memory maps cannot be used (e.g.,
  387     /// for searching streams like `stdin` or if transcoding is necessary),
  388     /// then the entire contents of the stream are read on to the heap before
  389     /// starting the search.
  390     ///
  391     /// This is disabled by default.
  392     pub fn multi_line(&mut self, yes: bool) -> &mut SearcherBuilder {
  393         self.config.multi_line = yes;
  394         self
  395     }
  396 
  397     /// Whether to include a fixed number of lines after every match.
  398     ///
  399     /// When this is set to a non-zero number, then the searcher will report
  400     /// `line_count` contextual lines after every match.
  401     ///
  402     /// This is set to `0` by default.
  403     pub fn after_context(
  404         &mut self,
  405         line_count: usize,
  406     ) -> &mut SearcherBuilder {
  407         self.config.after_context = line_count;
  408         self
  409     }
  410 
  411     /// Whether to include a fixed number of lines before every match.
  412     ///
  413     /// When this is set to a non-zero number, then the searcher will report
  414     /// `line_count` contextual lines before every match.
  415     ///
  416     /// This is set to `0` by default.
  417     pub fn before_context(
  418         &mut self,
  419         line_count: usize,
  420     ) -> &mut SearcherBuilder {
  421         self.config.before_context = line_count;
  422         self
  423     }
  424 
  425     /// Whether to enable the "passthru" feature or not.
  426     ///
  427     /// When passthru is enabled, it effectively treats all non-matching lines
  428     /// as contextual lines. In other words, enabling this is akin to
  429     /// requesting an unbounded number of before and after contextual lines.
  430     ///
  431     /// When passthru mode is enabled, any `before_context` or `after_context`
  432     /// settings are ignored by setting them to `0`.
  433     ///
  434     /// This is disabled by default.
  435     pub fn passthru(&mut self, yes: bool) -> &mut SearcherBuilder {
  436         self.config.passthru = yes;
  437         self
  438     }
  439 
  440     /// Set an approximate limit on the amount of heap space used by a
  441     /// searcher.
  442     ///
  443     /// The heap limit is enforced in two scenarios:
  444     ///
  445     /// * When searching using a fixed size buffer, the heap limit controls
  446     ///   how big this buffer is allowed to be. Assuming contexts are disabled,
  447     ///   the minimum size of this buffer is the length (in bytes) of the
  448     ///   largest single line in the contents being searched. If any line
  449     ///   exceeds the heap limit, then an error will be returned.
  450     /// * When performing a multi line search, a fixed size buffer cannot be
  451     ///   used. Thus, the only choices are to read the entire contents on to
  452     ///   the heap, or use memory maps. In the former case, the heap limit set
  453     ///   here is enforced.
  454     ///
  455     /// If a heap limit is set to `0`, then no heap space is used. If there are
  456     /// no alternative strategies available for searching without heap space
  457     /// (e.g., memory maps are disabled), then the searcher wil return an error
  458     /// immediately.
  459     ///
  460     /// By default, no limit is set.
  461     pub fn heap_limit(
  462         &mut self,
  463         bytes: Option<usize>,
  464     ) -> &mut SearcherBuilder {
  465         self.config.heap_limit = bytes;
  466         self
  467     }
  468 
  469     /// Set the strategy to employ use of memory maps.
  470     ///
  471     /// Currently, there are only two strategies that can be employed:
  472     ///
  473     /// * **Automatic** - A searcher will use heuristics, including but not
  474     ///   limited to file size and platform, to determine whether to use memory
  475     ///   maps or not.
  476     /// * **Never** - Memory maps will never be used. If multi line search is
  477     ///   enabled, then the entire contents will be read on to the heap before
  478     ///   searching begins.
  479     ///
  480     /// The default behavior is **never**. Generally speaking, and perhaps
  481     /// against conventional wisdom, memory maps don't necessarily enable
  482     /// faster searching. For example, depending on the platform, using memory
  483     /// maps while searching a large directory can actually be quite a bit
  484     /// slower than using normal read calls because of the overhead of managing
  485     /// the memory maps.
  486     ///
  487     /// Memory maps can be faster in some cases however. On some platforms,
  488     /// when searching a very large file that *is already in memory*, it can
  489     /// be slightly faster to search it as a memory map instead of using
  490     /// normal read calls.
  491     ///
  492     /// Finally, memory maps have a somewhat complicated safety story in Rust.
  493     /// If you aren't sure whether enabling memory maps is worth it, then just
  494     /// don't bother with it.
  495     ///
  496     /// **WARNING**: If your process is searching a file backed memory map
  497     /// at the same time that file is truncated, then it's possible for the
  498     /// process to terminate with a bus error.
  499     pub fn memory_map(
  500         &mut self,
  501         strategy: MmapChoice,
  502     ) -> &mut SearcherBuilder {
  503         self.config.mmap = strategy;
  504         self
  505     }
  506 
  507     /// Set the binary detection strategy.
  508     ///
  509     /// The binary detection strategy determines not only how the searcher
  510     /// detects binary data, but how it responds to the presence of binary
  511     /// data. See the [`BinaryDetection`](struct.BinaryDetection.html) type
  512     /// for more information.
  513     ///
  514     /// By default, binary detection is disabled.
  515     pub fn binary_detection(
  516         &mut self,
  517         detection: BinaryDetection,
  518     ) -> &mut SearcherBuilder {
  519         self.config.binary = detection;
  520         self
  521     }
  522 
  523     /// Set the encoding used to read the source data before searching.
  524     ///
  525     /// When an encoding is provided, then the source data is _unconditionally_
  526     /// transcoded using the encoding, unless a BOM is present. If a BOM is
  527     /// present, then the encoding indicated by the BOM is used instead. If the
  528     /// transcoding process encounters an error, then bytes are replaced with
  529     /// the Unicode replacement codepoint.
  530     ///
  531     /// When no encoding is specified (the default), then BOM sniffing is
  532     /// used (if it's enabled, which it is, by default) to determine whether
  533     /// the source data is UTF-8 or UTF-16, and transcoding will be performed
  534     /// automatically. If no BOM could be found, then the source data is
  535     /// searched _as if_ it were UTF-8. However, so long as the source data is
  536     /// at least ASCII compatible, then it is possible for a search to produce
  537     /// useful results.
  538     pub fn encoding(
  539         &mut self,
  540         encoding: Option<Encoding>,
  541     ) -> &mut SearcherBuilder {
  542         self.config.encoding = encoding;
  543         self
  544     }
  545 
  546     /// Enable automatic transcoding based on BOM sniffing.
  547     ///
  548     /// When this is enabled and an explicit encoding is not set, then this
  549     /// searcher will try to detect the encoding of the bytes being searched
  550     /// by sniffing its byte-order mark (BOM). In particular, when this is
  551     /// enabled, UTF-16 encoded files will be searched seamlessly.
  552     ///
  553     /// When this is disabled and if an explicit encoding is not set, then
  554     /// the bytes from the source stream will be passed through unchanged,
  555     /// including its BOM, if one is present.
  556     ///
  557     /// This is enabled by default.
  558     pub fn bom_sniffing(&mut self, yes: bool) -> &mut SearcherBuilder {
  559         self.config.bom_sniffing = yes;
  560         self
  561     }
  562 }
  563 
  564 /// A searcher executes searches over a haystack and writes results to a caller
  565 /// provided sink.
  566 ///
  567 /// Matches are detected via implementations of the `Matcher` trait, which must
  568 /// be provided by the caller when executing a search.
  569 ///
  570 /// When possible, a searcher should be reused.
  571 #[derive(Clone, Debug)]
  572 pub struct Searcher {
  573     /// The configuration for this searcher.
  574     ///
  575     /// We make most of these settings available to users of `Searcher` via
  576     /// public API methods, which can be queried in implementations of `Sink`
  577     /// if necessary.
  578     config: Config,
  579     /// A builder for constructing a streaming reader that transcodes source
  580     /// data according to either an explicitly specified encoding or via an
  581     /// automatically detected encoding via BOM sniffing.
  582     ///
  583     /// When no transcoding is needed, then the transcoder built will pass
  584     /// through the underlying bytes with no additional overhead.
  585     decode_builder: DecodeReaderBytesBuilder,
  586     /// A buffer that is used for transcoding scratch space.
  587     decode_buffer: RefCell<Vec<u8>>,
  588     /// A line buffer for use in line oriented searching.
  589     ///
  590     /// We wrap it in a RefCell to permit lending out borrows of `Searcher`
  591     /// to sinks. We still require a mutable borrow to execute a search, so
  592     /// we statically prevent callers from causing RefCell to panic at runtime
  593     /// due to a borrowing violation.
  594     line_buffer: RefCell<LineBuffer>,
  595     /// A buffer in which to store the contents of a reader when performing a
  596     /// multi line search. In particular, multi line searches cannot be
  597     /// performed incrementally, and need the entire haystack in memory at
  598     /// once.
  599     multi_line_buffer: RefCell<Vec<u8>>,
  600 }
  601 
  602 impl Searcher {
  603     /// Create a new searcher with a default configuration.
  604     ///
  605     /// To configure the searcher (e.g., invert matching, enable memory maps,
  606     /// enable contexts, etc.), use the
  607     /// [`SearcherBuilder`](struct.SearcherBuilder.html).
  608     pub fn new() -> Searcher {
  609         SearcherBuilder::new().build()
  610     }
  611 
  612     /// Execute a search over the file with the given path and write the
  613     /// results to the given sink.
  614     ///
  615     /// If memory maps are enabled and the searcher heuristically believes
  616     /// memory maps will help the search run faster, then this will use
  617     /// memory maps. For this reason, callers should prefer using this method
  618     /// or `search_file` over the more generic `search_reader` when possible.
  619     pub fn search_path<P, M, S>(
  620         &mut self,
  621         matcher: M,
  622         path: P,
  623         write_to: S,
  624     ) -> Result<(), S::Error>
  625     where P: AsRef<Path>,
  626           M: Matcher,
  627           S: Sink,
  628     {
  629         let path = path.as_ref();
  630         let file = File::open(path).map_err(S::Error::error_io)?;
  631         self.search_file_maybe_path(matcher, Some(path), &file, write_to)
  632     }
  633 
  634     /// Execute a search over a file and write the results to the given sink.
  635     ///
  636     /// If memory maps are enabled and the searcher heuristically believes
  637     /// memory maps will help the search run faster, then this will use
  638     /// memory maps. For this reason, callers should prefer using this method
  639     /// or `search_path` over the more generic `search_reader` when possible.
  640     pub fn search_file<M, S>(
  641         &mut self,
  642         matcher: M,
  643         file: &File,
  644         write_to: S,
  645     ) -> Result<(), S::Error>
  646     where M: Matcher,
  647           S: Sink,
  648     {
  649         self.search_file_maybe_path(matcher, None, file, write_to)
  650     }
  651 
  652     fn search_file_maybe_path<M, S>(
  653         &mut self,
  654         matcher: M,
  655         path: Option<&Path>,
  656         file: &File,
  657         write_to: S,
  658     ) -> Result<(), S::Error>
  659     where M: Matcher,
  660           S: Sink,
  661     {
  662         if let Some(mmap) = self.config.mmap.open(file, path) {
  663             trace!("{:?}: searching via memory map", path);
  664             return self.search_slice(matcher, &mmap, write_to);
  665         }
  666         // Fast path for multi-line searches of files when memory maps are
  667         // not enabled. This pre-allocates a buffer roughly the size of the
  668         // file, which isn't possible when searching an arbitrary io::Read.
  669         if self.multi_line_with_matcher(&matcher) {
  670             trace!("{:?}: reading entire file on to heap for mulitline", path);
  671             self.fill_multi_line_buffer_from_file::<S>(file)?;
  672             trace!("{:?}: searching via multiline strategy", path);
  673             MultiLine::new(
  674                 self,
  675                 matcher,
  676                 &*self.multi_line_buffer.borrow(),
  677                 write_to,
  678             ).run()
  679         } else {
  680             trace!("{:?}: searching using generic reader", path);
  681             self.search_reader(matcher, file, write_to)
  682         }
  683     }
  684 
  685     /// Execute a search over any implementation of `io::Read` and write the
  686     /// results to the given sink.
  687     ///
  688     /// When possible, this implementation will search the reader incrementally
  689     /// without reading it into memory. In some cases---for example, if multi
  690     /// line search is enabled---an incremental search isn't possible and the
  691     /// given reader is consumed completely and placed on the heap before
  692     /// searching begins. For this reason, when multi line search is enabled,
  693     /// one should try to use higher level APIs (e.g., searching by file or
  694     /// file path) so that memory maps can be used if they are available and
  695     /// enabled.
  696     pub fn search_reader<M, R, S>(
  697         &mut self,
  698         matcher: M,
  699         read_from: R,
  700         write_to: S,
  701     ) -> Result<(), S::Error>
  702     where M: Matcher,
  703           R: io::Read,
  704           S: Sink,
  705     {
  706         self.check_config(&matcher).map_err(S::Error::error_config)?;
  707 
  708         let mut decode_buffer = self.decode_buffer.borrow_mut();
  709         let read_from = self.decode_builder
  710             .build_with_buffer(read_from, &mut *decode_buffer)
  711             .map_err(S::Error::error_io)?;
  712 
  713         if self.multi_line_with_matcher(&matcher) {
  714             trace!("generic reader: reading everything to heap for multiline");
  715             self.fill_multi_line_buffer_from_reader::<_, S>(read_from)?;
  716             trace!("generic reader: searching via multiline strategy");
  717             MultiLine::new(
  718                 self,
  719                 matcher,
  720                 &*self.multi_line_buffer.borrow(),
  721                 write_to,
  722             ).run()
  723         } else {
  724             let mut line_buffer = self.line_buffer.borrow_mut();
  725             let rdr = LineBufferReader::new(read_from, &mut *line_buffer);
  726             trace!("generic reader: searching via roll buffer strategy");
  727             ReadByLine::new(self, matcher, rdr, write_to).run()
  728         }
  729     }
  730 
  731     /// Execute a search over the given slice and write the results to the
  732     /// given sink.
  733     pub fn search_slice<M, S>(
  734         &mut self,
  735         matcher: M,
  736         slice: &[u8],
  737         write_to: S,
  738     ) -> Result<(), S::Error>
  739     where M: Matcher,
  740           S: Sink,
  741     {
  742         self.check_config(&matcher).map_err(S::Error::error_config)?;
  743 
  744         // We can search the slice directly, unless we need to do transcoding.
  745         if self.slice_needs_transcoding(slice) {
  746             trace!("slice reader: needs transcoding, using generic reader");
  747             return self.search_reader(matcher, slice, write_to);
  748         }
  749         if self.multi_line_with_matcher(&matcher) {
  750             trace!("slice reader: searching via multiline strategy");
  751             MultiLine::new(self, matcher, slice, write_to).run()
  752         } else {
  753             trace!("slice reader: searching via slice-by-line strategy");
  754             SliceByLine::new(self, matcher, slice, write_to).run()
  755         }
  756     }
  757 
  758     /// Set the binary detection method used on this searcher.
  759     pub fn set_binary_detection(&mut self, detection: BinaryDetection) {
  760         self.config.binary = detection.clone();
  761         self.line_buffer.borrow_mut().set_binary_detection(detection.0);
  762     }
  763 
  764     /// Check that the searcher's configuration and the matcher are consistent
  765     /// with each other.
  766     fn check_config<M: Matcher>(&self, matcher: M) -> Result<(), ConfigError> {
  767         if self.config.heap_limit == Some(0)
  768             && !self.config.mmap.is_enabled()
  769         {
  770             return Err(ConfigError::SearchUnavailable);
  771         }
  772         let matcher_line_term = match matcher.line_terminator() {
  773             None => return Ok(()),
  774             Some(line_term) => line_term,
  775         };
  776         if matcher_line_term != self.config.line_term {
  777             return Err(ConfigError::MismatchedLineTerminators {
  778                 matcher: matcher_line_term,
  779                 searcher: self.config.line_term,
  780             });
  781         }
  782         Ok(())
  783     }
  784 
  785     /// Returns true if and only if the given slice needs to be transcoded.
  786     fn slice_needs_transcoding(&self, slice: &[u8]) -> bool {
  787         self.config.encoding.is_some()
  788         || (self.config.bom_sniffing && slice_has_utf16_bom(slice))
  789     }
  790 }
  791 
  792 /// The following methods permit querying the configuration of a searcher.
  793 /// These can be useful in generic implementations of
  794 /// [`Sink`](trait.Sink.html),
  795 /// where the output may be tailored based on how the searcher is configured.
  796 impl Searcher {
  797     /// Returns the line terminator used by this searcher.
  798     #[inline]
  799     pub fn line_terminator(&self) -> LineTerminator {
  800         self.config.line_term
  801     }
  802 
  803     /// Returns the type of binary detection configured on this searcher.
  804     #[inline]
  805     pub fn binary_detection(&self) -> &BinaryDetection {
  806         &self.config.binary
  807     }
  808 
  809     /// Returns true if and only if this searcher is configured to invert its
  810     /// search results. That is, matching lines are lines that do **not** match
  811     /// the searcher's matcher.
  812     #[inline]
  813     pub fn invert_match(&self) -> bool {
  814         self.config.invert_match
  815     }
  816 
  817     /// Returns true if and only if this searcher is configured to count line
  818     /// numbers.
  819     #[inline]
  820     pub fn line_number(&self) -> bool {
  821         self.config.line_number
  822     }
  823 
  824     /// Returns true if and only if this searcher is configured to perform
  825     /// multi line search.
  826     #[inline]
  827     pub fn multi_line(&self) -> bool {
  828         self.config.multi_line
  829     }
  830 
  831     /// Returns true if and only if this searcher will choose a multi-line
  832     /// strategy given the provided matcher.
  833     ///
  834     /// This may diverge from the result of `multi_line` in cases where the
  835     /// searcher has been configured to execute a search that can report
  836     /// matches over multiple lines, but where the matcher guarantees that it
  837     /// will never produce a match over multiple lines.
  838     pub fn multi_line_with_matcher<M: Matcher>(&self, matcher: M) -> bool {
  839         if !self.multi_line() {
  840             return false;
  841         }
  842         if let Some(line_term) = matcher.line_terminator() {
  843             if line_term == self.line_terminator() {
  844                 return false;
  845             }
  846         }
  847         if let Some(non_matching) = matcher.non_matching_bytes() {
  848             // If the line terminator is CRLF, we don't actually need to care
  849             // whether the regex can match `\r` or not. Namely, a `\r` is
  850             // neither necessary nor sufficient to terminate a line. A `\n` is
  851             // always required.
  852             if non_matching.contains(self.line_terminator().as_byte()) {
  853                 return false;
  854             }
  855         }
  856         true
  857     }
  858 
  859     /// Returns the number of "after" context lines to report. When context
  860     /// reporting is not enabled, this returns `0`.
  861     #[inline]
  862     pub fn after_context(&self) -> usize {
  863         self.config.after_context
  864     }
  865 
  866     /// Returns the number of "before" context lines to report. When context
  867     /// reporting is not enabled, this returns `0`.
  868     #[inline]
  869     pub fn before_context(&self) -> usize {
  870         self.config.before_context
  871     }
  872 
  873     /// Returns true if and only if the searcher has "passthru" mode enabled.
  874     #[inline]
  875     pub fn passthru(&self) -> bool {
  876         self.config.passthru
  877     }
  878 
  879     /// Fill the buffer for use with multi-line searching from the given file.
  880     /// This reads from the file until EOF or until an error occurs. If the
  881     /// contents exceed the configured heap limit, then an error is returned.
  882     fn fill_multi_line_buffer_from_file<S: Sink>(
  883         &self,
  884         file: &File,
  885     ) -> Result<(), S::Error> {
  886         assert!(self.config.multi_line);
  887 
  888         let mut decode_buffer = self.decode_buffer.borrow_mut();
  889         let mut read_from = self.decode_builder
  890             .build_with_buffer(file, &mut *decode_buffer)
  891             .map_err(S::Error::error_io)?;
  892 
  893         // If we don't have a heap limit, then we can defer to std's
  894         // read_to_end implementation. fill_multi_line_buffer_from_reader will
  895         // do this too, but since we have a File, we can be a bit smarter about
  896         // pre-allocating here.
  897         //
  898         // If we're transcoding, then our pre-allocation might not be exact,
  899         // but is probably still better than nothing.
  900         if self.config.heap_limit.is_none() {
  901             let mut buf = self.multi_line_buffer.borrow_mut();
  902             buf.clear();
  903             let cap = file
  904                 .metadata()
  905                 .map(|m| m.len() as usize + 1)
  906                 .unwrap_or(0);
  907             buf.reserve(cap);
  908             read_from.read_to_end(&mut *buf).map_err(S::Error::error_io)?;
  909             return Ok(());
  910         }
  911         self.fill_multi_line_buffer_from_reader::<_, S>(read_from)
  912     }
  913 
  914     /// Fill the buffer for use with multi-line searching from the given
  915     /// reader. This reads from the reader until EOF or until an error occurs.
  916     /// If the contents exceed the configured heap limit, then an error is
  917     /// returned.
  918     fn fill_multi_line_buffer_from_reader<R: io::Read, S: Sink>(
  919         &self,
  920         mut read_from: R,
  921     ) -> Result<(), S::Error> {
  922         assert!(self.config.multi_line);
  923 
  924         let mut buf = self.multi_line_buffer.borrow_mut();
  925         buf.clear();
  926 
  927         // If we don't have a heap limit, then we can defer to std's
  928         // read_to_end implementation...
  929         let heap_limit = match self.config.heap_limit {
  930             Some(heap_limit) => heap_limit,
  931             None => {
  932                 read_from.read_to_end(&mut *buf).map_err(S::Error::error_io)?;
  933                 return Ok(());
  934             }
  935         };
  936         if heap_limit == 0 {
  937             return Err(S::Error::error_io(alloc_error(heap_limit)));
  938         }
  939 
  940         // ... otherwise we need to roll our own. This is likely quite a bit
  941         // slower than what is optimal, but we avoid worry about memory safety
  942         // until there's a compelling reason to speed this up.
  943         buf.resize(cmp::min(DEFAULT_BUFFER_CAPACITY, heap_limit), 0);
  944         let mut pos = 0;
  945         loop {
  946             let nread = match read_from.read(&mut buf[pos..]) {
  947                 Ok(nread) => nread,
  948                 Err(ref err) if err.kind() == io::ErrorKind::Interrupted => {
  949                     continue;
  950                 }
  951                 Err(err) => return Err(S::Error::error_io(err)),
  952             };
  953             if nread == 0 {
  954                 buf.resize(pos, 0);
  955                 return Ok(());
  956             }
  957 
  958             pos += nread;
  959             if buf[pos..].is_empty() {
  960                 let additional = heap_limit - buf.len();
  961                 if additional == 0 {
  962                     return Err(S::Error::error_io(alloc_error(heap_limit)));
  963                 }
  964                 let limit = buf.len() + additional;
  965                 let doubled = 2 * buf.len();
  966                 buf.resize(cmp::min(doubled, limit), 0);
  967             }
  968         }
  969     }
  970 }
  971 
  972 /// Returns true if and only if the given slice begins with a UTF-16 BOM.
  973 ///
  974 /// This is used by the searcher to determine if a transcoder is necessary.
  975 /// Otherwise, it is advantageous to search the slice directly.
  976 fn slice_has_utf16_bom(slice: &[u8]) -> bool {
  977     let enc = match encoding_rs::Encoding::for_bom(slice) {
  978         None => return false,
  979         Some((enc, _)) => enc,
  980     };
  981     [encoding_rs::UTF_16LE, encoding_rs::UTF_16BE].contains(&enc)
  982 }
  983 
  984 #[cfg(test)]
  985 mod tests {
  986     use testutil::{KitchenSink, RegexMatcher};
  987     use super::*;
  988 
  989     #[test]
  990     fn config_error_heap_limit() {
  991         let matcher = RegexMatcher::new("");
  992         let sink = KitchenSink::new();
  993         let mut searcher = SearcherBuilder::new()
  994             .heap_limit(Some(0))
  995             .build();
  996         let res = searcher.search_slice(matcher, &[], sink);
  997         assert!(res.is_err());
  998     }
  999 
 1000     #[test]
 1001     fn config_error_line_terminator() {
 1002         let mut matcher = RegexMatcher::new("");
 1003         matcher.set_line_term(Some(LineTerminator::byte(b'z')));
 1004 
 1005         let sink = KitchenSink::new();
 1006         let mut searcher = Searcher::new();
 1007         let res = searcher.search_slice(matcher, &[], sink);
 1008         assert!(res.is_err());
 1009     }
 1010 }