chromium/third_party/rust/chromium_crates_io/vendor/regex-automata-0.4.7/src/util/prefilter/aho_corasick.rs

use crate::util::{
    prefilter::PrefilterI,
    search::{MatchKind, Span},
};

#[derive(Clone, Debug)]
pub(crate) struct AhoCorasick {
    #[cfg(not(feature = "perf-literal-multisubstring"))]
    _unused: (),
    #[cfg(feature = "perf-literal-multisubstring")]
    ac: aho_corasick::AhoCorasick,
}

impl AhoCorasick {
    pub(crate) fn new<B: AsRef<[u8]>>(
        kind: MatchKind,
        needles: &[B],
    ) -> Option<AhoCorasick> {
        #[cfg(not(feature = "perf-literal-multisubstring"))]
        {
            None
        }
        #[cfg(feature = "perf-literal-multisubstring")]
        {
            // We used to use `aho_corasick::MatchKind::Standard` here when
            // `kind` was `MatchKind::All`, but this is not correct. The
            // "standard" Aho-Corasick match semantics are to report a match
            // immediately as soon as it is seen, but `All` isn't like that.
            // In particular, with "standard" semantics, given the needles
            // "abc" and "b" and the haystack "abc," it would report a match
            // at offset 1 before a match at offset 0. This is never what we
            // want in the context of the regex engine, regardless of whether
            // we have leftmost-first or 'all' semantics. Namely, we always
            // want the leftmost match.
            let ac_match_kind = match kind {
                MatchKind::LeftmostFirst | MatchKind::All => {
                    aho_corasick::MatchKind::LeftmostFirst
                }
            };
            // This is kind of just an arbitrary number, but basically, if we
            // have a small enough set of literals, then we try to use the VERY
            // memory hungry DFA. Otherwise, we whimp out and use an NFA. The
            // upshot is that the NFA is quite lean and decently fast. Faster
            // than a naive Aho-Corasick NFA anyway.
            let ac_kind = if needles.len() <= 500 {
                aho_corasick::AhoCorasickKind::DFA
            } else {
                aho_corasick::AhoCorasickKind::ContiguousNFA
            };
            let result = aho_corasick::AhoCorasick::builder()
                .kind(Some(ac_kind))
                .match_kind(ac_match_kind)
                .start_kind(aho_corasick::StartKind::Both)
                // We try to handle all of the prefilter cases in the super
                // module, and only use Aho-Corasick for the actual automaton.
                // The aho-corasick crate does have some extra prefilters,
                // namely, looking for rare bytes to feed to memchr{,2,3}
                // instead of just the first byte. If we end up wanting
                // those---and they are somewhat tricky to implement---then
                // we could port them to this crate.
                //
                // The main reason for doing things this way is so we have a
                // complete and easy to understand picture of which prefilters
                // are available and how they work. Otherwise it seems too
                // easy to get into a situation where we have a prefilter
                // layered on top of prefilter, and that might have unintended
                // consequences.
                .prefilter(false)
                .build(needles);
            let ac = match result {
                Ok(ac) => ac,
                Err(_err) => {
                    debug!("aho-corasick prefilter failed to build: {}", _err);
                    return None;
                }
            };
            Some(AhoCorasick { ac })
        }
    }
}

impl PrefilterI for AhoCorasick {
    fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
        #[cfg(not(feature = "perf-literal-multisubstring"))]
        {
            unreachable!()
        }
        #[cfg(feature = "perf-literal-multisubstring")]
        {
            let input =
                aho_corasick::Input::new(haystack).span(span.start..span.end);
            self.ac
                .find(input)
                .map(|m| Span { start: m.start(), end: m.end() })
        }
    }

    fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
        #[cfg(not(feature = "perf-literal-multisubstring"))]
        {
            unreachable!()
        }
        #[cfg(feature = "perf-literal-multisubstring")]
        {
            let input = aho_corasick::Input::new(haystack)
                .anchored(aho_corasick::Anchored::Yes)
                .span(span.start..span.end);
            self.ac
                .find(input)
                .map(|m| Span { start: m.start(), end: m.end() })
        }
    }

    fn memory_usage(&self) -> usize {
        #[cfg(not(feature = "perf-literal-multisubstring"))]
        {
            unreachable!()
        }
        #[cfg(feature = "perf-literal-multisubstring")]
        {
            self.ac.memory_usage()
        }
    }

    fn is_fast(&self) -> bool {
        #[cfg(not(feature = "perf-literal-multisubstring"))]
        {
            unreachable!()
        }
        #[cfg(feature = "perf-literal-multisubstring")]
        {
            // Aho-Corasick is never considered "fast" because it's never
            // going to be even close to an order of magnitude faster than the
            // regex engine itself (assuming a DFA is used). In fact, it is
            // usually slower. The magic of Aho-Corasick is that it can search
            // a *large* number of literals with a relatively small amount of
            // memory. The regex engines are far more wasteful.
            //
            // Aho-Corasick may be "fast" when the regex engine corresponds
            // to, say, the PikeVM. That happens when the lazy DFA couldn't be
            // built or used for some reason. But in these cases, the regex
            // itself is likely quite big and we're probably hosed no matter
            // what we do. (In this case, the best bet is for the caller to
            // increase some of the memory limits on the hybrid cache capacity
            // and hope that's enough.)
            false
        }
    }
}