teddy.rs | Explore in Territory

use crate::util::{
    prefilter::PrefilterI,
    search::{MatchKind, Span},
};

#[derive(Clone, Debug)]
pub(crate) struct Teddy {
    #[cfg(not(feature = "perf-literal-multisubstring"))]
    _unused: (),
    /// The actual Teddy searcher.
    ///
    /// Technically, it's possible that Teddy doesn't actually get used, since
    /// Teddy does require its haystack to at least be of a certain size
    /// (usually around the size of whatever vector is being used, so ~16
    /// or ~32 bytes). For haystacks shorter than that, the implementation
    /// currently uses Rabin-Karp.
    #[cfg(feature = "perf-literal-multisubstring")]
    searcher: aho_corasick::packed::Searcher,
    /// When running an anchored search, the packed searcher can't handle it so
    /// we defer to Aho-Corasick itself. Kind of sad, but changing the packed
    /// searchers to support anchored search would be difficult at worst and
    /// annoying at best. Since packed searchers only apply to small numbers of
    /// literals, we content ourselves that this is not much of an added cost.
    /// (That packed searchers only work with a small number of literals is
    /// also why we use a DFA here. Otherwise, the memory usage of a DFA would
    /// likely be unacceptable.)
    #[cfg(feature = "perf-literal-multisubstring")]
    anchored_ac: aho_corasick::dfa::DFA,
    /// The length of the smallest literal we look for.
    ///
    /// We use this as a heuristic to figure out whether this will be "fast" or
    /// not. Generally, the longer the better, because longer needles are more
    /// discriminating and thus reduce false positive rate.
    #[cfg(feature = "perf-literal-multisubstring")]
    minimum_len: usize,
}

impl Teddy {
    pub(crate) fn new<B: AsRef<[u8]>>(
        kind: MatchKind,
        needles: &[B],
    ) -> Option<Teddy> {
        #[cfg(not(feature = "perf-literal-multisubstring"))]
        {
            None
        }
        #[cfg(feature = "perf-literal-multisubstring")]
        {
            // We only really support leftmost-first semantics. In
            // theory we could at least support leftmost-longest, as the
            // aho-corasick crate does, but regex-automata doesn't know about
            // leftmost-longest currently.
            //
            // And like the aho-corasick prefilter, if we're using `All`
            // semantics, then we can still use leftmost semantics for a
            // prefilter. (This might be a suspicious choice for the literal
            // engine, which uses a prefilter as a regex engine directly, but
            // that only happens when using leftmost-first semantics.)
            let (packed_match_kind, ac_match_kind) = match kind {
                MatchKind::LeftmostFirst | MatchKind::All => (
                    aho_corasick::packed::MatchKind::LeftmostFirst,
                    aho_corasick::MatchKind::LeftmostFirst,
                ),
            };
            let minimum_len =
                needles.iter().map(|n| n.as_ref().len()).min().unwrap_or(0);
            let packed = aho_corasick::packed::Config::new()
                .match_kind(packed_match_kind)
                .builder()
                .extend(needles)
                .build()?;
            let anchored_ac = aho_corasick::dfa::DFA::builder()
                .match_kind(ac_match_kind)
                .start_kind(aho_corasick::StartKind::Anchored)
                .prefilter(false)
                .build(needles)
                .ok()?;
            Some(Teddy { searcher: packed, anchored_ac, minimum_len })
        }
    }
}

impl PrefilterI for Teddy {
    fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
        #[cfg(not(feature = "perf-literal-multisubstring"))]
        {
            unreachable!()
        }
        #[cfg(feature = "perf-literal-multisubstring")]
        {
            let ac_span =
                aho_corasick::Span { start: span.start, end: span.end };
            self.searcher
                .find_in(haystack, ac_span)
                .map(|m| Span { start: m.start(), end: m.end() })
        }
    }

    fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
        #[cfg(not(feature = "perf-literal-multisubstring"))]
        {
            unreachable!()
        }
        #[cfg(feature = "perf-literal-multisubstring")]
        {
            use aho_corasick::automaton::Automaton;
            let input = aho_corasick::Input::new(haystack)
                .anchored(aho_corasick::Anchored::Yes)
                .span(span.start..span.end);
            self.anchored_ac
                .try_find(&input)
                // OK because we build the DFA with anchored support.
                .expect("aho-corasick DFA should never fail")
                .map(|m| Span { start: m.start(), end: m.end() })
        }
    }

    fn memory_usage(&self) -> usize {
        #[cfg(not(feature = "perf-literal-multisubstring"))]
        {
            unreachable!()
        }
        #[cfg(feature = "perf-literal-multisubstring")]
        {
            use aho_corasick::automaton::Automaton;
            self.searcher.memory_usage() + self.anchored_ac.memory_usage()
        }
    }

    fn is_fast(&self) -> bool {
        #[cfg(not(feature = "perf-literal-multisubstring"))]
        {
            unreachable!()
        }
        #[cfg(feature = "perf-literal-multisubstring")]
        {
            // Teddy is usually quite fast, but I have seen some cases where
            // a large number of literals can overwhelm it and make it not so
            // fast. We make an educated but conservative guess at a limit, at
            // which point, we're not so comfortable thinking Teddy is "fast."
            //
            // Well... this used to incorporate a "limit" on the *number*
            // of literals, but I have since changed it to a minimum on the
            // *smallest* literal. Namely, when there is a very small literal
            // (1 or 2 bytes), it is far more likely that it leads to a higher
            // false positive rate. (Although, of course, not always. For
            // example, 'zq' is likely to have a very low false positive rate.)
            // But when we have 3 bytes, we have a really good chance of being
            // quite discriminatory and thus fast.
            //
            // We may still want to add some kind of limit on the number of
            // literals here, but keep in mind that Teddy already has its own
            // somewhat small limit (64 at time of writing). The main issue
            // here is that if 'is_fast' is false, it opens the door for the
            // reverse inner optimization to kick in. We really only want to
            // resort to the reverse inner optimization if we absolutely must.
            self.minimum_len >= 3
        }
    }
}
chromium/third_party/rust/chromium_crates_io/vendor/regex-automata-0.4.7/src/util/prefilter/teddy.rs