Skip to main content

regex_automata/util/prefilter/
aho_corasick.rs

1use crate::util::{
2    prefilter::PrefilterI,
3    search::{MatchKind, Span},
4};
5
6#[derive(Clone, Debug)]
7pub(crate) struct AhoCorasick {
8    #[cfg(not(feature = "perf-literal-multisubstring"))]
9    _unused: (),
10    #[cfg(feature = "perf-literal-multisubstring")]
11    ac: aho_corasick::AhoCorasick,
12}
13
14impl AhoCorasick {
15    pub(crate) fn new<B: AsRef<[u8]>>(
16        kind: MatchKind,
17        needles: &[B],
18    ) -> Option<AhoCorasick> {
19        #[cfg(not(feature = "perf-literal-multisubstring"))]
20        {
21            None
22        }
23        #[cfg(feature = "perf-literal-multisubstring")]
24        {
25            // We used to use `aho_corasick::MatchKind::Standard` here when
26            // `kind` was `MatchKind::All`, but this is not correct. The
27            // "standard" Aho-Corasick match semantics are to report a match
28            // immediately as soon as it is seen, but `All` isn't like that.
29            // In particular, with "standard" semantics, given the needles
30            // "abc" and "b" and the haystack "abc," it would report a match
31            // at offset 1 before a match at offset 0. This is never what we
32            // want in the context of the regex engine, regardless of whether
33            // we have leftmost-first or 'all' semantics. Namely, we always
34            // want the leftmost match.
35            let ac_match_kind = match kind {
36                MatchKind::LeftmostFirst | MatchKind::All => {
37                    aho_corasick::MatchKind::LeftmostFirst
38                }
39            };
40            // This is kind of just an arbitrary number, but basically, if we
41            // have a small enough set of literals, then we try to use the VERY
42            // memory hungry DFA. Otherwise, we wimp out and use an NFA. The
43            // upshot is that the NFA is quite lean and decently fast. Faster
44            // than a naive Aho-Corasick NFA anyway.
45            let ac_kind = if needles.len() <= 500 {
46                aho_corasick::AhoCorasickKind::DFA
47            } else {
48                aho_corasick::AhoCorasickKind::ContiguousNFA
49            };
50            let result = aho_corasick::AhoCorasick::builder()
51                .kind(Some(ac_kind))
52                .match_kind(ac_match_kind)
53                .start_kind(aho_corasick::StartKind::Both)
54                // We try to handle all of the prefilter cases in the super
55                // module, and only use Aho-Corasick for the actual automaton.
56                // The aho-corasick crate does have some extra prefilters,
57                // namely, looking for rare bytes to feed to memchr{,2,3}
58                // instead of just the first byte. If we end up wanting
59                // those---and they are somewhat tricky to implement---then
60                // we could port them to this crate.
61                //
62                // The main reason for doing things this way is so we have a
63                // complete and easy to understand picture of which prefilters
64                // are available and how they work. Otherwise it seems too
65                // easy to get into a situation where we have a prefilter
66                // layered on top of prefilter, and that might have unintended
67                // consequences.
68                .prefilter(false)
69                .build(needles);
70            let ac = match result {
71                Ok(ac) => ac,
72                Err(_err) => {
73                    debug!("aho-corasick prefilter failed to build: {_err}");
74                    return None;
75                }
76            };
77            Some(AhoCorasick { ac })
78        }
79    }
80}
81
82impl PrefilterI for AhoCorasick {
83    fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
84        #[cfg(not(feature = "perf-literal-multisubstring"))]
85        {
86            unreachable!()
87        }
88        #[cfg(feature = "perf-literal-multisubstring")]
89        {
90            let input =
91                aho_corasick::Input::new(haystack).span(span.start..span.end);
92            self.ac
93                .find(input)
94                .map(|m| Span { start: m.start(), end: m.end() })
95        }
96    }
97
98    fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
99        #[cfg(not(feature = "perf-literal-multisubstring"))]
100        {
101            unreachable!()
102        }
103        #[cfg(feature = "perf-literal-multisubstring")]
104        {
105            let input = aho_corasick::Input::new(haystack)
106                .anchored(aho_corasick::Anchored::Yes)
107                .span(span.start..span.end);
108            self.ac
109                .find(input)
110                .map(|m| Span { start: m.start(), end: m.end() })
111        }
112    }
113
114    fn memory_usage(&self) -> usize {
115        #[cfg(not(feature = "perf-literal-multisubstring"))]
116        {
117            unreachable!()
118        }
119        #[cfg(feature = "perf-literal-multisubstring")]
120        {
121            self.ac.memory_usage()
122        }
123    }
124
125    fn is_fast(&self) -> bool {
126        #[cfg(not(feature = "perf-literal-multisubstring"))]
127        {
128            unreachable!()
129        }
130        #[cfg(feature = "perf-literal-multisubstring")]
131        {
132            // Aho-Corasick is never considered "fast" because it's never
133            // going to be even close to an order of magnitude faster than the
134            // regex engine itself (assuming a DFA is used). In fact, it is
135            // usually slower. The magic of Aho-Corasick is that it can search
136            // a *large* number of literals with a relatively small amount of
137            // memory. The regex engines are far more wasteful.
138            //
139            // Aho-Corasick may be "fast" when the regex engine corresponds
140            // to, say, the PikeVM. That happens when the lazy DFA couldn't be
141            // built or used for some reason. But in these cases, the regex
142            // itself is likely quite big and we're probably hosed no matter
143            // what we do. (In this case, the best bet is for the caller to
144            // increase some of the memory limits on the hybrid cache capacity
145            // and hope that's enough.)
146            false
147        }
148    }
149}