regex_automata/hybrid/
dfa.rs

1/*!
2Types and routines specific to lazy DFAs.
3
4This module is the home of [`hybrid::dfa::DFA`](DFA).
5
6This module also contains a [`hybrid::dfa::Builder`](Builder) and a
7[`hybrid::dfa::Config`](Config) for configuring and building a lazy DFA.
8*/
9
10use core::{iter, mem::size_of};
11
12use alloc::vec::Vec;
13
14use crate::{
15    hybrid::{
16        error::{BuildError, CacheError, StartError},
17        id::{LazyStateID, LazyStateIDError},
18        search,
19    },
20    nfa::thompson,
21    util::{
22        alphabet::{self, ByteClasses, ByteSet},
23        determinize::{self, State, StateBuilderEmpty, StateBuilderNFA},
24        empty,
25        prefilter::Prefilter,
26        primitives::{PatternID, StateID as NFAStateID},
27        search::{
28            Anchored, HalfMatch, Input, MatchError, MatchKind, PatternSet,
29        },
30        sparse_set::SparseSets,
31        start::{self, Start, StartByteMap},
32    },
33};
34
35/// The minimum number of states that a lazy DFA's cache size must support.
36///
37/// This is checked at time of construction to ensure that at least some small
38/// number of states can fit in the given capacity allotment. If we can't fit
39/// at least this number of states, then the thinking is that it's pretty
40/// senseless to use the lazy DFA. More to the point, parts of the code do
41/// assume that the cache can fit at least some small number of states.
42const MIN_STATES: usize = SENTINEL_STATES + 2;
43
44/// The number of "sentinel" states that get added to every lazy DFA.
45///
46/// These are special states indicating status conditions of a search: unknown,
47/// dead and quit. These states in particular also use zero NFA states, so
48/// their memory usage is quite small. This is relevant for computing the
49/// minimum memory needed for a lazy DFA cache.
50const SENTINEL_STATES: usize = 3;
51
52/// A hybrid NFA/DFA (also called a "lazy DFA") for regex searching.
53///
54/// A lazy DFA is a DFA that builds itself at search time. It otherwise has
55/// very similar characteristics as a [`dense::DFA`](crate::dfa::dense::DFA).
56/// Indeed, both support precisely the same regex features with precisely the
57/// same semantics.
58///
59/// Where as a `dense::DFA` must be completely built to handle any input before
60/// it may be used for search, a lazy DFA starts off effectively empty. During
61/// a search, a lazy DFA will build itself depending on whether it has already
62/// computed the next transition or not. If it has, then it looks a lot like
63/// a `dense::DFA` internally: it does a very fast table based access to find
64/// the next transition. Otherwise, if the state hasn't been computed, then it
65/// does determinization _for that specific transition_ to compute the next DFA
66/// state.
67///
68/// The main selling point of a lazy DFA is that, in practice, it has
69/// the performance profile of a `dense::DFA` without the weakness of it
70/// taking worst case exponential time to build. Indeed, for each byte of
71/// input, the lazy DFA will construct as most one new DFA state. Thus, a
72/// lazy DFA achieves worst case `O(mn)` time for regex search (where `m ~
73/// pattern.len()` and `n ~ haystack.len()`).
74///
75/// The main downsides of a lazy DFA are:
76///
77/// 1. It requires mutable "cache" space during search. This is where the
78/// transition table, among other things, is stored.
79/// 2. In pathological cases (e.g., if the cache is too small), it will run
80/// out of room and either require a bigger cache capacity or will repeatedly
81/// clear the cache and thus repeatedly regenerate DFA states. Overall, this
82/// will tend to be slower than a typical NFA simulation.
83///
84/// # Capabilities
85///
86/// Like a `dense::DFA`, a single lazy DFA fundamentally supports the following
87/// operations:
88///
89/// 1. Detection of a match.
90/// 2. Location of the end of a match.
91/// 3. In the case of a lazy DFA with multiple patterns, which pattern matched
92/// is reported as well.
93///
94/// A notable absence from the above list of capabilities is the location of
95/// the *start* of a match. In order to provide both the start and end of
96/// a match, *two* lazy DFAs are required. This functionality is provided by a
97/// [`Regex`](crate::hybrid::regex::Regex).
98///
99/// # Example
100///
101/// This shows how to build a lazy DFA with the default configuration and
102/// execute a search. Notice how, in contrast to a `dense::DFA`, we must create
103/// a cache and pass it to our search routine.
104///
105/// ```
106/// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
107///
108/// let dfa = DFA::new("foo[0-9]+")?;
109/// let mut cache = dfa.create_cache();
110///
111/// let expected = Some(HalfMatch::must(0, 8));
112/// assert_eq!(expected, dfa.try_search_fwd(
113///     &mut cache, &Input::new("foo12345"))?,
114/// );
115/// # Ok::<(), Box<dyn std::error::Error>>(())
116/// ```
117#[derive(Clone, Debug)]
118pub struct DFA {
119    config: Config,
120    nfa: thompson::NFA,
121    stride2: usize,
122    start_map: StartByteMap,
123    classes: ByteClasses,
124    quitset: ByteSet,
125    cache_capacity: usize,
126}
127
128impl DFA {
129    /// Parse the given regular expression using a default configuration and
130    /// return the corresponding lazy DFA.
131    ///
132    /// If you want a non-default configuration, then use the [`Builder`] to
133    /// set your own configuration.
134    ///
135    /// # Example
136    ///
137    /// ```
138    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
139    ///
140    /// let dfa = DFA::new("foo[0-9]+bar")?;
141    /// let mut cache = dfa.create_cache();
142    ///
143    /// let expected = HalfMatch::must(0, 11);
144    /// assert_eq!(
145    ///     Some(expected),
146    ///     dfa.try_search_fwd(&mut cache, &Input::new("foo12345bar"))?,
147    /// );
148    /// # Ok::<(), Box<dyn std::error::Error>>(())
149    /// ```
150    #[cfg(feature = "syntax")]
151    pub fn new(pattern: &str) -> Result<DFA, BuildError> {
152        DFA::builder().build(pattern)
153    }
154
155    /// Parse the given regular expressions using a default configuration and
156    /// return the corresponding lazy multi-DFA.
157    ///
158    /// If you want a non-default configuration, then use the [`Builder`] to
159    /// set your own configuration.
160    ///
161    /// # Example
162    ///
163    /// ```
164    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
165    ///
166    /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+"])?;
167    /// let mut cache = dfa.create_cache();
168    ///
169    /// let expected = HalfMatch::must(1, 3);
170    /// assert_eq!(
171    ///     Some(expected),
172    ///     dfa.try_search_fwd(&mut cache, &Input::new("foo12345bar"))?,
173    /// );
174    /// # Ok::<(), Box<dyn std::error::Error>>(())
175    /// ```
176    #[cfg(feature = "syntax")]
177    pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<DFA, BuildError> {
178        DFA::builder().build_many(patterns)
179    }
180
181    /// Create a new lazy DFA that matches every input.
182    ///
183    /// # Example
184    ///
185    /// ```
186    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
187    ///
188    /// let dfa = DFA::always_match()?;
189    /// let mut cache = dfa.create_cache();
190    ///
191    /// let expected = HalfMatch::must(0, 0);
192    /// assert_eq!(Some(expected), dfa.try_search_fwd(
193    ///     &mut cache, &Input::new(""))?,
194    /// );
195    /// assert_eq!(Some(expected), dfa.try_search_fwd(
196    ///     &mut cache, &Input::new("foo"))?,
197    /// );
198    /// # Ok::<(), Box<dyn std::error::Error>>(())
199    /// ```
200    pub fn always_match() -> Result<DFA, BuildError> {
201        let nfa = thompson::NFA::always_match();
202        Builder::new().build_from_nfa(nfa)
203    }
204
205    /// Create a new lazy DFA that never matches any input.
206    ///
207    /// # Example
208    ///
209    /// ```
210    /// use regex_automata::{hybrid::dfa::DFA, Input};
211    ///
212    /// let dfa = DFA::never_match()?;
213    /// let mut cache = dfa.create_cache();
214    ///
215    /// assert_eq!(None, dfa.try_search_fwd(&mut cache, &Input::new(""))?);
216    /// assert_eq!(None, dfa.try_search_fwd(&mut cache, &Input::new("foo"))?);
217    /// # Ok::<(), Box<dyn std::error::Error>>(())
218    /// ```
219    pub fn never_match() -> Result<DFA, BuildError> {
220        let nfa = thompson::NFA::never_match();
221        Builder::new().build_from_nfa(nfa)
222    }
223
224    /// Return a default configuration for a `DFA`.
225    ///
226    /// This is a convenience routine to avoid needing to import the [`Config`]
227    /// type when customizing the construction of a lazy DFA.
228    ///
229    /// # Example
230    ///
231    /// This example shows how to build a lazy DFA that heuristically supports
232    /// Unicode word boundaries.
233    ///
234    /// ```
235    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
236    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError, Input};
237    ///
238    /// let re = DFA::builder()
239    ///     .configure(DFA::config().unicode_word_boundary(true))
240    ///     .build(r"\b\w+\b")?;
241    /// let mut cache = re.create_cache();
242    ///
243    /// // Since our haystack is all ASCII, the DFA search sees then and knows
244    /// // it is legal to interpret Unicode word boundaries as ASCII word
245    /// // boundaries.
246    /// let input = Input::new("!!foo!!");
247    /// let expected = HalfMatch::must(0, 5);
248    /// assert_eq!(Some(expected), re.try_search_fwd(&mut cache, &input)?);
249    ///
250    /// // But if our haystack contains non-ASCII, then the search will fail
251    /// // with an error.
252    /// let input = Input::new("!!βββ!!");
253    /// let expected = MatchError::quit(b'\xCE', 2);
254    /// assert_eq!(Err(expected), re.try_search_fwd(&mut cache, &input));
255    ///
256    /// # Ok::<(), Box<dyn std::error::Error>>(())
257    /// ```
258    pub fn config() -> Config {
259        Config::new()
260    }
261
262    /// Return a builder for configuring the construction of a `Regex`.
263    ///
264    /// This is a convenience routine to avoid needing to import the
265    /// [`Builder`] type in common cases.
266    ///
267    /// # Example
268    ///
269    /// This example shows how to use the builder to disable UTF-8 mode
270    /// everywhere for lazy DFAs.
271    ///
272    /// ```
273    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
274    /// use regex_automata::{hybrid::dfa::DFA, util::syntax, HalfMatch, Input};
275    ///
276    /// let re = DFA::builder()
277    ///     .syntax(syntax::Config::new().utf8(false))
278    ///     .build(r"foo(?-u:[^b])ar.*")?;
279    /// let mut cache = re.create_cache();
280    ///
281    /// let input = Input::new(b"\xFEfoo\xFFarzz\xE2\x98\xFF\n");
282    /// let expected = Some(HalfMatch::must(0, 9));
283    /// let got = re.try_search_fwd(&mut cache, &input)?;
284    /// assert_eq!(expected, got);
285    ///
286    /// # Ok::<(), Box<dyn std::error::Error>>(())
287    /// ```
288    pub fn builder() -> Builder {
289        Builder::new()
290    }
291
292    /// Create a new cache for this lazy DFA.
293    ///
294    /// The cache returned should only be used for searches for this
295    /// lazy DFA. If you want to reuse the cache for another DFA, then
296    /// you must call [`Cache::reset`] with that DFA (or, equivalently,
297    /// [`DFA::reset_cache`]).
298    pub fn create_cache(&self) -> Cache {
299        Cache::new(self)
300    }
301
302    /// Reset the given cache such that it can be used for searching with the
303    /// this lazy DFA (and only this DFA).
304    ///
305    /// A cache reset permits reusing memory already allocated in this cache
306    /// with a different lazy DFA.
307    ///
308    /// Resetting a cache sets its "clear count" to 0. This is relevant if the
309    /// lazy DFA has been configured to "give up" after it has cleared the
310    /// cache a certain number of times.
311    ///
312    /// Any lazy state ID generated by the cache prior to resetting it is
313    /// invalid after the reset.
314    ///
315    /// # Example
316    ///
317    /// This shows how to re-purpose a cache for use with a different DFA.
318    ///
319    /// ```
320    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
321    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
322    ///
323    /// let dfa1 = DFA::new(r"\w")?;
324    /// let dfa2 = DFA::new(r"\W")?;
325    ///
326    /// let mut cache = dfa1.create_cache();
327    /// assert_eq!(
328    ///     Some(HalfMatch::must(0, 2)),
329    ///     dfa1.try_search_fwd(&mut cache, &Input::new("Δ"))?,
330    /// );
331    ///
332    /// // Using 'cache' with dfa2 is not allowed. It may result in panics or
333    /// // incorrect results. In order to re-purpose the cache, we must reset
334    /// // it with the DFA we'd like to use it with.
335    /// //
336    /// // Similarly, after this reset, using the cache with 'dfa1' is also not
337    /// // allowed.
338    /// dfa2.reset_cache(&mut cache);
339    /// assert_eq!(
340    ///     Some(HalfMatch::must(0, 3)),
341    ///     dfa2.try_search_fwd(&mut cache, &Input::new("☃"))?,
342    /// );
343    ///
344    /// # Ok::<(), Box<dyn std::error::Error>>(())
345    /// ```
346    pub fn reset_cache(&self, cache: &mut Cache) {
347        Lazy::new(self, cache).reset_cache()
348    }
349
350    /// Returns the total number of patterns compiled into this lazy DFA.
351    ///
352    /// In the case of a DFA that contains no patterns, this returns `0`.
353    ///
354    /// # Example
355    ///
356    /// This example shows the pattern length for a DFA that never matches:
357    ///
358    /// ```
359    /// use regex_automata::hybrid::dfa::DFA;
360    ///
361    /// let dfa = DFA::never_match()?;
362    /// assert_eq!(dfa.pattern_len(), 0);
363    /// # Ok::<(), Box<dyn std::error::Error>>(())
364    /// ```
365    ///
366    /// And another example for a DFA that matches at every position:
367    ///
368    /// ```
369    /// use regex_automata::hybrid::dfa::DFA;
370    ///
371    /// let dfa = DFA::always_match()?;
372    /// assert_eq!(dfa.pattern_len(), 1);
373    /// # Ok::<(), Box<dyn std::error::Error>>(())
374    /// ```
375    ///
376    /// And finally, a DFA that was constructed from multiple patterns:
377    ///
378    /// ```
379    /// use regex_automata::hybrid::dfa::DFA;
380    ///
381    /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
382    /// assert_eq!(dfa.pattern_len(), 3);
383    /// # Ok::<(), Box<dyn std::error::Error>>(())
384    /// ```
385    pub fn pattern_len(&self) -> usize {
386        self.nfa.pattern_len()
387    }
388
389    /// Returns the equivalence classes that make up the alphabet for this DFA.
390    ///
391    /// Unless [`Config::byte_classes`] was disabled, it is possible that
392    /// multiple distinct bytes are grouped into the same equivalence class
393    /// if it is impossible for them to discriminate between a match and a
394    /// non-match. This has the effect of reducing the overall alphabet size
395    /// and in turn potentially substantially reducing the size of the DFA's
396    /// transition table.
397    ///
398    /// The downside of using equivalence classes like this is that every state
399    /// transition will automatically use this map to convert an arbitrary
400    /// byte to its corresponding equivalence class. In practice this has a
401    /// negligible impact on performance.
402    pub fn byte_classes(&self) -> &ByteClasses {
403        &self.classes
404    }
405
406    /// Returns this lazy DFA's configuration.
407    pub fn get_config(&self) -> &Config {
408        &self.config
409    }
410
411    /// Returns a reference to the underlying NFA.
412    pub fn get_nfa(&self) -> &thompson::NFA {
413        &self.nfa
414    }
415
416    /// Returns the stride, as a base-2 exponent, required for these
417    /// equivalence classes.
418    ///
419    /// The stride is always the smallest power of 2 that is greater than or
420    /// equal to the alphabet length. This is done so that converting between
421    /// state IDs and indices can be done with shifts alone, which is much
422    /// faster than integer division.
423    fn stride2(&self) -> usize {
424        self.stride2
425    }
426
427    /// Returns the total stride for every state in this lazy DFA. This
428    /// corresponds to the total number of transitions used by each state in
429    /// this DFA's transition table.
430    fn stride(&self) -> usize {
431        1 << self.stride2()
432    }
433
434    /// Returns the memory usage, in bytes, of this lazy DFA.
435    ///
436    /// This does **not** include the stack size used up by this lazy DFA. To
437    /// compute that, use `std::mem::size_of::<DFA>()`. This also does not
438    /// include the size of the `Cache` used.
439    ///
440    /// This also does not include any heap memory used by the NFA inside of
441    /// this hybrid NFA/DFA. This is because the NFA's ownership is shared, and
442    /// thus not owned by this hybrid NFA/DFA. More practically, several regex
443    /// engines in this crate embed an NFA, and reporting the NFA's memory
444    /// usage in all of them would likely result in reporting higher heap
445    /// memory than is actually used.
446    pub fn memory_usage(&self) -> usize {
447        // The only thing that uses heap memory in a DFA is the NFA. But the
448        // NFA has shared ownership, so reporting its memory as part of the
449        // hybrid DFA is likely to lead to double-counting the NFA memory
450        // somehow. In particular, this DFA does not really own an NFA, so
451        // including it in the DFA's memory usage doesn't seem semantically
452        // correct.
453        0
454    }
455}
456
457impl DFA {
458    /// Executes a forward search and returns the end position of the leftmost
459    /// match that is found. If no match exists, then `None` is returned.
460    ///
461    /// In particular, this method continues searching even after it enters
462    /// a match state. The search only terminates once it has reached the
463    /// end of the input or when it has entered a dead or quit state. Upon
464    /// termination, the position of the last byte seen while still in a match
465    /// state is returned.
466    ///
467    /// # Errors
468    ///
469    /// This routine errors if the search could not complete. This can occur
470    /// in a number of circumstances:
471    ///
472    /// * The configuration of the lazy DFA may permit it to "quit" the search.
473    /// For example, setting quit bytes or enabling heuristic support for
474    /// Unicode word boundaries. The default configuration does not enable any
475    /// option that could result in the lazy DFA quitting.
476    /// * The configuration of the lazy DFA may also permit it to "give up"
477    /// on a search if it makes ineffective use of its transition table
478    /// cache. The default configuration does not enable this by default,
479    /// although it is typically a good idea to.
480    /// * When the provided `Input` configuration is not supported. For
481    /// example, by providing an unsupported anchor mode.
482    ///
483    /// When a search returns an error, callers cannot know whether a match
484    /// exists or not.
485    ///
486    /// # Example
487    ///
488    /// This example shows how to run a basic search.
489    ///
490    /// ```
491    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
492    ///
493    /// let dfa = DFA::new("foo[0-9]+")?;
494    /// let mut cache = dfa.create_cache();
495    /// let expected = HalfMatch::must(0, 8);
496    /// assert_eq!(Some(expected), dfa.try_search_fwd(
497    ///     &mut cache, &Input::new("foo12345"))?,
498    /// );
499    ///
500    /// // Even though a match is found after reading the first byte (`a`),
501    /// // the leftmost first match semantics demand that we find the earliest
502    /// // match that prefers earlier parts of the pattern over later parts.
503    /// let dfa = DFA::new("abc|a")?;
504    /// let mut cache = dfa.create_cache();
505    /// let expected = HalfMatch::must(0, 3);
506    /// assert_eq!(Some(expected), dfa.try_search_fwd(
507    ///     &mut cache, &Input::new("abc"))?,
508    /// );
509    ///
510    /// # Ok::<(), Box<dyn std::error::Error>>(())
511    /// ```
512    ///
513    /// # Example: specific pattern search
514    ///
515    /// This example shows how to build a lazy multi-DFA that permits searching
516    /// for specific patterns.
517    ///
518    /// ```
519    /// use regex_automata::{
520    ///     hybrid::dfa::DFA,
521    ///     Anchored, HalfMatch, PatternID, Input,
522    /// };
523    ///
524    /// let dfa = DFA::builder()
525    ///     .configure(DFA::config().starts_for_each_pattern(true))
526    ///     .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?;
527    /// let mut cache = dfa.create_cache();
528    /// let haystack = "foo123";
529    ///
530    /// // Since we are using the default leftmost-first match and both
531    /// // patterns match at the same starting position, only the first pattern
532    /// // will be returned in this case when doing a search for any of the
533    /// // patterns.
534    /// let expected = Some(HalfMatch::must(0, 6));
535    /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?;
536    /// assert_eq!(expected, got);
537    ///
538    /// // But if we want to check whether some other pattern matches, then we
539    /// // can provide its pattern ID.
540    /// let expected = Some(HalfMatch::must(1, 6));
541    /// let input = Input::new(haystack)
542    ///     .anchored(Anchored::Pattern(PatternID::must(1)));
543    /// let got = dfa.try_search_fwd(&mut cache, &input)?;
544    /// assert_eq!(expected, got);
545    ///
546    /// # Ok::<(), Box<dyn std::error::Error>>(())
547    /// ```
548    ///
549    /// # Example: specifying the bounds of a search
550    ///
551    /// This example shows how providing the bounds of a search can produce
552    /// different results than simply sub-slicing the haystack.
553    ///
554    /// ```
555    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
556    ///
557    /// // N.B. We disable Unicode here so that we use a simple ASCII word
558    /// // boundary. Alternatively, we could enable heuristic support for
559    /// // Unicode word boundaries since our haystack is pure ASCII.
560    /// let dfa = DFA::new(r"(?-u)\b[0-9]{3}\b")?;
561    /// let mut cache = dfa.create_cache();
562    /// let haystack = "foo123bar";
563    ///
564    /// // Since we sub-slice the haystack, the search doesn't know about the
565    /// // larger context and assumes that `123` is surrounded by word
566    /// // boundaries. And of course, the match position is reported relative
567    /// // to the sub-slice as well, which means we get `3` instead of `6`.
568    /// let expected = Some(HalfMatch::must(0, 3));
569    /// let got = dfa.try_search_fwd(
570    ///     &mut cache,
571    ///     &Input::new(&haystack[3..6]),
572    /// )?;
573    /// assert_eq!(expected, got);
574    ///
575    /// // But if we provide the bounds of the search within the context of the
576    /// // entire haystack, then the search can take the surrounding context
577    /// // into account. (And if we did find a match, it would be reported
578    /// // as a valid offset into `haystack` instead of its sub-slice.)
579    /// let expected = None;
580    /// let got = dfa.try_search_fwd(
581    ///     &mut cache,
582    ///     &Input::new(haystack).range(3..6),
583    /// )?;
584    /// assert_eq!(expected, got);
585    ///
586    /// # Ok::<(), Box<dyn std::error::Error>>(())
587    /// ```
588    #[inline]
589    pub fn try_search_fwd(
590        &self,
591        cache: &mut Cache,
592        input: &Input<'_>,
593    ) -> Result<Option<HalfMatch>, MatchError> {
594        let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
595        let hm = match search::find_fwd(self, cache, input)? {
596            None => return Ok(None),
597            Some(hm) if !utf8empty => return Ok(Some(hm)),
598            Some(hm) => hm,
599        };
600        // We get to this point when we know our DFA can match the empty string
601        // AND when UTF-8 mode is enabled. In this case, we skip any matches
602        // whose offset splits a codepoint. Such a match is necessarily a
603        // zero-width match, because UTF-8 mode requires the underlying NFA
604        // to be built such that all non-empty matches span valid UTF-8.
605        // Therefore, any match that ends in the middle of a codepoint cannot
606        // be part of a span of valid UTF-8 and thus must be an empty match.
607        // In such cases, we skip it, so as not to report matches that split a
608        // codepoint.
609        //
610        // Note that this is not a checked assumption. Callers *can* provide an
611        // NFA with UTF-8 mode enabled but produces non-empty matches that span
612        // invalid UTF-8. But doing so is documented to result in unspecified
613        // behavior.
614        empty::skip_splits_fwd(input, hm, hm.offset(), |input| {
615            let got = search::find_fwd(self, cache, input)?;
616            Ok(got.map(|hm| (hm, hm.offset())))
617        })
618    }
619
620    /// Executes a reverse search and returns the start of the position of the
621    /// leftmost match that is found. If no match exists, then `None` is
622    /// returned.
623    ///
624    /// # Errors
625    ///
626    /// This routine errors if the search could not complete. This can occur
627    /// in a number of circumstances:
628    ///
629    /// * The configuration of the lazy DFA may permit it to "quit" the search.
630    /// For example, setting quit bytes or enabling heuristic support for
631    /// Unicode word boundaries. The default configuration does not enable any
632    /// option that could result in the lazy DFA quitting.
633    /// * The configuration of the lazy DFA may also permit it to "give up"
634    /// on a search if it makes ineffective use of its transition table
635    /// cache. The default configuration does not enable this by default,
636    /// although it is typically a good idea to.
637    /// * When the provided `Input` configuration is not supported. For
638    /// example, by providing an unsupported anchor mode.
639    ///
640    /// When a search returns an error, callers cannot know whether a match
641    /// exists or not.
642    ///
643    /// # Example
644    ///
645    /// This routine is principally useful when used in
646    /// conjunction with the
647    /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse)
648    /// configuration. In general, it's unlikely to be correct to use both
649    /// `try_search_fwd` and `try_search_rev` with the same DFA since any
650    /// particular DFA will only support searching in one direction with
651    /// respect to the pattern.
652    ///
653    /// ```
654    /// use regex_automata::{
655    ///     nfa::thompson,
656    ///     hybrid::dfa::DFA,
657    ///     HalfMatch, Input,
658    /// };
659    ///
660    /// let dfa = DFA::builder()
661    ///     .thompson(thompson::Config::new().reverse(true))
662    ///     .build("foo[0-9]+")?;
663    /// let mut cache = dfa.create_cache();
664    /// let expected = HalfMatch::must(0, 0);
665    /// assert_eq!(
666    ///     Some(expected),
667    ///     dfa.try_search_rev(&mut cache, &Input::new("foo12345"))?,
668    /// );
669    ///
670    /// // Even though a match is found after reading the last byte (`c`),
671    /// // the leftmost first match semantics demand that we find the earliest
672    /// // match that prefers earlier parts of the pattern over latter parts.
673    /// let dfa = DFA::builder()
674    ///     .thompson(thompson::Config::new().reverse(true))
675    ///     .build("abc|c")?;
676    /// let mut cache = dfa.create_cache();
677    /// let expected = HalfMatch::must(0, 0);
678    /// assert_eq!(Some(expected), dfa.try_search_rev(
679    ///     &mut cache, &Input::new("abc"))?,
680    /// );
681    ///
682    /// # Ok::<(), Box<dyn std::error::Error>>(())
683    /// ```
684    ///
685    /// # Example: UTF-8 mode
686    ///
687    /// This examples demonstrates that UTF-8 mode applies to reverse
688    /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all
689    /// matches reported must correspond to valid UTF-8 spans. This includes
690    /// prohibiting zero-width matches that split a codepoint.
691    ///
692    /// UTF-8 mode is enabled by default. Notice below how the only zero-width
693    /// matches reported are those at UTF-8 boundaries:
694    ///
695    /// ```
696    /// use regex_automata::{
697    ///     hybrid::dfa::DFA,
698    ///     nfa::thompson,
699    ///     HalfMatch, Input, MatchKind,
700    /// };
701    ///
702    /// let dfa = DFA::builder()
703    ///     .thompson(thompson::Config::new().reverse(true))
704    ///     .build(r"")?;
705    /// let mut cache = dfa.create_cache();
706    ///
707    /// // Run the reverse DFA to collect all matches.
708    /// let mut input = Input::new("☃");
709    /// let mut matches = vec![];
710    /// loop {
711    ///     match dfa.try_search_rev(&mut cache, &input)? {
712    ///         None => break,
713    ///         Some(hm) => {
714    ///             matches.push(hm);
715    ///             if hm.offset() == 0 || input.end() == 0 {
716    ///                 break;
717    ///             } else if hm.offset() < input.end() {
718    ///                 input.set_end(hm.offset());
719    ///             } else {
720    ///                 // This is only necessary to handle zero-width
721    ///                 // matches, which of course occur in this example.
722    ///                 // Without this, the search would never advance
723    ///                 // backwards beyond the initial match.
724    ///                 input.set_end(input.end() - 1);
725    ///             }
726    ///         }
727    ///     }
728    /// }
729    ///
730    /// // No matches split a codepoint.
731    /// let expected = vec![
732    ///     HalfMatch::must(0, 3),
733    ///     HalfMatch::must(0, 0),
734    /// ];
735    /// assert_eq!(expected, matches);
736    ///
737    /// # Ok::<(), Box<dyn std::error::Error>>(())
738    /// ```
739    ///
740    /// Now let's look at the same example, but with UTF-8 mode on the
741    /// underlying NFA disabled:
742    ///
743    /// ```
744    /// use regex_automata::{
745    ///     hybrid::dfa::DFA,
746    ///     nfa::thompson,
747    ///     HalfMatch, Input, MatchKind,
748    /// };
749    ///
750    /// let dfa = DFA::builder()
751    ///     .thompson(thompson::Config::new().reverse(true).utf8(false))
752    ///     .build(r"")?;
753    /// let mut cache = dfa.create_cache();
754    ///
755    /// // Run the reverse DFA to collect all matches.
756    /// let mut input = Input::new("☃");
757    /// let mut matches = vec![];
758    /// loop {
759    ///     match dfa.try_search_rev(&mut cache, &input)? {
760    ///         None => break,
761    ///         Some(hm) => {
762    ///             matches.push(hm);
763    ///             if hm.offset() == 0 || input.end() == 0 {
764    ///                 break;
765    ///             } else if hm.offset() < input.end() {
766    ///                 input.set_end(hm.offset());
767    ///             } else {
768    ///                 // This is only necessary to handle zero-width
769    ///                 // matches, which of course occur in this example.
770    ///                 // Without this, the search would never advance
771    ///                 // backwards beyond the initial match.
772    ///                 input.set_end(input.end() - 1);
773    ///             }
774    ///         }
775    ///     }
776    /// }
777    ///
778    /// // No matches split a codepoint.
779    /// let expected = vec![
780    ///     HalfMatch::must(0, 3),
781    ///     HalfMatch::must(0, 2),
782    ///     HalfMatch::must(0, 1),
783    ///     HalfMatch::must(0, 0),
784    /// ];
785    /// assert_eq!(expected, matches);
786    ///
787    /// # Ok::<(), Box<dyn std::error::Error>>(())
788    /// ```
789    #[inline]
790    pub fn try_search_rev(
791        &self,
792        cache: &mut Cache,
793        input: &Input<'_>,
794    ) -> Result<Option<HalfMatch>, MatchError> {
795        let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
796        let hm = match search::find_rev(self, cache, input)? {
797            None => return Ok(None),
798            Some(hm) if !utf8empty => return Ok(Some(hm)),
799            Some(hm) => hm,
800        };
801        empty::skip_splits_rev(input, hm, hm.offset(), |input| {
802            let got = search::find_rev(self, cache, input)?;
803            Ok(got.map(|hm| (hm, hm.offset())))
804        })
805    }
806
807    /// Executes an overlapping forward search and returns the end position of
808    /// matches as they are found. If no match exists, then `None` is returned.
809    ///
810    /// This routine is principally only useful when searching for multiple
811    /// patterns on inputs where multiple patterns may match the same regions
812    /// of text. In particular, callers must preserve the automaton's search
813    /// state from prior calls so that the implementation knows where the last
814    /// match occurred.
815    ///
816    /// When using this routine to implement an iterator of overlapping
817    /// matches, the `start` of the search should remain invariant throughout
818    /// iteration. The `OverlappingState` given to the search will keep track
819    /// of the current position of the search. (This is because multiple
820    /// matches may be reported at the same position, so only the search
821    /// implementation itself knows when to advance the position.)
822    ///
823    /// If for some reason you want the search to forget about its previous
824    /// state and restart the search at a particular position, then setting the
825    /// state to [`OverlappingState::start`] will accomplish that.
826    ///
827    /// # Errors
828    ///
829    /// This routine errors if the search could not complete. This can occur
830    /// in a number of circumstances:
831    ///
832    /// * The configuration of the lazy DFA may permit it to "quit" the search.
833    /// For example, setting quit bytes or enabling heuristic support for
834    /// Unicode word boundaries. The default configuration does not enable any
835    /// option that could result in the lazy DFA quitting.
836    /// * The configuration of the lazy DFA may also permit it to "give up"
837    /// on a search if it makes ineffective use of its transition table
838    /// cache. The default configuration does not enable this by default,
839    /// although it is typically a good idea to.
840    /// * When the provided `Input` configuration is not supported. For
841    /// example, by providing an unsupported anchor mode.
842    ///
843    /// When a search returns an error, callers cannot know whether a match
844    /// exists or not.
845    ///
846    /// # Example
847    ///
848    /// This example shows how to run a basic overlapping search. Notice
849    /// that we build the automaton with a `MatchKind::All` configuration.
850    /// Overlapping searches are unlikely to work as one would expect when
851    /// using the default `MatchKind::LeftmostFirst` match semantics, since
852    /// leftmost-first matching is fundamentally incompatible with overlapping
853    /// searches. Namely, overlapping searches need to report matches as they
854    /// are seen, where as leftmost-first searches will continue searching even
855    /// after a match has been observed in order to find the conventional end
856    /// position of the match. More concretely, leftmost-first searches use
857    /// dead states to terminate a search after a specific match can no longer
858    /// be extended. Overlapping searches instead do the opposite by continuing
859    /// the search to find totally new matches (potentially of other patterns).
860    ///
861    /// ```
862    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
863    /// use regex_automata::{
864    ///     hybrid::dfa::{DFA, OverlappingState},
865    ///     HalfMatch, Input, MatchKind,
866    /// };
867    ///
868    /// let dfa = DFA::builder()
869    ///     .configure(DFA::config().match_kind(MatchKind::All))
870    ///     .build_many(&[r"\w+$", r"\S+$"])?;
871    /// let mut cache = dfa.create_cache();
872    ///
873    /// let haystack = "@foo";
874    /// let mut state = OverlappingState::start();
875    ///
876    /// let expected = Some(HalfMatch::must(1, 4));
877    /// dfa.try_search_overlapping_fwd(
878    ///     &mut cache, &Input::new(haystack), &mut state,
879    /// )?;
880    /// assert_eq!(expected, state.get_match());
881    ///
882    /// // The first pattern also matches at the same position, so re-running
883    /// // the search will yield another match. Notice also that the first
884    /// // pattern is returned after the second. This is because the second
885    /// // pattern begins its match before the first, is therefore an earlier
886    /// // match and is thus reported first.
887    /// let expected = Some(HalfMatch::must(0, 4));
888    /// dfa.try_search_overlapping_fwd(
889    ///     &mut cache, &Input::new(haystack), &mut state,
890    /// )?;
891    /// assert_eq!(expected, state.get_match());
892    ///
893    /// # Ok::<(), Box<dyn std::error::Error>>(())
894    /// ```
895    #[inline]
896    pub fn try_search_overlapping_fwd(
897        &self,
898        cache: &mut Cache,
899        input: &Input<'_>,
900        state: &mut OverlappingState,
901    ) -> Result<(), MatchError> {
902        let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
903        search::find_overlapping_fwd(self, cache, input, state)?;
904        match state.get_match() {
905            None => Ok(()),
906            Some(_) if !utf8empty => Ok(()),
907            Some(_) => skip_empty_utf8_splits_overlapping(
908                input,
909                state,
910                |input, state| {
911                    search::find_overlapping_fwd(self, cache, input, state)
912                },
913            ),
914        }
915    }
916
917    /// Executes a reverse overlapping search and returns the start of the
918    /// position of the leftmost match that is found. If no match exists, then
919    /// `None` is returned.
920    ///
921    /// When using this routine to implement an iterator of overlapping
922    /// matches, the `start` of the search should remain invariant throughout
923    /// iteration. The `OverlappingState` given to the search will keep track
924    /// of the current position of the search. (This is because multiple
925    /// matches may be reported at the same position, so only the search
926    /// implementation itself knows when to advance the position.)
927    ///
928    /// If for some reason you want the search to forget about its previous
929    /// state and restart the search at a particular position, then setting the
930    /// state to [`OverlappingState::start`] will accomplish that.
931    ///
932    /// # Errors
933    ///
934    /// This routine errors if the search could not complete. This can occur
935    /// in a number of circumstances:
936    ///
937    /// * The configuration of the lazy DFA may permit it to "quit" the search.
938    /// For example, setting quit bytes or enabling heuristic support for
939    /// Unicode word boundaries. The default configuration does not enable any
940    /// option that could result in the lazy DFA quitting.
941    /// * The configuration of the lazy DFA may also permit it to "give up"
942    /// on a search if it makes ineffective use of its transition table
943    /// cache. The default configuration does not enable this by default,
944    /// although it is typically a good idea to.
945    /// * When the provided `Input` configuration is not supported. For
946    /// example, by providing an unsupported anchor mode.
947    ///
948    /// When a search returns an error, callers cannot know whether a match
949    /// exists or not.
950    ///
951    /// # Example: UTF-8 mode
952    ///
953    /// This examples demonstrates that UTF-8 mode applies to reverse
954    /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all
955    /// matches reported must correspond to valid UTF-8 spans. This includes
956    /// prohibiting zero-width matches that split a codepoint.
957    ///
958    /// UTF-8 mode is enabled by default. Notice below how the only zero-width
959    /// matches reported are those at UTF-8 boundaries:
960    ///
961    /// ```
962    /// use regex_automata::{
963    ///     hybrid::dfa::{DFA, OverlappingState},
964    ///     nfa::thompson,
965    ///     HalfMatch, Input, MatchKind,
966    /// };
967    ///
968    /// let dfa = DFA::builder()
969    ///     .configure(DFA::config().match_kind(MatchKind::All))
970    ///     .thompson(thompson::Config::new().reverse(true))
971    ///     .build_many(&[r"", r"☃"])?;
972    /// let mut cache = dfa.create_cache();
973    ///
974    /// // Run the reverse DFA to collect all matches.
975    /// let input = Input::new("☃");
976    /// let mut state = OverlappingState::start();
977    /// let mut matches = vec![];
978    /// loop {
979    ///     dfa.try_search_overlapping_rev(&mut cache, &input, &mut state)?;
980    ///     match state.get_match() {
981    ///         None => break,
982    ///         Some(hm) => matches.push(hm),
983    ///     }
984    /// }
985    ///
986    /// // No matches split a codepoint.
987    /// let expected = vec![
988    ///     HalfMatch::must(0, 3),
989    ///     HalfMatch::must(1, 0),
990    ///     HalfMatch::must(0, 0),
991    /// ];
992    /// assert_eq!(expected, matches);
993    ///
994    /// # Ok::<(), Box<dyn std::error::Error>>(())
995    /// ```
996    ///
997    /// Now let's look at the same example, but with UTF-8 mode on the
998    /// underlying NFA disabled:
999    ///
1000    /// ```
1001    /// use regex_automata::{
1002    ///     hybrid::dfa::{DFA, OverlappingState},
1003    ///     nfa::thompson,
1004    ///     HalfMatch, Input, MatchKind,
1005    /// };
1006    ///
1007    /// let dfa = DFA::builder()
1008    ///     .configure(DFA::config().match_kind(MatchKind::All))
1009    ///     .thompson(thompson::Config::new().reverse(true).utf8(false))
1010    ///     .build_many(&[r"", r"☃"])?;
1011    /// let mut cache = dfa.create_cache();
1012    ///
1013    /// // Run the reverse DFA to collect all matches.
1014    /// let input = Input::new("☃");
1015    /// let mut state = OverlappingState::start();
1016    /// let mut matches = vec![];
1017    /// loop {
1018    ///     dfa.try_search_overlapping_rev(&mut cache, &input, &mut state)?;
1019    ///     match state.get_match() {
1020    ///         None => break,
1021    ///         Some(hm) => matches.push(hm),
1022    ///     }
1023    /// }
1024    ///
1025    /// // Now *all* positions match, even within a codepoint,
1026    /// // because we lifted the requirement that matches
1027    /// // correspond to valid UTF-8 spans.
1028    /// let expected = vec![
1029    ///     HalfMatch::must(0, 3),
1030    ///     HalfMatch::must(0, 2),
1031    ///     HalfMatch::must(0, 1),
1032    ///     HalfMatch::must(1, 0),
1033    ///     HalfMatch::must(0, 0),
1034    /// ];
1035    /// assert_eq!(expected, matches);
1036    ///
1037    /// # Ok::<(), Box<dyn std::error::Error>>(())
1038    /// ```
1039    #[inline]
1040    pub fn try_search_overlapping_rev(
1041        &self,
1042        cache: &mut Cache,
1043        input: &Input<'_>,
1044        state: &mut OverlappingState,
1045    ) -> Result<(), MatchError> {
1046        let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1047        search::find_overlapping_rev(self, cache, input, state)?;
1048        match state.get_match() {
1049            None => Ok(()),
1050            Some(_) if !utf8empty => Ok(()),
1051            Some(_) => skip_empty_utf8_splits_overlapping(
1052                input,
1053                state,
1054                |input, state| {
1055                    search::find_overlapping_rev(self, cache, input, state)
1056                },
1057            ),
1058        }
1059    }
1060
1061    /// Writes the set of patterns that match anywhere in the given search
1062    /// configuration to `patset`. If multiple patterns match at the same
1063    /// position and the underlying DFA supports overlapping matches, then all
1064    /// matching patterns are written to the given set.
1065    ///
1066    /// Unless all of the patterns in this DFA are anchored, then generally
1067    /// speaking, this will visit every byte in the haystack.
1068    ///
1069    /// This search routine *does not* clear the pattern set. This gives some
1070    /// flexibility to the caller (e.g., running multiple searches with the
1071    /// same pattern set), but does make the API bug-prone if you're reusing
1072    /// the same pattern set for multiple searches but intended them to be
1073    /// independent.
1074    ///
1075    /// If a pattern ID matched but the given `PatternSet` does not have
1076    /// sufficient capacity to store it, then it is not inserted and silently
1077    /// dropped.
1078    ///
1079    /// # Errors
1080    ///
1081    /// This routine errors if the search could not complete. This can occur
1082    /// in a number of circumstances:
1083    ///
1084    /// * The configuration of the lazy DFA may permit it to "quit" the search.
1085    /// For example, setting quit bytes or enabling heuristic support for
1086    /// Unicode word boundaries. The default configuration does not enable any
1087    /// option that could result in the lazy DFA quitting.
1088    /// * The configuration of the lazy DFA may also permit it to "give up"
1089    /// on a search if it makes ineffective use of its transition table
1090    /// cache. The default configuration does not enable this by default,
1091    /// although it is typically a good idea to.
1092    /// * When the provided `Input` configuration is not supported. For
1093    /// example, by providing an unsupported anchor mode.
1094    ///
1095    /// When a search returns an error, callers cannot know whether a match
1096    /// exists or not.
1097    ///
1098    /// # Example
1099    ///
1100    /// This example shows how to find all matching patterns in a haystack,
1101    /// even when some patterns match at the same position as other patterns.
1102    ///
1103    /// ```
1104    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1105    /// use regex_automata::{
1106    ///     hybrid::dfa::DFA,
1107    ///     Input, MatchKind, PatternSet,
1108    /// };
1109    ///
1110    /// let patterns = &[
1111    ///     r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar",
1112    /// ];
1113    /// let dfa = DFA::builder()
1114    ///     .configure(DFA::config().match_kind(MatchKind::All))
1115    ///     .build_many(patterns)?;
1116    /// let mut cache = dfa.create_cache();
1117    ///
1118    /// let input = Input::new("foobar");
1119    /// let mut patset = PatternSet::new(dfa.pattern_len());
1120    /// dfa.try_which_overlapping_matches(&mut cache, &input, &mut patset)?;
1121    /// let expected = vec![0, 2, 3, 4, 6];
1122    /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect();
1123    /// assert_eq!(expected, got);
1124    ///
1125    /// # Ok::<(), Box<dyn std::error::Error>>(())
1126    /// ```
1127    #[inline]
1128    pub fn try_which_overlapping_matches(
1129        &self,
1130        cache: &mut Cache,
1131        input: &Input<'_>,
1132        patset: &mut PatternSet,
1133    ) -> Result<(), MatchError> {
1134        let mut state = OverlappingState::start();
1135        while let Some(m) = {
1136            self.try_search_overlapping_fwd(cache, input, &mut state)?;
1137            state.get_match()
1138        } {
1139            let _ = patset.try_insert(m.pattern());
1140            // There's nothing left to find, so we can stop. Or the caller
1141            // asked us to.
1142            if patset.is_full() || input.get_earliest() {
1143                break;
1144            }
1145        }
1146        Ok(())
1147    }
1148}
1149
1150impl DFA {
1151    /// Transitions from the current state to the next state, given the next
1152    /// byte of input.
1153    ///
1154    /// The given cache is used to either reuse pre-computed state
1155    /// transitions, or to store this newly computed transition for future
1156    /// reuse. Thus, this routine guarantees that it will never return a state
1157    /// ID that has an "unknown" tag.
1158    ///
1159    /// # State identifier validity
1160    ///
1161    /// The only valid value for `current` is the lazy state ID returned
1162    /// by the most recent call to `next_state`, `next_state_untagged`,
1163    /// `next_state_untagged_unchecked`, `start_state_forward` or
1164    /// `state_state_reverse` for the given `cache`. Any state ID returned from
1165    /// prior calls to these routines (with the same `cache`) is considered
1166    /// invalid (even if it gives an appearance of working). State IDs returned
1167    /// from _any_ prior call for different `cache` values are also always
1168    /// invalid.
1169    ///
1170    /// The returned ID is always a valid ID when `current` refers to a valid
1171    /// ID. Moreover, this routine is defined for all possible values of
1172    /// `input`.
1173    ///
1174    /// These validity rules are not checked, even in debug mode. Callers are
1175    /// required to uphold these rules themselves.
1176    ///
1177    /// Violating these state ID validity rules will not sacrifice memory
1178    /// safety, but _may_ produce an incorrect result or a panic.
1179    ///
1180    /// # Panics
1181    ///
1182    /// If the given ID does not refer to a valid state, then this routine
1183    /// may panic but it also may not panic and instead return an invalid or
1184    /// incorrect ID.
1185    ///
1186    /// # Example
1187    ///
1188    /// This shows a simplistic example for walking a lazy DFA for a given
1189    /// haystack by using the `next_state` method.
1190    ///
1191    /// ```
1192    /// use regex_automata::{hybrid::dfa::DFA, Input};
1193    ///
1194    /// let dfa = DFA::new(r"[a-z]+r")?;
1195    /// let mut cache = dfa.create_cache();
1196    /// let haystack = "bar".as_bytes();
1197    ///
1198    /// // The start state is determined by inspecting the position and the
1199    /// // initial bytes of the haystack.
1200    /// let mut sid = dfa.start_state_forward(
1201    ///     &mut cache, &Input::new(haystack),
1202    /// )?;
1203    /// // Walk all the bytes in the haystack.
1204    /// for &b in haystack {
1205    ///     sid = dfa.next_state(&mut cache, sid, b)?;
1206    /// }
1207    /// // Matches are always delayed by 1 byte, so we must explicitly walk the
1208    /// // special "EOI" transition at the end of the search.
1209    /// sid = dfa.next_eoi_state(&mut cache, sid)?;
1210    /// assert!(sid.is_match());
1211    ///
1212    /// # Ok::<(), Box<dyn std::error::Error>>(())
1213    /// ```
1214    #[inline]
1215    pub fn next_state(
1216        &self,
1217        cache: &mut Cache,
1218        current: LazyStateID,
1219        input: u8,
1220    ) -> Result<LazyStateID, CacheError> {
1221        let class = usize::from(self.classes.get(input));
1222        let offset = current.as_usize_untagged() + class;
1223        let sid = cache.trans[offset];
1224        if !sid.is_unknown() {
1225            return Ok(sid);
1226        }
1227        let unit = alphabet::Unit::u8(input);
1228        Lazy::new(self, cache).cache_next_state(current, unit)
1229    }
1230
1231    /// Transitions from the current state to the next state, given the next
1232    /// byte of input and a state ID that is not tagged.
1233    ///
1234    /// The only reason to use this routine is performance. In particular, the
1235    /// `next_state` method needs to do some additional checks, among them is
1236    /// to account for identifiers to states that are not yet computed. In
1237    /// such a case, the transition is computed on the fly. However, if it is
1238    /// known that the `current` state ID is untagged, then these checks can be
1239    /// omitted.
1240    ///
1241    /// Since this routine does not compute states on the fly, it does not
1242    /// modify the cache and thus cannot return an error. Consequently, `cache`
1243    /// does not need to be mutable and it is possible for this routine to
1244    /// return a state ID corresponding to the special "unknown" state. In
1245    /// this case, it is the caller's responsibility to use the prior state
1246    /// ID and `input` with `next_state` in order to force the computation of
1247    /// the unknown transition. Otherwise, trying to use the "unknown" state
1248    /// ID will just result in transitioning back to itself, and thus never
1249    /// terminating. (This is technically a special exemption to the state ID
1250    /// validity rules, but is permissible since this routine is guaranteed to
1251    /// never mutate the given `cache`, and thus the identifier is guaranteed
1252    /// to remain valid.)
1253    ///
1254    /// See [`LazyStateID`] for more details on what it means for a state ID
1255    /// to be tagged. Also, see
1256    /// [`next_state_untagged_unchecked`](DFA::next_state_untagged_unchecked)
1257    /// for this same idea, but with bounds checks forcefully elided.
1258    ///
1259    /// # State identifier validity
1260    ///
1261    /// The only valid value for `current` is an **untagged** lazy
1262    /// state ID returned by the most recent call to `next_state`,
1263    /// `next_state_untagged`, `next_state_untagged_unchecked`,
1264    /// `start_state_forward` or `state_state_reverse` for the given `cache`.
1265    /// Any state ID returned from prior calls to these routines (with the
1266    /// same `cache`) is considered invalid (even if it gives an appearance
1267    /// of working). State IDs returned from _any_ prior call for different
1268    /// `cache` values are also always invalid.
1269    ///
1270    /// The returned ID is always a valid ID when `current` refers to a valid
1271    /// ID, although it may be tagged. Moreover, this routine is defined for
1272    /// all possible values of `input`.
1273    ///
1274    /// Not all validity rules are checked, even in debug mode. Callers are
1275    /// required to uphold these rules themselves.
1276    ///
1277    /// Violating these state ID validity rules will not sacrifice memory
1278    /// safety, but _may_ produce an incorrect result or a panic.
1279    ///
1280    /// # Panics
1281    ///
1282    /// If the given ID does not refer to a valid state, then this routine
1283    /// may panic but it also may not panic and instead return an invalid or
1284    /// incorrect ID.
1285    ///
1286    /// # Example
1287    ///
1288    /// This shows a simplistic example for walking a lazy DFA for a given
1289    /// haystack by using the `next_state_untagged` method where possible.
1290    ///
1291    /// ```
1292    /// use regex_automata::{hybrid::dfa::DFA, Input};
1293    ///
1294    /// let dfa = DFA::new(r"[a-z]+r")?;
1295    /// let mut cache = dfa.create_cache();
1296    /// let haystack = "bar".as_bytes();
1297    ///
1298    /// // The start state is determined by inspecting the position and the
1299    /// // initial bytes of the haystack.
1300    /// let mut sid = dfa.start_state_forward(
1301    ///     &mut cache, &Input::new(haystack),
1302    /// )?;
1303    /// // Walk all the bytes in the haystack.
1304    /// let mut at = 0;
1305    /// while at < haystack.len() {
1306    ///     if sid.is_tagged() {
1307    ///         sid = dfa.next_state(&mut cache, sid, haystack[at])?;
1308    ///     } else {
1309    ///         let mut prev_sid = sid;
1310    ///         // We attempt to chew through as much as we can while moving
1311    ///         // through untagged state IDs. Thus, the transition function
1312    ///         // does less work on average per byte. (Unrolling this loop
1313    ///         // may help even more.)
1314    ///         while at < haystack.len() {
1315    ///             prev_sid = sid;
1316    ///             sid = dfa.next_state_untagged(
1317    ///                 &mut cache, sid, haystack[at],
1318    ///             );
1319    ///             at += 1;
1320    ///             if sid.is_tagged() {
1321    ///                 break;
1322    ///             }
1323    ///         }
1324    ///         // We must ensure that we never proceed to the next iteration
1325    ///         // with an unknown state ID. If we don't account for this
1326    ///         // case, then search isn't guaranteed to terminate since all
1327    ///         // transitions on unknown states loop back to itself.
1328    ///         if sid.is_unknown() {
1329    ///             sid = dfa.next_state(
1330    ///                 &mut cache, prev_sid, haystack[at - 1],
1331    ///             )?;
1332    ///         }
1333    ///     }
1334    /// }
1335    /// // Matches are always delayed by 1 byte, so we must explicitly walk the
1336    /// // special "EOI" transition at the end of the search.
1337    /// sid = dfa.next_eoi_state(&mut cache, sid)?;
1338    /// assert!(sid.is_match());
1339    ///
1340    /// # Ok::<(), Box<dyn std::error::Error>>(())
1341    /// ```
1342    #[inline]
1343    pub fn next_state_untagged(
1344        &self,
1345        cache: &Cache,
1346        current: LazyStateID,
1347        input: u8,
1348    ) -> LazyStateID {
1349        debug_assert!(!current.is_tagged());
1350        let class = usize::from(self.classes.get(input));
1351        let offset = current.as_usize_unchecked() + class;
1352        cache.trans[offset]
1353    }
1354
1355    /// Transitions from the current state to the next state, eliding bounds
1356    /// checks, given the next byte of input and a state ID that is not tagged.
1357    ///
1358    /// The only reason to use this routine is performance. In particular, the
1359    /// `next_state` method needs to do some additional checks, among them is
1360    /// to account for identifiers to states that are not yet computed. In
1361    /// such a case, the transition is computed on the fly. However, if it is
1362    /// known that the `current` state ID is untagged, then these checks can be
1363    /// omitted.
1364    ///
1365    /// Since this routine does not compute states on the fly, it does not
1366    /// modify the cache and thus cannot return an error. Consequently, `cache`
1367    /// does not need to be mutable and it is possible for this routine to
1368    /// return a state ID corresponding to the special "unknown" state. In
1369    /// this case, it is the caller's responsibility to use the prior state
1370    /// ID and `input` with `next_state` in order to force the computation of
1371    /// the unknown transition. Otherwise, trying to use the "unknown" state
1372    /// ID will just result in transitioning back to itself, and thus never
1373    /// terminating. (This is technically a special exemption to the state ID
1374    /// validity rules, but is permissible since this routine is guaranteed to
1375    /// never mutate the given `cache`, and thus the identifier is guaranteed
1376    /// to remain valid.)
1377    ///
1378    /// See [`LazyStateID`] for more details on what it means for a state ID
1379    /// to be tagged. Also, see
1380    /// [`next_state_untagged`](DFA::next_state_untagged)
1381    /// for this same idea, but with memory safety guaranteed by retaining
1382    /// bounds checks.
1383    ///
1384    /// # State identifier validity
1385    ///
1386    /// The only valid value for `current` is an **untagged** lazy
1387    /// state ID returned by the most recent call to `next_state`,
1388    /// `next_state_untagged`, `next_state_untagged_unchecked`,
1389    /// `start_state_forward` or `state_state_reverse` for the given `cache`.
1390    /// Any state ID returned from prior calls to these routines (with the
1391    /// same `cache`) is considered invalid (even if it gives an appearance
1392    /// of working). State IDs returned from _any_ prior call for different
1393    /// `cache` values are also always invalid.
1394    ///
1395    /// The returned ID is always a valid ID when `current` refers to a valid
1396    /// ID, although it may be tagged. Moreover, this routine is defined for
1397    /// all possible values of `input`.
1398    ///
1399    /// Not all validity rules are checked, even in debug mode. Callers are
1400    /// required to uphold these rules themselves.
1401    ///
1402    /// Violating these state ID validity rules will not sacrifice memory
1403    /// safety, but _may_ produce an incorrect result or a panic.
1404    ///
1405    /// # Safety
1406    ///
1407    /// Callers of this method must guarantee that `current` refers to a valid
1408    /// state ID according to the rules described above. If `current` is not a
1409    /// valid state ID for this automaton, then calling this routine may result
1410    /// in undefined behavior.
1411    ///
1412    /// If `current` is valid, then the ID returned is valid for all possible
1413    /// values of `input`.
1414    #[inline]
1415    pub unsafe fn next_state_untagged_unchecked(
1416        &self,
1417        cache: &Cache,
1418        current: LazyStateID,
1419        input: u8,
1420    ) -> LazyStateID {
1421        debug_assert!(!current.is_tagged());
1422        let class = usize::from(self.classes.get(input));
1423        let offset = current.as_usize_unchecked() + class;
1424        *cache.trans.get_unchecked(offset)
1425    }
1426
1427    /// Transitions from the current state to the next state for the special
1428    /// EOI symbol.
1429    ///
1430    /// The given cache is used to either reuse pre-computed state
1431    /// transitions, or to store this newly computed transition for future
1432    /// reuse. Thus, this routine guarantees that it will never return a state
1433    /// ID that has an "unknown" tag.
1434    ///
1435    /// This routine must be called at the end of every search in a correct
1436    /// implementation of search. Namely, lazy DFAs in this crate delay matches
1437    /// by one byte in order to support look-around operators. Thus, after
1438    /// reaching the end of a haystack, a search implementation must follow one
1439    /// last EOI transition.
1440    ///
1441    /// It is best to think of EOI as an additional symbol in the alphabet of a
1442    /// DFA that is distinct from every other symbol. That is, the alphabet of
1443    /// lazy DFAs in this crate has a logical size of 257 instead of 256, where
1444    /// 256 corresponds to every possible inhabitant of `u8`. (In practice, the
1445    /// physical alphabet size may be smaller because of alphabet compression
1446    /// via equivalence classes, but EOI is always represented somehow in the
1447    /// alphabet.)
1448    ///
1449    /// # State identifier validity
1450    ///
1451    /// The only valid value for `current` is the lazy state ID returned
1452    /// by the most recent call to `next_state`, `next_state_untagged`,
1453    /// `next_state_untagged_unchecked`, `start_state_forward` or
1454    /// `state_state_reverse` for the given `cache`. Any state ID returned from
1455    /// prior calls to these routines (with the same `cache`) is considered
1456    /// invalid (even if it gives an appearance of working). State IDs returned
1457    /// from _any_ prior call for different `cache` values are also always
1458    /// invalid.
1459    ///
1460    /// The returned ID is always a valid ID when `current` refers to a valid
1461    /// ID.
1462    ///
1463    /// These validity rules are not checked, even in debug mode. Callers are
1464    /// required to uphold these rules themselves.
1465    ///
1466    /// Violating these state ID validity rules will not sacrifice memory
1467    /// safety, but _may_ produce an incorrect result or a panic.
1468    ///
1469    /// # Panics
1470    ///
1471    /// If the given ID does not refer to a valid state, then this routine
1472    /// may panic but it also may not panic and instead return an invalid or
1473    /// incorrect ID.
1474    ///
1475    /// # Example
1476    ///
1477    /// This shows a simplistic example for walking a DFA for a given haystack,
1478    /// and then finishing the search with the final EOI transition.
1479    ///
1480    /// ```
1481    /// use regex_automata::{hybrid::dfa::DFA, Input};
1482    ///
1483    /// let dfa = DFA::new(r"[a-z]+r")?;
1484    /// let mut cache = dfa.create_cache();
1485    /// let haystack = "bar".as_bytes();
1486    ///
1487    /// // The start state is determined by inspecting the position and the
1488    /// // initial bytes of the haystack.
1489    /// let mut sid = dfa.start_state_forward(
1490    ///     &mut cache, &Input::new(haystack),
1491    /// )?;
1492    /// // Walk all the bytes in the haystack.
1493    /// for &b in haystack {
1494    ///     sid = dfa.next_state(&mut cache, sid, b)?;
1495    /// }
1496    /// // Matches are always delayed by 1 byte, so we must explicitly walk
1497    /// // the special "EOI" transition at the end of the search. Without this
1498    /// // final transition, the assert below will fail since the DFA will not
1499    /// // have entered a match state yet!
1500    /// sid = dfa.next_eoi_state(&mut cache, sid)?;
1501    /// assert!(sid.is_match());
1502    ///
1503    /// # Ok::<(), Box<dyn std::error::Error>>(())
1504    /// ```
1505    #[inline]
1506    pub fn next_eoi_state(
1507        &self,
1508        cache: &mut Cache,
1509        current: LazyStateID,
1510    ) -> Result<LazyStateID, CacheError> {
1511        let eoi = self.classes.eoi().as_usize();
1512        let offset = current.as_usize_untagged() + eoi;
1513        let sid = cache.trans[offset];
1514        if !sid.is_unknown() {
1515            return Ok(sid);
1516        }
1517        let unit = self.classes.eoi();
1518        Lazy::new(self, cache).cache_next_state(current, unit)
1519    }
1520
1521    /// Return the ID of the start state for this lazy DFA for the given
1522    /// starting configuration.
1523    ///
1524    /// Unlike typical DFA implementations, the start state for DFAs in this
1525    /// crate is dependent on a few different factors:
1526    ///
1527    /// * The [`Anchored`] mode of the search. Unanchored, anchored and
1528    /// anchored searches for a specific [`PatternID`] all use different start
1529    /// states.
1530    /// * Whether a "look-behind" byte exists. For example, the `^` anchor
1531    /// matches if and only if there is no look-behind byte.
1532    /// * The specific value of that look-behind byte. For example, a `(?m:^)`
1533    /// assertion only matches when there is either no look-behind byte, or
1534    /// when the look-behind byte is a line terminator.
1535    ///
1536    /// The [starting configuration](start::Config) provides the above
1537    /// information.
1538    ///
1539    /// This routine can be used for either forward or reverse searches.
1540    /// Although, as a convenience, if you have an [`Input`], then it
1541    /// may be more succinct to use [`DFA::start_state_forward`] or
1542    /// [`DFA::start_state_reverse`]. Note, for example, that the convenience
1543    /// routines return a [`MatchError`] on failure where as this routine
1544    /// returns a [`StartError`].
1545    ///
1546    /// # Errors
1547    ///
1548    /// This may return a [`StartError`] if the search needs to give up when
1549    /// determining the start state (for example, if it sees a "quit" byte
1550    /// or if the cache has become inefficient). This can also return an
1551    /// error if the given configuration contains an unsupported [`Anchored`]
1552    /// configuration.
1553    #[cfg_attr(feature = "perf-inline", inline(always))]
1554    pub fn start_state(
1555        &self,
1556        cache: &mut Cache,
1557        config: &start::Config,
1558    ) -> Result<LazyStateID, StartError> {
1559        let lazy = LazyRef::new(self, cache);
1560        let anchored = config.get_anchored();
1561        let start = match config.get_look_behind() {
1562            None => Start::Text,
1563            Some(byte) => {
1564                if !self.quitset.is_empty() && self.quitset.contains(byte) {
1565                    return Err(StartError::quit(byte));
1566                }
1567                self.start_map.get(byte)
1568            }
1569        };
1570        let start_id = lazy.get_cached_start_id(anchored, start)?;
1571        if !start_id.is_unknown() {
1572            return Ok(start_id);
1573        }
1574        Lazy::new(self, cache).cache_start_group(anchored, start)
1575    }
1576
1577    /// Return the ID of the start state for this lazy DFA when executing a
1578    /// forward search.
1579    ///
1580    /// This is a convenience routine for calling [`DFA::start_state`] that
1581    /// converts the given [`Input`] to a [start configuration](start::Config).
1582    /// Additionally, if an error occurs, it is converted from a [`StartError`]
1583    /// to a [`MatchError`] using the offset information in the given
1584    /// [`Input`].
1585    ///
1586    /// # Errors
1587    ///
1588    /// This may return a [`MatchError`] if the search needs to give up when
1589    /// determining the start state (for example, if it sees a "quit" byte or
1590    /// if the cache has become inefficient). This can also return an error if
1591    /// the given `Input` contains an unsupported [`Anchored`] configuration.
1592    #[cfg_attr(feature = "perf-inline", inline(always))]
1593    pub fn start_state_forward(
1594        &self,
1595        cache: &mut Cache,
1596        input: &Input<'_>,
1597    ) -> Result<LazyStateID, MatchError> {
1598        let config = start::Config::from_input_forward(input);
1599        self.start_state(cache, &config).map_err(|err| match err {
1600            StartError::Cache { .. } => MatchError::gave_up(input.start()),
1601            StartError::Quit { byte } => {
1602                let offset = input
1603                    .start()
1604                    .checked_sub(1)
1605                    .expect("no quit in start without look-behind");
1606                MatchError::quit(byte, offset)
1607            }
1608            StartError::UnsupportedAnchored { mode } => {
1609                MatchError::unsupported_anchored(mode)
1610            }
1611        })
1612    }
1613
1614    /// Return the ID of the start state for this lazy DFA when executing a
1615    /// reverse search.
1616    ///
1617    /// This is a convenience routine for calling [`DFA::start_state`] that
1618    /// converts the given [`Input`] to a [start configuration](start::Config).
1619    /// Additionally, if an error occurs, it is converted from a [`StartError`]
1620    /// to a [`MatchError`] using the offset information in the given
1621    /// [`Input`].
1622    ///
1623    /// # Errors
1624    ///
1625    /// This may return a [`MatchError`] if the search needs to give up when
1626    /// determining the start state (for example, if it sees a "quit" byte or
1627    /// if the cache has become inefficient). This can also return an error if
1628    /// the given `Input` contains an unsupported [`Anchored`] configuration.
1629    #[cfg_attr(feature = "perf-inline", inline(always))]
1630    pub fn start_state_reverse(
1631        &self,
1632        cache: &mut Cache,
1633        input: &Input<'_>,
1634    ) -> Result<LazyStateID, MatchError> {
1635        let config = start::Config::from_input_reverse(input);
1636        self.start_state(cache, &config).map_err(|err| match err {
1637            StartError::Cache { .. } => MatchError::gave_up(input.end()),
1638            StartError::Quit { byte } => {
1639                let offset = input.end();
1640                MatchError::quit(byte, offset)
1641            }
1642            StartError::UnsupportedAnchored { mode } => {
1643                MatchError::unsupported_anchored(mode)
1644            }
1645        })
1646    }
1647
1648    /// Returns the total number of patterns that match in this state.
1649    ///
1650    /// If the lazy DFA was compiled with one pattern, then this must
1651    /// necessarily always return `1` for all match states.
1652    ///
1653    /// A lazy DFA guarantees that [`DFA::match_pattern`] can be called with
1654    /// indices up to (but not including) the length returned by this routine
1655    /// without panicking.
1656    ///
1657    /// # Panics
1658    ///
1659    /// If the given state is not a match state, then this may either panic
1660    /// or return an incorrect result.
1661    ///
1662    /// # Example
1663    ///
1664    /// This example shows a simple instance of implementing overlapping
1665    /// matches. In particular, it shows not only how to determine how many
1666    /// patterns have matched in a particular state, but also how to access
1667    /// which specific patterns have matched.
1668    ///
1669    /// Notice that we must use [`MatchKind::All`] when building the DFA. If we
1670    /// used [`MatchKind::LeftmostFirst`] instead, then the DFA would not be
1671    /// constructed in a way that supports overlapping matches. (It would only
1672    /// report a single pattern that matches at any particular point in time.)
1673    ///
1674    /// Another thing to take note of is the patterns used and the order in
1675    /// which the pattern IDs are reported. In the example below, pattern `3`
1676    /// is yielded first. Why? Because it corresponds to the match that
1677    /// appears first. Namely, the `@` symbol is part of `\S+` but not part
1678    /// of any of the other patterns. Since the `\S+` pattern has a match that
1679    /// starts to the left of any other pattern, its ID is returned before any
1680    /// other.
1681    ///
1682    /// ```
1683    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1684    /// use regex_automata::{hybrid::dfa::DFA, Input, MatchKind};
1685    ///
1686    /// let dfa = DFA::builder()
1687    ///     .configure(DFA::config().match_kind(MatchKind::All))
1688    ///     .build_many(&[
1689    ///         r"\w+", r"[a-z]+", r"[A-Z]+", r"\S+",
1690    ///     ])?;
1691    /// let mut cache = dfa.create_cache();
1692    /// let haystack = "@bar".as_bytes();
1693    ///
1694    /// // The start state is determined by inspecting the position and the
1695    /// // initial bytes of the haystack.
1696    /// let mut sid = dfa.start_state_forward(
1697    ///     &mut cache, &Input::new(haystack),
1698    /// )?;
1699    /// // Walk all the bytes in the haystack.
1700    /// for &b in haystack {
1701    ///     sid = dfa.next_state(&mut cache, sid, b)?;
1702    /// }
1703    /// sid = dfa.next_eoi_state(&mut cache, sid)?;
1704    ///
1705    /// assert!(sid.is_match());
1706    /// assert_eq!(dfa.match_len(&mut cache, sid), 3);
1707    /// // The following calls are guaranteed to not panic since `match_len`
1708    /// // returned `3` above.
1709    /// assert_eq!(dfa.match_pattern(&mut cache, sid, 0).as_usize(), 3);
1710    /// assert_eq!(dfa.match_pattern(&mut cache, sid, 1).as_usize(), 0);
1711    /// assert_eq!(dfa.match_pattern(&mut cache, sid, 2).as_usize(), 1);
1712    ///
1713    /// # Ok::<(), Box<dyn std::error::Error>>(())
1714    /// ```
1715    #[inline]
1716    pub fn match_len(&self, cache: &Cache, id: LazyStateID) -> usize {
1717        assert!(id.is_match());
1718        LazyRef::new(self, cache).get_cached_state(id).match_len()
1719    }
1720
1721    /// Returns the pattern ID corresponding to the given match index in the
1722    /// given state.
1723    ///
1724    /// See [`DFA::match_len`] for an example of how to use this method
1725    /// correctly. Note that if you know your lazy DFA is configured with a
1726    /// single pattern, then this routine is never necessary since it will
1727    /// always return a pattern ID of `0` for an index of `0` when `id`
1728    /// corresponds to a match state.
1729    ///
1730    /// Typically, this routine is used when implementing an overlapping
1731    /// search, as the example for `DFA::match_len` does.
1732    ///
1733    /// # Panics
1734    ///
1735    /// If the state ID is not a match state or if the match index is out
1736    /// of bounds for the given state, then this routine may either panic
1737    /// or produce an incorrect result. If the state ID is correct and the
1738    /// match index is correct, then this routine always produces a valid
1739    /// `PatternID`.
1740    #[inline]
1741    pub fn match_pattern(
1742        &self,
1743        cache: &Cache,
1744        id: LazyStateID,
1745        match_index: usize,
1746    ) -> PatternID {
1747        // This is an optimization for the very common case of a DFA with a
1748        // single pattern. This conditional avoids a somewhat more costly path
1749        // that finds the pattern ID from the corresponding `State`, which
1750        // requires a bit of slicing/pointer-chasing. This optimization tends
1751        // to only matter when matches are frequent.
1752        if self.pattern_len() == 1 {
1753            return PatternID::ZERO;
1754        }
1755        LazyRef::new(self, cache)
1756            .get_cached_state(id)
1757            .match_pattern(match_index)
1758    }
1759}
1760
1761/// A cache represents a partially computed DFA.
1762///
1763/// A cache is the key component that differentiates a classical DFA and a
1764/// hybrid NFA/DFA (also called a "lazy DFA"). Where a classical DFA builds a
1765/// complete transition table that can handle all possible inputs, a hybrid
1766/// NFA/DFA starts with an empty transition table and builds only the parts
1767/// required during search. The parts that are built are stored in a cache. For
1768/// this reason, a cache is a required parameter for nearly every operation on
1769/// a [`DFA`].
1770///
1771/// Caches can be created from their corresponding DFA via
1772/// [`DFA::create_cache`]. A cache can only be used with either the DFA that
1773/// created it, or the DFA that was most recently used to reset it with
1774/// [`Cache::reset`]. Using a cache with any other DFA may result in panics
1775/// or incorrect results.
1776#[derive(Clone, Debug)]
1777pub struct Cache {
1778    // N.B. If you're looking to understand how determinization works, it
1779    // is probably simpler to first grok src/dfa/determinize.rs, since that
1780    // doesn't have the "laziness" component.
1781    /// The transition table.
1782    ///
1783    /// Given a `current` LazyStateID and an `input` byte, the next state can
1784    /// be computed via `trans[untagged(current) + equiv_class(input)]`. Notice
1785    /// that no multiplication is used. That's because state identifiers are
1786    /// "premultiplied."
1787    ///
1788    /// Note that the next state may be the "unknown" state. In this case, the
1789    /// next state is not known and determinization for `current` on `input`
1790    /// must be performed.
1791    trans: Vec<LazyStateID>,
1792    /// The starting states for this DFA.
1793    ///
1794    /// These are computed lazily. Initially, these are all set to "unknown"
1795    /// lazy state IDs.
1796    ///
1797    /// When 'starts_for_each_pattern' is disabled (the default), then the size
1798    /// of this is constrained to the possible starting configurations based
1799    /// on the search parameters. (At time of writing, that's 4.) However,
1800    /// when starting states for each pattern is enabled, then there are N
1801    /// additional groups of starting states, where each group reflects the
1802    /// different possible configurations and N is the number of patterns.
1803    starts: Vec<LazyStateID>,
1804    /// A sequence of NFA/DFA powerset states that have been computed for this
1805    /// lazy DFA. This sequence is indexable by untagged LazyStateIDs. (Every
1806    /// tagged LazyStateID can be used to index this sequence by converting it
1807    /// to its untagged form.)
1808    states: Vec<State>,
1809    /// A map from states to their corresponding IDs. This map may be accessed
1810    /// via the raw byte representation of a state, which means that a `State`
1811    /// does not need to be allocated to determine whether it already exists
1812    /// in this map. Indeed, the existence of such a state is what determines
1813    /// whether we allocate a new `State` or not.
1814    ///
1815    /// The higher level idea here is that we do just enough determinization
1816    /// for a state to check whether we've already computed it. If we have,
1817    /// then we can save a little (albeit not much) work. The real savings is
1818    /// in memory usage. If we never checked for trivially duplicate states,
1819    /// then our memory usage would explode to unreasonable levels.
1820    states_to_id: StateMap,
1821    /// Sparse sets used to track which NFA states have been visited during
1822    /// various traversals.
1823    sparses: SparseSets,
1824    /// Scratch space for traversing the NFA graph. (We use space on the heap
1825    /// instead of the call stack.)
1826    stack: Vec<NFAStateID>,
1827    /// Scratch space for building a NFA/DFA powerset state. This is used to
1828    /// help amortize allocation since not every powerset state generated is
1829    /// added to the cache. In particular, if it already exists in the cache,
1830    /// then there is no need to allocate a new `State` for it.
1831    scratch_state_builder: StateBuilderEmpty,
1832    /// A simple abstraction for handling the saving of at most a single state
1833    /// across a cache clearing. This is required for correctness. Namely, if
1834    /// adding a new state after clearing the cache fails, then the caller
1835    /// must retain the ability to continue using the state ID given. The
1836    /// state corresponding to the state ID is what we preserve across cache
1837    /// clearings.
1838    state_saver: StateSaver,
1839    /// The memory usage, in bytes, used by 'states' and 'states_to_id'. We
1840    /// track this as new states are added since states use a variable amount
1841    /// of heap. Tracking this as we add states makes it possible to compute
1842    /// the total amount of memory used by the determinizer in constant time.
1843    memory_usage_state: usize,
1844    /// The number of times the cache has been cleared. When a minimum cache
1845    /// clear count is set, then the cache will return an error instead of
1846    /// clearing the cache if the count has been exceeded.
1847    clear_count: usize,
1848    /// The total number of bytes searched since the last time this cache was
1849    /// cleared, not including the current search.
1850    ///
1851    /// This can be added to the length of the current search to get the true
1852    /// total number of bytes searched.
1853    ///
1854    /// This is generally only non-zero when the
1855    /// `Cache::search_{start,update,finish}` APIs are used to track search
1856    /// progress.
1857    bytes_searched: usize,
1858    /// The progress of the current search.
1859    ///
1860    /// This is only non-`None` when callers utilize the `Cache::search_start`,
1861    /// `Cache::search_update` and `Cache::search_finish` APIs.
1862    ///
1863    /// The purpose of recording search progress is to be able to make a
1864    /// determination about the efficiency of the cache. Namely, by keeping
1865    /// track of the
1866    progress: Option<SearchProgress>,
1867}
1868
1869impl Cache {
1870    /// Create a new cache for the given lazy DFA.
1871    ///
1872    /// The cache returned should only be used for searches for the given DFA.
1873    /// If you want to reuse the cache for another DFA, then you must call
1874    /// [`Cache::reset`] with that DFA.
1875    pub fn new(dfa: &DFA) -> Cache {
1876        let mut cache = Cache {
1877            trans: alloc::vec![],
1878            starts: alloc::vec![],
1879            states: alloc::vec![],
1880            states_to_id: StateMap::new(),
1881            sparses: SparseSets::new(dfa.get_nfa().states().len()),
1882            stack: alloc::vec![],
1883            scratch_state_builder: StateBuilderEmpty::new(),
1884            state_saver: StateSaver::none(),
1885            memory_usage_state: 0,
1886            clear_count: 0,
1887            bytes_searched: 0,
1888            progress: None,
1889        };
1890        debug!("pre-init lazy DFA cache size: {}", cache.memory_usage());
1891        Lazy { dfa, cache: &mut cache }.init_cache();
1892        debug!("post-init lazy DFA cache size: {}", cache.memory_usage());
1893        cache
1894    }
1895
1896    /// Reset this cache such that it can be used for searching with the given
1897    /// lazy DFA (and only that DFA).
1898    ///
1899    /// A cache reset permits reusing memory already allocated in this cache
1900    /// with a different lazy DFA.
1901    ///
1902    /// Resetting a cache sets its "clear count" to 0. This is relevant if the
1903    /// lazy DFA has been configured to "give up" after it has cleared the
1904    /// cache a certain number of times.
1905    ///
1906    /// Any lazy state ID generated by the cache prior to resetting it is
1907    /// invalid after the reset.
1908    ///
1909    /// # Example
1910    ///
1911    /// This shows how to re-purpose a cache for use with a different DFA.
1912    ///
1913    /// ```
1914    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1915    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
1916    ///
1917    /// let dfa1 = DFA::new(r"\w")?;
1918    /// let dfa2 = DFA::new(r"\W")?;
1919    ///
1920    /// let mut cache = dfa1.create_cache();
1921    /// assert_eq!(
1922    ///     Some(HalfMatch::must(0, 2)),
1923    ///     dfa1.try_search_fwd(&mut cache, &Input::new("Δ"))?,
1924    /// );
1925    ///
1926    /// // Using 'cache' with dfa2 is not allowed. It may result in panics or
1927    /// // incorrect results. In order to re-purpose the cache, we must reset
1928    /// // it with the DFA we'd like to use it with.
1929    /// //
1930    /// // Similarly, after this reset, using the cache with 'dfa1' is also not
1931    /// // allowed.
1932    /// cache.reset(&dfa2);
1933    /// assert_eq!(
1934    ///     Some(HalfMatch::must(0, 3)),
1935    ///     dfa2.try_search_fwd(&mut cache, &Input::new("☃"))?,
1936    /// );
1937    ///
1938    /// # Ok::<(), Box<dyn std::error::Error>>(())
1939    /// ```
1940    pub fn reset(&mut self, dfa: &DFA) {
1941        Lazy::new(dfa, self).reset_cache()
1942    }
1943
1944    /// Initializes a new search starting at the given position.
1945    ///
1946    /// If a previous search was unfinished, then it is finished automatically
1947    /// and a new search is begun.
1948    ///
1949    /// Note that keeping track of search progress is _not necessary_
1950    /// for correct implementations of search using a lazy DFA. Keeping
1951    /// track of search progress is only necessary if you want the
1952    /// [`Config::minimum_bytes_per_state`] configuration knob to work.
1953    #[inline]
1954    pub fn search_start(&mut self, at: usize) {
1955        // If a previous search wasn't marked as finished, then finish it
1956        // now automatically.
1957        if let Some(p) = self.progress.take() {
1958            self.bytes_searched += p.len();
1959        }
1960        self.progress = Some(SearchProgress { start: at, at });
1961    }
1962
1963    /// Updates the current search to indicate that it has search to the
1964    /// current position.
1965    ///
1966    /// No special care needs to be taken for reverse searches. Namely, the
1967    /// position given may be _less than_ the starting position of the search.
1968    ///
1969    /// # Panics
1970    ///
1971    /// This panics if no search has been started by [`Cache::search_start`].
1972    #[inline]
1973    pub fn search_update(&mut self, at: usize) {
1974        let p =
1975            self.progress.as_mut().expect("no in-progress search to update");
1976        p.at = at;
1977    }
1978
1979    /// Indicates that a search has finished at the given position.
1980    ///
1981    /// # Panics
1982    ///
1983    /// This panics if no search has been started by [`Cache::search_start`].
1984    #[inline]
1985    pub fn search_finish(&mut self, at: usize) {
1986        let mut p =
1987            self.progress.take().expect("no in-progress search to finish");
1988        p.at = at;
1989        self.bytes_searched += p.len();
1990    }
1991
1992    /// Returns the total number of bytes that have been searched since this
1993    /// cache was last cleared.
1994    ///
1995    /// This is useful for determining the efficiency of the cache. For
1996    /// example, the lazy DFA uses this value in conjunction with the
1997    /// [`Config::minimum_bytes_per_state`] knob to help determine whether it
1998    /// should quit searching.
1999    ///
2000    /// This always returns `0` if search progress isn't being tracked. Note
2001    /// that the lazy DFA search routines in this crate always track search
2002    /// progress.
2003    pub fn search_total_len(&self) -> usize {
2004        self.bytes_searched + self.progress.as_ref().map_or(0, |p| p.len())
2005    }
2006
2007    /// Returns the total number of times this cache has been cleared since it
2008    /// was either created or last reset.
2009    ///
2010    /// This is useful for informational purposes or if you want to change
2011    /// search strategies based on the number of times the cache has been
2012    /// cleared.
2013    pub fn clear_count(&self) -> usize {
2014        self.clear_count
2015    }
2016
2017    /// Returns the heap memory usage, in bytes, of this cache.
2018    ///
2019    /// This does **not** include the stack size used up by this cache. To
2020    /// compute that, use `std::mem::size_of::<Cache>()`.
2021    pub fn memory_usage(&self) -> usize {
2022        const ID_SIZE: usize = size_of::<LazyStateID>();
2023        const STATE_SIZE: usize = size_of::<State>();
2024
2025        // NOTE: If you make changes to the below, then
2026        // 'minimum_cache_capacity' should be updated correspondingly.
2027
2028        self.trans.len() * ID_SIZE
2029        + self.starts.len() * ID_SIZE
2030        + self.states.len() * STATE_SIZE
2031        // Maps likely use more memory than this, but it's probably close.
2032        + self.states_to_id.len() * (STATE_SIZE + ID_SIZE)
2033        + self.sparses.memory_usage()
2034        + self.stack.capacity() * ID_SIZE
2035        + self.scratch_state_builder.capacity()
2036        // Heap memory used by 'State' in both 'states' and 'states_to_id'.
2037        + self.memory_usage_state
2038    }
2039}
2040
2041/// Keeps track of the progress of the current search.
2042///
2043/// This is updated via the `Cache::search_{start,update,finish}` APIs to
2044/// record how many bytes have been searched. This permits computing a
2045/// heuristic that represents the efficiency of a cache, and thus helps inform
2046/// whether the lazy DFA should give up or not.
2047#[derive(Clone, Debug)]
2048struct SearchProgress {
2049    start: usize,
2050    at: usize,
2051}
2052
2053impl SearchProgress {
2054    /// Returns the length, in bytes, of this search so far.
2055    ///
2056    /// This automatically handles the case of a reverse search, where `at`
2057    /// is likely to be less than `start`.
2058    fn len(&self) -> usize {
2059        if self.start <= self.at {
2060            self.at - self.start
2061        } else {
2062            self.start - self.at
2063        }
2064    }
2065}
2066
2067/// A map from states to state identifiers. When using std, we use a standard
2068/// hashmap, since it's a bit faster for this use case. (Other maps, like
2069/// one's based on FNV, have not yet been benchmarked.)
2070///
2071/// The main purpose of this map is to reuse states where possible. This won't
2072/// fully minimize the DFA, but it works well in a lot of cases.
2073#[cfg(feature = "std")]
2074type StateMap = std::collections::HashMap<State, LazyStateID>;
2075#[cfg(not(feature = "std"))]
2076type StateMap = alloc::collections::BTreeMap<State, LazyStateID>;
2077
2078/// A type that groups methods that require the base NFA/DFA and writable
2079/// access to the cache.
2080#[derive(Debug)]
2081struct Lazy<'i, 'c> {
2082    dfa: &'i DFA,
2083    cache: &'c mut Cache,
2084}
2085
2086impl<'i, 'c> Lazy<'i, 'c> {
2087    /// Creates a new 'Lazy' wrapper for a DFA and its corresponding cache.
2088    fn new(dfa: &'i DFA, cache: &'c mut Cache) -> Lazy<'i, 'c> {
2089        Lazy { dfa, cache }
2090    }
2091
2092    /// Return an immutable view by downgrading a writable cache to a read-only
2093    /// cache.
2094    fn as_ref<'a>(&'a self) -> LazyRef<'i, 'a> {
2095        LazyRef::new(self.dfa, self.cache)
2096    }
2097
2098    /// This is marked as 'inline(never)' to avoid bloating methods on 'DFA'
2099    /// like 'next_state' and 'next_eoi_state' that are called in critical
2100    /// areas. The idea is to let the optimizer focus on the other areas of
2101    /// those methods as the hot path.
2102    ///
2103    /// Here's an example that justifies 'inline(never)'
2104    ///
2105    /// ```ignore
2106    /// regex-cli find match hybrid \
2107    ///   --cache-capacity 100000000 \
2108    ///   -p '\pL{100}'
2109    ///   all-codepoints-utf8-100x
2110    /// ```
2111    ///
2112    /// Where 'all-codepoints-utf8-100x' is the UTF-8 encoding of every
2113    /// codepoint, in sequence, repeated 100 times.
2114    ///
2115    /// With 'inline(never)' hyperfine reports 1.1s per run. With
2116    /// 'inline(always)', hyperfine reports 1.23s. So that's a 10% improvement.
2117    #[cold]
2118    #[inline(never)]
2119    fn cache_next_state(
2120        &mut self,
2121        mut current: LazyStateID,
2122        unit: alphabet::Unit,
2123    ) -> Result<LazyStateID, CacheError> {
2124        let stride2 = self.dfa.stride2();
2125        let empty_builder = self.get_state_builder();
2126        let builder = determinize::next(
2127            self.dfa.get_nfa(),
2128            self.dfa.get_config().get_match_kind(),
2129            &mut self.cache.sparses,
2130            &mut self.cache.stack,
2131            &self.cache.states[current.as_usize_untagged() >> stride2],
2132            unit,
2133            empty_builder,
2134        );
2135        // This is subtle, but if we *might* clear the cache, then we should
2136        // try to save the current state so that we can re-map its ID after
2137        // cache clearing. We *might* clear the cache when either the new
2138        // state can't fit in the cache or when the number of transitions has
2139        // reached the maximum. Even if either of these conditions is true,
2140        // the cache might not be cleared if we can reuse an existing state.
2141        // But we don't know that at this point. Moreover, we don't save the
2142        // current state every time because it is costly.
2143        //
2144        // TODO: We should try to find a way to make this less subtle and error
2145        // prone. ---AG
2146        let save_state = !self.as_ref().state_builder_fits_in_cache(&builder)
2147            || self.cache.trans.len() >= LazyStateID::MAX;
2148        if save_state {
2149            self.save_state(current);
2150        }
2151        let next = self.add_builder_state(builder, |sid| sid)?;
2152        if save_state {
2153            current = self.saved_state_id();
2154        }
2155        // This is the payoff. The next time 'next_state' is called with this
2156        // state and alphabet unit, it will find this transition and avoid
2157        // having to re-determinize this transition.
2158        self.set_transition(current, unit, next);
2159        Ok(next)
2160    }
2161
2162    /// Compute and cache the starting state for the given pattern ID (if
2163    /// present) and the starting configuration.
2164    ///
2165    /// This panics if a pattern ID is given and the DFA isn't configured to
2166    /// build anchored start states for each pattern.
2167    ///
2168    /// This will never return an unknown lazy state ID.
2169    ///
2170    /// If caching this state would otherwise result in a cache that has been
2171    /// cleared too many times, then an error is returned.
2172    #[cold]
2173    #[inline(never)]
2174    fn cache_start_group(
2175        &mut self,
2176        anchored: Anchored,
2177        start: Start,
2178    ) -> Result<LazyStateID, StartError> {
2179        let nfa_start_id = match anchored {
2180            Anchored::No => self.dfa.get_nfa().start_unanchored(),
2181            Anchored::Yes => self.dfa.get_nfa().start_anchored(),
2182            Anchored::Pattern(pid) => {
2183                if !self.dfa.get_config().get_starts_for_each_pattern() {
2184                    return Err(StartError::unsupported_anchored(anchored));
2185                }
2186                match self.dfa.get_nfa().start_pattern(pid) {
2187                    None => return Ok(self.as_ref().dead_id()),
2188                    Some(sid) => sid,
2189                }
2190            }
2191        };
2192
2193        let id = self
2194            .cache_start_one(nfa_start_id, start)
2195            .map_err(StartError::cache)?;
2196        self.set_start_state(anchored, start, id);
2197        Ok(id)
2198    }
2199
2200    /// Compute and cache the starting state for the given NFA state ID and the
2201    /// starting configuration. The NFA state ID might be one of the following:
2202    ///
2203    /// 1) An unanchored start state to match any pattern.
2204    /// 2) An anchored start state to match any pattern.
2205    /// 3) An anchored start state for a particular pattern.
2206    ///
2207    /// This will never return an unknown lazy state ID.
2208    ///
2209    /// If caching this state would otherwise result in a cache that has been
2210    /// cleared too many times, then an error is returned.
2211    fn cache_start_one(
2212        &mut self,
2213        nfa_start_id: NFAStateID,
2214        start: Start,
2215    ) -> Result<LazyStateID, CacheError> {
2216        let mut builder_matches = self.get_state_builder().into_matches();
2217        determinize::set_lookbehind_from_start(
2218            self.dfa.get_nfa(),
2219            &start,
2220            &mut builder_matches,
2221        );
2222        self.cache.sparses.set1.clear();
2223        determinize::epsilon_closure(
2224            self.dfa.get_nfa(),
2225            nfa_start_id,
2226            builder_matches.look_have(),
2227            &mut self.cache.stack,
2228            &mut self.cache.sparses.set1,
2229        );
2230        let mut builder = builder_matches.into_nfa();
2231        determinize::add_nfa_states(
2232            &self.dfa.get_nfa(),
2233            &self.cache.sparses.set1,
2234            &mut builder,
2235        );
2236        let tag_starts = self.dfa.get_config().get_specialize_start_states();
2237        self.add_builder_state(builder, |id| {
2238            if tag_starts {
2239                id.to_start()
2240            } else {
2241                id
2242            }
2243        })
2244    }
2245
2246    /// Either add the given builder state to this cache, or return an ID to an
2247    /// equivalent state already in this cache.
2248    ///
2249    /// In the case where no equivalent state exists, the idmap function given
2250    /// may be used to transform the identifier allocated. This is useful if
2251    /// the caller needs to tag the ID with additional information.
2252    ///
2253    /// This will never return an unknown lazy state ID.
2254    ///
2255    /// If caching this state would otherwise result in a cache that has been
2256    /// cleared too many times, then an error is returned.
2257    fn add_builder_state(
2258        &mut self,
2259        builder: StateBuilderNFA,
2260        idmap: impl Fn(LazyStateID) -> LazyStateID,
2261    ) -> Result<LazyStateID, CacheError> {
2262        if let Some(&cached_id) =
2263            self.cache.states_to_id.get(builder.as_bytes())
2264        {
2265            // Since we have a cached state, put the constructed state's
2266            // memory back into our scratch space, so that it can be reused.
2267            self.put_state_builder(builder);
2268            return Ok(cached_id);
2269        }
2270        let result = self.add_state(builder.to_state(), idmap);
2271        self.put_state_builder(builder);
2272        result
2273    }
2274
2275    /// Allocate a new state ID and add the given state to this cache.
2276    ///
2277    /// The idmap function given may be used to transform the identifier
2278    /// allocated. This is useful if the caller needs to tag the ID with
2279    /// additional information.
2280    ///
2281    /// This will never return an unknown lazy state ID.
2282    ///
2283    /// If caching this state would otherwise result in a cache that has been
2284    /// cleared too many times, then an error is returned.
2285    fn add_state(
2286        &mut self,
2287        state: State,
2288        idmap: impl Fn(LazyStateID) -> LazyStateID,
2289    ) -> Result<LazyStateID, CacheError> {
2290        if !self.as_ref().state_fits_in_cache(&state) {
2291            self.try_clear_cache()?;
2292        }
2293        // It's important for this to come second, since the above may clear
2294        // the cache. If we clear the cache after ID generation, then the ID
2295        // is likely bunk since it would have been generated based on a larger
2296        // transition table.
2297        let mut id = idmap(self.next_state_id()?);
2298        if state.is_match() {
2299            id = id.to_match();
2300        }
2301        // Add room in the transition table. Since this is a fresh state, all
2302        // of its transitions are unknown.
2303        self.cache.trans.extend(
2304            iter::repeat(self.as_ref().unknown_id()).take(self.dfa.stride()),
2305        );
2306        // When we add a sentinel state, we never want to set any quit
2307        // transitions. Technically, this is harmless, since sentinel states
2308        // have all of their transitions set to loop back to themselves. But
2309        // when creating sentinel states before the quit sentinel state,
2310        // this will try to call 'set_transition' on a state ID that doesn't
2311        // actually exist yet, which isn't allowed. So we just skip doing so
2312        // entirely.
2313        if !self.dfa.quitset.is_empty() && !self.as_ref().is_sentinel(id) {
2314            let quit_id = self.as_ref().quit_id();
2315            for b in self.dfa.quitset.iter() {
2316                self.set_transition(id, alphabet::Unit::u8(b), quit_id);
2317            }
2318        }
2319        self.cache.memory_usage_state += state.memory_usage();
2320        self.cache.states.push(state.clone());
2321        self.cache.states_to_id.insert(state, id);
2322        Ok(id)
2323    }
2324
2325    /// Allocate a new state ID.
2326    ///
2327    /// This will never return an unknown lazy state ID.
2328    ///
2329    /// If caching this state would otherwise result in a cache that has been
2330    /// cleared too many times, then an error is returned.
2331    fn next_state_id(&mut self) -> Result<LazyStateID, CacheError> {
2332        let sid = match LazyStateID::new(self.cache.trans.len()) {
2333            Ok(sid) => sid,
2334            Err(_) => {
2335                self.try_clear_cache()?;
2336                // This has to pass since we check that ID capacity at
2337                // construction time can fit at least MIN_STATES states.
2338                LazyStateID::new(self.cache.trans.len()).unwrap()
2339            }
2340        };
2341        Ok(sid)
2342    }
2343
2344    /// Attempt to clear the cache used by this lazy DFA.
2345    ///
2346    /// If clearing the cache exceeds the minimum number of required cache
2347    /// clearings, then this will return a cache error. In this case,
2348    /// callers should bubble this up as the cache can't be used until it is
2349    /// reset. Implementations of search should convert this error into a
2350    /// [`MatchError::gave_up`].
2351    ///
2352    /// If 'self.state_saver' is set to save a state, then this state is
2353    /// persisted through cache clearing. Otherwise, the cache is returned to
2354    /// its state after initialization with two exceptions: its clear count
2355    /// is incremented and some of its memory likely has additional capacity.
2356    /// That is, clearing a cache does _not_ release memory.
2357    ///
2358    /// Otherwise, any lazy state ID generated by the cache prior to resetting
2359    /// it is invalid after the reset.
2360    fn try_clear_cache(&mut self) -> Result<(), CacheError> {
2361        let c = self.dfa.get_config();
2362        if let Some(min_count) = c.get_minimum_cache_clear_count() {
2363            if self.cache.clear_count >= min_count {
2364                if let Some(min_bytes_per) = c.get_minimum_bytes_per_state() {
2365                    let len = self.cache.search_total_len();
2366                    let min_bytes =
2367                        min_bytes_per.saturating_mul(self.cache.states.len());
2368                    // If we've searched 0 bytes then probably something has
2369                    // gone wrong and the lazy DFA search implementation isn't
2370                    // correctly updating the search progress state.
2371                    if len == 0 {
2372                        trace!(
2373                            "number of bytes searched is 0, but \
2374                             a minimum bytes per state searched ({}) is \
2375                             enabled, maybe Cache::search_update \
2376                             is not being used?",
2377                            min_bytes_per,
2378                        );
2379                    }
2380                    if len < min_bytes {
2381                        trace!(
2382                            "lazy DFA cache has been cleared {} times, \
2383                             which exceeds the limit of {}, \
2384                             AND its bytes searched per state is less \
2385                             than the configured minimum of {}, \
2386                             therefore lazy DFA is giving up \
2387                             (bytes searched since cache clear = {}, \
2388                              number of states = {})",
2389                            self.cache.clear_count,
2390                            min_count,
2391                            min_bytes_per,
2392                            len,
2393                            self.cache.states.len(),
2394                        );
2395                        return Err(CacheError::bad_efficiency());
2396                    } else {
2397                        trace!(
2398                            "lazy DFA cache has been cleared {} times, \
2399                             which exceeds the limit of {}, \
2400                             AND its bytes searched per state is greater \
2401                             than the configured minimum of {}, \
2402                             therefore lazy DFA is continuing! \
2403                             (bytes searched since cache clear = {}, \
2404                              number of states = {})",
2405                            self.cache.clear_count,
2406                            min_count,
2407                            min_bytes_per,
2408                            len,
2409                            self.cache.states.len(),
2410                        );
2411                    }
2412                } else {
2413                    trace!(
2414                        "lazy DFA cache has been cleared {} times, \
2415                         which exceeds the limit of {}, \
2416                         since there is no configured bytes per state \
2417                         minimum, lazy DFA is giving up",
2418                        self.cache.clear_count,
2419                        min_count,
2420                    );
2421                    return Err(CacheError::too_many_cache_clears());
2422                }
2423            }
2424        }
2425        self.clear_cache();
2426        Ok(())
2427    }
2428
2429    /// Clears _and_ resets the cache. Resetting the cache means that no
2430    /// states are persisted and the clear count is reset to 0. No heap memory
2431    /// is released.
2432    ///
2433    /// Note that the caller may reset a cache with a different DFA than what
2434    /// it was created from. In which case, the cache can now be used with the
2435    /// new DFA (and not the old DFA).
2436    fn reset_cache(&mut self) {
2437        self.cache.state_saver = StateSaver::none();
2438        self.clear_cache();
2439        // If a new DFA is used, it might have a different number of NFA
2440        // states, so we need to make sure our sparse sets have the appropriate
2441        // size.
2442        self.cache.sparses.resize(self.dfa.get_nfa().states().len());
2443        self.cache.clear_count = 0;
2444        self.cache.progress = None;
2445    }
2446
2447    /// Clear the cache used by this lazy DFA.
2448    ///
2449    /// If 'self.state_saver' is set to save a state, then this state is
2450    /// persisted through cache clearing. Otherwise, the cache is returned to
2451    /// its state after initialization with two exceptions: its clear count
2452    /// is incremented and some of its memory likely has additional capacity.
2453    /// That is, clearing a cache does _not_ release memory.
2454    ///
2455    /// Otherwise, any lazy state ID generated by the cache prior to resetting
2456    /// it is invalid after the reset.
2457    fn clear_cache(&mut self) {
2458        self.cache.trans.clear();
2459        self.cache.starts.clear();
2460        self.cache.states.clear();
2461        self.cache.states_to_id.clear();
2462        self.cache.memory_usage_state = 0;
2463        self.cache.clear_count += 1;
2464        self.cache.bytes_searched = 0;
2465        if let Some(ref mut progress) = self.cache.progress {
2466            progress.start = progress.at;
2467        }
2468        trace!(
2469            "lazy DFA cache has been cleared (count: {})",
2470            self.cache.clear_count
2471        );
2472        self.init_cache();
2473        // If the state we want to save is one of the sentinel
2474        // (unknown/dead/quit) states, then 'init_cache' adds those back, and
2475        // their identifier values remains invariant. So there's no need to add
2476        // it again. (And indeed, doing so would be incorrect!)
2477        if let Some((old_id, state)) = self.cache.state_saver.take_to_save() {
2478            // If the state is one of the special sentinel states, then it is
2479            // automatically added by cache initialization and its ID always
2480            // remains the same. With that said, this should never occur since
2481            // the sentinel states are all loop states back to themselves. So
2482            // we should never be in a position where we're attempting to save
2483            // a sentinel state since we never compute transitions out of a
2484            // sentinel state.
2485            assert!(
2486                !self.as_ref().is_sentinel(old_id),
2487                "cannot save sentinel state"
2488            );
2489            let new_id = self
2490                .add_state(state, |id| {
2491                    if old_id.is_start() {
2492                        // We don't need to consult the
2493                        // 'specialize_start_states' config knob here, because
2494                        // if it's disabled, old_id.is_start() will never
2495                        // return true.
2496                        id.to_start()
2497                    } else {
2498                        id
2499                    }
2500                })
2501                // The unwrap here is OK because lazy DFA creation ensures that
2502                // we have room in the cache to add MIN_STATES states. Since
2503                // 'init_cache' above adds 3, this adds a 4th.
2504                .expect("adding one state after cache clear must work");
2505            self.cache.state_saver = StateSaver::Saved(new_id);
2506        }
2507    }
2508
2509    /// Initialize this cache from emptiness to a place where it can be used
2510    /// for search.
2511    ///
2512    /// This is called both at cache creation time and after the cache has been
2513    /// cleared.
2514    ///
2515    /// Primarily, this adds the three sentinel states and allocates some
2516    /// initial memory.
2517    fn init_cache(&mut self) {
2518        // Why multiply by 2 here? Because we make room for both the unanchored
2519        // and anchored start states. Unanchored is first and then anchored.
2520        let mut starts_len = Start::len().checked_mul(2).unwrap();
2521        // ... but if we also want start states for every pattern, we make room
2522        // for that too.
2523        if self.dfa.get_config().get_starts_for_each_pattern() {
2524            starts_len += Start::len() * self.dfa.pattern_len();
2525        }
2526        self.cache
2527            .starts
2528            .extend(iter::repeat(self.as_ref().unknown_id()).take(starts_len));
2529        // This is the set of NFA states that corresponds to each of our three
2530        // sentinel states: the empty set.
2531        let dead = State::dead();
2532        // This sets up some states that we use as sentinels that are present
2533        // in every DFA. While it would be technically possible to implement
2534        // this DFA without explicitly putting these states in the transition
2535        // table, this is convenient to do to make `next_state` correct for all
2536        // valid state IDs without needing explicit conditionals to special
2537        // case these sentinel states.
2538        //
2539        // All three of these states are "dead" states. That is, all of
2540        // them transition only to themselves. So once you enter one of
2541        // these states, it's impossible to leave them. Thus, any correct
2542        // search routine must explicitly check for these state types. (Sans
2543        // `unknown`, since that is only used internally to represent missing
2544        // states.)
2545        let unk_id =
2546            self.add_state(dead.clone(), |id| id.to_unknown()).unwrap();
2547        let dead_id = self.add_state(dead.clone(), |id| id.to_dead()).unwrap();
2548        let quit_id = self.add_state(dead.clone(), |id| id.to_quit()).unwrap();
2549        assert_eq!(unk_id, self.as_ref().unknown_id());
2550        assert_eq!(dead_id, self.as_ref().dead_id());
2551        assert_eq!(quit_id, self.as_ref().quit_id());
2552        // The idea here is that if you start in an unknown/dead/quit state and
2553        // try to transition on them, then you should end up where you started.
2554        self.set_all_transitions(unk_id, unk_id);
2555        self.set_all_transitions(dead_id, dead_id);
2556        self.set_all_transitions(quit_id, quit_id);
2557        // All of these states are technically equivalent from the FSM
2558        // perspective, so putting all three of them in the cache isn't
2559        // possible. (They are distinct merely because we use their
2560        // identifiers as sentinels to mean something, as indicated by the
2561        // names.) Moreover, we wouldn't want to do that. Unknown and quit
2562        // states are special in that they are artificial constructions
2563        // this implementation. But dead states are a natural part of
2564        // determinization. When you reach a point in the NFA where you cannot
2565        // go anywhere else, a dead state will naturally arise and we MUST
2566        // reuse the canonical dead state that we've created here. Why? Because
2567        // it is the state ID that tells the search routine whether a state is
2568        // dead or not, and thus, whether to stop the search. Having a bunch of
2569        // distinct dead states would be quite wasteful!
2570        self.cache.states_to_id.insert(dead, dead_id);
2571    }
2572
2573    /// Save the state corresponding to the ID given such that the state
2574    /// persists through a cache clearing.
2575    ///
2576    /// While the state may persist, the ID may not. In order to discover the
2577    /// new state ID, one must call 'saved_state_id' after a cache clearing.
2578    fn save_state(&mut self, id: LazyStateID) {
2579        let state = self.as_ref().get_cached_state(id).clone();
2580        self.cache.state_saver = StateSaver::ToSave { id, state };
2581    }
2582
2583    /// Returns the updated lazy state ID for a state that was persisted
2584    /// through a cache clearing.
2585    ///
2586    /// It is only correct to call this routine when both a state has been
2587    /// saved and the cache has just been cleared. Otherwise, this panics.
2588    fn saved_state_id(&mut self) -> LazyStateID {
2589        self.cache
2590            .state_saver
2591            .take_saved()
2592            .expect("state saver does not have saved state ID")
2593    }
2594
2595    /// Set all transitions on the state 'from' to 'to'.
2596    fn set_all_transitions(&mut self, from: LazyStateID, to: LazyStateID) {
2597        for unit in self.dfa.classes.representatives(..) {
2598            self.set_transition(from, unit, to);
2599        }
2600    }
2601
2602    /// Set the transition on 'from' for 'unit' to 'to'.
2603    ///
2604    /// This panics if either 'from' or 'to' is invalid.
2605    ///
2606    /// All unit values are OK.
2607    fn set_transition(
2608        &mut self,
2609        from: LazyStateID,
2610        unit: alphabet::Unit,
2611        to: LazyStateID,
2612    ) {
2613        assert!(self.as_ref().is_valid(from), "invalid 'from' id: {from:?}");
2614        assert!(self.as_ref().is_valid(to), "invalid 'to' id: {to:?}");
2615        let offset =
2616            from.as_usize_untagged() + self.dfa.classes.get_by_unit(unit);
2617        self.cache.trans[offset] = to;
2618    }
2619
2620    /// Set the start ID for the given pattern ID (if given) and starting
2621    /// configuration to the ID given.
2622    ///
2623    /// This panics if 'id' is not valid or if a pattern ID is given and
2624    /// 'starts_for_each_pattern' is not enabled.
2625    fn set_start_state(
2626        &mut self,
2627        anchored: Anchored,
2628        start: Start,
2629        id: LazyStateID,
2630    ) {
2631        assert!(self.as_ref().is_valid(id));
2632        let start_index = start.as_usize();
2633        let index = match anchored {
2634            Anchored::No => start_index,
2635            Anchored::Yes => Start::len() + start_index,
2636            Anchored::Pattern(pid) => {
2637                assert!(
2638                    self.dfa.get_config().get_starts_for_each_pattern(),
2639                    "attempted to search for a specific pattern \
2640                     without enabling starts_for_each_pattern",
2641                );
2642                let pid = pid.as_usize();
2643                (2 * Start::len()) + (Start::len() * pid) + start_index
2644            }
2645        };
2646        self.cache.starts[index] = id;
2647    }
2648
2649    /// Returns a state builder from this DFA that might have existing
2650    /// capacity. This helps avoid allocs in cases where a state is built that
2651    /// turns out to already be cached.
2652    ///
2653    /// Callers must put the state builder back with 'put_state_builder',
2654    /// otherwise the allocation reuse won't work.
2655    fn get_state_builder(&mut self) -> StateBuilderEmpty {
2656        core::mem::replace(
2657            &mut self.cache.scratch_state_builder,
2658            StateBuilderEmpty::new(),
2659        )
2660    }
2661
2662    /// Puts the given state builder back into this DFA for reuse.
2663    ///
2664    /// Note that building a 'State' from a builder always creates a new alloc,
2665    /// so callers should always put the builder back.
2666    fn put_state_builder(&mut self, builder: StateBuilderNFA) {
2667        let _ = core::mem::replace(
2668            &mut self.cache.scratch_state_builder,
2669            builder.clear(),
2670        );
2671    }
2672}
2673
2674/// A type that groups methods that require the base NFA/DFA and read-only
2675/// access to the cache.
2676#[derive(Debug)]
2677struct LazyRef<'i, 'c> {
2678    dfa: &'i DFA,
2679    cache: &'c Cache,
2680}
2681
2682impl<'i, 'c> LazyRef<'i, 'c> {
2683    /// Creates a new 'Lazy' wrapper for a DFA and its corresponding cache.
2684    fn new(dfa: &'i DFA, cache: &'c Cache) -> LazyRef<'i, 'c> {
2685        LazyRef { dfa, cache }
2686    }
2687
2688    /// Return the ID of the start state for the given configuration.
2689    ///
2690    /// If the start state has not yet been computed, then this returns an
2691    /// unknown lazy state ID.
2692    #[cfg_attr(feature = "perf-inline", inline(always))]
2693    fn get_cached_start_id(
2694        &self,
2695        anchored: Anchored,
2696        start: Start,
2697    ) -> Result<LazyStateID, StartError> {
2698        let start_index = start.as_usize();
2699        let index = match anchored {
2700            Anchored::No => start_index,
2701            Anchored::Yes => Start::len() + start_index,
2702            Anchored::Pattern(pid) => {
2703                if !self.dfa.get_config().get_starts_for_each_pattern() {
2704                    return Err(StartError::unsupported_anchored(anchored));
2705                }
2706                if pid.as_usize() >= self.dfa.pattern_len() {
2707                    return Ok(self.dead_id());
2708                }
2709                (2 * Start::len())
2710                    + (Start::len() * pid.as_usize())
2711                    + start_index
2712            }
2713        };
2714        Ok(self.cache.starts[index])
2715    }
2716
2717    /// Return the cached NFA/DFA powerset state for the given ID.
2718    ///
2719    /// This panics if the given ID does not address a valid state.
2720    fn get_cached_state(&self, sid: LazyStateID) -> &State {
2721        let index = sid.as_usize_untagged() >> self.dfa.stride2();
2722        &self.cache.states[index]
2723    }
2724
2725    /// Returns true if and only if the given ID corresponds to a "sentinel"
2726    /// state.
2727    ///
2728    /// A sentinel state is a state that signifies a special condition of
2729    /// search, and where every transition maps back to itself. See LazyStateID
2730    /// for more details. Note that start and match states are _not_ sentinels
2731    /// since they may otherwise be real states with non-trivial transitions.
2732    /// The purposes of sentinel states is purely to indicate something. Their
2733    /// transitions are not meant to be followed.
2734    fn is_sentinel(&self, id: LazyStateID) -> bool {
2735        id == self.unknown_id() || id == self.dead_id() || id == self.quit_id()
2736    }
2737
2738    /// Returns the ID of the unknown state for this lazy DFA.
2739    fn unknown_id(&self) -> LazyStateID {
2740        // This unwrap is OK since 0 is always a valid state ID.
2741        LazyStateID::new(0).unwrap().to_unknown()
2742    }
2743
2744    /// Returns the ID of the dead state for this lazy DFA.
2745    fn dead_id(&self) -> LazyStateID {
2746        // This unwrap is OK since the maximum value here is 1 * 512 = 512,
2747        // which is <= 2047 (the maximum state ID on 16-bit systems). Where
2748        // 512 is the worst case for our equivalence classes (every byte is a
2749        // distinct class).
2750        LazyStateID::new(1 << self.dfa.stride2()).unwrap().to_dead()
2751    }
2752
2753    /// Returns the ID of the quit state for this lazy DFA.
2754    fn quit_id(&self) -> LazyStateID {
2755        // This unwrap is OK since the maximum value here is 2 * 512 = 1024,
2756        // which is <= 2047 (the maximum state ID on 16-bit systems). Where
2757        // 512 is the worst case for our equivalence classes (every byte is a
2758        // distinct class).
2759        LazyStateID::new(2 << self.dfa.stride2()).unwrap().to_quit()
2760    }
2761
2762    /// Returns true if and only if the given ID is valid.
2763    ///
2764    /// An ID is valid if it is both a valid index into the transition table
2765    /// and is a multiple of the DFA's stride.
2766    fn is_valid(&self, id: LazyStateID) -> bool {
2767        let id = id.as_usize_untagged();
2768        id < self.cache.trans.len() && id % self.dfa.stride() == 0
2769    }
2770
2771    /// Returns true if adding the state given would fit in this cache.
2772    fn state_fits_in_cache(&self, state: &State) -> bool {
2773        let needed = self.cache.memory_usage()
2774            + self.memory_usage_for_one_more_state(state.memory_usage());
2775        trace!(
2776            "lazy DFA cache capacity state check: {:?} ?<=? {:?}",
2777            needed,
2778            self.dfa.cache_capacity
2779        );
2780        needed <= self.dfa.cache_capacity
2781    }
2782
2783    /// Returns true if adding the state to be built by the given builder would
2784    /// fit in this cache.
2785    fn state_builder_fits_in_cache(&self, state: &StateBuilderNFA) -> bool {
2786        let needed = self.cache.memory_usage()
2787            + self.memory_usage_for_one_more_state(state.as_bytes().len());
2788        trace!(
2789            "lazy DFA cache capacity state builder check: {:?} ?<=? {:?}",
2790            needed,
2791            self.dfa.cache_capacity
2792        );
2793        needed <= self.dfa.cache_capacity
2794    }
2795
2796    /// Returns the additional memory usage, in bytes, required to add one more
2797    /// state to this cache. The given size should be the heap size, in bytes,
2798    /// that would be used by the new state being added.
2799    fn memory_usage_for_one_more_state(
2800        &self,
2801        state_heap_size: usize,
2802    ) -> usize {
2803        const ID_SIZE: usize = size_of::<LazyStateID>();
2804        const STATE_SIZE: usize = size_of::<State>();
2805
2806        self.dfa.stride() * ID_SIZE // additional space needed in trans table
2807        + STATE_SIZE // space in cache.states
2808        + (STATE_SIZE + ID_SIZE) // space in cache.states_to_id
2809        + state_heap_size // heap memory used by state itself
2810    }
2811}
2812
2813/// A simple type that encapsulates the saving of a state ID through a cache
2814/// clearing.
2815///
2816/// A state ID can be marked for saving with ToSave, while a state ID can be
2817/// saved itself with Saved.
2818#[derive(Clone, Debug)]
2819enum StateSaver {
2820    /// An empty state saver. In this case, no states (other than the special
2821    /// sentinel states) are preserved after clearing the cache.
2822    None,
2823    /// An ID of a state (and the state itself) that should be preserved after
2824    /// the lazy DFA's cache has been cleared. After clearing, the updated ID
2825    /// is stored in 'Saved' since it may have changed.
2826    ToSave { id: LazyStateID, state: State },
2827    /// An ID that of a state that has been persisted through a lazy DFA
2828    /// cache clearing. The ID recorded here corresponds to an ID that was
2829    /// once marked as ToSave. The IDs are likely not equivalent even though
2830    /// the states they point to are.
2831    Saved(LazyStateID),
2832}
2833
2834impl StateSaver {
2835    /// Create an empty state saver.
2836    fn none() -> StateSaver {
2837        StateSaver::None
2838    }
2839
2840    /// Replace this state saver with an empty saver, and if this saver is a
2841    /// request to save a state, return that request.
2842    fn take_to_save(&mut self) -> Option<(LazyStateID, State)> {
2843        match core::mem::replace(self, StateSaver::None) {
2844            StateSaver::None | StateSaver::Saved(_) => None,
2845            StateSaver::ToSave { id, state } => Some((id, state)),
2846        }
2847    }
2848
2849    /// Replace this state saver with an empty saver, and if this saver is a
2850    /// saved state (or a request to save a state), return that state's ID.
2851    ///
2852    /// The idea here is that a request to save a state isn't necessarily
2853    /// honored because it might not be needed. e.g., Some higher level code
2854    /// might request a state to be saved on the off chance that the cache gets
2855    /// cleared when a new state is added at a lower level. But if that new
2856    /// state is never added, then the cache is never cleared and the state and
2857    /// its ID remain unchanged.
2858    fn take_saved(&mut self) -> Option<LazyStateID> {
2859        match core::mem::replace(self, StateSaver::None) {
2860            StateSaver::None => None,
2861            StateSaver::Saved(id) | StateSaver::ToSave { id, .. } => Some(id),
2862        }
2863    }
2864}
2865
2866/// The configuration used for building a lazy DFA.
2867///
2868/// As a convenience, [`DFA::config`] is an alias for [`Config::new`]. The
2869/// advantage of the former is that it often lets you avoid importing the
2870/// `Config` type directly.
2871///
2872/// A lazy DFA configuration is a simple data object that is typically used
2873/// with [`Builder::configure`].
2874///
2875/// The default configuration guarantees that a search will never return a
2876/// "gave up" or "quit" error, although it is possible for a search to fail
2877/// if [`Config::starts_for_each_pattern`] wasn't enabled (which it is not by
2878/// default) and an [`Anchored::Pattern`] mode is requested via [`Input`].
2879#[derive(Clone, Debug, Default)]
2880pub struct Config {
2881    // As with other configuration types in this crate, we put all our knobs
2882    // in options so that we can distinguish between "default" and "not set."
2883    // This makes it possible to easily combine multiple configurations
2884    // without default values overwriting explicitly specified values. See the
2885    // 'overwrite' method.
2886    //
2887    // For docs on the fields below, see the corresponding method setters.
2888    match_kind: Option<MatchKind>,
2889    pre: Option<Option<Prefilter>>,
2890    starts_for_each_pattern: Option<bool>,
2891    byte_classes: Option<bool>,
2892    unicode_word_boundary: Option<bool>,
2893    quitset: Option<ByteSet>,
2894    specialize_start_states: Option<bool>,
2895    cache_capacity: Option<usize>,
2896    skip_cache_capacity_check: Option<bool>,
2897    minimum_cache_clear_count: Option<Option<usize>>,
2898    minimum_bytes_per_state: Option<Option<usize>>,
2899}
2900
2901impl Config {
2902    /// Return a new default lazy DFA builder configuration.
2903    pub fn new() -> Config {
2904        Config::default()
2905    }
2906
2907    /// Set the desired match semantics.
2908    ///
2909    /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the
2910    /// match semantics of Perl-like regex engines. That is, when multiple
2911    /// patterns would match at the same leftmost position, the pattern that
2912    /// appears first in the concrete syntax is chosen.
2913    ///
2914    /// Currently, the only other kind of match semantics supported is
2915    /// [`MatchKind::All`]. This corresponds to classical DFA construction
2916    /// where all possible matches are added to the lazy DFA.
2917    ///
2918    /// Typically, `All` is used when one wants to execute an overlapping
2919    /// search and `LeftmostFirst` otherwise. In particular, it rarely makes
2920    /// sense to use `All` with the various "leftmost" find routines, since the
2921    /// leftmost routines depend on the `LeftmostFirst` automata construction
2922    /// strategy. Specifically, `LeftmostFirst` adds dead states to the
2923    /// lazy DFA as a way to terminate the search and report a match.
2924    /// `LeftmostFirst` also supports non-greedy matches using this strategy
2925    /// where as `All` does not.
2926    ///
2927    /// # Example: overlapping search
2928    ///
2929    /// This example shows the typical use of `MatchKind::All`, which is to
2930    /// report overlapping matches.
2931    ///
2932    /// ```
2933    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
2934    /// use regex_automata::{
2935    ///     hybrid::dfa::{DFA, OverlappingState},
2936    ///     HalfMatch, Input, MatchKind,
2937    /// };
2938    ///
2939    /// let dfa = DFA::builder()
2940    ///     .configure(DFA::config().match_kind(MatchKind::All))
2941    ///     .build_many(&[r"\w+$", r"\S+$"])?;
2942    /// let mut cache = dfa.create_cache();
2943    /// let haystack = "@foo";
2944    /// let mut state = OverlappingState::start();
2945    ///
2946    /// let expected = Some(HalfMatch::must(1, 4));
2947    /// dfa.try_search_overlapping_fwd(
2948    ///     &mut cache, &Input::new(haystack), &mut state,
2949    /// )?;
2950    /// assert_eq!(expected, state.get_match());
2951    ///
2952    /// // The first pattern also matches at the same position, so re-running
2953    /// // the search will yield another match. Notice also that the first
2954    /// // pattern is returned after the second. This is because the second
2955    /// // pattern begins its match before the first, is therefore an earlier
2956    /// // match and is thus reported first.
2957    /// let expected = Some(HalfMatch::must(0, 4));
2958    /// dfa.try_search_overlapping_fwd(
2959    ///     &mut cache, &Input::new(haystack), &mut state,
2960    /// )?;
2961    /// assert_eq!(expected, state.get_match());
2962    ///
2963    /// # Ok::<(), Box<dyn std::error::Error>>(())
2964    /// ```
2965    ///
2966    /// # Example: reverse automaton to find start of match
2967    ///
2968    /// Another example for using `MatchKind::All` is for constructing a
2969    /// reverse automaton to find the start of a match. `All` semantics are
2970    /// used for this in order to find the longest possible match, which
2971    /// corresponds to the leftmost starting position.
2972    ///
2973    /// Note that if you need the starting position then
2974    /// [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) will handle this
2975    /// for you, so it's usually not necessary to do this yourself.
2976    ///
2977    /// ```
2978    /// use regex_automata::{
2979    ///     hybrid::dfa::DFA,
2980    ///     nfa::thompson::NFA,
2981    ///     Anchored, HalfMatch, Input, MatchKind,
2982    /// };
2983    ///
2984    /// let input = Input::new("123foobar456");
2985    /// let pattern = r"[a-z]+r";
2986    ///
2987    /// let dfa_fwd = DFA::new(pattern)?;
2988    /// let dfa_rev = DFA::builder()
2989    ///     .thompson(NFA::config().reverse(true))
2990    ///     .configure(DFA::config().match_kind(MatchKind::All))
2991    ///     .build(pattern)?;
2992    /// let mut cache_fwd = dfa_fwd.create_cache();
2993    /// let mut cache_rev = dfa_rev.create_cache();
2994    ///
2995    /// let expected_fwd = HalfMatch::must(0, 9);
2996    /// let expected_rev = HalfMatch::must(0, 3);
2997    /// let got_fwd = dfa_fwd.try_search_fwd(&mut cache_fwd, &input)?.unwrap();
2998    /// // Here we don't specify the pattern to search for since there's only
2999    /// // one pattern and we're doing a leftmost search. But if this were an
3000    /// // overlapping search, you'd need to specify the pattern that matched
3001    /// // in the forward direction. (Otherwise, you might wind up finding the
3002    /// // starting position of a match of some other pattern.) That in turn
3003    /// // requires building the reverse automaton with starts_for_each_pattern
3004    /// // enabled.
3005    /// let input = input
3006    ///     .clone()
3007    ///     .range(..got_fwd.offset())
3008    ///     .anchored(Anchored::Yes);
3009    /// let got_rev = dfa_rev.try_search_rev(&mut cache_rev, &input)?.unwrap();
3010    /// assert_eq!(expected_fwd, got_fwd);
3011    /// assert_eq!(expected_rev, got_rev);
3012    ///
3013    /// # Ok::<(), Box<dyn std::error::Error>>(())
3014    /// ```
3015    pub fn match_kind(mut self, kind: MatchKind) -> Config {
3016        self.match_kind = Some(kind);
3017        self
3018    }
3019
3020    /// Set a prefilter to be used whenever a start state is entered.
3021    ///
3022    /// A [`Prefilter`] in this context is meant to accelerate searches by
3023    /// looking for literal prefixes that every match for the corresponding
3024    /// pattern (or patterns) must start with. Once a prefilter produces a
3025    /// match, the underlying search routine continues on to try and confirm
3026    /// the match.
3027    ///
3028    /// Be warned that setting a prefilter does not guarantee that the search
3029    /// will be faster. While it's usually a good bet, if the prefilter
3030    /// produces a lot of false positive candidates (i.e., positions matched
3031    /// by the prefilter but not by the regex), then the overall result can
3032    /// be slower than if you had just executed the regex engine without any
3033    /// prefilters.
3034    ///
3035    /// Note that unless [`Config::specialize_start_states`] has been
3036    /// explicitly set, then setting this will also enable (when `pre` is
3037    /// `Some`) or disable (when `pre` is `None`) start state specialization.
3038    /// This occurs because without start state specialization, a prefilter
3039    /// is likely to be less effective. And without a prefilter, start state
3040    /// specialization is usually pointless.
3041    ///
3042    /// By default no prefilter is set.
3043    ///
3044    /// # Example
3045    ///
3046    /// ```
3047    /// use regex_automata::{
3048    ///     hybrid::dfa::DFA,
3049    ///     util::prefilter::Prefilter,
3050    ///     Input, HalfMatch, MatchKind,
3051    /// };
3052    ///
3053    /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]);
3054    /// let re = DFA::builder()
3055    ///     .configure(DFA::config().prefilter(pre))
3056    ///     .build(r"(foo|bar)[a-z]+")?;
3057    /// let mut cache = re.create_cache();
3058    /// let input = Input::new("foo1 barfox bar");
3059    /// assert_eq!(
3060    ///     Some(HalfMatch::must(0, 11)),
3061    ///     re.try_search_fwd(&mut cache, &input)?,
3062    /// );
3063    ///
3064    /// # Ok::<(), Box<dyn std::error::Error>>(())
3065    /// ```
3066    ///
3067    /// Be warned though that an incorrect prefilter can lead to incorrect
3068    /// results!
3069    ///
3070    /// ```
3071    /// use regex_automata::{
3072    ///     hybrid::dfa::DFA,
3073    ///     util::prefilter::Prefilter,
3074    ///     Input, HalfMatch, MatchKind,
3075    /// };
3076    ///
3077    /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]);
3078    /// let re = DFA::builder()
3079    ///     .configure(DFA::config().prefilter(pre))
3080    ///     .build(r"(foo|bar)[a-z]+")?;
3081    /// let mut cache = re.create_cache();
3082    /// let input = Input::new("foo1 barfox bar");
3083    /// assert_eq!(
3084    ///     // No match reported even though there clearly is one!
3085    ///     None,
3086    ///     re.try_search_fwd(&mut cache, &input)?,
3087    /// );
3088    ///
3089    /// # Ok::<(), Box<dyn std::error::Error>>(())
3090    /// ```
3091    pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config {
3092        self.pre = Some(pre);
3093        if self.specialize_start_states.is_none() {
3094            self.specialize_start_states =
3095                Some(self.get_prefilter().is_some());
3096        }
3097        self
3098    }
3099
3100    /// Whether to compile a separate start state for each pattern in the
3101    /// lazy DFA.
3102    ///
3103    /// When enabled, a separate **anchored** start state is added for each
3104    /// pattern in the lazy DFA. When this start state is used, then the DFA
3105    /// will only search for matches for the pattern specified, even if there
3106    /// are other patterns in the DFA.
3107    ///
3108    /// The main downside of this option is that it can potentially increase
3109    /// the size of the DFA and/or increase the time it takes to build the
3110    /// DFA at search time. However, since this is configuration for a lazy
3111    /// DFA, these states aren't actually built unless they're used. Enabling
3112    /// this isn't necessarily free, however, as it may result in higher cache
3113    /// usage.
3114    ///
3115    /// There are a few reasons one might want to enable this (it's disabled
3116    /// by default):
3117    ///
3118    /// 1. When looking for the start of an overlapping match (using a reverse
3119    /// DFA), doing it correctly requires starting the reverse search using the
3120    /// starting state of the pattern that matched in the forward direction.
3121    /// Indeed, when building a [`Regex`](crate::hybrid::regex::Regex), it
3122    /// will automatically enable this option when building the reverse DFA
3123    /// internally.
3124    /// 2. When you want to use a DFA with multiple patterns to both search
3125    /// for matches of any pattern or to search for anchored matches of one
3126    /// particular pattern while using the same DFA. (Otherwise, you would need
3127    /// to compile a new DFA for each pattern.)
3128    ///
3129    /// By default this is disabled.
3130    ///
3131    /// # Example
3132    ///
3133    /// This example shows how to use this option to permit the same lazy DFA
3134    /// to run both general searches for any pattern and anchored searches for
3135    /// a specific pattern.
3136    ///
3137    /// ```
3138    /// use regex_automata::{
3139    ///     hybrid::dfa::DFA,
3140    ///     Anchored, HalfMatch, Input, PatternID,
3141    /// };
3142    ///
3143    /// let dfa = DFA::builder()
3144    ///     .configure(DFA::config().starts_for_each_pattern(true))
3145    ///     .build_many(&[r"[a-z0-9]{6}", r"[a-z][a-z0-9]{5}"])?;
3146    /// let mut cache = dfa.create_cache();
3147    /// let haystack = "bar foo123";
3148    ///
3149    /// // Here's a normal unanchored search that looks for any pattern.
3150    /// let expected = HalfMatch::must(0, 10);
3151    /// let input = Input::new(haystack);
3152    /// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?);
3153    /// // We can also do a normal anchored search for any pattern. Since it's
3154    /// // an anchored search, we position the start of the search where we
3155    /// // know the match will begin.
3156    /// let expected = HalfMatch::must(0, 10);
3157    /// let input = Input::new(haystack).range(4..);
3158    /// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?);
3159    /// // Since we compiled anchored start states for each pattern, we can
3160    /// // also look for matches of other patterns explicitly, even if a
3161    /// // different pattern would have normally matched.
3162    /// let expected = HalfMatch::must(1, 10);
3163    /// let input = Input::new(haystack)
3164    ///     .range(4..)
3165    ///     .anchored(Anchored::Pattern(PatternID::must(1)));
3166    /// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?);
3167    ///
3168    /// # Ok::<(), Box<dyn std::error::Error>>(())
3169    /// ```
3170    pub fn starts_for_each_pattern(mut self, yes: bool) -> Config {
3171        self.starts_for_each_pattern = Some(yes);
3172        self
3173    }
3174
3175    /// Whether to attempt to shrink the size of the lazy DFA's alphabet or
3176    /// not.
3177    ///
3178    /// This option is enabled by default and should never be disabled unless
3179    /// one is debugging the lazy DFA.
3180    ///
3181    /// When enabled, the lazy DFA will use a map from all possible bytes
3182    /// to their corresponding equivalence class. Each equivalence class
3183    /// represents a set of bytes that does not discriminate between a match
3184    /// and a non-match in the DFA. For example, the pattern `[ab]+` has at
3185    /// least two equivalence classes: a set containing `a` and `b` and a set
3186    /// containing every byte except for `a` and `b`. `a` and `b` are in the
3187    /// same equivalence classes because they never discriminate between a
3188    /// match and a non-match.
3189    ///
3190    /// The advantage of this map is that the size of the transition table
3191    /// can be reduced drastically from `#states * 256 * sizeof(LazyStateID)`
3192    /// to `#states * k * sizeof(LazyStateID)` where `k` is the number of
3193    /// equivalence classes (rounded up to the nearest power of 2). As a
3194    /// result, total space usage can decrease substantially. Moreover, since a
3195    /// smaller alphabet is used, DFA compilation during search becomes faster
3196    /// as well since it will potentially be able to reuse a single transition
3197    /// for multiple bytes.
3198    ///
3199    /// **WARNING:** This is only useful for debugging lazy DFAs. Disabling
3200    /// this does not yield any speed advantages. Namely, even when this is
3201    /// disabled, a byte class map is still used while searching. The only
3202    /// difference is that every byte will be forced into its own distinct
3203    /// equivalence class. This is useful for debugging the actual generated
3204    /// transitions because it lets one see the transitions defined on actual
3205    /// bytes instead of the equivalence classes.
3206    pub fn byte_classes(mut self, yes: bool) -> Config {
3207        self.byte_classes = Some(yes);
3208        self
3209    }
3210
3211    /// Heuristically enable Unicode word boundaries.
3212    ///
3213    /// When set, this will attempt to implement Unicode word boundaries as if
3214    /// they were ASCII word boundaries. This only works when the search input
3215    /// is ASCII only. If a non-ASCII byte is observed while searching, then a
3216    /// [`MatchError::quit`] error is returned.
3217    ///
3218    /// A possible alternative to enabling this option is to simply use an
3219    /// ASCII word boundary, e.g., via `(?-u:\b)`. The main reason to use this
3220    /// option is if you absolutely need Unicode support. This option lets one
3221    /// use a fast search implementation (a DFA) for some potentially very
3222    /// common cases, while providing the option to fall back to some other
3223    /// regex engine to handle the general case when an error is returned.
3224    ///
3225    /// If the pattern provided has no Unicode word boundary in it, then this
3226    /// option has no effect. (That is, quitting on a non-ASCII byte only
3227    /// occurs when this option is enabled _and_ a Unicode word boundary is
3228    /// present in the pattern.)
3229    ///
3230    /// This is almost equivalent to setting all non-ASCII bytes to be quit
3231    /// bytes. The only difference is that this will cause non-ASCII bytes to
3232    /// be quit bytes _only_ when a Unicode word boundary is present in the
3233    /// pattern.
3234    ///
3235    /// When enabling this option, callers _must_ be prepared to
3236    /// handle a [`MatchError`] error during search. When using a
3237    /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the
3238    /// `try_` suite of methods. Alternatively, if callers can guarantee that
3239    /// their input is ASCII only, then a [`MatchError::quit`] error will never
3240    /// be returned while searching.
3241    ///
3242    /// This is disabled by default.
3243    ///
3244    /// # Example
3245    ///
3246    /// This example shows how to heuristically enable Unicode word boundaries
3247    /// in a pattern. It also shows what happens when a search comes across a
3248    /// non-ASCII byte.
3249    ///
3250    /// ```
3251    /// use regex_automata::{
3252    ///     hybrid::dfa::DFA,
3253    ///     HalfMatch, Input, MatchError,
3254    /// };
3255    ///
3256    /// let dfa = DFA::builder()
3257    ///     .configure(DFA::config().unicode_word_boundary(true))
3258    ///     .build(r"\b[0-9]+\b")?;
3259    /// let mut cache = dfa.create_cache();
3260    ///
3261    /// // The match occurs before the search ever observes the snowman
3262    /// // character, so no error occurs.
3263    /// let haystack = "foo 123  ☃";
3264    /// let expected = Some(HalfMatch::must(0, 7));
3265    /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?;
3266    /// assert_eq!(expected, got);
3267    ///
3268    /// // Notice that this search fails, even though the snowman character
3269    /// // occurs after the ending match offset. This is because search
3270    /// // routines read one byte past the end of the search to account for
3271    /// // look-around, and indeed, this is required here to determine whether
3272    /// // the trailing \b matches.
3273    /// let haystack = "foo 123 ☃";
3274    /// let expected = MatchError::quit(0xE2, 8);
3275    /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack));
3276    /// assert_eq!(Err(expected), got);
3277    ///
3278    /// // Another example is executing a search where the span of the haystack
3279    /// // we specify is all ASCII, but there is non-ASCII just before it. This
3280    /// // correctly also reports an error.
3281    /// let input = Input::new("β123").range(2..);
3282    /// let expected = MatchError::quit(0xB2, 1);
3283    /// let got = dfa.try_search_fwd(&mut cache, &input);
3284    /// assert_eq!(Err(expected), got);
3285    ///
3286    /// // And similarly for the trailing word boundary.
3287    /// let input = Input::new("123β").range(..3);
3288    /// let expected = MatchError::quit(0xCE, 3);
3289    /// let got = dfa.try_search_fwd(&mut cache, &input);
3290    /// assert_eq!(Err(expected), got);
3291    ///
3292    /// # Ok::<(), Box<dyn std::error::Error>>(())
3293    /// ```
3294    pub fn unicode_word_boundary(mut self, yes: bool) -> Config {
3295        // We have a separate option for this instead of just setting the
3296        // appropriate quit bytes here because we don't want to set quit bytes
3297        // for every regex. We only want to set them when the regex contains a
3298        // Unicode word boundary.
3299        self.unicode_word_boundary = Some(yes);
3300        self
3301    }
3302
3303    /// Add a "quit" byte to the lazy DFA.
3304    ///
3305    /// When a quit byte is seen during search time, then search will return a
3306    /// [`MatchError::quit`] error indicating the offset at which the search
3307    /// stopped.
3308    ///
3309    /// A quit byte will always overrule any other aspects of a regex. For
3310    /// example, if the `x` byte is added as a quit byte and the regex `\w` is
3311    /// used, then observing `x` will cause the search to quit immediately
3312    /// despite the fact that `x` is in the `\w` class.
3313    ///
3314    /// This mechanism is primarily useful for heuristically enabling certain
3315    /// features like Unicode word boundaries in a DFA. Namely, if the input
3316    /// to search is ASCII, then a Unicode word boundary can be implemented
3317    /// via an ASCII word boundary with no change in semantics. Thus, a DFA
3318    /// can attempt to match a Unicode word boundary but give up as soon as it
3319    /// observes a non-ASCII byte. Indeed, if callers set all non-ASCII bytes
3320    /// to be quit bytes, then Unicode word boundaries will be permitted when
3321    /// building lazy DFAs. Of course, callers should enable
3322    /// [`Config::unicode_word_boundary`] if they want this behavior instead.
3323    /// (The advantage being that non-ASCII quit bytes will only be added if a
3324    /// Unicode word boundary is in the pattern.)
3325    ///
3326    /// When enabling this option, callers _must_ be prepared to
3327    /// handle a [`MatchError`] error during search. When using a
3328    /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the
3329    /// `try_` suite of methods.
3330    ///
3331    /// By default, there are no quit bytes set.
3332    ///
3333    /// # Panics
3334    ///
3335    /// This panics if heuristic Unicode word boundaries are enabled and any
3336    /// non-ASCII byte is removed from the set of quit bytes. Namely, enabling
3337    /// Unicode word boundaries requires setting every non-ASCII byte to a quit
3338    /// byte. So if the caller attempts to undo any of that, then this will
3339    /// panic.
3340    ///
3341    /// # Example
3342    ///
3343    /// This example shows how to cause a search to terminate if it sees a
3344    /// `\n` byte. This could be useful if, for example, you wanted to prevent
3345    /// a user supplied pattern from matching across a line boundary.
3346    ///
3347    /// ```
3348    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
3349    /// use regex_automata::{hybrid::dfa::DFA, MatchError, Input};
3350    ///
3351    /// let dfa = DFA::builder()
3352    ///     .configure(DFA::config().quit(b'\n', true))
3353    ///     .build(r"foo\p{any}+bar")?;
3354    /// let mut cache = dfa.create_cache();
3355    ///
3356    /// let haystack = "foo\nbar";
3357    /// // Normally this would produce a match, since \p{any} contains '\n'.
3358    /// // But since we instructed the automaton to enter a quit state if a
3359    /// // '\n' is observed, this produces a match error instead.
3360    /// let expected = MatchError::quit(b'\n', 3);
3361    /// let got = dfa.try_search_fwd(
3362    ///     &mut cache,
3363    ///     &Input::new(haystack),
3364    /// ).unwrap_err();
3365    /// assert_eq!(expected, got);
3366    ///
3367    /// # Ok::<(), Box<dyn std::error::Error>>(())
3368    /// ```
3369    pub fn quit(mut self, byte: u8, yes: bool) -> Config {
3370        if self.get_unicode_word_boundary() && !byte.is_ascii() && !yes {
3371            panic!(
3372                "cannot set non-ASCII byte to be non-quit when \
3373                 Unicode word boundaries are enabled"
3374            );
3375        }
3376        if self.quitset.is_none() {
3377            self.quitset = Some(ByteSet::empty());
3378        }
3379        if yes {
3380            self.quitset.as_mut().unwrap().add(byte);
3381        } else {
3382            self.quitset.as_mut().unwrap().remove(byte);
3383        }
3384        self
3385    }
3386
3387    /// Enable specializing start states in the lazy DFA.
3388    ///
3389    /// When start states are specialized, an implementor of a search routine
3390    /// using a lazy DFA can tell when the search has entered a starting state.
3391    /// When start states aren't specialized, then it is impossible to know
3392    /// whether the search has entered a start state.
3393    ///
3394    /// Ideally, this option wouldn't need to exist and we could always
3395    /// specialize start states. The problem is that start states can be quite
3396    /// active. This in turn means that an efficient search routine is likely
3397    /// to ping-pong between a heavily optimized hot loop that handles most
3398    /// states and to a less optimized specialized handling of start states.
3399    /// This causes branches to get heavily mispredicted and overall can
3400    /// materially decrease throughput. Therefore, specializing start states
3401    /// should only be enabled when it is needed.
3402    ///
3403    /// Knowing whether a search is in a start state is typically useful when a
3404    /// prefilter is active for the search. A prefilter is typically only run
3405    /// when in a start state and a prefilter can greatly accelerate a search.
3406    /// Therefore, the possible cost of specializing start states is worth it
3407    /// in this case. Otherwise, if you have no prefilter, there is likely no
3408    /// reason to specialize start states.
3409    ///
3410    /// This is disabled by default, but note that it is automatically
3411    /// enabled (or disabled) if [`Config::prefilter`] is set. Namely, unless
3412    /// `specialize_start_states` has already been set, [`Config::prefilter`]
3413    /// will automatically enable or disable it based on whether a prefilter
3414    /// is present or not, respectively. This is done because a prefilter's
3415    /// effectiveness is rooted in being executed whenever the DFA is in a
3416    /// start state, and that's only possible to do when they are specialized.
3417    ///
3418    /// Note that it is plausibly reasonable to _disable_ this option
3419    /// explicitly while _enabling_ a prefilter. In that case, a prefilter
3420    /// will still be run at the beginning of a search, but never again. This
3421    /// in theory could strike a good balance if you're in a situation where a
3422    /// prefilter is likely to produce many false positive candidates.
3423    ///
3424    /// # Example
3425    ///
3426    /// This example shows how to enable start state specialization and then
3427    /// shows how to check whether a state is a start state or not.
3428    ///
3429    /// ```
3430    /// use regex_automata::{hybrid::dfa::DFA, MatchError, Input};
3431    ///
3432    /// let dfa = DFA::builder()
3433    ///     .configure(DFA::config().specialize_start_states(true))
3434    ///     .build(r"[a-z]+")?;
3435    /// let mut cache = dfa.create_cache();
3436    ///
3437    /// let haystack = "123 foobar 4567".as_bytes();
3438    /// let sid = dfa.start_state_forward(&mut cache, &Input::new(haystack))?;
3439    /// // The ID returned by 'start_state_forward' will always be tagged as
3440    /// // a start state when start state specialization is enabled.
3441    /// assert!(sid.is_tagged());
3442    /// assert!(sid.is_start());
3443    ///
3444    /// # Ok::<(), Box<dyn std::error::Error>>(())
3445    /// ```
3446    ///
3447    /// Compare the above with the default lazy DFA configuration where
3448    /// start states are _not_ specialized. In this case, the start state
3449    /// is not tagged and `sid.is_start()` returns false.
3450    ///
3451    /// ```
3452    /// use regex_automata::{hybrid::dfa::DFA, MatchError, Input};
3453    ///
3454    /// let dfa = DFA::new(r"[a-z]+")?;
3455    /// let mut cache = dfa.create_cache();
3456    ///
3457    /// let haystack = "123 foobar 4567".as_bytes();
3458    /// let sid = dfa.start_state_forward(&mut cache, &Input::new(haystack))?;
3459    /// // Start states are not tagged in the default configuration!
3460    /// assert!(!sid.is_tagged());
3461    /// assert!(!sid.is_start());
3462    ///
3463    /// # Ok::<(), Box<dyn std::error::Error>>(())
3464    /// ```
3465    pub fn specialize_start_states(mut self, yes: bool) -> Config {
3466        self.specialize_start_states = Some(yes);
3467        self
3468    }
3469
3470    /// Sets the maximum amount of heap memory, in bytes, to allocate to the
3471    /// cache for use during a lazy DFA search. If the lazy DFA would otherwise
3472    /// use more heap memory, then, depending on other configuration knobs,
3473    /// either stop the search and return an error or clear the cache and
3474    /// continue the search.
3475    ///
3476    /// The default cache capacity is some "reasonable" number that will
3477    /// accommodate most regular expressions. You may find that if you need
3478    /// to build a large DFA then it may be necessary to increase the cache
3479    /// capacity.
3480    ///
3481    /// Note that while building a lazy DFA will do a "minimum" check to ensure
3482    /// the capacity is big enough, this is more or less about correctness.
3483    /// If the cache is bigger than the minimum but still "too small," then the
3484    /// lazy DFA could wind up spending a lot of time clearing the cache and
3485    /// recomputing transitions, thus negating the performance benefits of a
3486    /// lazy DFA. Thus, setting the cache capacity is mostly an experimental
3487    /// endeavor. For most common patterns, however, the default should be
3488    /// sufficient.
3489    ///
3490    /// For more details on how the lazy DFA's cache is used, see the
3491    /// documentation for [`Cache`].
3492    ///
3493    /// # Example
3494    ///
3495    /// This example shows what happens if the configured cache capacity is
3496    /// too small. In such cases, one can override the cache capacity to make
3497    /// it bigger. Alternatively, one might want to use less memory by setting
3498    /// a smaller cache capacity.
3499    ///
3500    /// ```
3501    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
3502    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
3503    ///
3504    /// let pattern = r"\p{L}{1000}";
3505    ///
3506    /// // The default cache capacity is likely too small to deal with regexes
3507    /// // that are very large. Large repetitions of large Unicode character
3508    /// // classes are a common way to make very large regexes.
3509    /// let _ = DFA::new(pattern).unwrap_err();
3510    /// // Bump up the capacity to something bigger.
3511    /// let dfa = DFA::builder()
3512    ///     .configure(DFA::config().cache_capacity(100 * (1<<20))) // 100 MB
3513    ///     .build(pattern)?;
3514    /// let mut cache = dfa.create_cache();
3515    ///
3516    /// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ".repeat(50);
3517    /// let expected = Some(HalfMatch::must(0, 2000));
3518    /// let got = dfa.try_search_fwd(&mut cache, &Input::new(&haystack))?;
3519    /// assert_eq!(expected, got);
3520    ///
3521    /// # Ok::<(), Box<dyn std::error::Error>>(())
3522    /// ```
3523    pub fn cache_capacity(mut self, bytes: usize) -> Config {
3524        self.cache_capacity = Some(bytes);
3525        self
3526    }
3527
3528    /// Configures construction of a lazy DFA to use the minimum cache capacity
3529    /// if the configured capacity is otherwise too small for the provided NFA.
3530    ///
3531    /// This is useful if you never want lazy DFA construction to fail because
3532    /// of a capacity that is too small.
3533    ///
3534    /// In general, this option is typically not a good idea. In particular,
3535    /// while a minimum cache capacity does permit the lazy DFA to function
3536    /// where it otherwise couldn't, it's plausible that it may not function
3537    /// well if it's constantly running out of room. In that case, the speed
3538    /// advantages of the lazy DFA may be negated. On the other hand, the
3539    /// "minimum" cache capacity computed may not be completely accurate and
3540    /// could actually be bigger than what is really necessary. Therefore, it
3541    /// is plausible that using the minimum cache capacity could still result
3542    /// in very good performance.
3543    ///
3544    /// This is disabled by default.
3545    ///
3546    /// # Example
3547    ///
3548    /// This example shows what happens if the configured cache capacity is
3549    /// too small. In such cases, one could override the capacity explicitly.
3550    /// An alternative, demonstrated here, let's us force construction to use
3551    /// the minimum cache capacity if the configured capacity is otherwise
3552    /// too small.
3553    ///
3554    /// ```
3555    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
3556    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
3557    ///
3558    /// let pattern = r"\p{L}{1000}";
3559    ///
3560    /// // The default cache capacity is likely too small to deal with regexes
3561    /// // that are very large. Large repetitions of large Unicode character
3562    /// // classes are a common way to make very large regexes.
3563    /// let _ = DFA::new(pattern).unwrap_err();
3564    /// // Configure construction such it automatically selects the minimum
3565    /// // cache capacity if it would otherwise be too small.
3566    /// let dfa = DFA::builder()
3567    ///     .configure(DFA::config().skip_cache_capacity_check(true))
3568    ///     .build(pattern)?;
3569    /// let mut cache = dfa.create_cache();
3570    ///
3571    /// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ".repeat(50);
3572    /// let expected = Some(HalfMatch::must(0, 2000));
3573    /// let got = dfa.try_search_fwd(&mut cache, &Input::new(&haystack))?;
3574    /// assert_eq!(expected, got);
3575    ///
3576    /// # Ok::<(), Box<dyn std::error::Error>>(())
3577    /// ```
3578    pub fn skip_cache_capacity_check(mut self, yes: bool) -> Config {
3579        self.skip_cache_capacity_check = Some(yes);
3580        self
3581    }
3582
3583    /// Configure a lazy DFA search to quit after a certain number of cache
3584    /// clearings.
3585    ///
3586    /// When a minimum is set, then a lazy DFA search will *possibly* "give
3587    /// up" after the minimum number of cache clearings has occurred. This is
3588    /// typically useful in scenarios where callers want to detect whether the
3589    /// lazy DFA search is "efficient" or not. If the cache is cleared too many
3590    /// times, this is a good indicator that it is not efficient, and thus, the
3591    /// caller may wish to use some other regex engine.
3592    ///
3593    /// Note that the number of times a cache is cleared is a property of
3594    /// the cache itself. Thus, if a cache is used in a subsequent search
3595    /// with a similarly configured lazy DFA, then it could cause the
3596    /// search to "give up" if the cache needed to be cleared, depending
3597    /// on its internal count and configured minimum. The cache clear
3598    /// count can only be reset to `0` via [`DFA::reset_cache`] (or
3599    /// [`Regex::reset_cache`](crate::hybrid::regex::Regex::reset_cache) if
3600    /// you're using the `Regex` API).
3601    ///
3602    /// By default, no minimum is configured. Thus, a lazy DFA search will
3603    /// never give up due to cache clearings. If you do set this option, you
3604    /// might consider also setting [`Config::minimum_bytes_per_state`] in
3605    /// order for the lazy DFA to take efficiency into account before giving
3606    /// up.
3607    ///
3608    /// # Example
3609    ///
3610    /// This example uses a somewhat pathological configuration to demonstrate
3611    /// the _possible_ behavior of cache clearing and how it might result
3612    /// in a search that returns an error.
3613    ///
3614    /// It is important to note that the precise mechanics of how and when
3615    /// a cache gets cleared is an implementation detail.
3616    ///
3617    /// ```
3618    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
3619    /// use regex_automata::{hybrid::dfa::DFA, Input, MatchError, MatchErrorKind};
3620    ///
3621    /// // This is a carefully chosen regex. The idea is to pick one
3622    /// // that requires some decent number of states (hence the bounded
3623    /// // repetition). But we specifically choose to create a class with an
3624    /// // ASCII letter and a non-ASCII letter so that we can check that no new
3625    /// // states are created once the cache is full. Namely, if we fill up the
3626    /// // cache on a haystack of 'a's, then in order to match one 'β', a new
3627    /// // state will need to be created since a 'β' is encoded with multiple
3628    /// // bytes. Since there's no room for this state, the search should quit
3629    /// // at the very first position.
3630    /// let pattern = r"[aβ]{100}";
3631    /// let dfa = DFA::builder()
3632    ///     .configure(
3633    ///         // Configure it so that we have the minimum cache capacity
3634    ///         // possible. And that if any clearings occur, the search quits.
3635    ///         DFA::config()
3636    ///             .skip_cache_capacity_check(true)
3637    ///             .cache_capacity(0)
3638    ///             .minimum_cache_clear_count(Some(0)),
3639    ///     )
3640    ///     .build(pattern)?;
3641    /// let mut cache = dfa.create_cache();
3642    ///
3643    /// // Our search will give up before reaching the end!
3644    /// let haystack = "a".repeat(101).into_bytes();
3645    /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack));
3646    /// assert!(matches!(
3647    ///     *result.unwrap_err().kind(),
3648    ///     MatchErrorKind::GaveUp { .. },
3649    /// ));
3650    ///
3651    /// // Now that we know the cache is full, if we search a haystack that we
3652    /// // know will require creating at least one new state, it should not
3653    /// // be able to make much progress.
3654    /// let haystack = "β".repeat(101).into_bytes();
3655    /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack));
3656    /// assert!(matches!(
3657    ///     *result.unwrap_err().kind(),
3658    ///     MatchErrorKind::GaveUp { .. },
3659    /// ));
3660    ///
3661    /// // If we reset the cache, then we should be able to create more states
3662    /// // and make more progress with searching for betas.
3663    /// cache.reset(&dfa);
3664    /// let haystack = "β".repeat(101).into_bytes();
3665    /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack));
3666    /// assert!(matches!(
3667    ///     *result.unwrap_err().kind(),
3668    ///     MatchErrorKind::GaveUp { .. },
3669    /// ));
3670    ///
3671    /// // ... switching back to ASCII still makes progress since it just needs
3672    /// // to set transitions on existing states!
3673    /// let haystack = "a".repeat(101).into_bytes();
3674    /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack));
3675    /// assert!(matches!(
3676    ///     *result.unwrap_err().kind(),
3677    ///     MatchErrorKind::GaveUp { .. },
3678    /// ));
3679    ///
3680    /// # Ok::<(), Box<dyn std::error::Error>>(())
3681    /// ```
3682    pub fn minimum_cache_clear_count(mut self, min: Option<usize>) -> Config {
3683        self.minimum_cache_clear_count = Some(min);
3684        self
3685    }
3686
3687    /// Configure a lazy DFA search to quit only when its efficiency drops
3688    /// below the given minimum.
3689    ///
3690    /// The efficiency of the cache is determined by the number of DFA states
3691    /// compiled per byte of haystack searched. For example, if the efficiency
3692    /// is 2, then it means the lazy DFA is creating a new DFA state after
3693    /// searching approximately 2 bytes in a haystack. Generally speaking, 2
3694    /// is quite bad and it's likely that even a slower regex engine like the
3695    /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) would be faster.
3696    ///
3697    /// This has no effect if [`Config::minimum_cache_clear_count`] is not set.
3698    /// Namely, this option only kicks in when the cache has been cleared more
3699    /// than the minimum number. If no minimum is set, then the cache is simply
3700    /// cleared whenever it fills up and it is impossible for the lazy DFA to
3701    /// quit due to ineffective use of the cache.
3702    ///
3703    /// In general, if one is setting [`Config::minimum_cache_clear_count`],
3704    /// then one should probably also set this knob as well. The reason is
3705    /// that the absolute number of times the cache is cleared is generally
3706    /// not a great predictor of efficiency. For example, if a new DFA state
3707    /// is created for every 1,000 bytes searched, then it wouldn't be hard
3708    /// for the cache to get cleared more than `N` times and then cause the
3709    /// lazy DFA to quit. But a new DFA state every 1,000 bytes is likely quite
3710    /// good from a performance perspective, and it's likely that the lazy
3711    /// DFA should continue searching, even if it requires clearing the cache
3712    /// occasionally.
3713    ///
3714    /// Finally, note that if you're implementing your own lazy DFA search
3715    /// routine and also want this efficiency check to work correctly, then
3716    /// you'll need to use the following routines to record search progress:
3717    ///
3718    /// * Call [`Cache::search_start`] at the beginning of every search.
3719    /// * Call [`Cache::search_update`] whenever [`DFA::next_state`] is
3720    /// called.
3721    /// * Call [`Cache::search_finish`] before completing a search. (It is
3722    /// not strictly necessary to call this when an error is returned, as
3723    /// `Cache::search_start` will automatically finish the previous search
3724    /// for you. But calling it where possible before returning helps improve
3725    /// the accuracy of how many bytes have actually been searched.)
3726    pub fn minimum_bytes_per_state(mut self, min: Option<usize>) -> Config {
3727        self.minimum_bytes_per_state = Some(min);
3728        self
3729    }
3730
3731    /// Returns the match semantics set in this configuration.
3732    pub fn get_match_kind(&self) -> MatchKind {
3733        self.match_kind.unwrap_or(MatchKind::LeftmostFirst)
3734    }
3735
3736    /// Returns the prefilter set in this configuration, if one at all.
3737    pub fn get_prefilter(&self) -> Option<&Prefilter> {
3738        self.pre.as_ref().unwrap_or(&None).as_ref()
3739    }
3740
3741    /// Returns whether this configuration has enabled anchored starting states
3742    /// for every pattern in the DFA.
3743    pub fn get_starts_for_each_pattern(&self) -> bool {
3744        self.starts_for_each_pattern.unwrap_or(false)
3745    }
3746
3747    /// Returns whether this configuration has enabled byte classes or not.
3748    /// This is typically a debugging oriented option, as disabling it confers
3749    /// no speed benefit.
3750    pub fn get_byte_classes(&self) -> bool {
3751        self.byte_classes.unwrap_or(true)
3752    }
3753
3754    /// Returns whether this configuration has enabled heuristic Unicode word
3755    /// boundary support. When enabled, it is possible for a search to return
3756    /// an error.
3757    pub fn get_unicode_word_boundary(&self) -> bool {
3758        self.unicode_word_boundary.unwrap_or(false)
3759    }
3760
3761    /// Returns whether this configuration will instruct the lazy DFA to enter
3762    /// a quit state whenever the given byte is seen during a search. When at
3763    /// least one byte has this enabled, it is possible for a search to return
3764    /// an error.
3765    pub fn get_quit(&self, byte: u8) -> bool {
3766        self.quitset.map_or(false, |q| q.contains(byte))
3767    }
3768
3769    /// Returns whether this configuration will instruct the lazy DFA to
3770    /// "specialize" start states. When enabled, the lazy DFA will tag start
3771    /// states so that search routines using the lazy DFA can detect when
3772    /// it's in a start state and do some kind of optimization (like run a
3773    /// prefilter).
3774    pub fn get_specialize_start_states(&self) -> bool {
3775        self.specialize_start_states.unwrap_or(false)
3776    }
3777
3778    /// Returns the cache capacity set on this configuration.
3779    pub fn get_cache_capacity(&self) -> usize {
3780        self.cache_capacity.unwrap_or(2 * (1 << 20))
3781    }
3782
3783    /// Returns whether the cache capacity check should be skipped.
3784    pub fn get_skip_cache_capacity_check(&self) -> bool {
3785        self.skip_cache_capacity_check.unwrap_or(false)
3786    }
3787
3788    /// Returns, if set, the minimum number of times the cache must be cleared
3789    /// before a lazy DFA search can give up. When no minimum is set, then a
3790    /// search will never quit and will always clear the cache whenever it
3791    /// fills up.
3792    pub fn get_minimum_cache_clear_count(&self) -> Option<usize> {
3793        self.minimum_cache_clear_count.unwrap_or(None)
3794    }
3795
3796    /// Returns, if set, the minimum number of bytes per state that need to be
3797    /// processed in order for the lazy DFA to keep going. If the minimum falls
3798    /// below this number (and the cache has been cleared a minimum number of
3799    /// times), then the lazy DFA will return a "gave up" error.
3800    pub fn get_minimum_bytes_per_state(&self) -> Option<usize> {
3801        self.minimum_bytes_per_state.unwrap_or(None)
3802    }
3803
3804    /// Returns the minimum lazy DFA cache capacity required for the given NFA.
3805    ///
3806    /// The cache capacity required for a particular NFA may change without
3807    /// notice. Callers should not rely on it being stable.
3808    ///
3809    /// This is useful for informational purposes, but can also be useful for
3810    /// other reasons. For example, if one wants to check the minimum cache
3811    /// capacity themselves or if one wants to set the capacity based on the
3812    /// minimum.
3813    ///
3814    /// This may return an error if this configuration does not support all of
3815    /// the instructions used in the given NFA. For example, if the NFA has a
3816    /// Unicode word boundary but this configuration does not enable heuristic
3817    /// support for Unicode word boundaries.
3818    pub fn get_minimum_cache_capacity(
3819        &self,
3820        nfa: &thompson::NFA,
3821    ) -> Result<usize, BuildError> {
3822        let quitset = self.quit_set_from_nfa(nfa)?;
3823        let classes = self.byte_classes_from_nfa(nfa, &quitset);
3824        let starts = self.get_starts_for_each_pattern();
3825        Ok(minimum_cache_capacity(nfa, &classes, starts))
3826    }
3827
3828    /// Returns the byte class map used during search from the given NFA.
3829    ///
3830    /// If byte classes are disabled on this configuration, then a map is
3831    /// returned that puts each byte in its own equivalent class.
3832    fn byte_classes_from_nfa(
3833        &self,
3834        nfa: &thompson::NFA,
3835        quit: &ByteSet,
3836    ) -> ByteClasses {
3837        if !self.get_byte_classes() {
3838            // The lazy DFA will always use the equivalence class map, but
3839            // enabling this option is useful for debugging. Namely, this will
3840            // cause all transitions to be defined over their actual bytes
3841            // instead of an opaque equivalence class identifier. The former is
3842            // much easier to grok as a human.
3843            ByteClasses::singletons()
3844        } else {
3845            let mut set = nfa.byte_class_set().clone();
3846            // It is important to distinguish any "quit" bytes from all other
3847            // bytes. Otherwise, a non-quit byte may end up in the same class
3848            // as a quit byte, and thus cause the DFA stop when it shouldn't.
3849            //
3850            // Test case:
3851            //
3852            //   regex-cli find match hybrid --unicode-word-boundary \
3853            //     -p '^#' -p '\b10\.55\.182\.100\b' -y @conn.json.1000x.log
3854            if !quit.is_empty() {
3855                set.add_set(&quit);
3856            }
3857            set.byte_classes()
3858        }
3859    }
3860
3861    /// Return the quit set for this configuration and the given NFA.
3862    ///
3863    /// This may return an error if the NFA is incompatible with this
3864    /// configuration's quit set. For example, if the NFA has a Unicode word
3865    /// boundary and the quit set doesn't include non-ASCII bytes.
3866    fn quit_set_from_nfa(
3867        &self,
3868        nfa: &thompson::NFA,
3869    ) -> Result<ByteSet, BuildError> {
3870        let mut quit = self.quitset.unwrap_or(ByteSet::empty());
3871        if nfa.look_set_any().contains_word_unicode() {
3872            if self.get_unicode_word_boundary() {
3873                for b in 0x80..=0xFF {
3874                    quit.add(b);
3875                }
3876            } else {
3877                // If heuristic support for Unicode word boundaries wasn't
3878                // enabled, then we can still check if our quit set is correct.
3879                // If the caller set their quit bytes in a way that causes the
3880                // DFA to quit on at least all non-ASCII bytes, then that's all
3881                // we need for heuristic support to work.
3882                if !quit.contains_range(0x80, 0xFF) {
3883                    return Err(
3884                        BuildError::unsupported_dfa_word_boundary_unicode(),
3885                    );
3886                }
3887            }
3888        }
3889        Ok(quit)
3890    }
3891
3892    /// Overwrite the default configuration such that the options in `o` are
3893    /// always used. If an option in `o` is not set, then the corresponding
3894    /// option in `self` is used. If it's not set in `self` either, then it
3895    /// remains not set.
3896    fn overwrite(&self, o: Config) -> Config {
3897        Config {
3898            match_kind: o.match_kind.or(self.match_kind),
3899            pre: o.pre.or_else(|| self.pre.clone()),
3900            starts_for_each_pattern: o
3901                .starts_for_each_pattern
3902                .or(self.starts_for_each_pattern),
3903            byte_classes: o.byte_classes.or(self.byte_classes),
3904            unicode_word_boundary: o
3905                .unicode_word_boundary
3906                .or(self.unicode_word_boundary),
3907            quitset: o.quitset.or(self.quitset),
3908            specialize_start_states: o
3909                .specialize_start_states
3910                .or(self.specialize_start_states),
3911            cache_capacity: o.cache_capacity.or(self.cache_capacity),
3912            skip_cache_capacity_check: o
3913                .skip_cache_capacity_check
3914                .or(self.skip_cache_capacity_check),
3915            minimum_cache_clear_count: o
3916                .minimum_cache_clear_count
3917                .or(self.minimum_cache_clear_count),
3918            minimum_bytes_per_state: o
3919                .minimum_bytes_per_state
3920                .or(self.minimum_bytes_per_state),
3921        }
3922    }
3923}
3924
3925/// A builder for constructing a lazy deterministic finite automaton from
3926/// regular expressions.
3927///
3928/// As a convenience, [`DFA::builder`] is an alias for [`Builder::new`]. The
3929/// advantage of the former is that it often lets you avoid importing the
3930/// `Builder` type directly.
3931///
3932/// This builder provides two main things:
3933///
3934/// 1. It provides a few different `build` routines for actually constructing
3935/// a DFA from different kinds of inputs. The most convenient is
3936/// [`Builder::build`], which builds a DFA directly from a pattern string. The
3937/// most flexible is [`Builder::build_from_nfa`], which builds a DFA straight
3938/// from an NFA.
3939/// 2. The builder permits configuring a number of things.
3940/// [`Builder::configure`] is used with [`Config`] to configure aspects of
3941/// the DFA and the construction process itself. [`Builder::syntax`] and
3942/// [`Builder::thompson`] permit configuring the regex parser and Thompson NFA
3943/// construction, respectively. The syntax and thompson configurations only
3944/// apply when building from a pattern string.
3945///
3946/// This builder always constructs a *single* lazy DFA. As such, this builder
3947/// can only be used to construct regexes that either detect the presence
3948/// of a match or find the end location of a match. A single DFA cannot
3949/// produce both the start and end of a match. For that information, use a
3950/// [`Regex`](crate::hybrid::regex::Regex), which can be similarly configured
3951/// using [`regex::Builder`](crate::hybrid::regex::Builder). The main reason
3952/// to use a DFA directly is if the end location of a match is enough for your
3953/// use case. Namely, a `Regex` will construct two lazy DFAs instead of one,
3954/// since a second reverse DFA is needed to find the start of a match.
3955///
3956/// # Example
3957///
3958/// This example shows how to build a lazy DFA that uses a tiny cache capacity
3959/// and completely disables Unicode. That is:
3960///
3961/// * Things such as `\w`, `.` and `\b` are no longer Unicode-aware. `\w`
3962///   and `\b` are ASCII-only while `.` matches any byte except for `\n`
3963///   (instead of any UTF-8 encoding of a Unicode scalar value except for
3964///   `\n`). Things that are Unicode only, such as `\pL`, are not allowed.
3965/// * The pattern itself is permitted to match invalid UTF-8. For example,
3966///   things like `[^a]` that match any byte except for `a` are permitted.
3967///
3968/// ```
3969/// use regex_automata::{
3970///     hybrid::dfa::DFA,
3971///     nfa::thompson,
3972///     util::syntax,
3973///     HalfMatch, Input,
3974/// };
3975///
3976/// let dfa = DFA::builder()
3977///     .configure(DFA::config().cache_capacity(5_000))
3978///     .thompson(thompson::Config::new().utf8(false))
3979///     .syntax(syntax::Config::new().unicode(false).utf8(false))
3980///     .build(r"foo[^b]ar.*")?;
3981/// let mut cache = dfa.create_cache();
3982///
3983/// let haystack = b"\xFEfoo\xFFar\xE2\x98\xFF\n";
3984/// let expected = Some(HalfMatch::must(0, 10));
3985/// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?;
3986/// assert_eq!(expected, got);
3987///
3988/// # Ok::<(), Box<dyn std::error::Error>>(())
3989/// ```
3990#[derive(Clone, Debug)]
3991pub struct Builder {
3992    config: Config,
3993    #[cfg(feature = "syntax")]
3994    thompson: thompson::Compiler,
3995}
3996
3997impl Builder {
3998    /// Create a new lazy DFA builder with the default configuration.
3999    pub fn new() -> Builder {
4000        Builder {
4001            config: Config::default(),
4002            #[cfg(feature = "syntax")]
4003            thompson: thompson::Compiler::new(),
4004        }
4005    }
4006
4007    /// Build a lazy DFA from the given pattern.
4008    ///
4009    /// If there was a problem parsing or compiling the pattern, then an error
4010    /// is returned.
4011    #[cfg(feature = "syntax")]
4012    pub fn build(&self, pattern: &str) -> Result<DFA, BuildError> {
4013        self.build_many(&[pattern])
4014    }
4015
4016    /// Build a lazy DFA from the given patterns.
4017    ///
4018    /// When matches are returned, the pattern ID corresponds to the index of
4019    /// the pattern in the slice given.
4020    #[cfg(feature = "syntax")]
4021    pub fn build_many<P: AsRef<str>>(
4022        &self,
4023        patterns: &[P],
4024    ) -> Result<DFA, BuildError> {
4025        let nfa = self
4026            .thompson
4027            .clone()
4028            // We can always forcefully disable captures because DFAs do not
4029            // support them.
4030            .configure(
4031                thompson::Config::new()
4032                    .which_captures(thompson::WhichCaptures::None),
4033            )
4034            .build_many(patterns)
4035            .map_err(BuildError::nfa)?;
4036        self.build_from_nfa(nfa)
4037    }
4038
4039    /// Build a DFA from the given NFA.
4040    ///
4041    /// Note that this requires owning a `thompson::NFA`. While this may force
4042    /// you to clone the NFA, such a clone is not a deep clone. Namely, NFAs
4043    /// are defined internally to support shared ownership such that cloning is
4044    /// very cheap.
4045    ///
4046    /// # Example
4047    ///
4048    /// This example shows how to build a lazy DFA if you already have an NFA
4049    /// in hand.
4050    ///
4051    /// ```
4052    /// use regex_automata::{
4053    ///     hybrid::dfa::DFA,
4054    ///     nfa::thompson,
4055    ///     HalfMatch, Input,
4056    /// };
4057    ///
4058    /// let haystack = "foo123bar";
4059    ///
4060    /// // This shows how to set non-default options for building an NFA.
4061    /// let nfa = thompson::Compiler::new()
4062    ///     .configure(thompson::Config::new().shrink(true))
4063    ///     .build(r"[0-9]+")?;
4064    /// let dfa = DFA::builder().build_from_nfa(nfa)?;
4065    /// let mut cache = dfa.create_cache();
4066    /// let expected = Some(HalfMatch::must(0, 6));
4067    /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?;
4068    /// assert_eq!(expected, got);
4069    ///
4070    /// # Ok::<(), Box<dyn std::error::Error>>(())
4071    /// ```
4072    pub fn build_from_nfa(
4073        &self,
4074        nfa: thompson::NFA,
4075    ) -> Result<DFA, BuildError> {
4076        let quitset = self.config.quit_set_from_nfa(&nfa)?;
4077        let classes = self.config.byte_classes_from_nfa(&nfa, &quitset);
4078        // Check that we can fit at least a few states into our cache,
4079        // otherwise it's pretty senseless to use the lazy DFA. This does have
4080        // a possible failure mode though. This assumes the maximum size of a
4081        // state in powerset space (so, the total number of NFA states), which
4082        // may never actually materialize, and could be quite a bit larger
4083        // than the actual biggest state. If this turns out to be a problem,
4084        // we could expose a knob that disables this check. But if so, we have
4085        // to be careful not to panic in other areas of the code (the cache
4086        // clearing and init code) that tend to assume some minimum useful
4087        // cache capacity.
4088        let min_cache = minimum_cache_capacity(
4089            &nfa,
4090            &classes,
4091            self.config.get_starts_for_each_pattern(),
4092        );
4093        let mut cache_capacity = self.config.get_cache_capacity();
4094        if cache_capacity < min_cache {
4095            // When the caller has asked us to skip the cache capacity check,
4096            // then we simply force the cache capacity to its minimum amount
4097            // and mush on.
4098            if self.config.get_skip_cache_capacity_check() {
4099                debug!(
4100                    "given capacity ({cache_capacity}) is too small, \
4101                     since skip_cache_capacity_check is enabled, \
4102                     setting cache capacity to minimum ({min_cache})",
4103                );
4104                cache_capacity = min_cache;
4105            } else {
4106                return Err(BuildError::insufficient_cache_capacity(
4107                    min_cache,
4108                    cache_capacity,
4109                ));
4110            }
4111        }
4112        // We also need to check that we can fit at least some small number
4113        // of states in our state ID space. This is unlikely to trigger in
4114        // >=32-bit systems, but 16-bit systems have a pretty small state ID
4115        // space since a number of bits are used up as sentinels.
4116        if let Err(err) = minimum_lazy_state_id(&classes) {
4117            return Err(BuildError::insufficient_state_id_capacity(err));
4118        }
4119        let stride2 = classes.stride2();
4120        let start_map = StartByteMap::new(nfa.look_matcher());
4121        Ok(DFA {
4122            config: self.config.clone(),
4123            nfa,
4124            stride2,
4125            start_map,
4126            classes,
4127            quitset,
4128            cache_capacity,
4129        })
4130    }
4131
4132    /// Apply the given lazy DFA configuration options to this builder.
4133    pub fn configure(&mut self, config: Config) -> &mut Builder {
4134        self.config = self.config.overwrite(config);
4135        self
4136    }
4137
4138    /// Set the syntax configuration for this builder using
4139    /// [`syntax::Config`](crate::util::syntax::Config).
4140    ///
4141    /// This permits setting things like case insensitivity, Unicode and multi
4142    /// line mode.
4143    ///
4144    /// These settings only apply when constructing a lazy DFA directly from a
4145    /// pattern.
4146    #[cfg(feature = "syntax")]
4147    pub fn syntax(
4148        &mut self,
4149        config: crate::util::syntax::Config,
4150    ) -> &mut Builder {
4151        self.thompson.syntax(config);
4152        self
4153    }
4154
4155    /// Set the Thompson NFA configuration for this builder using
4156    /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
4157    ///
4158    /// This permits setting things like whether the DFA should match the regex
4159    /// in reverse or if additional time should be spent shrinking the size of
4160    /// the NFA.
4161    ///
4162    /// These settings only apply when constructing a DFA directly from a
4163    /// pattern.
4164    #[cfg(feature = "syntax")]
4165    pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
4166        self.thompson.configure(config);
4167        self
4168    }
4169}
4170
4171/// Represents the current state of an overlapping search.
4172///
4173/// This is used for overlapping searches since they need to know something
4174/// about the previous search. For example, when multiple patterns match at the
4175/// same position, this state tracks the last reported pattern so that the next
4176/// search knows whether to report another matching pattern or continue with
4177/// the search at the next position. Additionally, it also tracks which state
4178/// the last search call terminated in.
4179///
4180/// This type provides little introspection capabilities. The only thing a
4181/// caller can do is construct it and pass it around to permit search routines
4182/// to use it to track state, and also ask whether a match has been found.
4183///
4184/// Callers should always provide a fresh state constructed via
4185/// [`OverlappingState::start`] when starting a new search. Reusing state from
4186/// a previous search may result in incorrect results.
4187#[derive(Clone, Debug, Eq, PartialEq)]
4188pub struct OverlappingState {
4189    /// The match reported by the most recent overlapping search to use this
4190    /// state.
4191    ///
4192    /// If a search does not find any matches, then it is expected to clear
4193    /// this value.
4194    pub(crate) mat: Option<HalfMatch>,
4195    /// The state ID of the state at which the search was in when the call
4196    /// terminated. When this is a match state, `last_match` must be set to a
4197    /// non-None value.
4198    ///
4199    /// A `None` value indicates the start state of the corresponding
4200    /// automaton. We cannot use the actual ID, since any one automaton may
4201    /// have many start states, and which one is in use depends on several
4202    /// search-time factors.
4203    pub(crate) id: Option<LazyStateID>,
4204    /// The position of the search.
4205    ///
4206    /// When `id` is None (i.e., we are starting a search), this is set to
4207    /// the beginning of the search as given by the caller regardless of its
4208    /// current value. Subsequent calls to an overlapping search pick up at
4209    /// this offset.
4210    pub(crate) at: usize,
4211    /// The index into the matching patterns of the next match to report if the
4212    /// current state is a match state. Note that this may be 1 greater than
4213    /// the total number of matches to report for the current match state. (In
4214    /// which case, no more matches should be reported at the current position
4215    /// and the search should advance to the next position.)
4216    pub(crate) next_match_index: Option<usize>,
4217    /// This is set to true when a reverse overlapping search has entered its
4218    /// EOI transitions.
4219    ///
4220    /// This isn't used in a forward search because it knows to stop once the
4221    /// position exceeds the end of the search range. In a reverse search,
4222    /// since we use unsigned offsets, we don't "know" once we've gone past
4223    /// `0`. So the only way to detect it is with this extra flag. The reverse
4224    /// overlapping search knows to terminate specifically after it has
4225    /// reported all matches after following the EOI transition.
4226    pub(crate) rev_eoi: bool,
4227}
4228
4229impl OverlappingState {
4230    /// Create a new overlapping state that begins at the start state of any
4231    /// automaton.
4232    pub fn start() -> OverlappingState {
4233        OverlappingState {
4234            mat: None,
4235            id: None,
4236            at: 0,
4237            next_match_index: None,
4238            rev_eoi: false,
4239        }
4240    }
4241
4242    /// Return the match result of the most recent search to execute with this
4243    /// state.
4244    ///
4245    /// A searches will clear this result automatically, such that if no
4246    /// match is found, this will correctly report `None`.
4247    pub fn get_match(&self) -> Option<HalfMatch> {
4248        self.mat
4249    }
4250}
4251
4252/// Runs the given overlapping `search` function (forwards or backwards) until
4253/// a match is found whose offset does not split a codepoint.
4254///
4255/// This is *not* always correct to call. It should only be called when the
4256/// underlying NFA has UTF-8 mode enabled *and* it can produce zero-width
4257/// matches. Calling this when both of those things aren't true might result
4258/// in legitimate matches getting skipped.
4259#[cold]
4260#[inline(never)]
4261fn skip_empty_utf8_splits_overlapping<F>(
4262    input: &Input<'_>,
4263    state: &mut OverlappingState,
4264    mut search: F,
4265) -> Result<(), MatchError>
4266where
4267    F: FnMut(&Input<'_>, &mut OverlappingState) -> Result<(), MatchError>,
4268{
4269    // Note that this routine works for forwards and reverse searches
4270    // even though there's no code here to handle those cases. That's
4271    // because overlapping searches drive themselves to completion via
4272    // `OverlappingState`. So all we have to do is push it until no matches are
4273    // found.
4274
4275    let mut hm = match state.get_match() {
4276        None => return Ok(()),
4277        Some(hm) => hm,
4278    };
4279    if input.get_anchored().is_anchored() {
4280        if !input.is_char_boundary(hm.offset()) {
4281            state.mat = None;
4282        }
4283        return Ok(());
4284    }
4285    while !input.is_char_boundary(hm.offset()) {
4286        search(input, state)?;
4287        hm = match state.get_match() {
4288            None => return Ok(()),
4289            Some(hm) => hm,
4290        };
4291    }
4292    Ok(())
4293}
4294
4295/// Based on the minimum number of states required for a useful lazy DFA cache,
4296/// this returns the minimum lazy state ID that must be representable.
4297///
4298/// It's not likely for this to have any impact 32-bit systems (or higher), but
4299/// on 16-bit systems, the lazy state ID space is quite constrained and thus
4300/// may be insufficient if our MIN_STATES value is (for some reason) too high.
4301fn minimum_lazy_state_id(
4302    classes: &ByteClasses,
4303) -> Result<LazyStateID, LazyStateIDError> {
4304    let stride = 1 << classes.stride2();
4305    let min_state_index = MIN_STATES.checked_sub(1).unwrap();
4306    LazyStateID::new(min_state_index * stride)
4307}
4308
4309/// Based on the minimum number of states required for a useful lazy DFA cache,
4310/// this returns a heuristic minimum number of bytes of heap space required.
4311///
4312/// This is a "heuristic" because the minimum it returns is likely bigger than
4313/// the true minimum. Namely, it assumes that each powerset NFA/DFA state uses
4314/// the maximum number of NFA states (all of them). This is likely bigger
4315/// than what is required in practice. Computing the true minimum effectively
4316/// requires determinization, which is probably too much work to do for a
4317/// simple check like this.
4318///
4319/// One of the issues with this approach IMO is that it requires that this
4320/// be in sync with the calculation above for computing how much heap memory
4321/// the DFA cache uses. If we get it wrong, it's possible for example for the
4322/// minimum to be smaller than the computed heap memory, and thus, it may be
4323/// the case that we can't add the required minimum number of states. That in
4324/// turn will make lazy DFA panic because we assume that we can add at least a
4325/// minimum number of states.
4326///
4327/// Another approach would be to always allow the minimum number of states to
4328/// be added to the lazy DFA cache, even if it exceeds the configured cache
4329/// limit. This does mean that the limit isn't really a limit in all cases,
4330/// which is unfortunate. But it does at least guarantee that the lazy DFA can
4331/// always make progress, even if it is slow. (This approach is very similar to
4332/// enabling the 'skip_cache_capacity_check' config knob, except it wouldn't
4333/// rely on cache size calculation. Instead, it would just always permit a
4334/// minimum number of states to be added.)
4335fn minimum_cache_capacity(
4336    nfa: &thompson::NFA,
4337    classes: &ByteClasses,
4338    starts_for_each_pattern: bool,
4339) -> usize {
4340    const ID_SIZE: usize = size_of::<LazyStateID>();
4341    const STATE_SIZE: usize = size_of::<State>();
4342
4343    let stride = 1 << classes.stride2();
4344    let states_len = nfa.states().len();
4345    let sparses = 2 * states_len * NFAStateID::SIZE;
4346    let trans = MIN_STATES * stride * ID_SIZE;
4347
4348    let mut starts = Start::len() * ID_SIZE;
4349    if starts_for_each_pattern {
4350        starts += (Start::len() * nfa.pattern_len()) * ID_SIZE;
4351    }
4352
4353    // The min number of states HAS to be at least 4: we have 3 sentinel states
4354    // and then we need space for one more when we save a state after clearing
4355    // the cache. We also need space for one more, otherwise we get stuck in a
4356    // loop where we try to add a 5th state, which gets rejected, which clears
4357    // the cache, which adds back a saved state (4th total state) which then
4358    // tries to add the 5th state again.
4359    assert!(MIN_STATES >= 5, "minimum number of states has to be at least 5");
4360    // The minimum number of non-sentinel states. We consider this separately
4361    // because sentinel states are much smaller in that they contain no NFA
4362    // states. Given our aggressive calculation here, it's worth being more
4363    // precise with the number of states we need.
4364    let non_sentinel = MIN_STATES.checked_sub(SENTINEL_STATES).unwrap();
4365
4366    // Every `State` has 5 bytes for flags, 4 bytes (max) for the number of
4367    // patterns, followed by 32-bit encodings of patterns and then delta
4368    // varint encodings of NFA state IDs. We use the worst case (which isn't
4369    // technically possible) of 5 bytes for each NFA state ID.
4370    //
4371    // HOWEVER, three of the states needed by a lazy DFA are just the sentinel
4372    // unknown, dead and quit states. Those states have a known size and it is
4373    // small.
4374    let dead_state_size = State::dead().memory_usage();
4375    let max_state_size = 5 + 4 + (nfa.pattern_len() * 4) + (states_len * 5);
4376    let states = (SENTINEL_STATES * (STATE_SIZE + dead_state_size))
4377        + (non_sentinel * (STATE_SIZE + max_state_size));
4378    // NOTE: We don't double count heap memory used by State for this map since
4379    // we use reference counting to avoid doubling memory usage. (This tends to
4380    // be where most memory is allocated in the cache.)
4381    let states_to_sid = (MIN_STATES * STATE_SIZE) + (MIN_STATES * ID_SIZE);
4382    let stack = states_len * NFAStateID::SIZE;
4383    let scratch_state_builder = max_state_size;
4384
4385    trans
4386        + starts
4387        + states
4388        + states_to_sid
4389        + sparses
4390        + stack
4391        + scratch_state_builder
4392}
4393
4394#[cfg(all(test, feature = "syntax"))]
4395mod tests {
4396    use super::*;
4397
4398    // Tests that we handle heuristic Unicode word boundary support in reverse
4399    // DFAs in the specific case of contextual searches.
4400    //
4401    // I wrote this test when I discovered a bug in how heuristic word
4402    // boundaries were handled. Namely, that the starting state selection
4403    // didn't consider the DFA's quit byte set when looking at the byte
4404    // immediately before the start of the search (or immediately after the
4405    // end of the search in the case of a reverse search). As a result, it was
4406    // possible for '\bfoo\b' to match 'β123' because the trailing \xB2 byte
4407    // in the 'β' codepoint would be treated as a non-word character. But of
4408    // course, this search should trigger the DFA to quit, since there is a
4409    // non-ASCII byte in consideration.
4410    //
4411    // Thus, I fixed 'start_state_{forward,reverse}' to check the quit byte set
4412    // if it wasn't empty. The forward case is tested in the doc test for the
4413    // Config::unicode_word_boundary API. We test the reverse case here, which
4414    // is sufficiently niche that it doesn't really belong in a doc test.
4415    #[test]
4416    fn heuristic_unicode_reverse() {
4417        let dfa = DFA::builder()
4418            .configure(DFA::config().unicode_word_boundary(true))
4419            .thompson(thompson::Config::new().reverse(true))
4420            .build(r"\b[0-9]+\b")
4421            .unwrap();
4422        let mut cache = dfa.create_cache();
4423
4424        let input = Input::new("β123").range(2..);
4425        let expected = MatchError::quit(0xB2, 1);
4426        let got = dfa.try_search_rev(&mut cache, &input);
4427        assert_eq!(Err(expected), got);
4428
4429        let input = Input::new("123β").range(..3);
4430        let expected = MatchError::quit(0xCE, 3);
4431        let got = dfa.try_search_rev(&mut cache, &input);
4432        assert_eq!(Err(expected), got);
4433    }
4434}
regex_automata/hybrid/dfa.rs

regex_automata/hybrid/
dfa.rs