regex_automata/nfa/thompson/
compiler.rs

1use core::{borrow::Borrow, cell::RefCell};
2
3use alloc::{sync::Arc, vec, vec::Vec};
4
5use regex_syntax::{
6    hir::{self, Hir},
7    utf8::{Utf8Range, Utf8Sequences},
8    ParserBuilder,
9};
10
11use crate::{
12    nfa::thompson::{
13        builder::Builder,
14        error::BuildError,
15        literal_trie::LiteralTrie,
16        map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap},
17        nfa::{Transition, NFA},
18        range_trie::RangeTrie,
19    },
20    util::{
21        look::{Look, LookMatcher},
22        primitives::{PatternID, StateID},
23    },
24};
25
26/// The configuration used for a Thompson NFA compiler.
27#[derive(Clone, Debug, Default)]
28pub struct Config {
29    utf8: Option<bool>,
30    reverse: Option<bool>,
31    nfa_size_limit: Option<Option<usize>>,
32    shrink: Option<bool>,
33    which_captures: Option<WhichCaptures>,
34    look_matcher: Option<LookMatcher>,
35    #[cfg(test)]
36    unanchored_prefix: Option<bool>,
37}
38
39impl Config {
40    /// Return a new default Thompson NFA compiler configuration.
41    pub fn new() -> Config {
42        Config::default()
43    }
44
45    /// Whether to enable UTF-8 mode during search or not.
46    ///
47    /// A regex engine is said to be in UTF-8 mode when it guarantees that
48    /// all matches returned by it have spans consisting of only valid UTF-8.
49    /// That is, it is impossible for a match span to be returned that
50    /// contains any invalid UTF-8.
51    ///
52    /// UTF-8 mode generally consists of two things:
53    ///
54    /// 1. Whether the NFA's states are constructed such that all paths to a
55    /// match state that consume at least one byte always correspond to valid
56    /// UTF-8.
57    /// 2. Whether all paths to a match state that do _not_ consume any bytes
58    /// should always correspond to valid UTF-8 boundaries.
59    ///
60    /// (1) is a guarantee made by whoever constructs the NFA.
61    /// If you're parsing a regex from its concrete syntax, then
62    /// [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) can make
63    /// this guarantee for you. It does it by returning an error if the regex
64    /// pattern could every report a non-empty match span that contains invalid
65    /// UTF-8. So long as `syntax::Config::utf8` mode is enabled and your regex
66    /// successfully parses, then you're guaranteed that the corresponding NFA
67    /// will only ever report non-empty match spans containing valid UTF-8.
68    ///
69    /// (2) is a trickier guarantee because it cannot be enforced by the NFA
70    /// state graph itself. Consider, for example, the regex `a*`. It matches
71    /// the empty strings in `☃` at positions `0`, `1`, `2` and `3`, where
72    /// positions `1` and `2` occur within the UTF-8 encoding of a codepoint,
73    /// and thus correspond to invalid UTF-8 boundaries. Therefore, this
74    /// guarantee must be made at a higher level than the NFA state graph
75    /// itself. This crate deals with this case in each regex engine. Namely,
76    /// when a zero-width match that splits a codepoint is found and UTF-8
77    /// mode enabled, then it is ignored and the engine moves on looking for
78    /// the next match.
79    ///
80    /// Thus, UTF-8 mode is both a promise that the NFA built only reports
81    /// non-empty matches that are valid UTF-8, and an *instruction* to regex
82    /// engines that empty matches that split codepoints should be banned.
83    ///
84    /// Because UTF-8 mode is fundamentally about avoiding invalid UTF-8 spans,
85    /// it only makes sense to enable this option when you *know* your haystack
86    /// is valid UTF-8. (For example, a `&str`.) Enabling UTF-8 mode and
87    /// searching a haystack that contains invalid UTF-8 leads to **unspecified
88    /// behavior**.
89    ///
90    /// Therefore, it may make sense to enable `syntax::Config::utf8` while
91    /// simultaneously *disabling* this option. That would ensure all non-empty
92    /// match spans are valid UTF-8, but that empty match spans may still split
93    /// a codepoint or match at other places that aren't valid UTF-8.
94    ///
95    /// In general, this mode is only relevant if your regex can match the
96    /// empty string. Most regexes don't.
97    ///
98    /// This is enabled by default.
99    ///
100    /// # Example
101    ///
102    /// This example shows how UTF-8 mode can impact the match spans that may
103    /// be reported in certain cases.
104    ///
105    /// ```
106    /// use regex_automata::{
107    ///     nfa::thompson::{self, pikevm::PikeVM},
108    ///     Match, Input,
109    /// };
110    ///
111    /// let re = PikeVM::new("")?;
112    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
113    ///
114    /// // UTF-8 mode is enabled by default.
115    /// let mut input = Input::new("☃");
116    /// re.search(&mut cache, &input, &mut caps);
117    /// assert_eq!(Some(Match::must(0, 0..0)), caps.get_match());
118    ///
119    /// // Even though an empty regex matches at 1..1, our next match is
120    /// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is
121    /// // three bytes long).
122    /// input.set_start(1);
123    /// re.search(&mut cache, &input, &mut caps);
124    /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match());
125    ///
126    /// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2:
127    /// let re = PikeVM::builder()
128    ///     .thompson(thompson::Config::new().utf8(false))
129    ///     .build("")?;
130    /// re.search(&mut cache, &input, &mut caps);
131    /// assert_eq!(Some(Match::must(0, 1..1)), caps.get_match());
132    ///
133    /// input.set_start(2);
134    /// re.search(&mut cache, &input, &mut caps);
135    /// assert_eq!(Some(Match::must(0, 2..2)), caps.get_match());
136    ///
137    /// input.set_start(3);
138    /// re.search(&mut cache, &input, &mut caps);
139    /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match());
140    ///
141    /// input.set_start(4);
142    /// re.search(&mut cache, &input, &mut caps);
143    /// assert_eq!(None, caps.get_match());
144    ///
145    /// # Ok::<(), Box<dyn std::error::Error>>(())
146    /// ```
147    pub fn utf8(mut self, yes: bool) -> Config {
148        self.utf8 = Some(yes);
149        self
150    }
151
152    /// Reverse the NFA.
153    ///
154    /// A NFA reversal is performed by reversing all of the concatenated
155    /// sub-expressions in the original pattern, recursively. (Look around
156    /// operators are also inverted.) The resulting NFA can be used to match
157    /// the pattern starting from the end of a string instead of the beginning
158    /// of a string.
159    ///
160    /// Reversing the NFA is useful for building a reverse DFA, which is most
161    /// useful for finding the start of a match after its ending position has
162    /// been found. NFA execution engines typically do not work on reverse
163    /// NFAs. For example, currently, the Pike VM reports the starting location
164    /// of matches without a reverse NFA.
165    ///
166    /// Currently, enabling this setting requires disabling the
167    /// [`captures`](Config::captures) setting. If both are enabled, then the
168    /// compiler will return an error. It is expected that this limitation will
169    /// be lifted in the future.
170    ///
171    /// This is disabled by default.
172    ///
173    /// # Example
174    ///
175    /// This example shows how to build a DFA from a reverse NFA, and then use
176    /// the DFA to search backwards.
177    ///
178    /// ```
179    /// use regex_automata::{
180    ///     dfa::{self, Automaton},
181    ///     nfa::thompson::{NFA, WhichCaptures},
182    ///     HalfMatch, Input,
183    /// };
184    ///
185    /// let dfa = dfa::dense::Builder::new()
186    ///     .thompson(NFA::config()
187    ///         .which_captures(WhichCaptures::None)
188    ///         .reverse(true)
189    ///     )
190    ///     .build("baz[0-9]+")?;
191    /// let expected = Some(HalfMatch::must(0, 3));
192    /// assert_eq!(
193    ///     expected,
194    ///     dfa.try_search_rev(&Input::new("foobaz12345bar"))?,
195    /// );
196    ///
197    /// # Ok::<(), Box<dyn std::error::Error>>(())
198    /// ```
199    pub fn reverse(mut self, yes: bool) -> Config {
200        self.reverse = Some(yes);
201        self
202    }
203
204    /// Sets an approximate size limit on the total heap used by the NFA being
205    /// compiled.
206    ///
207    /// This permits imposing constraints on the size of a compiled NFA. This
208    /// may be useful in contexts where the regex pattern is untrusted and one
209    /// wants to avoid using too much memory.
210    ///
211    /// This size limit does not apply to auxiliary heap used during
212    /// compilation that is not part of the built NFA.
213    ///
214    /// Note that this size limit is applied during compilation in order for
215    /// the limit to prevent too much heap from being used. However, the
216    /// implementation may use an intermediate NFA representation that is
217    /// otherwise slightly bigger than the final public form. Since the size
218    /// limit may be applied to an intermediate representation, there is not
219    /// necessarily a precise correspondence between the configured size limit
220    /// and the heap usage of the final NFA.
221    ///
222    /// There is no size limit by default.
223    ///
224    /// # Example
225    ///
226    /// This example demonstrates how Unicode mode can greatly increase the
227    /// size of the NFA.
228    ///
229    /// ```
230    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
231    /// use regex_automata::nfa::thompson::NFA;
232    ///
233    /// // 300KB isn't enough!
234    /// NFA::compiler()
235    ///     .configure(NFA::config().nfa_size_limit(Some(300_000)))
236    ///     .build(r"\w{20}")
237    ///     .unwrap_err();
238    ///
239    /// // ... but 500KB probably is.
240    /// let nfa = NFA::compiler()
241    ///     .configure(NFA::config().nfa_size_limit(Some(500_000)))
242    ///     .build(r"\w{20}")?;
243    ///
244    /// assert_eq!(nfa.pattern_len(), 1);
245    ///
246    /// # Ok::<(), Box<dyn std::error::Error>>(())
247    /// ```
248    pub fn nfa_size_limit(mut self, bytes: Option<usize>) -> Config {
249        self.nfa_size_limit = Some(bytes);
250        self
251    }
252
253    /// Apply best effort heuristics to shrink the NFA at the expense of more
254    /// time/memory.
255    ///
256    /// Generally speaking, if one is using an NFA to compile a DFA, then the
257    /// extra time used to shrink the NFA will be more than made up for during
258    /// DFA construction (potentially by a lot). In other words, enabling this
259    /// can substantially decrease the overall amount of time it takes to build
260    /// a DFA.
261    ///
262    /// A reason to keep this disabled is if you want to compile an NFA and
263    /// start using it as quickly as possible without needing to build a DFA,
264    /// and you don't mind using a bit of extra memory for the NFA. e.g., for
265    /// an NFA simulation or for a lazy DFA.
266    ///
267    /// NFA shrinking is currently most useful when compiling a reverse
268    /// NFA with large Unicode character classes. In particular, it trades
269    /// additional CPU time during NFA compilation in favor of generating fewer
270    /// NFA states.
271    ///
272    /// This is disabled by default because it can increase compile times
273    /// quite a bit if you aren't building a full DFA.
274    ///
275    /// # Example
276    ///
277    /// This example shows that NFA shrinking can lead to substantial space
278    /// savings in some cases. Notice that, as noted above, we build a reverse
279    /// DFA and use a pattern with a large Unicode character class.
280    ///
281    /// ```
282    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
283    /// use regex_automata::nfa::thompson::{NFA, WhichCaptures};
284    ///
285    /// // Currently we have to disable captures when enabling reverse NFA.
286    /// let config = NFA::config()
287    ///     .which_captures(WhichCaptures::None)
288    ///     .reverse(true);
289    /// let not_shrunk = NFA::compiler()
290    ///     .configure(config.clone().shrink(false))
291    ///     .build(r"\w")?;
292    /// let shrunk = NFA::compiler()
293    ///     .configure(config.clone().shrink(true))
294    ///     .build(r"\w")?;
295    ///
296    /// // While a specific shrink factor is not guaranteed, the savings can be
297    /// // considerable in some cases.
298    /// assert!(shrunk.states().len() * 2 < not_shrunk.states().len());
299    ///
300    /// # Ok::<(), Box<dyn std::error::Error>>(())
301    /// ```
302    pub fn shrink(mut self, yes: bool) -> Config {
303        self.shrink = Some(yes);
304        self
305    }
306
307    /// Whether to include 'Capture' states in the NFA.
308    ///
309    /// Currently, enabling this setting requires disabling the
310    /// [`reverse`](Config::reverse) setting. If both are enabled, then the
311    /// compiler will return an error. It is expected that this limitation will
312    /// be lifted in the future.
313    ///
314    /// This is enabled by default.
315    ///
316    /// # Example
317    ///
318    /// This example demonstrates that some regex engines, like the Pike VM,
319    /// require capturing states to be present in the NFA to report match
320    /// offsets.
321    ///
322    /// (Note that since this method is deprecated, the example below uses
323    /// [`Config::which_captures`] to disable capture states.)
324    ///
325    /// ```
326    /// use regex_automata::nfa::thompson::{
327    ///     pikevm::PikeVM,
328    ///     NFA,
329    ///     WhichCaptures,
330    /// };
331    ///
332    /// let re = PikeVM::builder()
333    ///     .thompson(NFA::config().which_captures(WhichCaptures::None))
334    ///     .build(r"[a-z]+")?;
335    /// let mut cache = re.create_cache();
336    ///
337    /// assert!(re.is_match(&mut cache, "abc"));
338    /// assert_eq!(None, re.find(&mut cache, "abc"));
339    ///
340    /// # Ok::<(), Box<dyn std::error::Error>>(())
341    /// ```
342    #[deprecated(since = "0.3.5", note = "use which_captures instead")]
343    pub fn captures(self, yes: bool) -> Config {
344        self.which_captures(if yes {
345            WhichCaptures::All
346        } else {
347            WhichCaptures::None
348        })
349    }
350
351    /// Configures what kinds of capture groups are compiled into
352    /// [`State::Capture`](crate::nfa::thompson::State::Capture) states in a
353    /// Thompson NFA.
354    ///
355    /// Currently, using any option except for [`WhichCaptures::None`] requires
356    /// disabling the [`reverse`](Config::reverse) setting. If both are
357    /// enabled, then the compiler will return an error. It is expected that
358    /// this limitation will be lifted in the future.
359    ///
360    /// This is set to [`WhichCaptures::All`] by default. Callers may wish to
361    /// use [`WhichCaptures::Implicit`] in cases where one wants avoid the
362    /// overhead of capture states for explicit groups. Usually this occurs
363    /// when one wants to use the `PikeVM` only for determining the overall
364    /// match. Otherwise, the `PikeVM` could use much more memory than is
365    /// necessary.
366    ///
367    /// # Example
368    ///
369    /// This example demonstrates that some regex engines, like the Pike VM,
370    /// require capturing states to be present in the NFA to report match
371    /// offsets.
372    ///
373    /// ```
374    /// use regex_automata::nfa::thompson::{
375    ///     pikevm::PikeVM,
376    ///     NFA,
377    ///     WhichCaptures,
378    /// };
379    ///
380    /// let re = PikeVM::builder()
381    ///     .thompson(NFA::config().which_captures(WhichCaptures::None))
382    ///     .build(r"[a-z]+")?;
383    /// let mut cache = re.create_cache();
384    ///
385    /// assert!(re.is_match(&mut cache, "abc"));
386    /// assert_eq!(None, re.find(&mut cache, "abc"));
387    ///
388    /// # Ok::<(), Box<dyn std::error::Error>>(())
389    /// ```
390    ///
391    /// The same applies to the bounded backtracker:
392    ///
393    /// ```
394    /// use regex_automata::nfa::thompson::{
395    ///     backtrack::BoundedBacktracker,
396    ///     NFA,
397    ///     WhichCaptures,
398    /// };
399    ///
400    /// let re = BoundedBacktracker::builder()
401    ///     .thompson(NFA::config().which_captures(WhichCaptures::None))
402    ///     .build(r"[a-z]+")?;
403    /// let mut cache = re.create_cache();
404    ///
405    /// assert!(re.try_is_match(&mut cache, "abc")?);
406    /// assert_eq!(None, re.try_find(&mut cache, "abc")?);
407    ///
408    /// # Ok::<(), Box<dyn std::error::Error>>(())
409    /// ```
410    pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config {
411        self.which_captures = Some(which_captures);
412        self
413    }
414
415    /// Sets the look-around matcher that should be used with this NFA.
416    ///
417    /// A look-around matcher determines how to match look-around assertions.
418    /// In particular, some assertions are configurable. For example, the
419    /// `(?m:^)` and `(?m:$)` assertions can have their line terminator changed
420    /// from the default of `\n` to any other byte.
421    ///
422    /// # Example
423    ///
424    /// This shows how to change the line terminator for multi-line assertions.
425    ///
426    /// ```
427    /// use regex_automata::{
428    ///     nfa::thompson::{self, pikevm::PikeVM},
429    ///     util::look::LookMatcher,
430    ///     Match, Input,
431    /// };
432    ///
433    /// let mut lookm = LookMatcher::new();
434    /// lookm.set_line_terminator(b'\x00');
435    ///
436    /// let re = PikeVM::builder()
437    ///     .thompson(thompson::Config::new().look_matcher(lookm))
438    ///     .build(r"(?m)^[a-z]+$")?;
439    /// let mut cache = re.create_cache();
440    ///
441    /// // Multi-line assertions now use NUL as a terminator.
442    /// assert_eq!(
443    ///     Some(Match::must(0, 1..4)),
444    ///     re.find(&mut cache, b"\x00abc\x00"),
445    /// );
446    /// // ... and \n is no longer recognized as a terminator.
447    /// assert_eq!(
448    ///     None,
449    ///     re.find(&mut cache, b"\nabc\n"),
450    /// );
451    ///
452    /// # Ok::<(), Box<dyn std::error::Error>>(())
453    /// ```
454    pub fn look_matcher(mut self, m: LookMatcher) -> Config {
455        self.look_matcher = Some(m);
456        self
457    }
458
459    /// Whether to compile an unanchored prefix into this NFA.
460    ///
461    /// This is enabled by default. It is made available for tests only to make
462    /// it easier to unit test the output of the compiler.
463    #[cfg(test)]
464    fn unanchored_prefix(mut self, yes: bool) -> Config {
465        self.unanchored_prefix = Some(yes);
466        self
467    }
468
469    /// Returns whether this configuration has enabled UTF-8 mode.
470    pub fn get_utf8(&self) -> bool {
471        self.utf8.unwrap_or(true)
472    }
473
474    /// Returns whether this configuration has enabled reverse NFA compilation.
475    pub fn get_reverse(&self) -> bool {
476        self.reverse.unwrap_or(false)
477    }
478
479    /// Return the configured NFA size limit, if it exists, in the number of
480    /// bytes of heap used.
481    pub fn get_nfa_size_limit(&self) -> Option<usize> {
482        self.nfa_size_limit.unwrap_or(None)
483    }
484
485    /// Return whether NFA shrinking is enabled.
486    pub fn get_shrink(&self) -> bool {
487        self.shrink.unwrap_or(false)
488    }
489
490    /// Return whether NFA compilation is configured to produce capture states.
491    #[deprecated(since = "0.3.5", note = "use get_which_captures instead")]
492    pub fn get_captures(&self) -> bool {
493        self.get_which_captures().is_any()
494    }
495
496    /// Return what kinds of capture states will be compiled into an NFA.
497    pub fn get_which_captures(&self) -> WhichCaptures {
498        self.which_captures.unwrap_or(WhichCaptures::All)
499    }
500
501    /// Return the look-around matcher for this NFA.
502    pub fn get_look_matcher(&self) -> LookMatcher {
503        self.look_matcher.clone().unwrap_or(LookMatcher::default())
504    }
505
506    /// Return whether NFA compilation is configured to include an unanchored
507    /// prefix.
508    ///
509    /// This is always false when not in test mode.
510    fn get_unanchored_prefix(&self) -> bool {
511        #[cfg(test)]
512        {
513            self.unanchored_prefix.unwrap_or(true)
514        }
515        #[cfg(not(test))]
516        {
517            true
518        }
519    }
520
521    /// Overwrite the default configuration such that the options in `o` are
522    /// always used. If an option in `o` is not set, then the corresponding
523    /// option in `self` is used. If it's not set in `self` either, then it
524    /// remains not set.
525    pub(crate) fn overwrite(&self, o: Config) -> Config {
526        Config {
527            utf8: o.utf8.or(self.utf8),
528            reverse: o.reverse.or(self.reverse),
529            nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit),
530            shrink: o.shrink.or(self.shrink),
531            which_captures: o.which_captures.or(self.which_captures),
532            look_matcher: o.look_matcher.or_else(|| self.look_matcher.clone()),
533            #[cfg(test)]
534            unanchored_prefix: o.unanchored_prefix.or(self.unanchored_prefix),
535        }
536    }
537}
538
539/// A configuration indicating which kinds of
540/// [`State::Capture`](crate::nfa::thompson::State::Capture) states to include.
541///
542/// This configuration can be used with [`Config::which_captures`] to control
543/// which capture states are compiled into a Thompson NFA.
544///
545/// The default configuration is [`WhichCaptures::All`].
546#[derive(Clone, Copy, Debug)]
547pub enum WhichCaptures {
548    /// All capture states, including those corresponding to both implicit and
549    /// explicit capture groups, are included in the Thompson NFA.
550    All,
551    /// Only capture states corresponding to implicit capture groups are
552    /// included. Implicit capture groups appear in every pattern implicitly
553    /// and correspond to the overall match of a pattern.
554    ///
555    /// This is useful when one only cares about the overall match of a
556    /// pattern. By excluding capture states from explicit capture groups,
557    /// one might be able to reduce the memory usage of a multi-pattern regex
558    /// substantially if it was otherwise written to have many explicit capture
559    /// groups.
560    Implicit,
561    /// No capture states are compiled into the Thompson NFA.
562    ///
563    /// This is useful when capture states are either not needed (for example,
564    /// if one is only trying to build a DFA) or if they aren't supported (for
565    /// example, a reverse NFA).
566    ///
567    /// # Warning
568    ///
569    /// Callers must be exceedingly careful when using this
570    /// option. In particular, not all regex engines support
571    /// reporting match spans when using this option (for example,
572    /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) or
573    /// [`BoundedBacktracker`](crate::nfa::thompson::backtrack::BoundedBacktracker)).
574    ///
575    /// Perhaps more confusingly, using this option with such an
576    /// engine means that an `is_match` routine could report `true`
577    /// when `find` reports `None`. This is generally not something
578    /// that _should_ happen, but the low level control provided by
579    /// this crate makes it possible.
580    ///
581    /// Similarly, any regex engines (like [`meta::Regex`](crate::meta::Regex))
582    /// should always return `None` from `find` routines when this option is
583    /// used, even if _some_ of its internal engines could find the match
584    /// boundaries. This is because inputs from user data could influence
585    /// engine selection, and thus influence whether a match is found or not.
586    /// Indeed, `meta::Regex::find` will always return `None` when configured
587    /// with this option.
588    None,
589}
590
591impl Default for WhichCaptures {
592    fn default() -> WhichCaptures {
593        WhichCaptures::All
594    }
595}
596
597impl WhichCaptures {
598    /// Returns true if this configuration indicates that no capture states
599    /// should be produced in an NFA.
600    pub fn is_none(&self) -> bool {
601        matches!(*self, WhichCaptures::None)
602    }
603
604    /// Returns true if this configuration indicates that some capture states
605    /// should be added to an NFA. Note that this might only include capture
606    /// states for implicit capture groups.
607    pub fn is_any(&self) -> bool {
608        !self.is_none()
609    }
610}
611
612/*
613This compiler below uses Thompson's construction algorithm. The compiler takes
614a regex-syntax::Hir as input and emits an NFA graph as output. The NFA graph
615is structured in a way that permits it to be executed by a virtual machine and
616also used to efficiently build a DFA.
617
618The compiler deals with a slightly expanded set of NFA states than what is
619in a final NFA (as exhibited by builder::State and nfa::State). Notably a
620compiler state includes an empty node that has exactly one unconditional
621epsilon transition to the next state. In other words, it's a "goto" instruction
622if one views Thompson's NFA as a set of bytecode instructions. These goto
623instructions are removed in a subsequent phase before returning the NFA to the
624caller. The purpose of these empty nodes is that they make the construction
625algorithm substantially simpler to implement. We remove them before returning
626to the caller because they can represent substantial overhead when traversing
627the NFA graph (either while searching using the NFA directly or while building
628a DFA).
629
630In the future, it would be nice to provide a Glushkov compiler as well, as it
631would work well as a bit-parallel NFA for smaller regexes. But the Thompson
632construction is one I'm more familiar with and seems more straight-forward to
633deal with when it comes to large Unicode character classes.
634
635Internally, the compiler uses interior mutability to improve composition in the
636face of the borrow checker. In particular, we'd really like to be able to write
637things like this:
638
639    self.c_concat(exprs.iter().map(|e| self.c(e)))
640
641Which elegantly uses iterators to build up a sequence of compiled regex
642sub-expressions and then hands it off to the concatenating compiler routine.
643Without interior mutability, the borrow checker won't let us borrow `self`
644mutably both inside and outside the closure at the same time.
645*/
646
647/// A builder for compiling an NFA from a regex's high-level intermediate
648/// representation (HIR).
649///
650/// This compiler provides a way to translate a parsed regex pattern into an
651/// NFA state graph. The NFA state graph can either be used directly to execute
652/// a search (e.g., with a Pike VM), or it can be further used to build a DFA.
653///
654/// This compiler provides APIs both for compiling regex patterns directly from
655/// their concrete syntax, or via a [`regex_syntax::hir::Hir`].
656///
657/// This compiler has various options that may be configured via
658/// [`thompson::Config`](Config).
659///
660/// Note that a compiler is not the same as a [`thompson::Builder`](Builder).
661/// A `Builder` provides a lower level API that is uncoupled from a regex
662/// pattern's concrete syntax or even its HIR. Instead, it permits stitching
663/// together an NFA by hand. See its docs for examples.
664///
665/// # Example: compilation from concrete syntax
666///
667/// This shows how to compile an NFA from a pattern string while setting a size
668/// limit on how big the NFA is allowed to be (in terms of bytes of heap used).
669///
670/// ```
671/// use regex_automata::{
672///     nfa::thompson::{NFA, pikevm::PikeVM},
673///     Match,
674/// };
675///
676/// let config = NFA::config().nfa_size_limit(Some(1_000));
677/// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?;
678///
679/// let re = PikeVM::new_from_nfa(nfa)?;
680/// let mut cache = re.create_cache();
681/// let mut caps = re.create_captures();
682/// let expected = Some(Match::must(0, 3..4));
683/// re.captures(&mut cache, "!@#A#@!", &mut caps);
684/// assert_eq!(expected, caps.get_match());
685///
686/// # Ok::<(), Box<dyn std::error::Error>>(())
687/// ```
688///
689/// # Example: compilation from HIR
690///
691/// This shows how to hand assemble a regular expression via its HIR, and then
692/// compile an NFA directly from it.
693///
694/// ```
695/// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
696/// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
697///
698/// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
699///     ClassBytesRange::new(b'0', b'9'),
700///     ClassBytesRange::new(b'A', b'Z'),
701///     ClassBytesRange::new(b'_', b'_'),
702///     ClassBytesRange::new(b'a', b'z'),
703/// ])));
704///
705/// let config = NFA::config().nfa_size_limit(Some(1_000));
706/// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
707///
708/// let re = PikeVM::new_from_nfa(nfa)?;
709/// let mut cache = re.create_cache();
710/// let mut caps = re.create_captures();
711/// let expected = Some(Match::must(0, 3..4));
712/// re.captures(&mut cache, "!@#A#@!", &mut caps);
713/// assert_eq!(expected, caps.get_match());
714///
715/// # Ok::<(), Box<dyn std::error::Error>>(())
716/// ```
717#[derive(Clone, Debug)]
718pub struct Compiler {
719    /// A regex parser, used when compiling an NFA directly from a pattern
720    /// string.
721    parser: ParserBuilder,
722    /// The compiler configuration.
723    config: Config,
724    /// The builder for actually constructing an NFA. This provides a
725    /// convenient abstraction for writing a compiler.
726    builder: RefCell<Builder>,
727    /// State used for compiling character classes to UTF-8 byte automata.
728    /// State is not retained between character class compilations. This just
729    /// serves to amortize allocation to the extent possible.
730    utf8_state: RefCell<Utf8State>,
731    /// State used for arranging character classes in reverse into a trie.
732    trie_state: RefCell<RangeTrie>,
733    /// State used for caching common suffixes when compiling reverse UTF-8
734    /// automata (for Unicode character classes).
735    utf8_suffix: RefCell<Utf8SuffixMap>,
736}
737
738impl Compiler {
739    /// Create a new NFA builder with its default configuration.
740    pub fn new() -> Compiler {
741        Compiler {
742            parser: ParserBuilder::new(),
743            config: Config::default(),
744            builder: RefCell::new(Builder::new()),
745            utf8_state: RefCell::new(Utf8State::new()),
746            trie_state: RefCell::new(RangeTrie::new()),
747            utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
748        }
749    }
750
751    /// Compile the given regular expression pattern into an NFA.
752    ///
753    /// If there was a problem parsing the regex, then that error is returned.
754    ///
755    /// Otherwise, if there was a problem building the NFA, then an error is
756    /// returned. The only error that can occur is if the compiled regex would
757    /// exceed the size limits configured on this builder, or if any part of
758    /// the NFA would exceed the integer representations used. (For example,
759    /// too many states might plausibly occur on a 16-bit target.)
760    ///
761    /// # Example
762    ///
763    /// ```
764    /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
765    ///
766    /// let config = NFA::config().nfa_size_limit(Some(1_000));
767    /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?;
768    ///
769    /// let re = PikeVM::new_from_nfa(nfa)?;
770    /// let mut cache = re.create_cache();
771    /// let mut caps = re.create_captures();
772    /// let expected = Some(Match::must(0, 3..4));
773    /// re.captures(&mut cache, "!@#A#@!", &mut caps);
774    /// assert_eq!(expected, caps.get_match());
775    ///
776    /// # Ok::<(), Box<dyn std::error::Error>>(())
777    /// ```
778    pub fn build(&self, pattern: &str) -> Result<NFA, BuildError> {
779        self.build_many(&[pattern])
780    }
781
782    /// Compile the given regular expression patterns into a single NFA.
783    ///
784    /// When matches are returned, the pattern ID corresponds to the index of
785    /// the pattern in the slice given.
786    ///
787    /// # Example
788    ///
789    /// ```
790    /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
791    ///
792    /// let config = NFA::config().nfa_size_limit(Some(1_000));
793    /// let nfa = NFA::compiler().configure(config).build_many(&[
794    ///     r"(?-u)\s",
795    ///     r"(?-u)\w",
796    /// ])?;
797    ///
798    /// let re = PikeVM::new_from_nfa(nfa)?;
799    /// let mut cache = re.create_cache();
800    /// let mut caps = re.create_captures();
801    /// let expected = Some(Match::must(1, 1..2));
802    /// re.captures(&mut cache, "!A! !A!", &mut caps);
803    /// assert_eq!(expected, caps.get_match());
804    ///
805    /// # Ok::<(), Box<dyn std::error::Error>>(())
806    /// ```
807    pub fn build_many<P: AsRef<str>>(
808        &self,
809        patterns: &[P],
810    ) -> Result<NFA, BuildError> {
811        let mut hirs = vec![];
812        for p in patterns {
813            hirs.push(
814                self.parser
815                    .build()
816                    .parse(p.as_ref())
817                    .map_err(BuildError::syntax)?,
818            );
819            debug!("parsed: {:?}", p.as_ref());
820        }
821        self.build_many_from_hir(&hirs)
822    }
823
824    /// Compile the given high level intermediate representation of a regular
825    /// expression into an NFA.
826    ///
827    /// If there was a problem building the NFA, then an error is returned. The
828    /// only error that can occur is if the compiled regex would exceed the
829    /// size limits configured on this builder, or if any part of the NFA would
830    /// exceed the integer representations used. (For example, too many states
831    /// might plausibly occur on a 16-bit target.)
832    ///
833    /// # Example
834    ///
835    /// ```
836    /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
837    /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
838    ///
839    /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
840    ///     ClassBytesRange::new(b'0', b'9'),
841    ///     ClassBytesRange::new(b'A', b'Z'),
842    ///     ClassBytesRange::new(b'_', b'_'),
843    ///     ClassBytesRange::new(b'a', b'z'),
844    /// ])));
845    ///
846    /// let config = NFA::config().nfa_size_limit(Some(1_000));
847    /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
848    ///
849    /// let re = PikeVM::new_from_nfa(nfa)?;
850    /// let mut cache = re.create_cache();
851    /// let mut caps = re.create_captures();
852    /// let expected = Some(Match::must(0, 3..4));
853    /// re.captures(&mut cache, "!@#A#@!", &mut caps);
854    /// assert_eq!(expected, caps.get_match());
855    ///
856    /// # Ok::<(), Box<dyn std::error::Error>>(())
857    /// ```
858    pub fn build_from_hir(&self, expr: &Hir) -> Result<NFA, BuildError> {
859        self.build_many_from_hir(&[expr])
860    }
861
862    /// Compile the given high level intermediate representations of regular
863    /// expressions into a single NFA.
864    ///
865    /// When matches are returned, the pattern ID corresponds to the index of
866    /// the pattern in the slice given.
867    ///
868    /// # Example
869    ///
870    /// ```
871    /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
872    /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
873    ///
874    /// let hirs = &[
875    ///     Hir::class(Class::Bytes(ClassBytes::new(vec![
876    ///         ClassBytesRange::new(b'\t', b'\r'),
877    ///         ClassBytesRange::new(b' ', b' '),
878    ///     ]))),
879    ///     Hir::class(Class::Bytes(ClassBytes::new(vec![
880    ///         ClassBytesRange::new(b'0', b'9'),
881    ///         ClassBytesRange::new(b'A', b'Z'),
882    ///         ClassBytesRange::new(b'_', b'_'),
883    ///         ClassBytesRange::new(b'a', b'z'),
884    ///     ]))),
885    /// ];
886    ///
887    /// let config = NFA::config().nfa_size_limit(Some(1_000));
888    /// let nfa = NFA::compiler().configure(config).build_many_from_hir(hirs)?;
889    ///
890    /// let re = PikeVM::new_from_nfa(nfa)?;
891    /// let mut cache = re.create_cache();
892    /// let mut caps = re.create_captures();
893    /// let expected = Some(Match::must(1, 1..2));
894    /// re.captures(&mut cache, "!A! !A!", &mut caps);
895    /// assert_eq!(expected, caps.get_match());
896    ///
897    /// # Ok::<(), Box<dyn std::error::Error>>(())
898    /// ```
899    pub fn build_many_from_hir<H: Borrow<Hir>>(
900        &self,
901        exprs: &[H],
902    ) -> Result<NFA, BuildError> {
903        self.compile(exprs)
904    }
905
906    /// Apply the given NFA configuration options to this builder.
907    ///
908    /// # Example
909    ///
910    /// ```
911    /// use regex_automata::nfa::thompson::NFA;
912    ///
913    /// let config = NFA::config().nfa_size_limit(Some(1_000));
914    /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?;
915    /// assert_eq!(nfa.pattern_len(), 1);
916    ///
917    /// # Ok::<(), Box<dyn std::error::Error>>(())
918    /// ```
919    pub fn configure(&mut self, config: Config) -> &mut Compiler {
920        self.config = self.config.overwrite(config);
921        self
922    }
923
924    /// Set the syntax configuration for this builder using
925    /// [`syntax::Config`](crate::util::syntax::Config).
926    ///
927    /// This permits setting things like case insensitivity, Unicode and multi
928    /// line mode.
929    ///
930    /// This syntax configuration only applies when an NFA is built directly
931    /// from a pattern string. If an NFA is built from an HIR, then all syntax
932    /// settings are ignored.
933    ///
934    /// # Example
935    ///
936    /// ```
937    /// use regex_automata::{nfa::thompson::NFA, util::syntax};
938    ///
939    /// let syntax_config = syntax::Config::new().unicode(false);
940    /// let nfa = NFA::compiler().syntax(syntax_config).build(r"\w")?;
941    /// // If Unicode were enabled, the number of states would be much bigger.
942    /// assert!(nfa.states().len() < 15);
943    ///
944    /// # Ok::<(), Box<dyn std::error::Error>>(())
945    /// ```
946    pub fn syntax(
947        &mut self,
948        config: crate::util::syntax::Config,
949    ) -> &mut Compiler {
950        config.apply(&mut self.parser);
951        self
952    }
953}
954
955impl Compiler {
956    /// Compile the sequence of HIR expressions given. Pattern IDs are
957    /// allocated starting from 0, in correspondence with the slice given.
958    ///
959    /// It is legal to provide an empty slice. In that case, the NFA returned
960    /// has no patterns and will never match anything.
961    fn compile<H: Borrow<Hir>>(&self, exprs: &[H]) -> Result<NFA, BuildError> {
962        if exprs.len() > PatternID::LIMIT {
963            return Err(BuildError::too_many_patterns(exprs.len()));
964        }
965        if self.config.get_reverse()
966            && self.config.get_which_captures().is_any()
967        {
968            return Err(BuildError::unsupported_captures());
969        }
970
971        self.builder.borrow_mut().clear();
972        self.builder.borrow_mut().set_utf8(self.config.get_utf8());
973        self.builder.borrow_mut().set_reverse(self.config.get_reverse());
974        self.builder
975            .borrow_mut()
976            .set_look_matcher(self.config.get_look_matcher());
977        self.builder
978            .borrow_mut()
979            .set_size_limit(self.config.get_nfa_size_limit())?;
980
981        // We always add an unanchored prefix unless we were specifically told
982        // not to (for tests only), or if we know that the regex is anchored
983        // for all matches. When an unanchored prefix is not added, then the
984        // NFA's anchored and unanchored start states are equivalent.
985        let all_anchored = exprs.iter().all(|e| {
986            let props = e.borrow().properties();
987            if self.config.get_reverse() {
988                props.look_set_suffix().contains(hir::Look::End)
989            } else {
990                props.look_set_prefix().contains(hir::Look::Start)
991            }
992        });
993        let anchored = !self.config.get_unanchored_prefix() || all_anchored;
994        let unanchored_prefix = if anchored {
995            self.c_empty()?
996        } else {
997            self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?
998        };
999
1000        let compiled = self.c_alt_iter(exprs.iter().map(|e| {
1001            let _ = self.start_pattern()?;
1002            let one = self.c_cap(0, None, e.borrow())?;
1003            let match_state_id = self.add_match()?;
1004            self.patch(one.end, match_state_id)?;
1005            let _ = self.finish_pattern(one.start)?;
1006            Ok(ThompsonRef { start: one.start, end: match_state_id })
1007        }))?;
1008        self.patch(unanchored_prefix.end, compiled.start)?;
1009        let nfa = self
1010            .builder
1011            .borrow_mut()
1012            .build(compiled.start, unanchored_prefix.start)?;
1013
1014        debug!("HIR-to-NFA compilation complete, config: {:?}", self.config);
1015        Ok(nfa)
1016    }
1017
1018    /// Compile an arbitrary HIR expression.
1019    fn c(&self, expr: &Hir) -> Result<ThompsonRef, BuildError> {
1020        use regex_syntax::hir::{Class, HirKind::*};
1021
1022        match *expr.kind() {
1023            Empty => self.c_empty(),
1024            Literal(hir::Literal(ref bytes)) => self.c_literal(bytes),
1025            Class(Class::Bytes(ref c)) => self.c_byte_class(c),
1026            Class(Class::Unicode(ref c)) => self.c_unicode_class(c),
1027            Look(ref look) => self.c_look(look),
1028            Repetition(ref rep) => self.c_repetition(rep),
1029            Capture(ref c) => self.c_cap(c.index, c.name.as_deref(), &c.sub),
1030            Concat(ref es) => self.c_concat(es.iter().map(|e| self.c(e))),
1031            Alternation(ref es) => self.c_alt_slice(es),
1032        }
1033    }
1034
1035    /// Compile a concatenation of the sub-expressions yielded by the given
1036    /// iterator. If the iterator yields no elements, then this compiles down
1037    /// to an "empty" state that always matches.
1038    ///
1039    /// If the compiler is in reverse mode, then the expressions given are
1040    /// automatically compiled in reverse.
1041    fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError>
1042    where
1043        I: DoubleEndedIterator<Item = Result<ThompsonRef, BuildError>>,
1044    {
1045        let first = if self.is_reverse() { it.next_back() } else { it.next() };
1046        let ThompsonRef { start, mut end } = match first {
1047            Some(result) => result?,
1048            None => return self.c_empty(),
1049        };
1050        loop {
1051            let next =
1052                if self.is_reverse() { it.next_back() } else { it.next() };
1053            let compiled = match next {
1054                Some(result) => result?,
1055                None => break,
1056            };
1057            self.patch(end, compiled.start)?;
1058            end = compiled.end;
1059        }
1060        Ok(ThompsonRef { start, end })
1061    }
1062
1063    /// Compile an alternation of the given HIR values.
1064    ///
1065    /// This is like 'c_alt_iter', but it accepts a slice of HIR values instead
1066    /// of an iterator of compiled NFA sub-graphs. The point of accepting a
1067    /// slice here is that it opens up some optimization opportunities. For
1068    /// example, if all of the HIR values are literals, then this routine might
1069    /// re-shuffle them to make NFA epsilon closures substantially faster.
1070    fn c_alt_slice(&self, exprs: &[Hir]) -> Result<ThompsonRef, BuildError> {
1071        // self.c_alt_iter(exprs.iter().map(|e| self.c(e)))
1072        let literal_count = exprs
1073            .iter()
1074            .filter(|e| {
1075                matches!(*e.kind(), hir::HirKind::Literal(hir::Literal(_)))
1076            })
1077            .count();
1078        if literal_count <= 1 || literal_count < exprs.len() {
1079            return self.c_alt_iter(exprs.iter().map(|e| self.c(e)));
1080        }
1081
1082        let mut trie = if self.is_reverse() {
1083            LiteralTrie::reverse()
1084        } else {
1085            LiteralTrie::forward()
1086        };
1087        for expr in exprs.iter() {
1088            let literal = match *expr.kind() {
1089                hir::HirKind::Literal(hir::Literal(ref bytes)) => bytes,
1090                _ => unreachable!(),
1091            };
1092            trie.add(literal)?;
1093        }
1094        trie.compile(&mut self.builder.borrow_mut())
1095    }
1096
1097    /// Compile an alternation, where each element yielded by the given
1098    /// iterator represents an item in the alternation. If the iterator yields
1099    /// no elements, then this compiles down to a "fail" state.
1100    ///
1101    /// In an alternation, expressions appearing earlier are "preferred" at
1102    /// match time over expressions appearing later. At least, this is true
1103    /// when using "leftmost first" match semantics. (If "leftmost longest" are
1104    /// ever added in the future, then this preference order of priority would
1105    /// not apply in that mode.)
1106    fn c_alt_iter<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError>
1107    where
1108        I: Iterator<Item = Result<ThompsonRef, BuildError>>,
1109    {
1110        let first = match it.next() {
1111            None => return self.c_fail(),
1112            Some(result) => result?,
1113        };
1114        let second = match it.next() {
1115            None => return Ok(first),
1116            Some(result) => result?,
1117        };
1118
1119        let union = self.add_union()?;
1120        let end = self.add_empty()?;
1121        self.patch(union, first.start)?;
1122        self.patch(first.end, end)?;
1123        self.patch(union, second.start)?;
1124        self.patch(second.end, end)?;
1125        for result in it {
1126            let compiled = result?;
1127            self.patch(union, compiled.start)?;
1128            self.patch(compiled.end, end)?;
1129        }
1130        Ok(ThompsonRef { start: union, end })
1131    }
1132
1133    /// Compile the given capture sub-expression. `expr` should be the
1134    /// sub-expression contained inside the capture. If "capture" states are
1135    /// enabled, then they are added as appropriate.
1136    ///
1137    /// This accepts the pieces of a capture instead of a `hir::Capture` so
1138    /// that it's easy to manufacture a "fake" group when necessary, e.g., for
1139    /// adding the entire pattern as if it were a group in order to create
1140    /// appropriate "capture" states in the NFA.
1141    fn c_cap(
1142        &self,
1143        index: u32,
1144        name: Option<&str>,
1145        expr: &Hir,
1146    ) -> Result<ThompsonRef, BuildError> {
1147        match self.config.get_which_captures() {
1148            // No capture states means we always skip them.
1149            WhichCaptures::None => return self.c(expr),
1150            // Implicit captures states means we only add when index==0 since
1151            // index==0 implies the group is implicit.
1152            WhichCaptures::Implicit if index > 0 => return self.c(expr),
1153            _ => {}
1154        }
1155
1156        let start = self.add_capture_start(index, name)?;
1157        let inner = self.c(expr)?;
1158        let end = self.add_capture_end(index)?;
1159        self.patch(start, inner.start)?;
1160        self.patch(inner.end, end)?;
1161        Ok(ThompsonRef { start, end })
1162    }
1163
1164    /// Compile the given repetition expression. This handles all types of
1165    /// repetitions and greediness.
1166    fn c_repetition(
1167        &self,
1168        rep: &hir::Repetition,
1169    ) -> Result<ThompsonRef, BuildError> {
1170        match (rep.min, rep.max) {
1171            (0, Some(1)) => self.c_zero_or_one(&rep.sub, rep.greedy),
1172            (min, None) => self.c_at_least(&rep.sub, rep.greedy, min),
1173            (min, Some(max)) if min == max => self.c_exactly(&rep.sub, min),
1174            (min, Some(max)) => self.c_bounded(&rep.sub, rep.greedy, min, max),
1175        }
1176    }
1177
1178    /// Compile the given expression such that it matches at least `min` times,
1179    /// but no more than `max` times.
1180    ///
1181    /// When `greedy` is true, then the preference is for the expression to
1182    /// match as much as possible. Otherwise, it will match as little as
1183    /// possible.
1184    fn c_bounded(
1185        &self,
1186        expr: &Hir,
1187        greedy: bool,
1188        min: u32,
1189        max: u32,
1190    ) -> Result<ThompsonRef, BuildError> {
1191        let prefix = self.c_exactly(expr, min)?;
1192        if min == max {
1193            return Ok(prefix);
1194        }
1195
1196        // It is tempting here to compile the rest here as a concatenation
1197        // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it
1198        // were `aaa?a?a?`. The problem here is that it leads to this program:
1199        //
1200        //     >000000: 61 => 01
1201        //      000001: 61 => 02
1202        //      000002: union(03, 04)
1203        //      000003: 61 => 04
1204        //      000004: union(05, 06)
1205        //      000005: 61 => 06
1206        //      000006: union(07, 08)
1207        //      000007: 61 => 08
1208        //      000008: MATCH
1209        //
1210        // And effectively, once you hit state 2, the epsilon closure will
1211        // include states 3, 5, 6, 7 and 8, which is quite a bit. It is better
1212        // to instead compile it like so:
1213        //
1214        //     >000000: 61 => 01
1215        //      000001: 61 => 02
1216        //      000002: union(03, 08)
1217        //      000003: 61 => 04
1218        //      000004: union(05, 08)
1219        //      000005: 61 => 06
1220        //      000006: union(07, 08)
1221        //      000007: 61 => 08
1222        //      000008: MATCH
1223        //
1224        // So that the epsilon closure of state 2 is now just 3 and 8.
1225        let empty = self.add_empty()?;
1226        let mut prev_end = prefix.end;
1227        for _ in min..max {
1228            let union = if greedy {
1229                self.add_union()
1230            } else {
1231                self.add_union_reverse()
1232            }?;
1233            let compiled = self.c(expr)?;
1234            self.patch(prev_end, union)?;
1235            self.patch(union, compiled.start)?;
1236            self.patch(union, empty)?;
1237            prev_end = compiled.end;
1238        }
1239        self.patch(prev_end, empty)?;
1240        Ok(ThompsonRef { start: prefix.start, end: empty })
1241    }
1242
1243    /// Compile the given expression such that it may be matched `n` or more
1244    /// times, where `n` can be any integer. (Although a particularly large
1245    /// integer is likely to run afoul of any configured size limits.)
1246    ///
1247    /// When `greedy` is true, then the preference is for the expression to
1248    /// match as much as possible. Otherwise, it will match as little as
1249    /// possible.
1250    fn c_at_least(
1251        &self,
1252        expr: &Hir,
1253        greedy: bool,
1254        n: u32,
1255    ) -> Result<ThompsonRef, BuildError> {
1256        if n == 0 {
1257            // When the expression cannot match the empty string, then we
1258            // can get away with something much simpler: just one 'alt'
1259            // instruction that optionally repeats itself. But if the expr
1260            // can match the empty string... see below.
1261            if expr.properties().minimum_len().map_or(false, |len| len > 0) {
1262                let union = if greedy {
1263                    self.add_union()
1264                } else {
1265                    self.add_union_reverse()
1266                }?;
1267                let compiled = self.c(expr)?;
1268                self.patch(union, compiled.start)?;
1269                self.patch(compiled.end, union)?;
1270                return Ok(ThompsonRef { start: union, end: union });
1271            }
1272
1273            // What's going on here? Shouldn't x* be simpler than this? It
1274            // turns out that when implementing leftmost-first (Perl-like)
1275            // match semantics, x* results in an incorrect preference order
1276            // when computing the transitive closure of states if and only if
1277            // 'x' can match the empty string. So instead, we compile x* as
1278            // (x+)?, which preserves the correct preference order.
1279            //
1280            // See: https://github.com/rust-lang/regex/issues/779
1281            let compiled = self.c(expr)?;
1282            let plus = if greedy {
1283                self.add_union()
1284            } else {
1285                self.add_union_reverse()
1286            }?;
1287            self.patch(compiled.end, plus)?;
1288            self.patch(plus, compiled.start)?;
1289
1290            let question = if greedy {
1291                self.add_union()
1292            } else {
1293                self.add_union_reverse()
1294            }?;
1295            let empty = self.add_empty()?;
1296            self.patch(question, compiled.start)?;
1297            self.patch(question, empty)?;
1298            self.patch(plus, empty)?;
1299            Ok(ThompsonRef { start: question, end: empty })
1300        } else if n == 1 {
1301            let compiled = self.c(expr)?;
1302            let union = if greedy {
1303                self.add_union()
1304            } else {
1305                self.add_union_reverse()
1306            }?;
1307            self.patch(compiled.end, union)?;
1308            self.patch(union, compiled.start)?;
1309            Ok(ThompsonRef { start: compiled.start, end: union })
1310        } else {
1311            let prefix = self.c_exactly(expr, n - 1)?;
1312            let last = self.c(expr)?;
1313            let union = if greedy {
1314                self.add_union()
1315            } else {
1316                self.add_union_reverse()
1317            }?;
1318            self.patch(prefix.end, last.start)?;
1319            self.patch(last.end, union)?;
1320            self.patch(union, last.start)?;
1321            Ok(ThompsonRef { start: prefix.start, end: union })
1322        }
1323    }
1324
1325    /// Compile the given expression such that it may be matched zero or one
1326    /// times.
1327    ///
1328    /// When `greedy` is true, then the preference is for the expression to
1329    /// match as much as possible. Otherwise, it will match as little as
1330    /// possible.
1331    fn c_zero_or_one(
1332        &self,
1333        expr: &Hir,
1334        greedy: bool,
1335    ) -> Result<ThompsonRef, BuildError> {
1336        let union =
1337            if greedy { self.add_union() } else { self.add_union_reverse() }?;
1338        let compiled = self.c(expr)?;
1339        let empty = self.add_empty()?;
1340        self.patch(union, compiled.start)?;
1341        self.patch(union, empty)?;
1342        self.patch(compiled.end, empty)?;
1343        Ok(ThompsonRef { start: union, end: empty })
1344    }
1345
1346    /// Compile the given HIR expression exactly `n` times.
1347    fn c_exactly(
1348        &self,
1349        expr: &Hir,
1350        n: u32,
1351    ) -> Result<ThompsonRef, BuildError> {
1352        let it = (0..n).map(|_| self.c(expr));
1353        self.c_concat(it)
1354    }
1355
1356    /// Compile the given byte oriented character class.
1357    ///
1358    /// This uses "sparse" states to represent an alternation between ranges in
1359    /// this character class. We can use "sparse" states instead of stitching
1360    /// together a "union" state because all ranges in a character class have
1361    /// equal priority *and* are non-overlapping (thus, only one can match, so
1362    /// there's never a question of priority in the first place). This saves a
1363    /// fair bit of overhead when traversing an NFA.
1364    ///
1365    /// This routine compiles an empty character class into a "fail" state.
1366    fn c_byte_class(
1367        &self,
1368        cls: &hir::ClassBytes,
1369    ) -> Result<ThompsonRef, BuildError> {
1370        let end = self.add_empty()?;
1371        let mut trans = Vec::with_capacity(cls.ranges().len());
1372        for r in cls.iter() {
1373            trans.push(Transition {
1374                start: r.start(),
1375                end: r.end(),
1376                next: end,
1377            });
1378        }
1379        Ok(ThompsonRef { start: self.add_sparse(trans)?, end })
1380    }
1381
1382    /// Compile the given Unicode character class.
1383    ///
1384    /// This routine specifically tries to use various types of compression,
1385    /// since UTF-8 automata of large classes can get quite large. The specific
1386    /// type of compression used depends on forward vs reverse compilation, and
1387    /// whether NFA shrinking is enabled or not.
1388    ///
1389    /// Aside from repetitions causing lots of repeat group, this is like the
1390    /// single most expensive part of regex compilation. Therefore, a large part
1391    /// of the expense of compilation may be reduce by disabling Unicode in the
1392    /// pattern.
1393    ///
1394    /// This routine compiles an empty character class into a "fail" state.
1395    fn c_unicode_class(
1396        &self,
1397        cls: &hir::ClassUnicode,
1398    ) -> Result<ThompsonRef, BuildError> {
1399        // If all we have are ASCII ranges wrapped in a Unicode package, then
1400        // there is zero reason to bring out the big guns. We can fit all ASCII
1401        // ranges within a single sparse state.
1402        if cls.is_ascii() {
1403            let end = self.add_empty()?;
1404            let mut trans = Vec::with_capacity(cls.ranges().len());
1405            for r in cls.iter() {
1406                // The unwraps below are OK because we've verified that this
1407                // class only contains ASCII codepoints.
1408                trans.push(Transition {
1409                    // FIXME(1.59): use the 'TryFrom<char> for u8' impl.
1410                    start: u8::try_from(u32::from(r.start())).unwrap(),
1411                    end: u8::try_from(u32::from(r.end())).unwrap(),
1412                    next: end,
1413                });
1414            }
1415            Ok(ThompsonRef { start: self.add_sparse(trans)?, end })
1416        } else if self.is_reverse() {
1417            if !self.config.get_shrink() {
1418                // When we don't want to spend the extra time shrinking, we
1419                // compile the UTF-8 automaton in reverse using something like
1420                // the "naive" approach, but will attempt to re-use common
1421                // suffixes.
1422                self.c_unicode_class_reverse_with_suffix(cls)
1423            } else {
1424                // When we want to shrink our NFA for reverse UTF-8 automata,
1425                // we cannot feed UTF-8 sequences directly to the UTF-8
1426                // compiler, since the UTF-8 compiler requires all sequences
1427                // to be lexicographically sorted. Instead, we organize our
1428                // sequences into a range trie, which can then output our
1429                // sequences in the correct order. Unfortunately, building the
1430                // range trie is fairly expensive (but not nearly as expensive
1431                // as building a DFA). Hence the reason why the 'shrink' option
1432                // exists, so that this path can be toggled off. For example,
1433                // we might want to turn this off if we know we won't be
1434                // compiling a DFA.
1435                let mut trie = self.trie_state.borrow_mut();
1436                trie.clear();
1437
1438                for rng in cls.iter() {
1439                    for mut seq in Utf8Sequences::new(rng.start(), rng.end()) {
1440                        seq.reverse();
1441                        trie.insert(seq.as_slice());
1442                    }
1443                }
1444                let mut builder = self.builder.borrow_mut();
1445                let mut utf8_state = self.utf8_state.borrow_mut();
1446                let mut utf8c =
1447                    Utf8Compiler::new(&mut *builder, &mut *utf8_state)?;
1448                trie.iter(|seq| {
1449                    utf8c.add(&seq)?;
1450                    Ok(())
1451                })?;
1452                utf8c.finish()
1453            }
1454        } else {
1455            // In the forward direction, we always shrink our UTF-8 automata
1456            // because we can stream it right into the UTF-8 compiler. There
1457            // is almost no downside (in either memory or time) to using this
1458            // approach.
1459            let mut builder = self.builder.borrow_mut();
1460            let mut utf8_state = self.utf8_state.borrow_mut();
1461            let mut utf8c =
1462                Utf8Compiler::new(&mut *builder, &mut *utf8_state)?;
1463            for rng in cls.iter() {
1464                for seq in Utf8Sequences::new(rng.start(), rng.end()) {
1465                    utf8c.add(seq.as_slice())?;
1466                }
1467            }
1468            utf8c.finish()
1469        }
1470
1471        // For reference, the code below is the "naive" version of compiling a
1472        // UTF-8 automaton. It is deliciously simple (and works for both the
1473        // forward and reverse cases), but will unfortunately produce very
1474        // large NFAs. When compiling a forward automaton, the size difference
1475        // can sometimes be an order of magnitude. For example, the '\w' regex
1476        // will generate about ~3000 NFA states using the naive approach below,
1477        // but only 283 states when using the approach above. This is because
1478        // the approach above actually compiles a *minimal* (or near minimal,
1479        // because of the bounded hashmap for reusing equivalent states) UTF-8
1480        // automaton.
1481        //
1482        // The code below is kept as a reference point in order to make it
1483        // easier to understand the higher level goal here. Although, it will
1484        // almost certainly bit-rot, so keep that in mind. Also, if you try to
1485        // use it, some of the tests in this module will fail because they look
1486        // for terser byte code produce by the more optimized handling above.
1487        // But the integration test suite should still pass.
1488        //
1489        // One good example of the substantial difference this can make is to
1490        // compare and contrast performance of the Pike VM when the code below
1491        // is active vs the code above. Here's an example to try:
1492        //
1493        //   regex-cli find match pikevm -b -p '(?m)^\w{20}' non-ascii-file
1494        //
1495        // With Unicode classes generated below, this search takes about 45s on
1496        // my machine. But with the compressed version above, the search takes
1497        // only around 1.4s. The NFA is also 20% smaller. This is in part due
1498        // to the compression, but also because of the utilization of 'sparse'
1499        // NFA states. They lead to much less state shuffling during the NFA
1500        // search.
1501        /*
1502        let it = cls
1503            .iter()
1504            .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end()))
1505            .map(|seq| {
1506                let it = seq
1507                    .as_slice()
1508                    .iter()
1509                    .map(|rng| self.c_range(rng.start, rng.end));
1510                self.c_concat(it)
1511            });
1512        self.c_alt_iter(it)
1513        */
1514    }
1515
1516    /// Compile the given Unicode character class in reverse with suffix
1517    /// caching.
1518    ///
1519    /// This is a "quick" way to compile large Unicode classes into reverse
1520    /// UTF-8 automata while doing a small amount of compression on that
1521    /// automata by reusing common suffixes.
1522    ///
1523    /// A more comprehensive compression scheme can be accomplished by using
1524    /// a range trie to efficiently sort a reverse sequence of UTF-8 byte
1525    /// ranges, and then use Daciuk's algorithm via `Utf8Compiler`.
1526    ///
1527    /// This is the technique used when "NFA shrinking" is disabled.
1528    ///
1529    /// (This also tries to use "sparse" states where possible, just like
1530    /// `c_byte_class` does.)
1531    fn c_unicode_class_reverse_with_suffix(
1532        &self,
1533        cls: &hir::ClassUnicode,
1534    ) -> Result<ThompsonRef, BuildError> {
1535        // N.B. It would likely be better to cache common *prefixes* in the
1536        // reverse direction, but it's not quite clear how to do that. The
1537        // advantage of caching suffixes is that it does give us a win, and
1538        // has a very small additional overhead.
1539        let mut cache = self.utf8_suffix.borrow_mut();
1540        cache.clear();
1541
1542        let union = self.add_union()?;
1543        let alt_end = self.add_empty()?;
1544        for urng in cls.iter() {
1545            for seq in Utf8Sequences::new(urng.start(), urng.end()) {
1546                let mut end = alt_end;
1547                for brng in seq.as_slice() {
1548                    let key = Utf8SuffixKey {
1549                        from: end,
1550                        start: brng.start,
1551                        end: brng.end,
1552                    };
1553                    let hash = cache.hash(&key);
1554                    if let Some(id) = cache.get(&key, hash) {
1555                        end = id;
1556                        continue;
1557                    }
1558
1559                    let compiled = self.c_range(brng.start, brng.end)?;
1560                    self.patch(compiled.end, end)?;
1561                    end = compiled.start;
1562                    cache.set(key, hash, end);
1563                }
1564                self.patch(union, end)?;
1565            }
1566        }
1567        Ok(ThompsonRef { start: union, end: alt_end })
1568    }
1569
1570    /// Compile the given HIR look-around assertion to an NFA look-around
1571    /// assertion.
1572    fn c_look(&self, anchor: &hir::Look) -> Result<ThompsonRef, BuildError> {
1573        let look = match *anchor {
1574            hir::Look::Start => Look::Start,
1575            hir::Look::End => Look::End,
1576            hir::Look::StartLF => Look::StartLF,
1577            hir::Look::EndLF => Look::EndLF,
1578            hir::Look::StartCRLF => Look::StartCRLF,
1579            hir::Look::EndCRLF => Look::EndCRLF,
1580            hir::Look::WordAscii => Look::WordAscii,
1581            hir::Look::WordAsciiNegate => Look::WordAsciiNegate,
1582            hir::Look::WordUnicode => Look::WordUnicode,
1583            hir::Look::WordUnicodeNegate => Look::WordUnicodeNegate,
1584            hir::Look::WordStartAscii => Look::WordStartAscii,
1585            hir::Look::WordEndAscii => Look::WordEndAscii,
1586            hir::Look::WordStartUnicode => Look::WordStartUnicode,
1587            hir::Look::WordEndUnicode => Look::WordEndUnicode,
1588            hir::Look::WordStartHalfAscii => Look::WordStartHalfAscii,
1589            hir::Look::WordEndHalfAscii => Look::WordEndHalfAscii,
1590            hir::Look::WordStartHalfUnicode => Look::WordStartHalfUnicode,
1591            hir::Look::WordEndHalfUnicode => Look::WordEndHalfUnicode,
1592        };
1593        let id = self.add_look(look)?;
1594        Ok(ThompsonRef { start: id, end: id })
1595    }
1596
1597    /// Compile the given byte string to a concatenation of bytes.
1598    fn c_literal(&self, bytes: &[u8]) -> Result<ThompsonRef, BuildError> {
1599        self.c_concat(bytes.iter().copied().map(|b| self.c_range(b, b)))
1600    }
1601
1602    /// Compile a "range" state with one transition that may only be followed
1603    /// if the input byte is in the (inclusive) range given.
1604    ///
1605    /// Both the `start` and `end` locations point to the state created.
1606    /// Callers will likely want to keep the `start`, but patch the `end` to
1607    /// point to some other state.
1608    fn c_range(&self, start: u8, end: u8) -> Result<ThompsonRef, BuildError> {
1609        let id = self.add_range(start, end)?;
1610        Ok(ThompsonRef { start: id, end: id })
1611    }
1612
1613    /// Compile an "empty" state with one unconditional epsilon transition.
1614    ///
1615    /// Both the `start` and `end` locations point to the state created.
1616    /// Callers will likely want to keep the `start`, but patch the `end` to
1617    /// point to some other state.
1618    fn c_empty(&self) -> Result<ThompsonRef, BuildError> {
1619        let id = self.add_empty()?;
1620        Ok(ThompsonRef { start: id, end: id })
1621    }
1622
1623    /// Compile a "fail" state that can never have any outgoing transitions.
1624    fn c_fail(&self) -> Result<ThompsonRef, BuildError> {
1625        let id = self.add_fail()?;
1626        Ok(ThompsonRef { start: id, end: id })
1627    }
1628
1629    // The below helpers are meant to be simple wrappers around the
1630    // corresponding Builder methods. For the most part, they let us write
1631    // 'self.add_foo()' instead of 'self.builder.borrow_mut().add_foo()', where
1632    // the latter is a mouthful. Some of the methods do inject a little bit
1633    // of extra logic. e.g., Flipping look-around operators when compiling in
1634    // reverse mode.
1635
1636    fn patch(&self, from: StateID, to: StateID) -> Result<(), BuildError> {
1637        self.builder.borrow_mut().patch(from, to)
1638    }
1639
1640    fn start_pattern(&self) -> Result<PatternID, BuildError> {
1641        self.builder.borrow_mut().start_pattern()
1642    }
1643
1644    fn finish_pattern(
1645        &self,
1646        start_id: StateID,
1647    ) -> Result<PatternID, BuildError> {
1648        self.builder.borrow_mut().finish_pattern(start_id)
1649    }
1650
1651    fn add_empty(&self) -> Result<StateID, BuildError> {
1652        self.builder.borrow_mut().add_empty()
1653    }
1654
1655    fn add_range(&self, start: u8, end: u8) -> Result<StateID, BuildError> {
1656        self.builder.borrow_mut().add_range(Transition {
1657            start,
1658            end,
1659            next: StateID::ZERO,
1660        })
1661    }
1662
1663    fn add_sparse(
1664        &self,
1665        ranges: Vec<Transition>,
1666    ) -> Result<StateID, BuildError> {
1667        self.builder.borrow_mut().add_sparse(ranges)
1668    }
1669
1670    fn add_look(&self, mut look: Look) -> Result<StateID, BuildError> {
1671        if self.is_reverse() {
1672            look = look.reversed();
1673        }
1674        self.builder.borrow_mut().add_look(StateID::ZERO, look)
1675    }
1676
1677    fn add_union(&self) -> Result<StateID, BuildError> {
1678        self.builder.borrow_mut().add_union(vec![])
1679    }
1680
1681    fn add_union_reverse(&self) -> Result<StateID, BuildError> {
1682        self.builder.borrow_mut().add_union_reverse(vec![])
1683    }
1684
1685    fn add_capture_start(
1686        &self,
1687        capture_index: u32,
1688        name: Option<&str>,
1689    ) -> Result<StateID, BuildError> {
1690        let name = name.map(Arc::from);
1691        self.builder.borrow_mut().add_capture_start(
1692            StateID::ZERO,
1693            capture_index,
1694            name,
1695        )
1696    }
1697
1698    fn add_capture_end(
1699        &self,
1700        capture_index: u32,
1701    ) -> Result<StateID, BuildError> {
1702        self.builder.borrow_mut().add_capture_end(StateID::ZERO, capture_index)
1703    }
1704
1705    fn add_fail(&self) -> Result<StateID, BuildError> {
1706        self.builder.borrow_mut().add_fail()
1707    }
1708
1709    fn add_match(&self) -> Result<StateID, BuildError> {
1710        self.builder.borrow_mut().add_match()
1711    }
1712
1713    fn is_reverse(&self) -> bool {
1714        self.config.get_reverse()
1715    }
1716}
1717
1718/// A value that represents the result of compiling a sub-expression of a
1719/// regex's HIR. Specifically, this represents a sub-graph of the NFA that
1720/// has an initial state at `start` and a final state at `end`.
1721#[derive(Clone, Copy, Debug)]
1722pub(crate) struct ThompsonRef {
1723    pub(crate) start: StateID,
1724    pub(crate) end: StateID,
1725}
1726
1727/// A UTF-8 compiler based on Daciuk's algorithm for compiling minimal DFAs
1728/// from a lexicographically sorted sequence of strings in linear time.
1729///
1730/// The trick here is that any Unicode codepoint range can be converted to
1731/// a sequence of byte ranges that form a UTF-8 automaton. Connecting them
1732/// together via an alternation is trivial, and indeed, it works. However,
1733/// there is a lot of redundant structure in many UTF-8 automatons. Since our
1734/// UTF-8 ranges are in lexicographic order, we can use Daciuk's algorithm
1735/// to build nearly minimal DFAs in linear time. (They are guaranteed to be
1736/// minimal because we use a bounded cache of previously build DFA states.)
1737///
1738/// The drawback is that this sadly doesn't work for reverse automata, since
1739/// the ranges are no longer in lexicographic order. For that, we invented the
1740/// range trie (which gets its own module). Once a range trie is built, we then
1741/// use this same Utf8Compiler to build a reverse UTF-8 automaton.
1742///
1743/// The high level idea is described here:
1744/// https://blog.burntsushi.net/transducers/#finite-state-machines-as-data-structures
1745///
1746/// There is also another implementation of this in the `fst` crate.
1747#[derive(Debug)]
1748struct Utf8Compiler<'a> {
1749    builder: &'a mut Builder,
1750    state: &'a mut Utf8State,
1751    target: StateID,
1752}
1753
1754#[derive(Clone, Debug)]
1755struct Utf8State {
1756    compiled: Utf8BoundedMap,
1757    uncompiled: Vec<Utf8Node>,
1758}
1759
1760#[derive(Clone, Debug)]
1761struct Utf8Node {
1762    trans: Vec<Transition>,
1763    last: Option<Utf8LastTransition>,
1764}
1765
1766#[derive(Clone, Debug)]
1767struct Utf8LastTransition {
1768    start: u8,
1769    end: u8,
1770}
1771
1772impl Utf8State {
1773    fn new() -> Utf8State {
1774        Utf8State { compiled: Utf8BoundedMap::new(10_000), uncompiled: vec![] }
1775    }
1776
1777    fn clear(&mut self) {
1778        self.compiled.clear();
1779        self.uncompiled.clear();
1780    }
1781}
1782
1783impl<'a> Utf8Compiler<'a> {
1784    fn new(
1785        builder: &'a mut Builder,
1786        state: &'a mut Utf8State,
1787    ) -> Result<Utf8Compiler<'a>, BuildError> {
1788        let target = builder.add_empty()?;
1789        state.clear();
1790        let mut utf8c = Utf8Compiler { builder, state, target };
1791        utf8c.add_empty();
1792        Ok(utf8c)
1793    }
1794
1795    fn finish(&mut self) -> Result<ThompsonRef, BuildError> {
1796        self.compile_from(0)?;
1797        let node = self.pop_root();
1798        let start = self.compile(node)?;
1799        Ok(ThompsonRef { start, end: self.target })
1800    }
1801
1802    fn add(&mut self, ranges: &[Utf8Range]) -> Result<(), BuildError> {
1803        let prefix_len = ranges
1804            .iter()
1805            .zip(&self.state.uncompiled)
1806            .take_while(|&(range, node)| {
1807                node.last.as_ref().map_or(false, |t| {
1808                    (t.start, t.end) == (range.start, range.end)
1809                })
1810            })
1811            .count();
1812        assert!(prefix_len < ranges.len());
1813        self.compile_from(prefix_len)?;
1814        self.add_suffix(&ranges[prefix_len..]);
1815        Ok(())
1816    }
1817
1818    fn compile_from(&mut self, from: usize) -> Result<(), BuildError> {
1819        let mut next = self.target;
1820        while from + 1 < self.state.uncompiled.len() {
1821            let node = self.pop_freeze(next);
1822            next = self.compile(node)?;
1823        }
1824        self.top_last_freeze(next);
1825        Ok(())
1826    }
1827
1828    fn compile(
1829        &mut self,
1830        node: Vec<Transition>,
1831    ) -> Result<StateID, BuildError> {
1832        let hash = self.state.compiled.hash(&node);
1833        if let Some(id) = self.state.compiled.get(&node, hash) {
1834            return Ok(id);
1835        }
1836        let id = self.builder.add_sparse(node.clone())?;
1837        self.state.compiled.set(node, hash, id);
1838        Ok(id)
1839    }
1840
1841    fn add_suffix(&mut self, ranges: &[Utf8Range]) {
1842        assert!(!ranges.is_empty());
1843        let last = self
1844            .state
1845            .uncompiled
1846            .len()
1847            .checked_sub(1)
1848            .expect("non-empty nodes");
1849        assert!(self.state.uncompiled[last].last.is_none());
1850        self.state.uncompiled[last].last = Some(Utf8LastTransition {
1851            start: ranges[0].start,
1852            end: ranges[0].end,
1853        });
1854        for r in &ranges[1..] {
1855            self.state.uncompiled.push(Utf8Node {
1856                trans: vec![],
1857                last: Some(Utf8LastTransition { start: r.start, end: r.end }),
1858            });
1859        }
1860    }
1861
1862    fn add_empty(&mut self) {
1863        self.state.uncompiled.push(Utf8Node { trans: vec![], last: None });
1864    }
1865
1866    fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> {
1867        let mut uncompiled = self.state.uncompiled.pop().unwrap();
1868        uncompiled.set_last_transition(next);
1869        uncompiled.trans
1870    }
1871
1872    fn pop_root(&mut self) -> Vec<Transition> {
1873        assert_eq!(self.state.uncompiled.len(), 1);
1874        assert!(self.state.uncompiled[0].last.is_none());
1875        self.state.uncompiled.pop().expect("non-empty nodes").trans
1876    }
1877
1878    fn top_last_freeze(&mut self, next: StateID) {
1879        let last = self
1880            .state
1881            .uncompiled
1882            .len()
1883            .checked_sub(1)
1884            .expect("non-empty nodes");
1885        self.state.uncompiled[last].set_last_transition(next);
1886    }
1887}
1888
1889impl Utf8Node {
1890    fn set_last_transition(&mut self, next: StateID) {
1891        if let Some(last) = self.last.take() {
1892            self.trans.push(Transition {
1893                start: last.start,
1894                end: last.end,
1895                next,
1896            });
1897        }
1898    }
1899}
1900
1901#[cfg(test)]
1902mod tests {
1903    use alloc::vec;
1904
1905    use crate::{
1906        nfa::thompson::{SparseTransitions, State},
1907        util::primitives::SmallIndex,
1908    };
1909
1910    use super::*;
1911
1912    fn build(pattern: &str) -> NFA {
1913        NFA::compiler()
1914            .configure(
1915                NFA::config()
1916                    .which_captures(WhichCaptures::None)
1917                    .unanchored_prefix(false),
1918            )
1919            .build(pattern)
1920            .unwrap()
1921    }
1922
1923    fn pid(id: usize) -> PatternID {
1924        PatternID::new(id).unwrap()
1925    }
1926
1927    fn sid(id: usize) -> StateID {
1928        StateID::new(id).unwrap()
1929    }
1930
1931    fn s_byte(byte: u8, next: usize) -> State {
1932        let next = sid(next);
1933        let trans = Transition { start: byte, end: byte, next };
1934        State::ByteRange { trans }
1935    }
1936
1937    fn s_range(start: u8, end: u8, next: usize) -> State {
1938        let next = sid(next);
1939        let trans = Transition { start, end, next };
1940        State::ByteRange { trans }
1941    }
1942
1943    fn s_sparse(transitions: &[(u8, u8, usize)]) -> State {
1944        let transitions = transitions
1945            .iter()
1946            .map(|&(start, end, next)| Transition {
1947                start,
1948                end,
1949                next: sid(next),
1950            })
1951            .collect();
1952        State::Sparse(SparseTransitions { transitions })
1953    }
1954
1955    fn s_look(look: Look, next: usize) -> State {
1956        let next = sid(next);
1957        State::Look { look, next }
1958    }
1959
1960    fn s_bin_union(alt1: usize, alt2: usize) -> State {
1961        State::BinaryUnion { alt1: sid(alt1), alt2: sid(alt2) }
1962    }
1963
1964    fn s_union(alts: &[usize]) -> State {
1965        State::Union {
1966            alternates: alts
1967                .iter()
1968                .map(|&id| sid(id))
1969                .collect::<Vec<StateID>>()
1970                .into_boxed_slice(),
1971        }
1972    }
1973
1974    fn s_cap(next: usize, pattern: usize, index: usize, slot: usize) -> State {
1975        State::Capture {
1976            next: sid(next),
1977            pattern_id: pid(pattern),
1978            group_index: SmallIndex::new(index).unwrap(),
1979            slot: SmallIndex::new(slot).unwrap(),
1980        }
1981    }
1982
1983    fn s_fail() -> State {
1984        State::Fail
1985    }
1986
1987    fn s_match(id: usize) -> State {
1988        State::Match { pattern_id: pid(id) }
1989    }
1990
1991    // Test that building an unanchored NFA has an appropriate `(?s:.)*?`
1992    // prefix.
1993    #[test]
1994    fn compile_unanchored_prefix() {
1995        let nfa = NFA::compiler()
1996            .configure(NFA::config().which_captures(WhichCaptures::None))
1997            .build(r"a")
1998            .unwrap();
1999        assert_eq!(
2000            nfa.states(),
2001            &[
2002                s_bin_union(2, 1),
2003                s_range(0, 255, 0),
2004                s_byte(b'a', 3),
2005                s_match(0),
2006            ]
2007        );
2008    }
2009
2010    #[test]
2011    fn compile_no_unanchored_prefix_with_start_anchor() {
2012        let nfa = NFA::compiler()
2013            .configure(NFA::config().which_captures(WhichCaptures::None))
2014            .build(r"^a")
2015            .unwrap();
2016        assert_eq!(
2017            nfa.states(),
2018            &[s_look(Look::Start, 1), s_byte(b'a', 2), s_match(0)]
2019        );
2020    }
2021
2022    #[test]
2023    fn compile_yes_unanchored_prefix_with_end_anchor() {
2024        let nfa = NFA::compiler()
2025            .configure(NFA::config().which_captures(WhichCaptures::None))
2026            .build(r"a$")
2027            .unwrap();
2028        assert_eq!(
2029            nfa.states(),
2030            &[
2031                s_bin_union(2, 1),
2032                s_range(0, 255, 0),
2033                s_byte(b'a', 3),
2034                s_look(Look::End, 4),
2035                s_match(0),
2036            ]
2037        );
2038    }
2039
2040    #[test]
2041    fn compile_yes_reverse_unanchored_prefix_with_start_anchor() {
2042        let nfa = NFA::compiler()
2043            .configure(
2044                NFA::config()
2045                    .reverse(true)
2046                    .which_captures(WhichCaptures::None),
2047            )
2048            .build(r"^a")
2049            .unwrap();
2050        assert_eq!(
2051            nfa.states(),
2052            &[
2053                s_bin_union(2, 1),
2054                s_range(0, 255, 0),
2055                s_byte(b'a', 3),
2056                // Anchors get flipped in a reverse automaton.
2057                s_look(Look::End, 4),
2058                s_match(0),
2059            ],
2060        );
2061    }
2062
2063    #[test]
2064    fn compile_no_reverse_unanchored_prefix_with_end_anchor() {
2065        let nfa = NFA::compiler()
2066            .configure(
2067                NFA::config()
2068                    .reverse(true)
2069                    .which_captures(WhichCaptures::None),
2070            )
2071            .build(r"a$")
2072            .unwrap();
2073        assert_eq!(
2074            nfa.states(),
2075            &[
2076                // Anchors get flipped in a reverse automaton.
2077                s_look(Look::Start, 1),
2078                s_byte(b'a', 2),
2079                s_match(0),
2080            ],
2081        );
2082    }
2083
2084    #[test]
2085    fn compile_empty() {
2086        assert_eq!(build("").states(), &[s_match(0),]);
2087    }
2088
2089    #[test]
2090    fn compile_literal() {
2091        assert_eq!(build("a").states(), &[s_byte(b'a', 1), s_match(0),]);
2092        assert_eq!(
2093            build("ab").states(),
2094            &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0),]
2095        );
2096        assert_eq!(
2097            build("☃").states(),
2098            &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(0)]
2099        );
2100
2101        // Check that non-UTF-8 literals work.
2102        let nfa = NFA::compiler()
2103            .configure(
2104                NFA::config()
2105                    .which_captures(WhichCaptures::None)
2106                    .unanchored_prefix(false),
2107            )
2108            .syntax(crate::util::syntax::Config::new().utf8(false))
2109            .build(r"(?-u)\xFF")
2110            .unwrap();
2111        assert_eq!(nfa.states(), &[s_byte(b'\xFF', 1), s_match(0),]);
2112    }
2113
2114    #[test]
2115    fn compile_class_ascii() {
2116        assert_eq!(
2117            build(r"[a-z]").states(),
2118            &[s_range(b'a', b'z', 1), s_match(0),]
2119        );
2120        assert_eq!(
2121            build(r"[x-za-c]").states(),
2122            &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match(0)]
2123        );
2124    }
2125
2126    #[test]
2127    #[cfg(not(miri))]
2128    fn compile_class_unicode() {
2129        assert_eq!(
2130            build(r"[\u03B1-\u03B4]").states(),
2131            &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match(0)]
2132        );
2133        assert_eq!(
2134            build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states(),
2135            &[
2136                s_range(0xB1, 0xB4, 5),
2137                s_range(0x99, 0x9E, 5),
2138                s_byte(0xA4, 1),
2139                s_byte(0x9F, 2),
2140                s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]),
2141                s_match(0),
2142            ]
2143        );
2144        assert_eq!(
2145            build(r"[a-z☃]").states(),
2146            &[
2147                s_byte(0x83, 3),
2148                s_byte(0x98, 0),
2149                s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]),
2150                s_match(0),
2151            ]
2152        );
2153    }
2154
2155    #[test]
2156    fn compile_repetition() {
2157        assert_eq!(
2158            build(r"a?").states(),
2159            &[s_bin_union(1, 2), s_byte(b'a', 2), s_match(0),]
2160        );
2161        assert_eq!(
2162            build(r"a??").states(),
2163            &[s_bin_union(2, 1), s_byte(b'a', 2), s_match(0),]
2164        );
2165    }
2166
2167    #[test]
2168    fn compile_group() {
2169        assert_eq!(
2170            build(r"ab+").states(),
2171            &[s_byte(b'a', 1), s_byte(b'b', 2), s_bin_union(1, 3), s_match(0)]
2172        );
2173        assert_eq!(
2174            build(r"(ab)").states(),
2175            &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0)]
2176        );
2177        assert_eq!(
2178            build(r"(ab)+").states(),
2179            &[s_byte(b'a', 1), s_byte(b'b', 2), s_bin_union(0, 3), s_match(0)]
2180        );
2181    }
2182
2183    #[test]
2184    fn compile_alternation() {
2185        assert_eq!(
2186            build(r"a|b").states(),
2187            &[s_range(b'a', b'b', 1), s_match(0)]
2188        );
2189        assert_eq!(
2190            build(r"ab|cd").states(),
2191            &[
2192                s_byte(b'b', 3),
2193                s_byte(b'd', 3),
2194                s_sparse(&[(b'a', b'a', 0), (b'c', b'c', 1)]),
2195                s_match(0)
2196            ],
2197        );
2198        assert_eq!(
2199            build(r"|b").states(),
2200            &[s_byte(b'b', 2), s_bin_union(2, 0), s_match(0)]
2201        );
2202        assert_eq!(
2203            build(r"a|").states(),
2204            &[s_byte(b'a', 2), s_bin_union(0, 2), s_match(0)]
2205        );
2206    }
2207
2208    // This tests the use of a non-binary union, i.e., a state with more than
2209    // 2 unconditional epsilon transitions. The only place they tend to appear
2210    // is in reverse NFAs when shrinking is disabled. Otherwise, 'binary-union'
2211    // and 'sparse' tend to cover all other cases of alternation.
2212    #[test]
2213    fn compile_non_binary_union() {
2214        let nfa = NFA::compiler()
2215            .configure(
2216                NFA::config()
2217                    .which_captures(WhichCaptures::None)
2218                    .reverse(true)
2219                    .shrink(false)
2220                    .unanchored_prefix(false),
2221            )
2222            .build(r"[\u1000\u2000\u3000]")
2223            .unwrap();
2224        assert_eq!(
2225            nfa.states(),
2226            &[
2227                s_union(&[3, 6, 9]),
2228                s_byte(0xE1, 10),
2229                s_byte(0x80, 1),
2230                s_byte(0x80, 2),
2231                s_byte(0xE2, 10),
2232                s_byte(0x80, 4),
2233                s_byte(0x80, 5),
2234                s_byte(0xE3, 10),
2235                s_byte(0x80, 7),
2236                s_byte(0x80, 8),
2237                s_match(0),
2238            ]
2239        );
2240    }
2241
2242    #[test]
2243    fn compile_many_start_pattern() {
2244        let nfa = NFA::compiler()
2245            .configure(
2246                NFA::config()
2247                    .which_captures(WhichCaptures::None)
2248                    .unanchored_prefix(false),
2249            )
2250            .build_many(&["a", "b"])
2251            .unwrap();
2252        assert_eq!(
2253            nfa.states(),
2254            &[
2255                s_byte(b'a', 1),
2256                s_match(0),
2257                s_byte(b'b', 3),
2258                s_match(1),
2259                s_bin_union(0, 2),
2260            ]
2261        );
2262        assert_eq!(nfa.start_anchored().as_usize(), 4);
2263        assert_eq!(nfa.start_unanchored().as_usize(), 4);
2264        // Test that the start states for each individual pattern are correct.
2265        assert_eq!(nfa.start_pattern(pid(0)).unwrap(), sid(0));
2266        assert_eq!(nfa.start_pattern(pid(1)).unwrap(), sid(2));
2267    }
2268
2269    // This tests that our compiler can handle an empty character class. At the
2270    // time of writing, the regex parser forbids it, so the only way to test it
2271    // is to provide a hand written HIR.
2272    #[test]
2273    fn empty_class_bytes() {
2274        use regex_syntax::hir::{Class, ClassBytes, Hir};
2275
2276        let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![])));
2277        let config = NFA::config()
2278            .which_captures(WhichCaptures::None)
2279            .unanchored_prefix(false);
2280        let nfa =
2281            NFA::compiler().configure(config).build_from_hir(&hir).unwrap();
2282        assert_eq!(nfa.states(), &[s_fail(), s_match(0)]);
2283    }
2284
2285    // Like empty_class_bytes, but for a Unicode class.
2286    #[test]
2287    fn empty_class_unicode() {
2288        use regex_syntax::hir::{Class, ClassUnicode, Hir};
2289
2290        let hir = Hir::class(Class::Unicode(ClassUnicode::new(vec![])));
2291        let config = NFA::config()
2292            .which_captures(WhichCaptures::None)
2293            .unanchored_prefix(false);
2294        let nfa =
2295            NFA::compiler().configure(config).build_from_hir(&hir).unwrap();
2296        assert_eq!(nfa.states(), &[s_fail(), s_match(0)]);
2297    }
2298
2299    #[test]
2300    fn compile_captures_all() {
2301        let nfa = NFA::compiler()
2302            .configure(
2303                NFA::config()
2304                    .unanchored_prefix(false)
2305                    .which_captures(WhichCaptures::All),
2306            )
2307            .build("a(b)c")
2308            .unwrap();
2309        assert_eq!(
2310            nfa.states(),
2311            &[
2312                s_cap(1, 0, 0, 0),
2313                s_byte(b'a', 2),
2314                s_cap(3, 0, 1, 2),
2315                s_byte(b'b', 4),
2316                s_cap(5, 0, 1, 3),
2317                s_byte(b'c', 6),
2318                s_cap(7, 0, 0, 1),
2319                s_match(0)
2320            ]
2321        );
2322        let ginfo = nfa.group_info();
2323        assert_eq!(2, ginfo.all_group_len());
2324    }
2325
2326    #[test]
2327    fn compile_captures_implicit() {
2328        let nfa = NFA::compiler()
2329            .configure(
2330                NFA::config()
2331                    .unanchored_prefix(false)
2332                    .which_captures(WhichCaptures::Implicit),
2333            )
2334            .build("a(b)c")
2335            .unwrap();
2336        assert_eq!(
2337            nfa.states(),
2338            &[
2339                s_cap(1, 0, 0, 0),
2340                s_byte(b'a', 2),
2341                s_byte(b'b', 3),
2342                s_byte(b'c', 4),
2343                s_cap(5, 0, 0, 1),
2344                s_match(0)
2345            ]
2346        );
2347        let ginfo = nfa.group_info();
2348        assert_eq!(1, ginfo.all_group_len());
2349    }
2350
2351    #[test]
2352    fn compile_captures_none() {
2353        let nfa = NFA::compiler()
2354            .configure(
2355                NFA::config()
2356                    .unanchored_prefix(false)
2357                    .which_captures(WhichCaptures::None),
2358            )
2359            .build("a(b)c")
2360            .unwrap();
2361        assert_eq!(
2362            nfa.states(),
2363            &[s_byte(b'a', 1), s_byte(b'b', 2), s_byte(b'c', 3), s_match(0)]
2364        );
2365        let ginfo = nfa.group_info();
2366        assert_eq!(0, ginfo.all_group_len());
2367    }
2368}
regex_automata/nfa/thompson/compiler.rs

regex_automata/nfa/thompson/
compiler.rs