regex_automata/nfa/thompson/compiler.rs
1use core::{borrow::Borrow, cell::RefCell};
2
3use alloc::{sync::Arc, vec, vec::Vec};
4
5use regex_syntax::{
6 hir::{self, Hir},
7 utf8::{Utf8Range, Utf8Sequences},
8 ParserBuilder,
9};
10
11use crate::{
12 nfa::thompson::{
13 builder::Builder,
14 error::BuildError,
15 literal_trie::LiteralTrie,
16 map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap},
17 nfa::{Transition, NFA},
18 range_trie::RangeTrie,
19 },
20 util::{
21 look::{Look, LookMatcher},
22 primitives::{PatternID, StateID},
23 },
24};
25
26/// The configuration used for a Thompson NFA compiler.
27#[derive(Clone, Debug, Default)]
28pub struct Config {
29 utf8: Option<bool>,
30 reverse: Option<bool>,
31 nfa_size_limit: Option<Option<usize>>,
32 shrink: Option<bool>,
33 which_captures: Option<WhichCaptures>,
34 look_matcher: Option<LookMatcher>,
35 #[cfg(test)]
36 unanchored_prefix: Option<bool>,
37}
38
39impl Config {
40 /// Return a new default Thompson NFA compiler configuration.
41 pub fn new() -> Config {
42 Config::default()
43 }
44
45 /// Whether to enable UTF-8 mode during search or not.
46 ///
47 /// A regex engine is said to be in UTF-8 mode when it guarantees that
48 /// all matches returned by it have spans consisting of only valid UTF-8.
49 /// That is, it is impossible for a match span to be returned that
50 /// contains any invalid UTF-8.
51 ///
52 /// UTF-8 mode generally consists of two things:
53 ///
54 /// 1. Whether the NFA's states are constructed such that all paths to a
55 /// match state that consume at least one byte always correspond to valid
56 /// UTF-8.
57 /// 2. Whether all paths to a match state that do _not_ consume any bytes
58 /// should always correspond to valid UTF-8 boundaries.
59 ///
60 /// (1) is a guarantee made by whoever constructs the NFA.
61 /// If you're parsing a regex from its concrete syntax, then
62 /// [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) can make
63 /// this guarantee for you. It does it by returning an error if the regex
64 /// pattern could every report a non-empty match span that contains invalid
65 /// UTF-8. So long as `syntax::Config::utf8` mode is enabled and your regex
66 /// successfully parses, then you're guaranteed that the corresponding NFA
67 /// will only ever report non-empty match spans containing valid UTF-8.
68 ///
69 /// (2) is a trickier guarantee because it cannot be enforced by the NFA
70 /// state graph itself. Consider, for example, the regex `a*`. It matches
71 /// the empty strings in `ā` at positions `0`, `1`, `2` and `3`, where
72 /// positions `1` and `2` occur within the UTF-8 encoding of a codepoint,
73 /// and thus correspond to invalid UTF-8 boundaries. Therefore, this
74 /// guarantee must be made at a higher level than the NFA state graph
75 /// itself. This crate deals with this case in each regex engine. Namely,
76 /// when a zero-width match that splits a codepoint is found and UTF-8
77 /// mode enabled, then it is ignored and the engine moves on looking for
78 /// the next match.
79 ///
80 /// Thus, UTF-8 mode is both a promise that the NFA built only reports
81 /// non-empty matches that are valid UTF-8, and an *instruction* to regex
82 /// engines that empty matches that split codepoints should be banned.
83 ///
84 /// Because UTF-8 mode is fundamentally about avoiding invalid UTF-8 spans,
85 /// it only makes sense to enable this option when you *know* your haystack
86 /// is valid UTF-8. (For example, a `&str`.) Enabling UTF-8 mode and
87 /// searching a haystack that contains invalid UTF-8 leads to **unspecified
88 /// behavior**.
89 ///
90 /// Therefore, it may make sense to enable `syntax::Config::utf8` while
91 /// simultaneously *disabling* this option. That would ensure all non-empty
92 /// match spans are valid UTF-8, but that empty match spans may still split
93 /// a codepoint or match at other places that aren't valid UTF-8.
94 ///
95 /// In general, this mode is only relevant if your regex can match the
96 /// empty string. Most regexes don't.
97 ///
98 /// This is enabled by default.
99 ///
100 /// # Example
101 ///
102 /// This example shows how UTF-8 mode can impact the match spans that may
103 /// be reported in certain cases.
104 ///
105 /// ```
106 /// use regex_automata::{
107 /// nfa::thompson::{self, pikevm::PikeVM},
108 /// Match, Input,
109 /// };
110 ///
111 /// let re = PikeVM::new("")?;
112 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
113 ///
114 /// // UTF-8 mode is enabled by default.
115 /// let mut input = Input::new("ā");
116 /// re.search(&mut cache, &input, &mut caps);
117 /// assert_eq!(Some(Match::must(0, 0..0)), caps.get_match());
118 ///
119 /// // Even though an empty regex matches at 1..1, our next match is
120 /// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is
121 /// // three bytes long).
122 /// input.set_start(1);
123 /// re.search(&mut cache, &input, &mut caps);
124 /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match());
125 ///
126 /// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2:
127 /// let re = PikeVM::builder()
128 /// .thompson(thompson::Config::new().utf8(false))
129 /// .build("")?;
130 /// re.search(&mut cache, &input, &mut caps);
131 /// assert_eq!(Some(Match::must(0, 1..1)), caps.get_match());
132 ///
133 /// input.set_start(2);
134 /// re.search(&mut cache, &input, &mut caps);
135 /// assert_eq!(Some(Match::must(0, 2..2)), caps.get_match());
136 ///
137 /// input.set_start(3);
138 /// re.search(&mut cache, &input, &mut caps);
139 /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match());
140 ///
141 /// input.set_start(4);
142 /// re.search(&mut cache, &input, &mut caps);
143 /// assert_eq!(None, caps.get_match());
144 ///
145 /// # Ok::<(), Box<dyn std::error::Error>>(())
146 /// ```
147 pub fn utf8(mut self, yes: bool) -> Config {
148 self.utf8 = Some(yes);
149 self
150 }
151
152 /// Reverse the NFA.
153 ///
154 /// A NFA reversal is performed by reversing all of the concatenated
155 /// sub-expressions in the original pattern, recursively. (Look around
156 /// operators are also inverted.) The resulting NFA can be used to match
157 /// the pattern starting from the end of a string instead of the beginning
158 /// of a string.
159 ///
160 /// Reversing the NFA is useful for building a reverse DFA, which is most
161 /// useful for finding the start of a match after its ending position has
162 /// been found. NFA execution engines typically do not work on reverse
163 /// NFAs. For example, currently, the Pike VM reports the starting location
164 /// of matches without a reverse NFA.
165 ///
166 /// Currently, enabling this setting requires disabling the
167 /// [`captures`](Config::captures) setting. If both are enabled, then the
168 /// compiler will return an error. It is expected that this limitation will
169 /// be lifted in the future.
170 ///
171 /// This is disabled by default.
172 ///
173 /// # Example
174 ///
175 /// This example shows how to build a DFA from a reverse NFA, and then use
176 /// the DFA to search backwards.
177 ///
178 /// ```
179 /// use regex_automata::{
180 /// dfa::{self, Automaton},
181 /// nfa::thompson::{NFA, WhichCaptures},
182 /// HalfMatch, Input,
183 /// };
184 ///
185 /// let dfa = dfa::dense::Builder::new()
186 /// .thompson(NFA::config()
187 /// .which_captures(WhichCaptures::None)
188 /// .reverse(true)
189 /// )
190 /// .build("baz[0-9]+")?;
191 /// let expected = Some(HalfMatch::must(0, 3));
192 /// assert_eq!(
193 /// expected,
194 /// dfa.try_search_rev(&Input::new("foobaz12345bar"))?,
195 /// );
196 ///
197 /// # Ok::<(), Box<dyn std::error::Error>>(())
198 /// ```
199 pub fn reverse(mut self, yes: bool) -> Config {
200 self.reverse = Some(yes);
201 self
202 }
203
204 /// Sets an approximate size limit on the total heap used by the NFA being
205 /// compiled.
206 ///
207 /// This permits imposing constraints on the size of a compiled NFA. This
208 /// may be useful in contexts where the regex pattern is untrusted and one
209 /// wants to avoid using too much memory.
210 ///
211 /// This size limit does not apply to auxiliary heap used during
212 /// compilation that is not part of the built NFA.
213 ///
214 /// Note that this size limit is applied during compilation in order for
215 /// the limit to prevent too much heap from being used. However, the
216 /// implementation may use an intermediate NFA representation that is
217 /// otherwise slightly bigger than the final public form. Since the size
218 /// limit may be applied to an intermediate representation, there is not
219 /// necessarily a precise correspondence between the configured size limit
220 /// and the heap usage of the final NFA.
221 ///
222 /// There is no size limit by default.
223 ///
224 /// # Example
225 ///
226 /// This example demonstrates how Unicode mode can greatly increase the
227 /// size of the NFA.
228 ///
229 /// ```
230 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
231 /// use regex_automata::nfa::thompson::NFA;
232 ///
233 /// // 300KB isn't enough!
234 /// NFA::compiler()
235 /// .configure(NFA::config().nfa_size_limit(Some(300_000)))
236 /// .build(r"\w{20}")
237 /// .unwrap_err();
238 ///
239 /// // ... but 500KB probably is.
240 /// let nfa = NFA::compiler()
241 /// .configure(NFA::config().nfa_size_limit(Some(500_000)))
242 /// .build(r"\w{20}")?;
243 ///
244 /// assert_eq!(nfa.pattern_len(), 1);
245 ///
246 /// # Ok::<(), Box<dyn std::error::Error>>(())
247 /// ```
248 pub fn nfa_size_limit(mut self, bytes: Option<usize>) -> Config {
249 self.nfa_size_limit = Some(bytes);
250 self
251 }
252
253 /// Apply best effort heuristics to shrink the NFA at the expense of more
254 /// time/memory.
255 ///
256 /// Generally speaking, if one is using an NFA to compile a DFA, then the
257 /// extra time used to shrink the NFA will be more than made up for during
258 /// DFA construction (potentially by a lot). In other words, enabling this
259 /// can substantially decrease the overall amount of time it takes to build
260 /// a DFA.
261 ///
262 /// A reason to keep this disabled is if you want to compile an NFA and
263 /// start using it as quickly as possible without needing to build a DFA,
264 /// and you don't mind using a bit of extra memory for the NFA. e.g., for
265 /// an NFA simulation or for a lazy DFA.
266 ///
267 /// NFA shrinking is currently most useful when compiling a reverse
268 /// NFA with large Unicode character classes. In particular, it trades
269 /// additional CPU time during NFA compilation in favor of generating fewer
270 /// NFA states.
271 ///
272 /// This is disabled by default because it can increase compile times
273 /// quite a bit if you aren't building a full DFA.
274 ///
275 /// # Example
276 ///
277 /// This example shows that NFA shrinking can lead to substantial space
278 /// savings in some cases. Notice that, as noted above, we build a reverse
279 /// DFA and use a pattern with a large Unicode character class.
280 ///
281 /// ```
282 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
283 /// use regex_automata::nfa::thompson::{NFA, WhichCaptures};
284 ///
285 /// // Currently we have to disable captures when enabling reverse NFA.
286 /// let config = NFA::config()
287 /// .which_captures(WhichCaptures::None)
288 /// .reverse(true);
289 /// let not_shrunk = NFA::compiler()
290 /// .configure(config.clone().shrink(false))
291 /// .build(r"\w")?;
292 /// let shrunk = NFA::compiler()
293 /// .configure(config.clone().shrink(true))
294 /// .build(r"\w")?;
295 ///
296 /// // While a specific shrink factor is not guaranteed, the savings can be
297 /// // considerable in some cases.
298 /// assert!(shrunk.states().len() * 2 < not_shrunk.states().len());
299 ///
300 /// # Ok::<(), Box<dyn std::error::Error>>(())
301 /// ```
302 pub fn shrink(mut self, yes: bool) -> Config {
303 self.shrink = Some(yes);
304 self
305 }
306
307 /// Whether to include 'Capture' states in the NFA.
308 ///
309 /// Currently, enabling this setting requires disabling the
310 /// [`reverse`](Config::reverse) setting. If both are enabled, then the
311 /// compiler will return an error. It is expected that this limitation will
312 /// be lifted in the future.
313 ///
314 /// This is enabled by default.
315 ///
316 /// # Example
317 ///
318 /// This example demonstrates that some regex engines, like the Pike VM,
319 /// require capturing states to be present in the NFA to report match
320 /// offsets.
321 ///
322 /// (Note that since this method is deprecated, the example below uses
323 /// [`Config::which_captures`] to disable capture states.)
324 ///
325 /// ```
326 /// use regex_automata::nfa::thompson::{
327 /// pikevm::PikeVM,
328 /// NFA,
329 /// WhichCaptures,
330 /// };
331 ///
332 /// let re = PikeVM::builder()
333 /// .thompson(NFA::config().which_captures(WhichCaptures::None))
334 /// .build(r"[a-z]+")?;
335 /// let mut cache = re.create_cache();
336 ///
337 /// assert!(re.is_match(&mut cache, "abc"));
338 /// assert_eq!(None, re.find(&mut cache, "abc"));
339 ///
340 /// # Ok::<(), Box<dyn std::error::Error>>(())
341 /// ```
342 #[deprecated(since = "0.3.5", note = "use which_captures instead")]
343 pub fn captures(self, yes: bool) -> Config {
344 self.which_captures(if yes {
345 WhichCaptures::All
346 } else {
347 WhichCaptures::None
348 })
349 }
350
351 /// Configures what kinds of capture groups are compiled into
352 /// [`State::Capture`](crate::nfa::thompson::State::Capture) states in a
353 /// Thompson NFA.
354 ///
355 /// Currently, using any option except for [`WhichCaptures::None`] requires
356 /// disabling the [`reverse`](Config::reverse) setting. If both are
357 /// enabled, then the compiler will return an error. It is expected that
358 /// this limitation will be lifted in the future.
359 ///
360 /// This is set to [`WhichCaptures::All`] by default. Callers may wish to
361 /// use [`WhichCaptures::Implicit`] in cases where one wants avoid the
362 /// overhead of capture states for explicit groups. Usually this occurs
363 /// when one wants to use the `PikeVM` only for determining the overall
364 /// match. Otherwise, the `PikeVM` could use much more memory than is
365 /// necessary.
366 ///
367 /// # Example
368 ///
369 /// This example demonstrates that some regex engines, like the Pike VM,
370 /// require capturing states to be present in the NFA to report match
371 /// offsets.
372 ///
373 /// ```
374 /// use regex_automata::nfa::thompson::{
375 /// pikevm::PikeVM,
376 /// NFA,
377 /// WhichCaptures,
378 /// };
379 ///
380 /// let re = PikeVM::builder()
381 /// .thompson(NFA::config().which_captures(WhichCaptures::None))
382 /// .build(r"[a-z]+")?;
383 /// let mut cache = re.create_cache();
384 ///
385 /// assert!(re.is_match(&mut cache, "abc"));
386 /// assert_eq!(None, re.find(&mut cache, "abc"));
387 ///
388 /// # Ok::<(), Box<dyn std::error::Error>>(())
389 /// ```
390 ///
391 /// The same applies to the bounded backtracker:
392 ///
393 /// ```
394 /// use regex_automata::nfa::thompson::{
395 /// backtrack::BoundedBacktracker,
396 /// NFA,
397 /// WhichCaptures,
398 /// };
399 ///
400 /// let re = BoundedBacktracker::builder()
401 /// .thompson(NFA::config().which_captures(WhichCaptures::None))
402 /// .build(r"[a-z]+")?;
403 /// let mut cache = re.create_cache();
404 ///
405 /// assert!(re.try_is_match(&mut cache, "abc")?);
406 /// assert_eq!(None, re.try_find(&mut cache, "abc")?);
407 ///
408 /// # Ok::<(), Box<dyn std::error::Error>>(())
409 /// ```
410 pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config {
411 self.which_captures = Some(which_captures);
412 self
413 }
414
415 /// Sets the look-around matcher that should be used with this NFA.
416 ///
417 /// A look-around matcher determines how to match look-around assertions.
418 /// In particular, some assertions are configurable. For example, the
419 /// `(?m:^)` and `(?m:$)` assertions can have their line terminator changed
420 /// from the default of `\n` to any other byte.
421 ///
422 /// # Example
423 ///
424 /// This shows how to change the line terminator for multi-line assertions.
425 ///
426 /// ```
427 /// use regex_automata::{
428 /// nfa::thompson::{self, pikevm::PikeVM},
429 /// util::look::LookMatcher,
430 /// Match, Input,
431 /// };
432 ///
433 /// let mut lookm = LookMatcher::new();
434 /// lookm.set_line_terminator(b'\x00');
435 ///
436 /// let re = PikeVM::builder()
437 /// .thompson(thompson::Config::new().look_matcher(lookm))
438 /// .build(r"(?m)^[a-z]+$")?;
439 /// let mut cache = re.create_cache();
440 ///
441 /// // Multi-line assertions now use NUL as a terminator.
442 /// assert_eq!(
443 /// Some(Match::must(0, 1..4)),
444 /// re.find(&mut cache, b"\x00abc\x00"),
445 /// );
446 /// // ... and \n is no longer recognized as a terminator.
447 /// assert_eq!(
448 /// None,
449 /// re.find(&mut cache, b"\nabc\n"),
450 /// );
451 ///
452 /// # Ok::<(), Box<dyn std::error::Error>>(())
453 /// ```
454 pub fn look_matcher(mut self, m: LookMatcher) -> Config {
455 self.look_matcher = Some(m);
456 self
457 }
458
459 /// Whether to compile an unanchored prefix into this NFA.
460 ///
461 /// This is enabled by default. It is made available for tests only to make
462 /// it easier to unit test the output of the compiler.
463 #[cfg(test)]
464 fn unanchored_prefix(mut self, yes: bool) -> Config {
465 self.unanchored_prefix = Some(yes);
466 self
467 }
468
469 /// Returns whether this configuration has enabled UTF-8 mode.
470 pub fn get_utf8(&self) -> bool {
471 self.utf8.unwrap_or(true)
472 }
473
474 /// Returns whether this configuration has enabled reverse NFA compilation.
475 pub fn get_reverse(&self) -> bool {
476 self.reverse.unwrap_or(false)
477 }
478
479 /// Return the configured NFA size limit, if it exists, in the number of
480 /// bytes of heap used.
481 pub fn get_nfa_size_limit(&self) -> Option<usize> {
482 self.nfa_size_limit.unwrap_or(None)
483 }
484
485 /// Return whether NFA shrinking is enabled.
486 pub fn get_shrink(&self) -> bool {
487 self.shrink.unwrap_or(false)
488 }
489
490 /// Return whether NFA compilation is configured to produce capture states.
491 #[deprecated(since = "0.3.5", note = "use get_which_captures instead")]
492 pub fn get_captures(&self) -> bool {
493 self.get_which_captures().is_any()
494 }
495
496 /// Return what kinds of capture states will be compiled into an NFA.
497 pub fn get_which_captures(&self) -> WhichCaptures {
498 self.which_captures.unwrap_or(WhichCaptures::All)
499 }
500
501 /// Return the look-around matcher for this NFA.
502 pub fn get_look_matcher(&self) -> LookMatcher {
503 self.look_matcher.clone().unwrap_or(LookMatcher::default())
504 }
505
506 /// Return whether NFA compilation is configured to include an unanchored
507 /// prefix.
508 ///
509 /// This is always false when not in test mode.
510 fn get_unanchored_prefix(&self) -> bool {
511 #[cfg(test)]
512 {
513 self.unanchored_prefix.unwrap_or(true)
514 }
515 #[cfg(not(test))]
516 {
517 true
518 }
519 }
520
521 /// Overwrite the default configuration such that the options in `o` are
522 /// always used. If an option in `o` is not set, then the corresponding
523 /// option in `self` is used. If it's not set in `self` either, then it
524 /// remains not set.
525 pub(crate) fn overwrite(&self, o: Config) -> Config {
526 Config {
527 utf8: o.utf8.or(self.utf8),
528 reverse: o.reverse.or(self.reverse),
529 nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit),
530 shrink: o.shrink.or(self.shrink),
531 which_captures: o.which_captures.or(self.which_captures),
532 look_matcher: o.look_matcher.or_else(|| self.look_matcher.clone()),
533 #[cfg(test)]
534 unanchored_prefix: o.unanchored_prefix.or(self.unanchored_prefix),
535 }
536 }
537}
538
539/// A configuration indicating which kinds of
540/// [`State::Capture`](crate::nfa::thompson::State::Capture) states to include.
541///
542/// This configuration can be used with [`Config::which_captures`] to control
543/// which capture states are compiled into a Thompson NFA.
544///
545/// The default configuration is [`WhichCaptures::All`].
546#[derive(Clone, Copy, Debug)]
547pub enum WhichCaptures {
548 /// All capture states, including those corresponding to both implicit and
549 /// explicit capture groups, are included in the Thompson NFA.
550 All,
551 /// Only capture states corresponding to implicit capture groups are
552 /// included. Implicit capture groups appear in every pattern implicitly
553 /// and correspond to the overall match of a pattern.
554 ///
555 /// This is useful when one only cares about the overall match of a
556 /// pattern. By excluding capture states from explicit capture groups,
557 /// one might be able to reduce the memory usage of a multi-pattern regex
558 /// substantially if it was otherwise written to have many explicit capture
559 /// groups.
560 Implicit,
561 /// No capture states are compiled into the Thompson NFA.
562 ///
563 /// This is useful when capture states are either not needed (for example,
564 /// if one is only trying to build a DFA) or if they aren't supported (for
565 /// example, a reverse NFA).
566 ///
567 /// # Warning
568 ///
569 /// Callers must be exceedingly careful when using this
570 /// option. In particular, not all regex engines support
571 /// reporting match spans when using this option (for example,
572 /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) or
573 /// [`BoundedBacktracker`](crate::nfa::thompson::backtrack::BoundedBacktracker)).
574 ///
575 /// Perhaps more confusingly, using this option with such an
576 /// engine means that an `is_match` routine could report `true`
577 /// when `find` reports `None`. This is generally not something
578 /// that _should_ happen, but the low level control provided by
579 /// this crate makes it possible.
580 ///
581 /// Similarly, any regex engines (like [`meta::Regex`](crate::meta::Regex))
582 /// should always return `None` from `find` routines when this option is
583 /// used, even if _some_ of its internal engines could find the match
584 /// boundaries. This is because inputs from user data could influence
585 /// engine selection, and thus influence whether a match is found or not.
586 /// Indeed, `meta::Regex::find` will always return `None` when configured
587 /// with this option.
588 None,
589}
590
591impl Default for WhichCaptures {
592 fn default() -> WhichCaptures {
593 WhichCaptures::All
594 }
595}
596
597impl WhichCaptures {
598 /// Returns true if this configuration indicates that no capture states
599 /// should be produced in an NFA.
600 pub fn is_none(&self) -> bool {
601 matches!(*self, WhichCaptures::None)
602 }
603
604 /// Returns true if this configuration indicates that some capture states
605 /// should be added to an NFA. Note that this might only include capture
606 /// states for implicit capture groups.
607 pub fn is_any(&self) -> bool {
608 !self.is_none()
609 }
610}
611
612/*
613This compiler below uses Thompson's construction algorithm. The compiler takes
614a regex-syntax::Hir as input and emits an NFA graph as output. The NFA graph
615is structured in a way that permits it to be executed by a virtual machine and
616also used to efficiently build a DFA.
617
618The compiler deals with a slightly expanded set of NFA states than what is
619in a final NFA (as exhibited by builder::State and nfa::State). Notably a
620compiler state includes an empty node that has exactly one unconditional
621epsilon transition to the next state. In other words, it's a "goto" instruction
622if one views Thompson's NFA as a set of bytecode instructions. These goto
623instructions are removed in a subsequent phase before returning the NFA to the
624caller. The purpose of these empty nodes is that they make the construction
625algorithm substantially simpler to implement. We remove them before returning
626to the caller because they can represent substantial overhead when traversing
627the NFA graph (either while searching using the NFA directly or while building
628a DFA).
629
630In the future, it would be nice to provide a Glushkov compiler as well, as it
631would work well as a bit-parallel NFA for smaller regexes. But the Thompson
632construction is one I'm more familiar with and seems more straight-forward to
633deal with when it comes to large Unicode character classes.
634
635Internally, the compiler uses interior mutability to improve composition in the
636face of the borrow checker. In particular, we'd really like to be able to write
637things like this:
638
639 self.c_concat(exprs.iter().map(|e| self.c(e)))
640
641Which elegantly uses iterators to build up a sequence of compiled regex
642sub-expressions and then hands it off to the concatenating compiler routine.
643Without interior mutability, the borrow checker won't let us borrow `self`
644mutably both inside and outside the closure at the same time.
645*/
646
647/// A builder for compiling an NFA from a regex's high-level intermediate
648/// representation (HIR).
649///
650/// This compiler provides a way to translate a parsed regex pattern into an
651/// NFA state graph. The NFA state graph can either be used directly to execute
652/// a search (e.g., with a Pike VM), or it can be further used to build a DFA.
653///
654/// This compiler provides APIs both for compiling regex patterns directly from
655/// their concrete syntax, or via a [`regex_syntax::hir::Hir`].
656///
657/// This compiler has various options that may be configured via
658/// [`thompson::Config`](Config).
659///
660/// Note that a compiler is not the same as a [`thompson::Builder`](Builder).
661/// A `Builder` provides a lower level API that is uncoupled from a regex
662/// pattern's concrete syntax or even its HIR. Instead, it permits stitching
663/// together an NFA by hand. See its docs for examples.
664///
665/// # Example: compilation from concrete syntax
666///
667/// This shows how to compile an NFA from a pattern string while setting a size
668/// limit on how big the NFA is allowed to be (in terms of bytes of heap used).
669///
670/// ```
671/// use regex_automata::{
672/// nfa::thompson::{NFA, pikevm::PikeVM},
673/// Match,
674/// };
675///
676/// let config = NFA::config().nfa_size_limit(Some(1_000));
677/// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?;
678///
679/// let re = PikeVM::new_from_nfa(nfa)?;
680/// let mut cache = re.create_cache();
681/// let mut caps = re.create_captures();
682/// let expected = Some(Match::must(0, 3..4));
683/// re.captures(&mut cache, "!@#A#@!", &mut caps);
684/// assert_eq!(expected, caps.get_match());
685///
686/// # Ok::<(), Box<dyn std::error::Error>>(())
687/// ```
688///
689/// # Example: compilation from HIR
690///
691/// This shows how to hand assemble a regular expression via its HIR, and then
692/// compile an NFA directly from it.
693///
694/// ```
695/// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
696/// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
697///
698/// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
699/// ClassBytesRange::new(b'0', b'9'),
700/// ClassBytesRange::new(b'A', b'Z'),
701/// ClassBytesRange::new(b'_', b'_'),
702/// ClassBytesRange::new(b'a', b'z'),
703/// ])));
704///
705/// let config = NFA::config().nfa_size_limit(Some(1_000));
706/// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
707///
708/// let re = PikeVM::new_from_nfa(nfa)?;
709/// let mut cache = re.create_cache();
710/// let mut caps = re.create_captures();
711/// let expected = Some(Match::must(0, 3..4));
712/// re.captures(&mut cache, "!@#A#@!", &mut caps);
713/// assert_eq!(expected, caps.get_match());
714///
715/// # Ok::<(), Box<dyn std::error::Error>>(())
716/// ```
717#[derive(Clone, Debug)]
718pub struct Compiler {
719 /// A regex parser, used when compiling an NFA directly from a pattern
720 /// string.
721 parser: ParserBuilder,
722 /// The compiler configuration.
723 config: Config,
724 /// The builder for actually constructing an NFA. This provides a
725 /// convenient abstraction for writing a compiler.
726 builder: RefCell<Builder>,
727 /// State used for compiling character classes to UTF-8 byte automata.
728 /// State is not retained between character class compilations. This just
729 /// serves to amortize allocation to the extent possible.
730 utf8_state: RefCell<Utf8State>,
731 /// State used for arranging character classes in reverse into a trie.
732 trie_state: RefCell<RangeTrie>,
733 /// State used for caching common suffixes when compiling reverse UTF-8
734 /// automata (for Unicode character classes).
735 utf8_suffix: RefCell<Utf8SuffixMap>,
736}
737
738impl Compiler {
739 /// Create a new NFA builder with its default configuration.
740 pub fn new() -> Compiler {
741 Compiler {
742 parser: ParserBuilder::new(),
743 config: Config::default(),
744 builder: RefCell::new(Builder::new()),
745 utf8_state: RefCell::new(Utf8State::new()),
746 trie_state: RefCell::new(RangeTrie::new()),
747 utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
748 }
749 }
750
751 /// Compile the given regular expression pattern into an NFA.
752 ///
753 /// If there was a problem parsing the regex, then that error is returned.
754 ///
755 /// Otherwise, if there was a problem building the NFA, then an error is
756 /// returned. The only error that can occur is if the compiled regex would
757 /// exceed the size limits configured on this builder, or if any part of
758 /// the NFA would exceed the integer representations used. (For example,
759 /// too many states might plausibly occur on a 16-bit target.)
760 ///
761 /// # Example
762 ///
763 /// ```
764 /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
765 ///
766 /// let config = NFA::config().nfa_size_limit(Some(1_000));
767 /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?;
768 ///
769 /// let re = PikeVM::new_from_nfa(nfa)?;
770 /// let mut cache = re.create_cache();
771 /// let mut caps = re.create_captures();
772 /// let expected = Some(Match::must(0, 3..4));
773 /// re.captures(&mut cache, "!@#A#@!", &mut caps);
774 /// assert_eq!(expected, caps.get_match());
775 ///
776 /// # Ok::<(), Box<dyn std::error::Error>>(())
777 /// ```
778 pub fn build(&self, pattern: &str) -> Result<NFA, BuildError> {
779 self.build_many(&[pattern])
780 }
781
782 /// Compile the given regular expression patterns into a single NFA.
783 ///
784 /// When matches are returned, the pattern ID corresponds to the index of
785 /// the pattern in the slice given.
786 ///
787 /// # Example
788 ///
789 /// ```
790 /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
791 ///
792 /// let config = NFA::config().nfa_size_limit(Some(1_000));
793 /// let nfa = NFA::compiler().configure(config).build_many(&[
794 /// r"(?-u)\s",
795 /// r"(?-u)\w",
796 /// ])?;
797 ///
798 /// let re = PikeVM::new_from_nfa(nfa)?;
799 /// let mut cache = re.create_cache();
800 /// let mut caps = re.create_captures();
801 /// let expected = Some(Match::must(1, 1..2));
802 /// re.captures(&mut cache, "!A! !A!", &mut caps);
803 /// assert_eq!(expected, caps.get_match());
804 ///
805 /// # Ok::<(), Box<dyn std::error::Error>>(())
806 /// ```
807 pub fn build_many<P: AsRef<str>>(
808 &self,
809 patterns: &[P],
810 ) -> Result<NFA, BuildError> {
811 let mut hirs = vec![];
812 for p in patterns {
813 hirs.push(
814 self.parser
815 .build()
816 .parse(p.as_ref())
817 .map_err(BuildError::syntax)?,
818 );
819 debug!("parsed: {:?}", p.as_ref());
820 }
821 self.build_many_from_hir(&hirs)
822 }
823
824 /// Compile the given high level intermediate representation of a regular
825 /// expression into an NFA.
826 ///
827 /// If there was a problem building the NFA, then an error is returned. The
828 /// only error that can occur is if the compiled regex would exceed the
829 /// size limits configured on this builder, or if any part of the NFA would
830 /// exceed the integer representations used. (For example, too many states
831 /// might plausibly occur on a 16-bit target.)
832 ///
833 /// # Example
834 ///
835 /// ```
836 /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
837 /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
838 ///
839 /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
840 /// ClassBytesRange::new(b'0', b'9'),
841 /// ClassBytesRange::new(b'A', b'Z'),
842 /// ClassBytesRange::new(b'_', b'_'),
843 /// ClassBytesRange::new(b'a', b'z'),
844 /// ])));
845 ///
846 /// let config = NFA::config().nfa_size_limit(Some(1_000));
847 /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
848 ///
849 /// let re = PikeVM::new_from_nfa(nfa)?;
850 /// let mut cache = re.create_cache();
851 /// let mut caps = re.create_captures();
852 /// let expected = Some(Match::must(0, 3..4));
853 /// re.captures(&mut cache, "!@#A#@!", &mut caps);
854 /// assert_eq!(expected, caps.get_match());
855 ///
856 /// # Ok::<(), Box<dyn std::error::Error>>(())
857 /// ```
858 pub fn build_from_hir(&self, expr: &Hir) -> Result<NFA, BuildError> {
859 self.build_many_from_hir(&[expr])
860 }
861
862 /// Compile the given high level intermediate representations of regular
863 /// expressions into a single NFA.
864 ///
865 /// When matches are returned, the pattern ID corresponds to the index of
866 /// the pattern in the slice given.
867 ///
868 /// # Example
869 ///
870 /// ```
871 /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
872 /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
873 ///
874 /// let hirs = &[
875 /// Hir::class(Class::Bytes(ClassBytes::new(vec![
876 /// ClassBytesRange::new(b'\t', b'\r'),
877 /// ClassBytesRange::new(b' ', b' '),
878 /// ]))),
879 /// Hir::class(Class::Bytes(ClassBytes::new(vec![
880 /// ClassBytesRange::new(b'0', b'9'),
881 /// ClassBytesRange::new(b'A', b'Z'),
882 /// ClassBytesRange::new(b'_', b'_'),
883 /// ClassBytesRange::new(b'a', b'z'),
884 /// ]))),
885 /// ];
886 ///
887 /// let config = NFA::config().nfa_size_limit(Some(1_000));
888 /// let nfa = NFA::compiler().configure(config).build_many_from_hir(hirs)?;
889 ///
890 /// let re = PikeVM::new_from_nfa(nfa)?;
891 /// let mut cache = re.create_cache();
892 /// let mut caps = re.create_captures();
893 /// let expected = Some(Match::must(1, 1..2));
894 /// re.captures(&mut cache, "!A! !A!", &mut caps);
895 /// assert_eq!(expected, caps.get_match());
896 ///
897 /// # Ok::<(), Box<dyn std::error::Error>>(())
898 /// ```
899 pub fn build_many_from_hir<H: Borrow<Hir>>(
900 &self,
901 exprs: &[H],
902 ) -> Result<NFA, BuildError> {
903 self.compile(exprs)
904 }
905
906 /// Apply the given NFA configuration options to this builder.
907 ///
908 /// # Example
909 ///
910 /// ```
911 /// use regex_automata::nfa::thompson::NFA;
912 ///
913 /// let config = NFA::config().nfa_size_limit(Some(1_000));
914 /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?;
915 /// assert_eq!(nfa.pattern_len(), 1);
916 ///
917 /// # Ok::<(), Box<dyn std::error::Error>>(())
918 /// ```
919 pub fn configure(&mut self, config: Config) -> &mut Compiler {
920 self.config = self.config.overwrite(config);
921 self
922 }
923
924 /// Set the syntax configuration for this builder using
925 /// [`syntax::Config`](crate::util::syntax::Config).
926 ///
927 /// This permits setting things like case insensitivity, Unicode and multi
928 /// line mode.
929 ///
930 /// This syntax configuration only applies when an NFA is built directly
931 /// from a pattern string. If an NFA is built from an HIR, then all syntax
932 /// settings are ignored.
933 ///
934 /// # Example
935 ///
936 /// ```
937 /// use regex_automata::{nfa::thompson::NFA, util::syntax};
938 ///
939 /// let syntax_config = syntax::Config::new().unicode(false);
940 /// let nfa = NFA::compiler().syntax(syntax_config).build(r"\w")?;
941 /// // If Unicode were enabled, the number of states would be much bigger.
942 /// assert!(nfa.states().len() < 15);
943 ///
944 /// # Ok::<(), Box<dyn std::error::Error>>(())
945 /// ```
946 pub fn syntax(
947 &mut self,
948 config: crate::util::syntax::Config,
949 ) -> &mut Compiler {
950 config.apply(&mut self.parser);
951 self
952 }
953}
954
955impl Compiler {
956 /// Compile the sequence of HIR expressions given. Pattern IDs are
957 /// allocated starting from 0, in correspondence with the slice given.
958 ///
959 /// It is legal to provide an empty slice. In that case, the NFA returned
960 /// has no patterns and will never match anything.
961 fn compile<H: Borrow<Hir>>(&self, exprs: &[H]) -> Result<NFA, BuildError> {
962 if exprs.len() > PatternID::LIMIT {
963 return Err(BuildError::too_many_patterns(exprs.len()));
964 }
965 if self.config.get_reverse()
966 && self.config.get_which_captures().is_any()
967 {
968 return Err(BuildError::unsupported_captures());
969 }
970
971 self.builder.borrow_mut().clear();
972 self.builder.borrow_mut().set_utf8(self.config.get_utf8());
973 self.builder.borrow_mut().set_reverse(self.config.get_reverse());
974 self.builder
975 .borrow_mut()
976 .set_look_matcher(self.config.get_look_matcher());
977 self.builder
978 .borrow_mut()
979 .set_size_limit(self.config.get_nfa_size_limit())?;
980
981 // We always add an unanchored prefix unless we were specifically told
982 // not to (for tests only), or if we know that the regex is anchored
983 // for all matches. When an unanchored prefix is not added, then the
984 // NFA's anchored and unanchored start states are equivalent.
985 let all_anchored = exprs.iter().all(|e| {
986 let props = e.borrow().properties();
987 if self.config.get_reverse() {
988 props.look_set_suffix().contains(hir::Look::End)
989 } else {
990 props.look_set_prefix().contains(hir::Look::Start)
991 }
992 });
993 let anchored = !self.config.get_unanchored_prefix() || all_anchored;
994 let unanchored_prefix = if anchored {
995 self.c_empty()?
996 } else {
997 self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?
998 };
999
1000 let compiled = self.c_alt_iter(exprs.iter().map(|e| {
1001 let _ = self.start_pattern()?;
1002 let one = self.c_cap(0, None, e.borrow())?;
1003 let match_state_id = self.add_match()?;
1004 self.patch(one.end, match_state_id)?;
1005 let _ = self.finish_pattern(one.start)?;
1006 Ok(ThompsonRef { start: one.start, end: match_state_id })
1007 }))?;
1008 self.patch(unanchored_prefix.end, compiled.start)?;
1009 let nfa = self
1010 .builder
1011 .borrow_mut()
1012 .build(compiled.start, unanchored_prefix.start)?;
1013
1014 debug!("HIR-to-NFA compilation complete, config: {:?}", self.config);
1015 Ok(nfa)
1016 }
1017
1018 /// Compile an arbitrary HIR expression.
1019 fn c(&self, expr: &Hir) -> Result<ThompsonRef, BuildError> {
1020 use regex_syntax::hir::{Class, HirKind::*};
1021
1022 match *expr.kind() {
1023 Empty => self.c_empty(),
1024 Literal(hir::Literal(ref bytes)) => self.c_literal(bytes),
1025 Class(Class::Bytes(ref c)) => self.c_byte_class(c),
1026 Class(Class::Unicode(ref c)) => self.c_unicode_class(c),
1027 Look(ref look) => self.c_look(look),
1028 Repetition(ref rep) => self.c_repetition(rep),
1029 Capture(ref c) => self.c_cap(c.index, c.name.as_deref(), &c.sub),
1030 Concat(ref es) => self.c_concat(es.iter().map(|e| self.c(e))),
1031 Alternation(ref es) => self.c_alt_slice(es),
1032 }
1033 }
1034
1035 /// Compile a concatenation of the sub-expressions yielded by the given
1036 /// iterator. If the iterator yields no elements, then this compiles down
1037 /// to an "empty" state that always matches.
1038 ///
1039 /// If the compiler is in reverse mode, then the expressions given are
1040 /// automatically compiled in reverse.
1041 fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError>
1042 where
1043 I: DoubleEndedIterator<Item = Result<ThompsonRef, BuildError>>,
1044 {
1045 let first = if self.is_reverse() { it.next_back() } else { it.next() };
1046 let ThompsonRef { start, mut end } = match first {
1047 Some(result) => result?,
1048 None => return self.c_empty(),
1049 };
1050 loop {
1051 let next =
1052 if self.is_reverse() { it.next_back() } else { it.next() };
1053 let compiled = match next {
1054 Some(result) => result?,
1055 None => break,
1056 };
1057 self.patch(end, compiled.start)?;
1058 end = compiled.end;
1059 }
1060 Ok(ThompsonRef { start, end })
1061 }
1062
1063 /// Compile an alternation of the given HIR values.
1064 ///
1065 /// This is like 'c_alt_iter', but it accepts a slice of HIR values instead
1066 /// of an iterator of compiled NFA sub-graphs. The point of accepting a
1067 /// slice here is that it opens up some optimization opportunities. For
1068 /// example, if all of the HIR values are literals, then this routine might
1069 /// re-shuffle them to make NFA epsilon closures substantially faster.
1070 fn c_alt_slice(&self, exprs: &[Hir]) -> Result<ThompsonRef, BuildError> {
1071 // self.c_alt_iter(exprs.iter().map(|e| self.c(e)))
1072 let literal_count = exprs
1073 .iter()
1074 .filter(|e| {
1075 matches!(*e.kind(), hir::HirKind::Literal(hir::Literal(_)))
1076 })
1077 .count();
1078 if literal_count <= 1 || literal_count < exprs.len() {
1079 return self.c_alt_iter(exprs.iter().map(|e| self.c(e)));
1080 }
1081
1082 let mut trie = if self.is_reverse() {
1083 LiteralTrie::reverse()
1084 } else {
1085 LiteralTrie::forward()
1086 };
1087 for expr in exprs.iter() {
1088 let literal = match *expr.kind() {
1089 hir::HirKind::Literal(hir::Literal(ref bytes)) => bytes,
1090 _ => unreachable!(),
1091 };
1092 trie.add(literal)?;
1093 }
1094 trie.compile(&mut self.builder.borrow_mut())
1095 }
1096
1097 /// Compile an alternation, where each element yielded by the given
1098 /// iterator represents an item in the alternation. If the iterator yields
1099 /// no elements, then this compiles down to a "fail" state.
1100 ///
1101 /// In an alternation, expressions appearing earlier are "preferred" at
1102 /// match time over expressions appearing later. At least, this is true
1103 /// when using "leftmost first" match semantics. (If "leftmost longest" are
1104 /// ever added in the future, then this preference order of priority would
1105 /// not apply in that mode.)
1106 fn c_alt_iter<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError>
1107 where
1108 I: Iterator<Item = Result<ThompsonRef, BuildError>>,
1109 {
1110 let first = match it.next() {
1111 None => return self.c_fail(),
1112 Some(result) => result?,
1113 };
1114 let second = match it.next() {
1115 None => return Ok(first),
1116 Some(result) => result?,
1117 };
1118
1119 let union = self.add_union()?;
1120 let end = self.add_empty()?;
1121 self.patch(union, first.start)?;
1122 self.patch(first.end, end)?;
1123 self.patch(union, second.start)?;
1124 self.patch(second.end, end)?;
1125 for result in it {
1126 let compiled = result?;
1127 self.patch(union, compiled.start)?;
1128 self.patch(compiled.end, end)?;
1129 }
1130 Ok(ThompsonRef { start: union, end })
1131 }
1132
1133 /// Compile the given capture sub-expression. `expr` should be the
1134 /// sub-expression contained inside the capture. If "capture" states are
1135 /// enabled, then they are added as appropriate.
1136 ///
1137 /// This accepts the pieces of a capture instead of a `hir::Capture` so
1138 /// that it's easy to manufacture a "fake" group when necessary, e.g., for
1139 /// adding the entire pattern as if it were a group in order to create
1140 /// appropriate "capture" states in the NFA.
1141 fn c_cap(
1142 &self,
1143 index: u32,
1144 name: Option<&str>,
1145 expr: &Hir,
1146 ) -> Result<ThompsonRef, BuildError> {
1147 match self.config.get_which_captures() {
1148 // No capture states means we always skip them.
1149 WhichCaptures::None => return self.c(expr),
1150 // Implicit captures states means we only add when index==0 since
1151 // index==0 implies the group is implicit.
1152 WhichCaptures::Implicit if index > 0 => return self.c(expr),
1153 _ => {}
1154 }
1155
1156 let start = self.add_capture_start(index, name)?;
1157 let inner = self.c(expr)?;
1158 let end = self.add_capture_end(index)?;
1159 self.patch(start, inner.start)?;
1160 self.patch(inner.end, end)?;
1161 Ok(ThompsonRef { start, end })
1162 }
1163
1164 /// Compile the given repetition expression. This handles all types of
1165 /// repetitions and greediness.
1166 fn c_repetition(
1167 &self,
1168 rep: &hir::Repetition,
1169 ) -> Result<ThompsonRef, BuildError> {
1170 match (rep.min, rep.max) {
1171 (0, Some(1)) => self.c_zero_or_one(&rep.sub, rep.greedy),
1172 (min, None) => self.c_at_least(&rep.sub, rep.greedy, min),
1173 (min, Some(max)) if min == max => self.c_exactly(&rep.sub, min),
1174 (min, Some(max)) => self.c_bounded(&rep.sub, rep.greedy, min, max),
1175 }
1176 }
1177
1178 /// Compile the given expression such that it matches at least `min` times,
1179 /// but no more than `max` times.
1180 ///
1181 /// When `greedy` is true, then the preference is for the expression to
1182 /// match as much as possible. Otherwise, it will match as little as
1183 /// possible.
1184 fn c_bounded(
1185 &self,
1186 expr: &Hir,
1187 greedy: bool,
1188 min: u32,
1189 max: u32,
1190 ) -> Result<ThompsonRef, BuildError> {
1191 let prefix = self.c_exactly(expr, min)?;
1192 if min == max {
1193 return Ok(prefix);
1194 }
1195
1196 // It is tempting here to compile the rest here as a concatenation
1197 // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it
1198 // were `aaa?a?a?`. The problem here is that it leads to this program:
1199 //
1200 // >000000: 61 => 01
1201 // 000001: 61 => 02
1202 // 000002: union(03, 04)
1203 // 000003: 61 => 04
1204 // 000004: union(05, 06)
1205 // 000005: 61 => 06
1206 // 000006: union(07, 08)
1207 // 000007: 61 => 08
1208 // 000008: MATCH
1209 //
1210 // And effectively, once you hit state 2, the epsilon closure will
1211 // include states 3, 5, 6, 7 and 8, which is quite a bit. It is better
1212 // to instead compile it like so:
1213 //
1214 // >000000: 61 => 01
1215 // 000001: 61 => 02
1216 // 000002: union(03, 08)
1217 // 000003: 61 => 04
1218 // 000004: union(05, 08)
1219 // 000005: 61 => 06
1220 // 000006: union(07, 08)
1221 // 000007: 61 => 08
1222 // 000008: MATCH
1223 //
1224 // So that the epsilon closure of state 2 is now just 3 and 8.
1225 let empty = self.add_empty()?;
1226 let mut prev_end = prefix.end;
1227 for _ in min..max {
1228 let union = if greedy {
1229 self.add_union()
1230 } else {
1231 self.add_union_reverse()
1232 }?;
1233 let compiled = self.c(expr)?;
1234 self.patch(prev_end, union)?;
1235 self.patch(union, compiled.start)?;
1236 self.patch(union, empty)?;
1237 prev_end = compiled.end;
1238 }
1239 self.patch(prev_end, empty)?;
1240 Ok(ThompsonRef { start: prefix.start, end: empty })
1241 }
1242
1243 /// Compile the given expression such that it may be matched `n` or more
1244 /// times, where `n` can be any integer. (Although a particularly large
1245 /// integer is likely to run afoul of any configured size limits.)
1246 ///
1247 /// When `greedy` is true, then the preference is for the expression to
1248 /// match as much as possible. Otherwise, it will match as little as
1249 /// possible.
1250 fn c_at_least(
1251 &self,
1252 expr: &Hir,
1253 greedy: bool,
1254 n: u32,
1255 ) -> Result<ThompsonRef, BuildError> {
1256 if n == 0 {
1257 // When the expression cannot match the empty string, then we
1258 // can get away with something much simpler: just one 'alt'
1259 // instruction that optionally repeats itself. But if the expr
1260 // can match the empty string... see below.
1261 if expr.properties().minimum_len().map_or(false, |len| len > 0) {
1262 let union = if greedy {
1263 self.add_union()
1264 } else {
1265 self.add_union_reverse()
1266 }?;
1267 let compiled = self.c(expr)?;
1268 self.patch(union, compiled.start)?;
1269 self.patch(compiled.end, union)?;
1270 return Ok(ThompsonRef { start: union, end: union });
1271 }
1272
1273 // What's going on here? Shouldn't x* be simpler than this? It
1274 // turns out that when implementing leftmost-first (Perl-like)
1275 // match semantics, x* results in an incorrect preference order
1276 // when computing the transitive closure of states if and only if
1277 // 'x' can match the empty string. So instead, we compile x* as
1278 // (x+)?, which preserves the correct preference order.
1279 //
1280 // See: https://github.com/rust-lang/regex/issues/779
1281 let compiled = self.c(expr)?;
1282 let plus = if greedy {
1283 self.add_union()
1284 } else {
1285 self.add_union_reverse()
1286 }?;
1287 self.patch(compiled.end, plus)?;
1288 self.patch(plus, compiled.start)?;
1289
1290 let question = if greedy {
1291 self.add_union()
1292 } else {
1293 self.add_union_reverse()
1294 }?;
1295 let empty = self.add_empty()?;
1296 self.patch(question, compiled.start)?;
1297 self.patch(question, empty)?;
1298 self.patch(plus, empty)?;
1299 Ok(ThompsonRef { start: question, end: empty })
1300 } else if n == 1 {
1301 let compiled = self.c(expr)?;
1302 let union = if greedy {
1303 self.add_union()
1304 } else {
1305 self.add_union_reverse()
1306 }?;
1307 self.patch(compiled.end, union)?;
1308 self.patch(union, compiled.start)?;
1309 Ok(ThompsonRef { start: compiled.start, end: union })
1310 } else {
1311 let prefix = self.c_exactly(expr, n - 1)?;
1312 let last = self.c(expr)?;
1313 let union = if greedy {
1314 self.add_union()
1315 } else {
1316 self.add_union_reverse()
1317 }?;
1318 self.patch(prefix.end, last.start)?;
1319 self.patch(last.end, union)?;
1320 self.patch(union, last.start)?;
1321 Ok(ThompsonRef { start: prefix.start, end: union })
1322 }
1323 }
1324
1325 /// Compile the given expression such that it may be matched zero or one
1326 /// times.
1327 ///
1328 /// When `greedy` is true, then the preference is for the expression to
1329 /// match as much as possible. Otherwise, it will match as little as
1330 /// possible.
1331 fn c_zero_or_one(
1332 &self,
1333 expr: &Hir,
1334 greedy: bool,
1335 ) -> Result<ThompsonRef, BuildError> {
1336 let union =
1337 if greedy { self.add_union() } else { self.add_union_reverse() }?;
1338 let compiled = self.c(expr)?;
1339 let empty = self.add_empty()?;
1340 self.patch(union, compiled.start)?;
1341 self.patch(union, empty)?;
1342 self.patch(compiled.end, empty)?;
1343 Ok(ThompsonRef { start: union, end: empty })
1344 }
1345
1346 /// Compile the given HIR expression exactly `n` times.
1347 fn c_exactly(
1348 &self,
1349 expr: &Hir,
1350 n: u32,
1351 ) -> Result<ThompsonRef, BuildError> {
1352 let it = (0..n).map(|_| self.c(expr));
1353 self.c_concat(it)
1354 }
1355
1356 /// Compile the given byte oriented character class.
1357 ///
1358 /// This uses "sparse" states to represent an alternation between ranges in
1359 /// this character class. We can use "sparse" states instead of stitching
1360 /// together a "union" state because all ranges in a character class have
1361 /// equal priority *and* are non-overlapping (thus, only one can match, so
1362 /// there's never a question of priority in the first place). This saves a
1363 /// fair bit of overhead when traversing an NFA.
1364 ///
1365 /// This routine compiles an empty character class into a "fail" state.
1366 fn c_byte_class(
1367 &self,
1368 cls: &hir::ClassBytes,
1369 ) -> Result<ThompsonRef, BuildError> {
1370 let end = self.add_empty()?;
1371 let mut trans = Vec::with_capacity(cls.ranges().len());
1372 for r in cls.iter() {
1373 trans.push(Transition {
1374 start: r.start(),
1375 end: r.end(),
1376 next: end,
1377 });
1378 }
1379 Ok(ThompsonRef { start: self.add_sparse(trans)?, end })
1380 }
1381
1382 /// Compile the given Unicode character class.
1383 ///
1384 /// This routine specifically tries to use various types of compression,
1385 /// since UTF-8 automata of large classes can get quite large. The specific
1386 /// type of compression used depends on forward vs reverse compilation, and
1387 /// whether NFA shrinking is enabled or not.
1388 ///
1389 /// Aside from repetitions causing lots of repeat group, this is like the
1390 /// single most expensive part of regex compilation. Therefore, a large part
1391 /// of the expense of compilation may be reduce by disabling Unicode in the
1392 /// pattern.
1393 ///
1394 /// This routine compiles an empty character class into a "fail" state.
1395 fn c_unicode_class(
1396 &self,
1397 cls: &hir::ClassUnicode,
1398 ) -> Result<ThompsonRef, BuildError> {
1399 // If all we have are ASCII ranges wrapped in a Unicode package, then
1400 // there is zero reason to bring out the big guns. We can fit all ASCII
1401 // ranges within a single sparse state.
1402 if cls.is_ascii() {
1403 let end = self.add_empty()?;
1404 let mut trans = Vec::with_capacity(cls.ranges().len());
1405 for r in cls.iter() {
1406 // The unwraps below are OK because we've verified that this
1407 // class only contains ASCII codepoints.
1408 trans.push(Transition {
1409 // FIXME(1.59): use the 'TryFrom<char> for u8' impl.
1410 start: u8::try_from(u32::from(r.start())).unwrap(),
1411 end: u8::try_from(u32::from(r.end())).unwrap(),
1412 next: end,
1413 });
1414 }
1415 Ok(ThompsonRef { start: self.add_sparse(trans)?, end })
1416 } else if self.is_reverse() {
1417 if !self.config.get_shrink() {
1418 // When we don't want to spend the extra time shrinking, we
1419 // compile the UTF-8 automaton in reverse using something like
1420 // the "naive" approach, but will attempt to re-use common
1421 // suffixes.
1422 self.c_unicode_class_reverse_with_suffix(cls)
1423 } else {
1424 // When we want to shrink our NFA for reverse UTF-8 automata,
1425 // we cannot feed UTF-8 sequences directly to the UTF-8
1426 // compiler, since the UTF-8 compiler requires all sequences
1427 // to be lexicographically sorted. Instead, we organize our
1428 // sequences into a range trie, which can then output our
1429 // sequences in the correct order. Unfortunately, building the
1430 // range trie is fairly expensive (but not nearly as expensive
1431 // as building a DFA). Hence the reason why the 'shrink' option
1432 // exists, so that this path can be toggled off. For example,
1433 // we might want to turn this off if we know we won't be
1434 // compiling a DFA.
1435 let mut trie = self.trie_state.borrow_mut();
1436 trie.clear();
1437
1438 for rng in cls.iter() {
1439 for mut seq in Utf8Sequences::new(rng.start(), rng.end()) {
1440 seq.reverse();
1441 trie.insert(seq.as_slice());
1442 }
1443 }
1444 let mut builder = self.builder.borrow_mut();
1445 let mut utf8_state = self.utf8_state.borrow_mut();
1446 let mut utf8c =
1447 Utf8Compiler::new(&mut *builder, &mut *utf8_state)?;
1448 trie.iter(|seq| {
1449 utf8c.add(&seq)?;
1450 Ok(())
1451 })?;
1452 utf8c.finish()
1453 }
1454 } else {
1455 // In the forward direction, we always shrink our UTF-8 automata
1456 // because we can stream it right into the UTF-8 compiler. There
1457 // is almost no downside (in either memory or time) to using this
1458 // approach.
1459 let mut builder = self.builder.borrow_mut();
1460 let mut utf8_state = self.utf8_state.borrow_mut();
1461 let mut utf8c =
1462 Utf8Compiler::new(&mut *builder, &mut *utf8_state)?;
1463 for rng in cls.iter() {
1464 for seq in Utf8Sequences::new(rng.start(), rng.end()) {
1465 utf8c.add(seq.as_slice())?;
1466 }
1467 }
1468 utf8c.finish()
1469 }
1470
1471 // For reference, the code below is the "naive" version of compiling a
1472 // UTF-8 automaton. It is deliciously simple (and works for both the
1473 // forward and reverse cases), but will unfortunately produce very
1474 // large NFAs. When compiling a forward automaton, the size difference
1475 // can sometimes be an order of magnitude. For example, the '\w' regex
1476 // will generate about ~3000 NFA states using the naive approach below,
1477 // but only 283 states when using the approach above. This is because
1478 // the approach above actually compiles a *minimal* (or near minimal,
1479 // because of the bounded hashmap for reusing equivalent states) UTF-8
1480 // automaton.
1481 //
1482 // The code below is kept as a reference point in order to make it
1483 // easier to understand the higher level goal here. Although, it will
1484 // almost certainly bit-rot, so keep that in mind. Also, if you try to
1485 // use it, some of the tests in this module will fail because they look
1486 // for terser byte code produce by the more optimized handling above.
1487 // But the integration test suite should still pass.
1488 //
1489 // One good example of the substantial difference this can make is to
1490 // compare and contrast performance of the Pike VM when the code below
1491 // is active vs the code above. Here's an example to try:
1492 //
1493 // regex-cli find match pikevm -b -p '(?m)^\w{20}' non-ascii-file
1494 //
1495 // With Unicode classes generated below, this search takes about 45s on
1496 // my machine. But with the compressed version above, the search takes
1497 // only around 1.4s. The NFA is also 20% smaller. This is in part due
1498 // to the compression, but also because of the utilization of 'sparse'
1499 // NFA states. They lead to much less state shuffling during the NFA
1500 // search.
1501 /*
1502 let it = cls
1503 .iter()
1504 .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end()))
1505 .map(|seq| {
1506 let it = seq
1507 .as_slice()
1508 .iter()
1509 .map(|rng| self.c_range(rng.start, rng.end));
1510 self.c_concat(it)
1511 });
1512 self.c_alt_iter(it)
1513 */
1514 }
1515
1516 /// Compile the given Unicode character class in reverse with suffix
1517 /// caching.
1518 ///
1519 /// This is a "quick" way to compile large Unicode classes into reverse
1520 /// UTF-8 automata while doing a small amount of compression on that
1521 /// automata by reusing common suffixes.
1522 ///
1523 /// A more comprehensive compression scheme can be accomplished by using
1524 /// a range trie to efficiently sort a reverse sequence of UTF-8 byte
1525 /// ranges, and then use Daciuk's algorithm via `Utf8Compiler`.
1526 ///
1527 /// This is the technique used when "NFA shrinking" is disabled.
1528 ///
1529 /// (This also tries to use "sparse" states where possible, just like
1530 /// `c_byte_class` does.)
1531 fn c_unicode_class_reverse_with_suffix(
1532 &self,
1533 cls: &hir::ClassUnicode,
1534 ) -> Result<ThompsonRef, BuildError> {
1535 // N.B. It would likely be better to cache common *prefixes* in the
1536 // reverse direction, but it's not quite clear how to do that. The
1537 // advantage of caching suffixes is that it does give us a win, and
1538 // has a very small additional overhead.
1539 let mut cache = self.utf8_suffix.borrow_mut();
1540 cache.clear();
1541
1542 let union = self.add_union()?;
1543 let alt_end = self.add_empty()?;
1544 for urng in cls.iter() {
1545 for seq in Utf8Sequences::new(urng.start(), urng.end()) {
1546 let mut end = alt_end;
1547 for brng in seq.as_slice() {
1548 let key = Utf8SuffixKey {
1549 from: end,
1550 start: brng.start,
1551 end: brng.end,
1552 };
1553 let hash = cache.hash(&key);
1554 if let Some(id) = cache.get(&key, hash) {
1555 end = id;
1556 continue;
1557 }
1558
1559 let compiled = self.c_range(brng.start, brng.end)?;
1560 self.patch(compiled.end, end)?;
1561 end = compiled.start;
1562 cache.set(key, hash, end);
1563 }
1564 self.patch(union, end)?;
1565 }
1566 }
1567 Ok(ThompsonRef { start: union, end: alt_end })
1568 }
1569
1570 /// Compile the given HIR look-around assertion to an NFA look-around
1571 /// assertion.
1572 fn c_look(&self, anchor: &hir::Look) -> Result<ThompsonRef, BuildError> {
1573 let look = match *anchor {
1574 hir::Look::Start => Look::Start,
1575 hir::Look::End => Look::End,
1576 hir::Look::StartLF => Look::StartLF,
1577 hir::Look::EndLF => Look::EndLF,
1578 hir::Look::StartCRLF => Look::StartCRLF,
1579 hir::Look::EndCRLF => Look::EndCRLF,
1580 hir::Look::WordAscii => Look::WordAscii,
1581 hir::Look::WordAsciiNegate => Look::WordAsciiNegate,
1582 hir::Look::WordUnicode => Look::WordUnicode,
1583 hir::Look::WordUnicodeNegate => Look::WordUnicodeNegate,
1584 hir::Look::WordStartAscii => Look::WordStartAscii,
1585 hir::Look::WordEndAscii => Look::WordEndAscii,
1586 hir::Look::WordStartUnicode => Look::WordStartUnicode,
1587 hir::Look::WordEndUnicode => Look::WordEndUnicode,
1588 hir::Look::WordStartHalfAscii => Look::WordStartHalfAscii,
1589 hir::Look::WordEndHalfAscii => Look::WordEndHalfAscii,
1590 hir::Look::WordStartHalfUnicode => Look::WordStartHalfUnicode,
1591 hir::Look::WordEndHalfUnicode => Look::WordEndHalfUnicode,
1592 };
1593 let id = self.add_look(look)?;
1594 Ok(ThompsonRef { start: id, end: id })
1595 }
1596
1597 /// Compile the given byte string to a concatenation of bytes.
1598 fn c_literal(&self, bytes: &[u8]) -> Result<ThompsonRef, BuildError> {
1599 self.c_concat(bytes.iter().copied().map(|b| self.c_range(b, b)))
1600 }
1601
1602 /// Compile a "range" state with one transition that may only be followed
1603 /// if the input byte is in the (inclusive) range given.
1604 ///
1605 /// Both the `start` and `end` locations point to the state created.
1606 /// Callers will likely want to keep the `start`, but patch the `end` to
1607 /// point to some other state.
1608 fn c_range(&self, start: u8, end: u8) -> Result<ThompsonRef, BuildError> {
1609 let id = self.add_range(start, end)?;
1610 Ok(ThompsonRef { start: id, end: id })
1611 }
1612
1613 /// Compile an "empty" state with one unconditional epsilon transition.
1614 ///
1615 /// Both the `start` and `end` locations point to the state created.
1616 /// Callers will likely want to keep the `start`, but patch the `end` to
1617 /// point to some other state.
1618 fn c_empty(&self) -> Result<ThompsonRef, BuildError> {
1619 let id = self.add_empty()?;
1620 Ok(ThompsonRef { start: id, end: id })
1621 }
1622
1623 /// Compile a "fail" state that can never have any outgoing transitions.
1624 fn c_fail(&self) -> Result<ThompsonRef, BuildError> {
1625 let id = self.add_fail()?;
1626 Ok(ThompsonRef { start: id, end: id })
1627 }
1628
1629 // The below helpers are meant to be simple wrappers around the
1630 // corresponding Builder methods. For the most part, they let us write
1631 // 'self.add_foo()' instead of 'self.builder.borrow_mut().add_foo()', where
1632 // the latter is a mouthful. Some of the methods do inject a little bit
1633 // of extra logic. e.g., Flipping look-around operators when compiling in
1634 // reverse mode.
1635
1636 fn patch(&self, from: StateID, to: StateID) -> Result<(), BuildError> {
1637 self.builder.borrow_mut().patch(from, to)
1638 }
1639
1640 fn start_pattern(&self) -> Result<PatternID, BuildError> {
1641 self.builder.borrow_mut().start_pattern()
1642 }
1643
1644 fn finish_pattern(
1645 &self,
1646 start_id: StateID,
1647 ) -> Result<PatternID, BuildError> {
1648 self.builder.borrow_mut().finish_pattern(start_id)
1649 }
1650
1651 fn add_empty(&self) -> Result<StateID, BuildError> {
1652 self.builder.borrow_mut().add_empty()
1653 }
1654
1655 fn add_range(&self, start: u8, end: u8) -> Result<StateID, BuildError> {
1656 self.builder.borrow_mut().add_range(Transition {
1657 start,
1658 end,
1659 next: StateID::ZERO,
1660 })
1661 }
1662
1663 fn add_sparse(
1664 &self,
1665 ranges: Vec<Transition>,
1666 ) -> Result<StateID, BuildError> {
1667 self.builder.borrow_mut().add_sparse(ranges)
1668 }
1669
1670 fn add_look(&self, mut look: Look) -> Result<StateID, BuildError> {
1671 if self.is_reverse() {
1672 look = look.reversed();
1673 }
1674 self.builder.borrow_mut().add_look(StateID::ZERO, look)
1675 }
1676
1677 fn add_union(&self) -> Result<StateID, BuildError> {
1678 self.builder.borrow_mut().add_union(vec![])
1679 }
1680
1681 fn add_union_reverse(&self) -> Result<StateID, BuildError> {
1682 self.builder.borrow_mut().add_union_reverse(vec![])
1683 }
1684
1685 fn add_capture_start(
1686 &self,
1687 capture_index: u32,
1688 name: Option<&str>,
1689 ) -> Result<StateID, BuildError> {
1690 let name = name.map(Arc::from);
1691 self.builder.borrow_mut().add_capture_start(
1692 StateID::ZERO,
1693 capture_index,
1694 name,
1695 )
1696 }
1697
1698 fn add_capture_end(
1699 &self,
1700 capture_index: u32,
1701 ) -> Result<StateID, BuildError> {
1702 self.builder.borrow_mut().add_capture_end(StateID::ZERO, capture_index)
1703 }
1704
1705 fn add_fail(&self) -> Result<StateID, BuildError> {
1706 self.builder.borrow_mut().add_fail()
1707 }
1708
1709 fn add_match(&self) -> Result<StateID, BuildError> {
1710 self.builder.borrow_mut().add_match()
1711 }
1712
1713 fn is_reverse(&self) -> bool {
1714 self.config.get_reverse()
1715 }
1716}
1717
1718/// A value that represents the result of compiling a sub-expression of a
1719/// regex's HIR. Specifically, this represents a sub-graph of the NFA that
1720/// has an initial state at `start` and a final state at `end`.
1721#[derive(Clone, Copy, Debug)]
1722pub(crate) struct ThompsonRef {
1723 pub(crate) start: StateID,
1724 pub(crate) end: StateID,
1725}
1726
1727/// A UTF-8 compiler based on Daciuk's algorithm for compiling minimal DFAs
1728/// from a lexicographically sorted sequence of strings in linear time.
1729///
1730/// The trick here is that any Unicode codepoint range can be converted to
1731/// a sequence of byte ranges that form a UTF-8 automaton. Connecting them
1732/// together via an alternation is trivial, and indeed, it works. However,
1733/// there is a lot of redundant structure in many UTF-8 automatons. Since our
1734/// UTF-8 ranges are in lexicographic order, we can use Daciuk's algorithm
1735/// to build nearly minimal DFAs in linear time. (They are guaranteed to be
1736/// minimal because we use a bounded cache of previously build DFA states.)
1737///
1738/// The drawback is that this sadly doesn't work for reverse automata, since
1739/// the ranges are no longer in lexicographic order. For that, we invented the
1740/// range trie (which gets its own module). Once a range trie is built, we then
1741/// use this same Utf8Compiler to build a reverse UTF-8 automaton.
1742///
1743/// The high level idea is described here:
1744/// https://blog.burntsushi.net/transducers/#finite-state-machines-as-data-structures
1745///
1746/// There is also another implementation of this in the `fst` crate.
1747#[derive(Debug)]
1748struct Utf8Compiler<'a> {
1749 builder: &'a mut Builder,
1750 state: &'a mut Utf8State,
1751 target: StateID,
1752}
1753
1754#[derive(Clone, Debug)]
1755struct Utf8State {
1756 compiled: Utf8BoundedMap,
1757 uncompiled: Vec<Utf8Node>,
1758}
1759
1760#[derive(Clone, Debug)]
1761struct Utf8Node {
1762 trans: Vec<Transition>,
1763 last: Option<Utf8LastTransition>,
1764}
1765
1766#[derive(Clone, Debug)]
1767struct Utf8LastTransition {
1768 start: u8,
1769 end: u8,
1770}
1771
1772impl Utf8State {
1773 fn new() -> Utf8State {
1774 Utf8State { compiled: Utf8BoundedMap::new(10_000), uncompiled: vec![] }
1775 }
1776
1777 fn clear(&mut self) {
1778 self.compiled.clear();
1779 self.uncompiled.clear();
1780 }
1781}
1782
1783impl<'a> Utf8Compiler<'a> {
1784 fn new(
1785 builder: &'a mut Builder,
1786 state: &'a mut Utf8State,
1787 ) -> Result<Utf8Compiler<'a>, BuildError> {
1788 let target = builder.add_empty()?;
1789 state.clear();
1790 let mut utf8c = Utf8Compiler { builder, state, target };
1791 utf8c.add_empty();
1792 Ok(utf8c)
1793 }
1794
1795 fn finish(&mut self) -> Result<ThompsonRef, BuildError> {
1796 self.compile_from(0)?;
1797 let node = self.pop_root();
1798 let start = self.compile(node)?;
1799 Ok(ThompsonRef { start, end: self.target })
1800 }
1801
1802 fn add(&mut self, ranges: &[Utf8Range]) -> Result<(), BuildError> {
1803 let prefix_len = ranges
1804 .iter()
1805 .zip(&self.state.uncompiled)
1806 .take_while(|&(range, node)| {
1807 node.last.as_ref().map_or(false, |t| {
1808 (t.start, t.end) == (range.start, range.end)
1809 })
1810 })
1811 .count();
1812 assert!(prefix_len < ranges.len());
1813 self.compile_from(prefix_len)?;
1814 self.add_suffix(&ranges[prefix_len..]);
1815 Ok(())
1816 }
1817
1818 fn compile_from(&mut self, from: usize) -> Result<(), BuildError> {
1819 let mut next = self.target;
1820 while from + 1 < self.state.uncompiled.len() {
1821 let node = self.pop_freeze(next);
1822 next = self.compile(node)?;
1823 }
1824 self.top_last_freeze(next);
1825 Ok(())
1826 }
1827
1828 fn compile(
1829 &mut self,
1830 node: Vec<Transition>,
1831 ) -> Result<StateID, BuildError> {
1832 let hash = self.state.compiled.hash(&node);
1833 if let Some(id) = self.state.compiled.get(&node, hash) {
1834 return Ok(id);
1835 }
1836 let id = self.builder.add_sparse(node.clone())?;
1837 self.state.compiled.set(node, hash, id);
1838 Ok(id)
1839 }
1840
1841 fn add_suffix(&mut self, ranges: &[Utf8Range]) {
1842 assert!(!ranges.is_empty());
1843 let last = self
1844 .state
1845 .uncompiled
1846 .len()
1847 .checked_sub(1)
1848 .expect("non-empty nodes");
1849 assert!(self.state.uncompiled[last].last.is_none());
1850 self.state.uncompiled[last].last = Some(Utf8LastTransition {
1851 start: ranges[0].start,
1852 end: ranges[0].end,
1853 });
1854 for r in &ranges[1..] {
1855 self.state.uncompiled.push(Utf8Node {
1856 trans: vec![],
1857 last: Some(Utf8LastTransition { start: r.start, end: r.end }),
1858 });
1859 }
1860 }
1861
1862 fn add_empty(&mut self) {
1863 self.state.uncompiled.push(Utf8Node { trans: vec![], last: None });
1864 }
1865
1866 fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> {
1867 let mut uncompiled = self.state.uncompiled.pop().unwrap();
1868 uncompiled.set_last_transition(next);
1869 uncompiled.trans
1870 }
1871
1872 fn pop_root(&mut self) -> Vec<Transition> {
1873 assert_eq!(self.state.uncompiled.len(), 1);
1874 assert!(self.state.uncompiled[0].last.is_none());
1875 self.state.uncompiled.pop().expect("non-empty nodes").trans
1876 }
1877
1878 fn top_last_freeze(&mut self, next: StateID) {
1879 let last = self
1880 .state
1881 .uncompiled
1882 .len()
1883 .checked_sub(1)
1884 .expect("non-empty nodes");
1885 self.state.uncompiled[last].set_last_transition(next);
1886 }
1887}
1888
1889impl Utf8Node {
1890 fn set_last_transition(&mut self, next: StateID) {
1891 if let Some(last) = self.last.take() {
1892 self.trans.push(Transition {
1893 start: last.start,
1894 end: last.end,
1895 next,
1896 });
1897 }
1898 }
1899}
1900
1901#[cfg(test)]
1902mod tests {
1903 use alloc::vec;
1904
1905 use crate::{
1906 nfa::thompson::{SparseTransitions, State},
1907 util::primitives::SmallIndex,
1908 };
1909
1910 use super::*;
1911
1912 fn build(pattern: &str) -> NFA {
1913 NFA::compiler()
1914 .configure(
1915 NFA::config()
1916 .which_captures(WhichCaptures::None)
1917 .unanchored_prefix(false),
1918 )
1919 .build(pattern)
1920 .unwrap()
1921 }
1922
1923 fn pid(id: usize) -> PatternID {
1924 PatternID::new(id).unwrap()
1925 }
1926
1927 fn sid(id: usize) -> StateID {
1928 StateID::new(id).unwrap()
1929 }
1930
1931 fn s_byte(byte: u8, next: usize) -> State {
1932 let next = sid(next);
1933 let trans = Transition { start: byte, end: byte, next };
1934 State::ByteRange { trans }
1935 }
1936
1937 fn s_range(start: u8, end: u8, next: usize) -> State {
1938 let next = sid(next);
1939 let trans = Transition { start, end, next };
1940 State::ByteRange { trans }
1941 }
1942
1943 fn s_sparse(transitions: &[(u8, u8, usize)]) -> State {
1944 let transitions = transitions
1945 .iter()
1946 .map(|&(start, end, next)| Transition {
1947 start,
1948 end,
1949 next: sid(next),
1950 })
1951 .collect();
1952 State::Sparse(SparseTransitions { transitions })
1953 }
1954
1955 fn s_look(look: Look, next: usize) -> State {
1956 let next = sid(next);
1957 State::Look { look, next }
1958 }
1959
1960 fn s_bin_union(alt1: usize, alt2: usize) -> State {
1961 State::BinaryUnion { alt1: sid(alt1), alt2: sid(alt2) }
1962 }
1963
1964 fn s_union(alts: &[usize]) -> State {
1965 State::Union {
1966 alternates: alts
1967 .iter()
1968 .map(|&id| sid(id))
1969 .collect::<Vec<StateID>>()
1970 .into_boxed_slice(),
1971 }
1972 }
1973
1974 fn s_cap(next: usize, pattern: usize, index: usize, slot: usize) -> State {
1975 State::Capture {
1976 next: sid(next),
1977 pattern_id: pid(pattern),
1978 group_index: SmallIndex::new(index).unwrap(),
1979 slot: SmallIndex::new(slot).unwrap(),
1980 }
1981 }
1982
1983 fn s_fail() -> State {
1984 State::Fail
1985 }
1986
1987 fn s_match(id: usize) -> State {
1988 State::Match { pattern_id: pid(id) }
1989 }
1990
1991 // Test that building an unanchored NFA has an appropriate `(?s:.)*?`
1992 // prefix.
1993 #[test]
1994 fn compile_unanchored_prefix() {
1995 let nfa = NFA::compiler()
1996 .configure(NFA::config().which_captures(WhichCaptures::None))
1997 .build(r"a")
1998 .unwrap();
1999 assert_eq!(
2000 nfa.states(),
2001 &[
2002 s_bin_union(2, 1),
2003 s_range(0, 255, 0),
2004 s_byte(b'a', 3),
2005 s_match(0),
2006 ]
2007 );
2008 }
2009
2010 #[test]
2011 fn compile_no_unanchored_prefix_with_start_anchor() {
2012 let nfa = NFA::compiler()
2013 .configure(NFA::config().which_captures(WhichCaptures::None))
2014 .build(r"^a")
2015 .unwrap();
2016 assert_eq!(
2017 nfa.states(),
2018 &[s_look(Look::Start, 1), s_byte(b'a', 2), s_match(0)]
2019 );
2020 }
2021
2022 #[test]
2023 fn compile_yes_unanchored_prefix_with_end_anchor() {
2024 let nfa = NFA::compiler()
2025 .configure(NFA::config().which_captures(WhichCaptures::None))
2026 .build(r"a$")
2027 .unwrap();
2028 assert_eq!(
2029 nfa.states(),
2030 &[
2031 s_bin_union(2, 1),
2032 s_range(0, 255, 0),
2033 s_byte(b'a', 3),
2034 s_look(Look::End, 4),
2035 s_match(0),
2036 ]
2037 );
2038 }
2039
2040 #[test]
2041 fn compile_yes_reverse_unanchored_prefix_with_start_anchor() {
2042 let nfa = NFA::compiler()
2043 .configure(
2044 NFA::config()
2045 .reverse(true)
2046 .which_captures(WhichCaptures::None),
2047 )
2048 .build(r"^a")
2049 .unwrap();
2050 assert_eq!(
2051 nfa.states(),
2052 &[
2053 s_bin_union(2, 1),
2054 s_range(0, 255, 0),
2055 s_byte(b'a', 3),
2056 // Anchors get flipped in a reverse automaton.
2057 s_look(Look::End, 4),
2058 s_match(0),
2059 ],
2060 );
2061 }
2062
2063 #[test]
2064 fn compile_no_reverse_unanchored_prefix_with_end_anchor() {
2065 let nfa = NFA::compiler()
2066 .configure(
2067 NFA::config()
2068 .reverse(true)
2069 .which_captures(WhichCaptures::None),
2070 )
2071 .build(r"a$")
2072 .unwrap();
2073 assert_eq!(
2074 nfa.states(),
2075 &[
2076 // Anchors get flipped in a reverse automaton.
2077 s_look(Look::Start, 1),
2078 s_byte(b'a', 2),
2079 s_match(0),
2080 ],
2081 );
2082 }
2083
2084 #[test]
2085 fn compile_empty() {
2086 assert_eq!(build("").states(), &[s_match(0),]);
2087 }
2088
2089 #[test]
2090 fn compile_literal() {
2091 assert_eq!(build("a").states(), &[s_byte(b'a', 1), s_match(0),]);
2092 assert_eq!(
2093 build("ab").states(),
2094 &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0),]
2095 );
2096 assert_eq!(
2097 build("ā").states(),
2098 &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(0)]
2099 );
2100
2101 // Check that non-UTF-8 literals work.
2102 let nfa = NFA::compiler()
2103 .configure(
2104 NFA::config()
2105 .which_captures(WhichCaptures::None)
2106 .unanchored_prefix(false),
2107 )
2108 .syntax(crate::util::syntax::Config::new().utf8(false))
2109 .build(r"(?-u)\xFF")
2110 .unwrap();
2111 assert_eq!(nfa.states(), &[s_byte(b'\xFF', 1), s_match(0),]);
2112 }
2113
2114 #[test]
2115 fn compile_class_ascii() {
2116 assert_eq!(
2117 build(r"[a-z]").states(),
2118 &[s_range(b'a', b'z', 1), s_match(0),]
2119 );
2120 assert_eq!(
2121 build(r"[x-za-c]").states(),
2122 &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match(0)]
2123 );
2124 }
2125
2126 #[test]
2127 #[cfg(not(miri))]
2128 fn compile_class_unicode() {
2129 assert_eq!(
2130 build(r"[\u03B1-\u03B4]").states(),
2131 &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match(0)]
2132 );
2133 assert_eq!(
2134 build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states(),
2135 &[
2136 s_range(0xB1, 0xB4, 5),
2137 s_range(0x99, 0x9E, 5),
2138 s_byte(0xA4, 1),
2139 s_byte(0x9F, 2),
2140 s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]),
2141 s_match(0),
2142 ]
2143 );
2144 assert_eq!(
2145 build(r"[a-zā]").states(),
2146 &[
2147 s_byte(0x83, 3),
2148 s_byte(0x98, 0),
2149 s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]),
2150 s_match(0),
2151 ]
2152 );
2153 }
2154
2155 #[test]
2156 fn compile_repetition() {
2157 assert_eq!(
2158 build(r"a?").states(),
2159 &[s_bin_union(1, 2), s_byte(b'a', 2), s_match(0),]
2160 );
2161 assert_eq!(
2162 build(r"a??").states(),
2163 &[s_bin_union(2, 1), s_byte(b'a', 2), s_match(0),]
2164 );
2165 }
2166
2167 #[test]
2168 fn compile_group() {
2169 assert_eq!(
2170 build(r"ab+").states(),
2171 &[s_byte(b'a', 1), s_byte(b'b', 2), s_bin_union(1, 3), s_match(0)]
2172 );
2173 assert_eq!(
2174 build(r"(ab)").states(),
2175 &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0)]
2176 );
2177 assert_eq!(
2178 build(r"(ab)+").states(),
2179 &[s_byte(b'a', 1), s_byte(b'b', 2), s_bin_union(0, 3), s_match(0)]
2180 );
2181 }
2182
2183 #[test]
2184 fn compile_alternation() {
2185 assert_eq!(
2186 build(r"a|b").states(),
2187 &[s_range(b'a', b'b', 1), s_match(0)]
2188 );
2189 assert_eq!(
2190 build(r"ab|cd").states(),
2191 &[
2192 s_byte(b'b', 3),
2193 s_byte(b'd', 3),
2194 s_sparse(&[(b'a', b'a', 0), (b'c', b'c', 1)]),
2195 s_match(0)
2196 ],
2197 );
2198 assert_eq!(
2199 build(r"|b").states(),
2200 &[s_byte(b'b', 2), s_bin_union(2, 0), s_match(0)]
2201 );
2202 assert_eq!(
2203 build(r"a|").states(),
2204 &[s_byte(b'a', 2), s_bin_union(0, 2), s_match(0)]
2205 );
2206 }
2207
2208 // This tests the use of a non-binary union, i.e., a state with more than
2209 // 2 unconditional epsilon transitions. The only place they tend to appear
2210 // is in reverse NFAs when shrinking is disabled. Otherwise, 'binary-union'
2211 // and 'sparse' tend to cover all other cases of alternation.
2212 #[test]
2213 fn compile_non_binary_union() {
2214 let nfa = NFA::compiler()
2215 .configure(
2216 NFA::config()
2217 .which_captures(WhichCaptures::None)
2218 .reverse(true)
2219 .shrink(false)
2220 .unanchored_prefix(false),
2221 )
2222 .build(r"[\u1000\u2000\u3000]")
2223 .unwrap();
2224 assert_eq!(
2225 nfa.states(),
2226 &[
2227 s_union(&[3, 6, 9]),
2228 s_byte(0xE1, 10),
2229 s_byte(0x80, 1),
2230 s_byte(0x80, 2),
2231 s_byte(0xE2, 10),
2232 s_byte(0x80, 4),
2233 s_byte(0x80, 5),
2234 s_byte(0xE3, 10),
2235 s_byte(0x80, 7),
2236 s_byte(0x80, 8),
2237 s_match(0),
2238 ]
2239 );
2240 }
2241
2242 #[test]
2243 fn compile_many_start_pattern() {
2244 let nfa = NFA::compiler()
2245 .configure(
2246 NFA::config()
2247 .which_captures(WhichCaptures::None)
2248 .unanchored_prefix(false),
2249 )
2250 .build_many(&["a", "b"])
2251 .unwrap();
2252 assert_eq!(
2253 nfa.states(),
2254 &[
2255 s_byte(b'a', 1),
2256 s_match(0),
2257 s_byte(b'b', 3),
2258 s_match(1),
2259 s_bin_union(0, 2),
2260 ]
2261 );
2262 assert_eq!(nfa.start_anchored().as_usize(), 4);
2263 assert_eq!(nfa.start_unanchored().as_usize(), 4);
2264 // Test that the start states for each individual pattern are correct.
2265 assert_eq!(nfa.start_pattern(pid(0)).unwrap(), sid(0));
2266 assert_eq!(nfa.start_pattern(pid(1)).unwrap(), sid(2));
2267 }
2268
2269 // This tests that our compiler can handle an empty character class. At the
2270 // time of writing, the regex parser forbids it, so the only way to test it
2271 // is to provide a hand written HIR.
2272 #[test]
2273 fn empty_class_bytes() {
2274 use regex_syntax::hir::{Class, ClassBytes, Hir};
2275
2276 let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![])));
2277 let config = NFA::config()
2278 .which_captures(WhichCaptures::None)
2279 .unanchored_prefix(false);
2280 let nfa =
2281 NFA::compiler().configure(config).build_from_hir(&hir).unwrap();
2282 assert_eq!(nfa.states(), &[s_fail(), s_match(0)]);
2283 }
2284
2285 // Like empty_class_bytes, but for a Unicode class.
2286 #[test]
2287 fn empty_class_unicode() {
2288 use regex_syntax::hir::{Class, ClassUnicode, Hir};
2289
2290 let hir = Hir::class(Class::Unicode(ClassUnicode::new(vec![])));
2291 let config = NFA::config()
2292 .which_captures(WhichCaptures::None)
2293 .unanchored_prefix(false);
2294 let nfa =
2295 NFA::compiler().configure(config).build_from_hir(&hir).unwrap();
2296 assert_eq!(nfa.states(), &[s_fail(), s_match(0)]);
2297 }
2298
2299 #[test]
2300 fn compile_captures_all() {
2301 let nfa = NFA::compiler()
2302 .configure(
2303 NFA::config()
2304 .unanchored_prefix(false)
2305 .which_captures(WhichCaptures::All),
2306 )
2307 .build("a(b)c")
2308 .unwrap();
2309 assert_eq!(
2310 nfa.states(),
2311 &[
2312 s_cap(1, 0, 0, 0),
2313 s_byte(b'a', 2),
2314 s_cap(3, 0, 1, 2),
2315 s_byte(b'b', 4),
2316 s_cap(5, 0, 1, 3),
2317 s_byte(b'c', 6),
2318 s_cap(7, 0, 0, 1),
2319 s_match(0)
2320 ]
2321 );
2322 let ginfo = nfa.group_info();
2323 assert_eq!(2, ginfo.all_group_len());
2324 }
2325
2326 #[test]
2327 fn compile_captures_implicit() {
2328 let nfa = NFA::compiler()
2329 .configure(
2330 NFA::config()
2331 .unanchored_prefix(false)
2332 .which_captures(WhichCaptures::Implicit),
2333 )
2334 .build("a(b)c")
2335 .unwrap();
2336 assert_eq!(
2337 nfa.states(),
2338 &[
2339 s_cap(1, 0, 0, 0),
2340 s_byte(b'a', 2),
2341 s_byte(b'b', 3),
2342 s_byte(b'c', 4),
2343 s_cap(5, 0, 0, 1),
2344 s_match(0)
2345 ]
2346 );
2347 let ginfo = nfa.group_info();
2348 assert_eq!(1, ginfo.all_group_len());
2349 }
2350
2351 #[test]
2352 fn compile_captures_none() {
2353 let nfa = NFA::compiler()
2354 .configure(
2355 NFA::config()
2356 .unanchored_prefix(false)
2357 .which_captures(WhichCaptures::None),
2358 )
2359 .build("a(b)c")
2360 .unwrap();
2361 assert_eq!(
2362 nfa.states(),
2363 &[s_byte(b'a', 1), s_byte(b'b', 2), s_byte(b'c', 3), s_match(0)]
2364 );
2365 let ginfo = nfa.group_info();
2366 assert_eq!(0, ginfo.all_group_len());
2367 }
2368}