regex_automata/util/
look.rs

1/*!
2Types and routines for working with look-around assertions.
3
4This module principally defines two types:
5
6* [`Look`] enumerates all of the assertions supported by this crate.
7* [`LookSet`] provides a way to efficiently store a set of [`Look`] values.
8* [`LookMatcher`] provides routines for checking whether a `Look` or a
9`LookSet` matches at a particular position in a haystack.
10*/
11
12// LAMENTATION: Sadly, a lot of the API of `Look` and `LookSet` were basically
13// copied verbatim from the regex-syntax crate. I would have no problems using
14// the regex-syntax types and defining the matching routines (only found
15// in this crate) as free functions, except the `Look` and `LookSet` types
16// are used in lots of places. Including in places we expect to work when
17// regex-syntax is *not* enabled, such as in the definition of the NFA itself.
18//
19// Thankfully the code we copy is pretty simple and there isn't much of it.
20// Otherwise, the rest of this module deals with *matching* the assertions,
21// which is not something that regex-syntax handles.
22
23use crate::util::{escape::DebugByte, utf8};
24
25/// A look-around assertion.
26///
27/// An assertion matches at a position between characters in a haystack.
28/// Namely, it does not actually "consume" any input as most parts of a regular
29/// expression do. Assertions are a way of stating that some property must be
30/// true at a particular point during matching.
31///
32/// For example, `(?m)^[a-z]+$` is a pattern that:
33///
34/// * Scans the haystack for a position at which `(?m:^)` is satisfied. That
35/// occurs at either the beginning of the haystack, or immediately following
36/// a `\n` character.
37/// * Looks for one or more occurrences of `[a-z]`.
38/// * Once `[a-z]+` has matched as much as it can, an overall match is only
39/// reported when `[a-z]+` stops just before a `\n`.
40///
41/// So in this case, `abc` and `\nabc\n` match, but `\nabc1\n` does not.
42///
43/// Assertions are also called "look-around," "look-behind" and "look-ahead."
44/// Specifically, some assertions are look-behind (like `^`), other assertions
45/// are look-ahead (like `$`) and yet other assertions are both look-ahead and
46/// look-behind (like `\b`).
47///
48/// # Assertions in an NFA
49///
50/// An assertion in a [`thompson::NFA`](crate::nfa::thompson::NFA) can be
51/// thought of as a conditional epsilon transition. That is, a matching engine
52/// like the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) only permits
53/// moving through conditional epsilon transitions when their condition
54/// is satisfied at whatever position the `PikeVM` is currently at in the
55/// haystack.
56///
57/// How assertions are handled in a `DFA` is trickier, since a DFA does not
58/// have epsilon transitions at all. In this case, they are compiled into the
59/// automaton itself, at the expense of more states than what would be required
60/// without an assertion.
61#[derive(Clone, Copy, Debug, Eq, PartialEq)]
62pub enum Look {
63    /// Match the beginning of text. Specifically, this matches at the starting
64    /// position of the input.
65    Start = 1 << 0,
66    /// Match the end of text. Specifically, this matches at the ending
67    /// position of the input.
68    End = 1 << 1,
69    /// Match the beginning of a line or the beginning of text. Specifically,
70    /// this matches at the starting position of the input, or at the position
71    /// immediately following a `\n` character.
72    StartLF = 1 << 2,
73    /// Match the end of a line or the end of text. Specifically, this matches
74    /// at the end position of the input, or at the position immediately
75    /// preceding a `\n` character.
76    EndLF = 1 << 3,
77    /// Match the beginning of a line or the beginning of text. Specifically,
78    /// this matches at the starting position of the input, or at the position
79    /// immediately following either a `\r` or `\n` character, but never after
80    /// a `\r` when a `\n` follows.
81    StartCRLF = 1 << 4,
82    /// Match the end of a line or the end of text. Specifically, this matches
83    /// at the end position of the input, or at the position immediately
84    /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r`
85    /// precedes it.
86    EndCRLF = 1 << 5,
87    /// Match an ASCII-only word boundary. That is, this matches a position
88    /// where the left adjacent character and right adjacent character
89    /// correspond to a word and non-word or a non-word and word character.
90    WordAscii = 1 << 6,
91    /// Match an ASCII-only negation of a word boundary.
92    WordAsciiNegate = 1 << 7,
93    /// Match a Unicode-aware word boundary. That is, this matches a position
94    /// where the left adjacent character and right adjacent character
95    /// correspond to a word and non-word or a non-word and word character.
96    WordUnicode = 1 << 8,
97    /// Match a Unicode-aware negation of a word boundary.
98    WordUnicodeNegate = 1 << 9,
99    /// Match the start of an ASCII-only word boundary. That is, this matches a
100    /// position at either the beginning of the haystack or where the previous
101    /// character is not a word character and the following character is a word
102    /// character.
103    WordStartAscii = 1 << 10,
104    /// Match the end of an ASCII-only word boundary. That is, this matches
105    /// a position at either the end of the haystack or where the previous
106    /// character is a word character and the following character is not a word
107    /// character.
108    WordEndAscii = 1 << 11,
109    /// Match the start of a Unicode word boundary. That is, this matches a
110    /// position at either the beginning of the haystack or where the previous
111    /// character is not a word character and the following character is a word
112    /// character.
113    WordStartUnicode = 1 << 12,
114    /// Match the end of a Unicode word boundary. That is, this matches a
115    /// position at either the end of the haystack or where the previous
116    /// character is a word character and the following character is not a word
117    /// character.
118    WordEndUnicode = 1 << 13,
119    /// Match the start half of an ASCII-only word boundary. That is, this
120    /// matches a position at either the beginning of the haystack or where the
121    /// previous character is not a word character.
122    WordStartHalfAscii = 1 << 14,
123    /// Match the end half of an ASCII-only word boundary. That is, this
124    /// matches a position at either the end of the haystack or where the
125    /// following character is not a word character.
126    WordEndHalfAscii = 1 << 15,
127    /// Match the start half of a Unicode word boundary. That is, this matches
128    /// a position at either the beginning of the haystack or where the
129    /// previous character is not a word character.
130    WordStartHalfUnicode = 1 << 16,
131    /// Match the end half of a Unicode word boundary. That is, this matches
132    /// a position at either the end of the haystack or where the following
133    /// character is not a word character.
134    WordEndHalfUnicode = 1 << 17,
135}
136
137impl Look {
138    /// Flip the look-around assertion to its equivalent for reverse searches.
139    /// For example, `StartLF` gets translated to `EndLF`.
140    ///
141    /// Some assertions, such as `WordUnicode`, remain the same since they
142    /// match the same positions regardless of the direction of the search.
143    #[inline]
144    pub const fn reversed(self) -> Look {
145        match self {
146            Look::Start => Look::End,
147            Look::End => Look::Start,
148            Look::StartLF => Look::EndLF,
149            Look::EndLF => Look::StartLF,
150            Look::StartCRLF => Look::EndCRLF,
151            Look::EndCRLF => Look::StartCRLF,
152            Look::WordAscii => Look::WordAscii,
153            Look::WordAsciiNegate => Look::WordAsciiNegate,
154            Look::WordUnicode => Look::WordUnicode,
155            Look::WordUnicodeNegate => Look::WordUnicodeNegate,
156            Look::WordStartAscii => Look::WordEndAscii,
157            Look::WordEndAscii => Look::WordStartAscii,
158            Look::WordStartUnicode => Look::WordEndUnicode,
159            Look::WordEndUnicode => Look::WordStartUnicode,
160            Look::WordStartHalfAscii => Look::WordEndHalfAscii,
161            Look::WordEndHalfAscii => Look::WordStartHalfAscii,
162            Look::WordStartHalfUnicode => Look::WordEndHalfUnicode,
163            Look::WordEndHalfUnicode => Look::WordStartHalfUnicode,
164        }
165    }
166
167    /// Return the underlying representation of this look-around enumeration
168    /// as an integer. Giving the return value to the [`Look::from_repr`]
169    /// constructor is guaranteed to return the same look-around variant that
170    /// one started with within a semver compatible release of this crate.
171    #[inline]
172    pub const fn as_repr(self) -> u32 {
173        // AFAIK, 'as' is the only way to zero-cost convert an int enum to an
174        // actual int.
175        self as u32
176    }
177
178    /// Given the underlying representation of a `Look` value, return the
179    /// corresponding `Look` value if the representation is valid. Otherwise
180    /// `None` is returned.
181    #[inline]
182    pub const fn from_repr(repr: u32) -> Option<Look> {
183        match repr {
184            0b00_0000_0000_0000_0001 => Some(Look::Start),
185            0b00_0000_0000_0000_0010 => Some(Look::End),
186            0b00_0000_0000_0000_0100 => Some(Look::StartLF),
187            0b00_0000_0000_0000_1000 => Some(Look::EndLF),
188            0b00_0000_0000_0001_0000 => Some(Look::StartCRLF),
189            0b00_0000_0000_0010_0000 => Some(Look::EndCRLF),
190            0b00_0000_0000_0100_0000 => Some(Look::WordAscii),
191            0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate),
192            0b00_0000_0001_0000_0000 => Some(Look::WordUnicode),
193            0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate),
194            0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii),
195            0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii),
196            0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode),
197            0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode),
198            0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii),
199            0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii),
200            0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode),
201            0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode),
202            _ => None,
203        }
204    }
205
206    /// Returns a convenient single codepoint representation of this
207    /// look-around assertion. Each assertion is guaranteed to be represented
208    /// by a distinct character.
209    ///
210    /// This is useful for succinctly representing a look-around assertion in
211    /// human friendly but succinct output intended for a programmer working on
212    /// regex internals.
213    #[inline]
214    pub const fn as_char(self) -> char {
215        match self {
216            Look::Start => 'A',
217            Look::End => 'z',
218            Look::StartLF => '^',
219            Look::EndLF => '$',
220            Look::StartCRLF => 'r',
221            Look::EndCRLF => 'R',
222            Look::WordAscii => 'b',
223            Look::WordAsciiNegate => 'B',
224            Look::WordUnicode => '𝛃',
225            Look::WordUnicodeNegate => '𝚩',
226            Look::WordStartAscii => '<',
227            Look::WordEndAscii => '>',
228            Look::WordStartUnicode => '〈',
229            Look::WordEndUnicode => '〉',
230            Look::WordStartHalfAscii => '◁',
231            Look::WordEndHalfAscii => '▷',
232            Look::WordStartHalfUnicode => '◀',
233            Look::WordEndHalfUnicode => '▶',
234        }
235    }
236}
237
238/// LookSet is a memory-efficient set of look-around assertions.
239///
240/// This is useful for efficiently tracking look-around assertions. For
241/// example, a [`thompson::NFA`](crate::nfa::thompson::NFA) provides properties
242/// that return `LookSet`s.
243#[derive(Clone, Copy, Default, Eq, PartialEq)]
244pub struct LookSet {
245    /// The underlying representation this set is exposed to make it possible
246    /// to store it somewhere efficiently. The representation is that
247    /// of a bitset, where each assertion occupies bit `i` where
248    /// `i = Look::as_repr()`.
249    ///
250    /// Note that users of this internal representation must permit the full
251    /// range of `u16` values to be represented. For example, even if the
252    /// current implementation only makes use of the 10 least significant bits,
253    /// it may use more bits in a future semver compatible release.
254    pub bits: u32,
255}
256
257impl LookSet {
258    /// Create an empty set of look-around assertions.
259    #[inline]
260    pub fn empty() -> LookSet {
261        LookSet { bits: 0 }
262    }
263
264    /// Create a full set of look-around assertions.
265    ///
266    /// This set contains all possible look-around assertions.
267    #[inline]
268    pub fn full() -> LookSet {
269        LookSet { bits: !0 }
270    }
271
272    /// Create a look-around set containing the look-around assertion given.
273    ///
274    /// This is a convenience routine for creating an empty set and inserting
275    /// one look-around assertions.
276    #[inline]
277    pub fn singleton(look: Look) -> LookSet {
278        LookSet::empty().insert(look)
279    }
280
281    /// Returns the total number of look-around assertions in this set.
282    #[inline]
283    pub fn len(self) -> usize {
284        // OK because max value always fits in a u8, which in turn always
285        // fits in a usize, regardless of target.
286        usize::try_from(self.bits.count_ones()).unwrap()
287    }
288
289    /// Returns true if and only if this set is empty.
290    #[inline]
291    pub fn is_empty(self) -> bool {
292        self.len() == 0
293    }
294
295    /// Returns true if and only if the given look-around assertion is in this
296    /// set.
297    #[inline]
298    pub fn contains(self, look: Look) -> bool {
299        self.bits & look.as_repr() != 0
300    }
301
302    /// Returns true if and only if this set contains any anchor assertions.
303    /// This includes both "start/end of haystack" and "start/end of line."
304    #[inline]
305    pub fn contains_anchor(&self) -> bool {
306        self.contains_anchor_haystack() || self.contains_anchor_line()
307    }
308
309    /// Returns true if and only if this set contains any "start/end of
310    /// haystack" anchors. This doesn't include "start/end of line" anchors.
311    #[inline]
312    pub fn contains_anchor_haystack(&self) -> bool {
313        self.contains(Look::Start) || self.contains(Look::End)
314    }
315
316    /// Returns true if and only if this set contains any "start/end of line"
317    /// anchors. This doesn't include "start/end of haystack" anchors. This
318    /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors.
319    #[inline]
320    pub fn contains_anchor_line(&self) -> bool {
321        self.contains(Look::StartLF)
322            || self.contains(Look::EndLF)
323            || self.contains(Look::StartCRLF)
324            || self.contains(Look::EndCRLF)
325    }
326
327    /// Returns true if and only if this set contains any "start/end of line"
328    /// anchors that only treat `\n` as line terminators. This does not include
329    /// haystack anchors or CRLF aware line anchors.
330    #[inline]
331    pub fn contains_anchor_lf(&self) -> bool {
332        self.contains(Look::StartLF) || self.contains(Look::EndLF)
333    }
334
335    /// Returns true if and only if this set contains any "start/end of line"
336    /// anchors that are CRLF-aware. This doesn't include "start/end of
337    /// haystack" or "start/end of line-feed" anchors.
338    #[inline]
339    pub fn contains_anchor_crlf(&self) -> bool {
340        self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF)
341    }
342
343    /// Returns true if and only if this set contains any word boundary or
344    /// negated word boundary assertions. This include both Unicode and ASCII
345    /// word boundaries.
346    #[inline]
347    pub fn contains_word(self) -> bool {
348        self.contains_word_unicode() || self.contains_word_ascii()
349    }
350
351    /// Returns true if and only if this set contains any Unicode word boundary
352    /// or negated Unicode word boundary assertions.
353    #[inline]
354    pub fn contains_word_unicode(self) -> bool {
355        self.contains(Look::WordUnicode)
356            || self.contains(Look::WordUnicodeNegate)
357            || self.contains(Look::WordStartUnicode)
358            || self.contains(Look::WordEndUnicode)
359            || self.contains(Look::WordStartHalfUnicode)
360            || self.contains(Look::WordEndHalfUnicode)
361    }
362
363    /// Returns true if and only if this set contains any ASCII word boundary
364    /// or negated ASCII word boundary assertions.
365    #[inline]
366    pub fn contains_word_ascii(self) -> bool {
367        self.contains(Look::WordAscii)
368            || self.contains(Look::WordAsciiNegate)
369            || self.contains(Look::WordStartAscii)
370            || self.contains(Look::WordEndAscii)
371            || self.contains(Look::WordStartHalfAscii)
372            || self.contains(Look::WordEndHalfAscii)
373    }
374
375    /// Returns an iterator over all of the look-around assertions in this set.
376    #[inline]
377    pub fn iter(self) -> LookSetIter {
378        LookSetIter { set: self }
379    }
380
381    /// Return a new set that is equivalent to the original, but with the given
382    /// assertion added to it. If the assertion is already in the set, then the
383    /// returned set is equivalent to the original.
384    #[inline]
385    pub fn insert(self, look: Look) -> LookSet {
386        LookSet { bits: self.bits | look.as_repr() }
387    }
388
389    /// Updates this set in place with the result of inserting the given
390    /// assertion into this set.
391    #[inline]
392    pub fn set_insert(&mut self, look: Look) {
393        *self = self.insert(look);
394    }
395
396    /// Return a new set that is equivalent to the original, but with the given
397    /// assertion removed from it. If the assertion is not in the set, then the
398    /// returned set is equivalent to the original.
399    #[inline]
400    pub fn remove(self, look: Look) -> LookSet {
401        LookSet { bits: self.bits & !look.as_repr() }
402    }
403
404    /// Updates this set in place with the result of removing the given
405    /// assertion from this set.
406    #[inline]
407    pub fn set_remove(&mut self, look: Look) {
408        *self = self.remove(look);
409    }
410
411    /// Returns a new set that is the result of subtracting the given set from
412    /// this set.
413    #[inline]
414    pub fn subtract(self, other: LookSet) -> LookSet {
415        LookSet { bits: self.bits & !other.bits }
416    }
417
418    /// Updates this set in place with the result of subtracting the given set
419    /// from this set.
420    #[inline]
421    pub fn set_subtract(&mut self, other: LookSet) {
422        *self = self.subtract(other);
423    }
424
425    /// Returns a new set that is the union of this and the one given.
426    #[inline]
427    pub fn union(self, other: LookSet) -> LookSet {
428        LookSet { bits: self.bits | other.bits }
429    }
430
431    /// Updates this set in place with the result of unioning it with the one
432    /// given.
433    #[inline]
434    pub fn set_union(&mut self, other: LookSet) {
435        *self = self.union(other);
436    }
437
438    /// Returns a new set that is the intersection of this and the one given.
439    #[inline]
440    pub fn intersect(self, other: LookSet) -> LookSet {
441        LookSet { bits: self.bits & other.bits }
442    }
443
444    /// Updates this set in place with the result of intersecting it with the
445    /// one given.
446    #[inline]
447    pub fn set_intersect(&mut self, other: LookSet) {
448        *self = self.intersect(other);
449    }
450
451    /// Return a `LookSet` from the slice given as a native endian 32-bit
452    /// integer.
453    ///
454    /// # Panics
455    ///
456    /// This panics if `slice.len() < 4`.
457    #[inline]
458    pub fn read_repr(slice: &[u8]) -> LookSet {
459        let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap());
460        LookSet { bits }
461    }
462
463    /// Write a `LookSet` as a native endian 32-bit integer to the beginning
464    /// of the slice given.
465    ///
466    /// # Panics
467    ///
468    /// This panics if `slice.len() < 4`.
469    #[inline]
470    pub fn write_repr(self, slice: &mut [u8]) {
471        let raw = self.bits.to_ne_bytes();
472        slice[0] = raw[0];
473        slice[1] = raw[1];
474        slice[2] = raw[2];
475        slice[3] = raw[3];
476    }
477
478    /// Checks that all assertions in this set can be matched.
479    ///
480    /// Some assertions, such as Unicode word boundaries, require optional (but
481    /// enabled by default) tables that may not be available. If there are
482    /// assertions in this set that require tables that are not available, then
483    /// this will return an error.
484    ///
485    /// Specifically, this returns an error when the
486    /// `unicode-word-boundary` feature is _not_ enabled _and_ this set
487    /// contains a Unicode word boundary assertion.
488    ///
489    /// It can be useful to use this on the result of
490    /// [`NFA::look_set_any`](crate::nfa::thompson::NFA::look_set_any)
491    /// when building a matcher engine to ensure methods like
492    /// [`LookMatcher::matches_set`] do not panic at search time.
493    pub fn available(self) -> Result<(), UnicodeWordBoundaryError> {
494        if self.contains_word_unicode() {
495            UnicodeWordBoundaryError::check()?;
496        }
497        Ok(())
498    }
499}
500
501impl core::fmt::Debug for LookSet {
502    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
503        if self.is_empty() {
504            return write!(f, "∅");
505        }
506        for look in self.iter() {
507            write!(f, "{}", look.as_char())?;
508        }
509        Ok(())
510    }
511}
512
513/// An iterator over all look-around assertions in a [`LookSet`].
514///
515/// This iterator is created by [`LookSet::iter`].
516#[derive(Clone, Debug)]
517pub struct LookSetIter {
518    set: LookSet,
519}
520
521impl Iterator for LookSetIter {
522    type Item = Look;
523
524    #[inline]
525    fn next(&mut self) -> Option<Look> {
526        if self.set.is_empty() {
527            return None;
528        }
529        // We'll never have more than u8::MAX distinct look-around assertions,
530        // so 'bit' will always fit into a u16.
531        let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
532        let look = Look::from_repr(1 << bit)?;
533        self.set = self.set.remove(look);
534        Some(look)
535    }
536}
537
538/// A matcher for look-around assertions.
539///
540/// This matcher permits configuring aspects of how look-around assertions are
541/// matched.
542///
543/// # Example
544///
545/// A `LookMatcher` can change the line terminator used for matching multi-line
546/// anchors such as `(?m:^)` and `(?m:$)`.
547///
548/// ```
549/// use regex_automata::{
550///     nfa::thompson::{self, pikevm::PikeVM},
551///     util::look::LookMatcher,
552///     Match, Input,
553/// };
554///
555/// let mut lookm = LookMatcher::new();
556/// lookm.set_line_terminator(b'\x00');
557///
558/// let re = PikeVM::builder()
559///     .thompson(thompson::Config::new().look_matcher(lookm))
560///     .build(r"(?m)^[a-z]+$")?;
561/// let mut cache = re.create_cache();
562///
563/// // Multi-line assertions now use NUL as a terminator.
564/// assert_eq!(
565///     Some(Match::must(0, 1..4)),
566///     re.find(&mut cache, b"\x00abc\x00"),
567/// );
568/// // ... and \n is no longer recognized as a terminator.
569/// assert_eq!(
570///     None,
571///     re.find(&mut cache, b"\nabc\n"),
572/// );
573///
574/// # Ok::<(), Box<dyn std::error::Error>>(())
575/// ```
576#[derive(Clone, Debug)]
577pub struct LookMatcher {
578    lineterm: DebugByte,
579}
580
581impl LookMatcher {
582    /// Creates a new default matcher for look-around assertions.
583    pub fn new() -> LookMatcher {
584        LookMatcher { lineterm: DebugByte(b'\n') }
585    }
586
587    /// Sets the line terminator for use with `(?m:^)` and `(?m:$)`.
588    ///
589    /// Namely, instead of `^` matching after `\n` and `$` matching immediately
590    /// before a `\n`, this will cause it to match after and before the byte
591    /// given.
592    ///
593    /// It can occasionally be useful to use this to configure the line
594    /// terminator to the NUL byte when searching binary data.
595    ///
596    /// Note that this does not apply to CRLF-aware line anchors such as
597    /// `(?Rm:^)` and `(?Rm:$)`. CRLF-aware line anchors are hard-coded to
598    /// use `\r` and `\n`.
599    pub fn set_line_terminator(&mut self, byte: u8) -> &mut LookMatcher {
600        self.lineterm.0 = byte;
601        self
602    }
603
604    /// Returns the line terminator that was configured for this matcher.
605    ///
606    /// If no line terminator was configured, then this returns `\n`.
607    ///
608    /// Note that the line terminator should only be used for matching `(?m:^)`
609    /// and `(?m:$)` assertions. It specifically should _not_ be used for
610    /// matching the CRLF aware assertions `(?Rm:^)` and `(?Rm:$)`.
611    pub fn get_line_terminator(&self) -> u8 {
612        self.lineterm.0
613    }
614
615    /// Returns true when the position `at` in `haystack` satisfies the given
616    /// look-around assertion.
617    ///
618    /// # Panics
619    ///
620    /// This panics when testing any Unicode word boundary assertion in this
621    /// set and when the Unicode word data is not available. Specifically, this
622    /// only occurs when the `unicode-word-boundary` feature is not enabled.
623    ///
624    /// Since it's generally expected that this routine is called inside of
625    /// a matching engine, callers should check the error condition when
626    /// building the matching engine. If there is a Unicode word boundary
627    /// in the matcher and the data isn't available, then the matcher should
628    /// fail to build.
629    ///
630    /// Callers can check the error condition with [`LookSet::available`].
631    ///
632    /// This also may panic when `at > haystack.len()`. Note that `at ==
633    /// haystack.len()` is legal and guaranteed not to panic.
634    #[inline]
635    pub fn matches(&self, look: Look, haystack: &[u8], at: usize) -> bool {
636        self.matches_inline(look, haystack, at)
637    }
638
639    /// Like `matches`, but forcefully inlined.
640    ///
641    /// # Panics
642    ///
643    /// This panics when testing any Unicode word boundary assertion in this
644    /// set and when the Unicode word data is not available. Specifically, this
645    /// only occurs when the `unicode-word-boundary` feature is not enabled.
646    ///
647    /// Since it's generally expected that this routine is called inside of
648    /// a matching engine, callers should check the error condition when
649    /// building the matching engine. If there is a Unicode word boundary
650    /// in the matcher and the data isn't available, then the matcher should
651    /// fail to build.
652    ///
653    /// Callers can check the error condition with [`LookSet::available`].
654    ///
655    /// This also may panic when `at > haystack.len()`. Note that `at ==
656    /// haystack.len()` is legal and guaranteed not to panic.
657    #[cfg_attr(feature = "perf-inline", inline(always))]
658    pub(crate) fn matches_inline(
659        &self,
660        look: Look,
661        haystack: &[u8],
662        at: usize,
663    ) -> bool {
664        match look {
665            Look::Start => self.is_start(haystack, at),
666            Look::End => self.is_end(haystack, at),
667            Look::StartLF => self.is_start_lf(haystack, at),
668            Look::EndLF => self.is_end_lf(haystack, at),
669            Look::StartCRLF => self.is_start_crlf(haystack, at),
670            Look::EndCRLF => self.is_end_crlf(haystack, at),
671            Look::WordAscii => self.is_word_ascii(haystack, at),
672            Look::WordAsciiNegate => self.is_word_ascii_negate(haystack, at),
673            Look::WordUnicode => self.is_word_unicode(haystack, at).unwrap(),
674            Look::WordUnicodeNegate => {
675                self.is_word_unicode_negate(haystack, at).unwrap()
676            }
677            Look::WordStartAscii => self.is_word_start_ascii(haystack, at),
678            Look::WordEndAscii => self.is_word_end_ascii(haystack, at),
679            Look::WordStartUnicode => {
680                self.is_word_start_unicode(haystack, at).unwrap()
681            }
682            Look::WordEndUnicode => {
683                self.is_word_end_unicode(haystack, at).unwrap()
684            }
685            Look::WordStartHalfAscii => {
686                self.is_word_start_half_ascii(haystack, at)
687            }
688            Look::WordEndHalfAscii => {
689                self.is_word_end_half_ascii(haystack, at)
690            }
691            Look::WordStartHalfUnicode => {
692                self.is_word_start_half_unicode(haystack, at).unwrap()
693            }
694            Look::WordEndHalfUnicode => {
695                self.is_word_end_half_unicode(haystack, at).unwrap()
696            }
697        }
698    }
699
700    /// Returns true when _all_ of the assertions in the given set match at the
701    /// given position in the haystack.
702    ///
703    /// # Panics
704    ///
705    /// This panics when testing any Unicode word boundary assertion in this
706    /// set and when the Unicode word data is not available. Specifically, this
707    /// only occurs when the `unicode-word-boundary` feature is not enabled.
708    ///
709    /// Since it's generally expected that this routine is called inside of
710    /// a matching engine, callers should check the error condition when
711    /// building the matching engine. If there is a Unicode word boundary
712    /// in the matcher and the data isn't available, then the matcher should
713    /// fail to build.
714    ///
715    /// Callers can check the error condition with [`LookSet::available`].
716    ///
717    /// This also may panic when `at > haystack.len()`. Note that `at ==
718    /// haystack.len()` is legal and guaranteed not to panic.
719    #[inline]
720    pub fn matches_set(
721        &self,
722        set: LookSet,
723        haystack: &[u8],
724        at: usize,
725    ) -> bool {
726        self.matches_set_inline(set, haystack, at)
727    }
728
729    /// Like `LookSet::matches`, but forcefully inlined for perf.
730    #[cfg_attr(feature = "perf-inline", inline(always))]
731    pub(crate) fn matches_set_inline(
732        &self,
733        set: LookSet,
734        haystack: &[u8],
735        at: usize,
736    ) -> bool {
737        // This used to use LookSet::iter with Look::matches on each element,
738        // but that proved to be quite disastrous for perf. The manual "if
739        // the set has this assertion, check it" turns out to be quite a bit
740        // faster.
741        if set.contains(Look::Start) {
742            if !self.is_start(haystack, at) {
743                return false;
744            }
745        }
746        if set.contains(Look::End) {
747            if !self.is_end(haystack, at) {
748                return false;
749            }
750        }
751        if set.contains(Look::StartLF) {
752            if !self.is_start_lf(haystack, at) {
753                return false;
754            }
755        }
756        if set.contains(Look::EndLF) {
757            if !self.is_end_lf(haystack, at) {
758                return false;
759            }
760        }
761        if set.contains(Look::StartCRLF) {
762            if !self.is_start_crlf(haystack, at) {
763                return false;
764            }
765        }
766        if set.contains(Look::EndCRLF) {
767            if !self.is_end_crlf(haystack, at) {
768                return false;
769            }
770        }
771        if set.contains(Look::WordAscii) {
772            if !self.is_word_ascii(haystack, at) {
773                return false;
774            }
775        }
776        if set.contains(Look::WordAsciiNegate) {
777            if !self.is_word_ascii_negate(haystack, at) {
778                return false;
779            }
780        }
781        if set.contains(Look::WordUnicode) {
782            if !self.is_word_unicode(haystack, at).unwrap() {
783                return false;
784            }
785        }
786        if set.contains(Look::WordUnicodeNegate) {
787            if !self.is_word_unicode_negate(haystack, at).unwrap() {
788                return false;
789            }
790        }
791        if set.contains(Look::WordStartAscii) {
792            if !self.is_word_start_ascii(haystack, at) {
793                return false;
794            }
795        }
796        if set.contains(Look::WordEndAscii) {
797            if !self.is_word_end_ascii(haystack, at) {
798                return false;
799            }
800        }
801        if set.contains(Look::WordStartUnicode) {
802            if !self.is_word_start_unicode(haystack, at).unwrap() {
803                return false;
804            }
805        }
806        if set.contains(Look::WordEndUnicode) {
807            if !self.is_word_end_unicode(haystack, at).unwrap() {
808                return false;
809            }
810        }
811        if set.contains(Look::WordStartHalfAscii) {
812            if !self.is_word_start_half_ascii(haystack, at) {
813                return false;
814            }
815        }
816        if set.contains(Look::WordEndHalfAscii) {
817            if !self.is_word_end_half_ascii(haystack, at) {
818                return false;
819            }
820        }
821        if set.contains(Look::WordStartHalfUnicode) {
822            if !self.is_word_start_half_unicode(haystack, at).unwrap() {
823                return false;
824            }
825        }
826        if set.contains(Look::WordEndHalfUnicode) {
827            if !self.is_word_end_half_unicode(haystack, at).unwrap() {
828                return false;
829            }
830        }
831        true
832    }
833
834    /// Split up the given byte classes into equivalence classes in a way that
835    /// is consistent with this look-around assertion.
836    #[cfg(feature = "alloc")]
837    pub(crate) fn add_to_byteset(
838        &self,
839        look: Look,
840        set: &mut crate::util::alphabet::ByteClassSet,
841    ) {
842        match look {
843            Look::Start | Look::End => {}
844            Look::StartLF | Look::EndLF => {
845                set.set_range(self.lineterm.0, self.lineterm.0);
846            }
847            Look::StartCRLF | Look::EndCRLF => {
848                set.set_range(b'\r', b'\r');
849                set.set_range(b'\n', b'\n');
850            }
851            Look::WordAscii
852            | Look::WordAsciiNegate
853            | Look::WordUnicode
854            | Look::WordUnicodeNegate
855            | Look::WordStartAscii
856            | Look::WordEndAscii
857            | Look::WordStartUnicode
858            | Look::WordEndUnicode
859            | Look::WordStartHalfAscii
860            | Look::WordEndHalfAscii
861            | Look::WordStartHalfUnicode
862            | Look::WordEndHalfUnicode => {
863                // We need to mark all ranges of bytes whose pairs result in
864                // evaluating \b differently. This isn't technically correct
865                // for Unicode word boundaries, but DFAs can't handle those
866                // anyway, and thus, the byte classes don't need to either
867                // since they are themselves only used in DFAs.
868                //
869                // FIXME: It seems like the calls to 'set_range' here are
870                // completely invariant, which means we could just hard-code
871                // them here without needing to write a loop. And we only need
872                // to do this dance at most once per regex.
873                //
874                // FIXME: Is this correct for \B?
875                let iswb = utf8::is_word_byte;
876                // This unwrap is OK because we guard every use of 'asu8' with
877                // a check that the input is <= 255.
878                let asu8 = |b: u16| u8::try_from(b).unwrap();
879                let mut b1: u16 = 0;
880                let mut b2: u16;
881                while b1 <= 255 {
882                    b2 = b1 + 1;
883                    while b2 <= 255 && iswb(asu8(b1)) == iswb(asu8(b2)) {
884                        b2 += 1;
885                    }
886                    // The guards above guarantee that b2 can never get any
887                    // bigger.
888                    assert!(b2 <= 256);
889                    // Subtracting 1 from b2 is always OK because it is always
890                    // at least 1 greater than b1, and the assert above
891                    // guarantees that the asu8 conversion will succeed.
892                    set.set_range(asu8(b1), asu8(b2.checked_sub(1).unwrap()));
893                    b1 = b2;
894                }
895            }
896        }
897    }
898
899    /// Returns true when [`Look::Start`] is satisfied `at` the given position
900    /// in `haystack`.
901    ///
902    /// # Panics
903    ///
904    /// This may panic when `at > haystack.len()`. Note that `at ==
905    /// haystack.len()` is legal and guaranteed not to panic.
906    #[inline]
907    pub fn is_start(&self, _haystack: &[u8], at: usize) -> bool {
908        at == 0
909    }
910
911    /// Returns true when [`Look::End`] is satisfied `at` the given position in
912    /// `haystack`.
913    ///
914    /// # Panics
915    ///
916    /// This may panic when `at > haystack.len()`. Note that `at ==
917    /// haystack.len()` is legal and guaranteed not to panic.
918    #[inline]
919    pub fn is_end(&self, haystack: &[u8], at: usize) -> bool {
920        at == haystack.len()
921    }
922
923    /// Returns true when [`Look::StartLF`] is satisfied `at` the given
924    /// position in `haystack`.
925    ///
926    /// # Panics
927    ///
928    /// This may panic when `at > haystack.len()`. Note that `at ==
929    /// haystack.len()` is legal and guaranteed not to panic.
930    #[inline]
931    pub fn is_start_lf(&self, haystack: &[u8], at: usize) -> bool {
932        self.is_start(haystack, at) || haystack[at - 1] == self.lineterm.0
933    }
934
935    /// Returns true when [`Look::EndLF`] is satisfied `at` the given position
936    /// in `haystack`.
937    ///
938    /// # Panics
939    ///
940    /// This may panic when `at > haystack.len()`. Note that `at ==
941    /// haystack.len()` is legal and guaranteed not to panic.
942    #[inline]
943    pub fn is_end_lf(&self, haystack: &[u8], at: usize) -> bool {
944        self.is_end(haystack, at) || haystack[at] == self.lineterm.0
945    }
946
947    /// Returns true when [`Look::StartCRLF`] is satisfied `at` the given
948    /// position in `haystack`.
949    ///
950    /// # Panics
951    ///
952    /// This may panic when `at > haystack.len()`. Note that `at ==
953    /// haystack.len()` is legal and guaranteed not to panic.
954    #[inline]
955    pub fn is_start_crlf(&self, haystack: &[u8], at: usize) -> bool {
956        self.is_start(haystack, at)
957            || haystack[at - 1] == b'\n'
958            || (haystack[at - 1] == b'\r'
959                && (at >= haystack.len() || haystack[at] != b'\n'))
960    }
961
962    /// Returns true when [`Look::EndCRLF`] is satisfied `at` the given
963    /// position in `haystack`.
964    ///
965    /// # Panics
966    ///
967    /// This may panic when `at > haystack.len()`. Note that `at ==
968    /// haystack.len()` is legal and guaranteed not to panic.
969    #[inline]
970    pub fn is_end_crlf(&self, haystack: &[u8], at: usize) -> bool {
971        self.is_end(haystack, at)
972            || haystack[at] == b'\r'
973            || (haystack[at] == b'\n'
974                && (at == 0 || haystack[at - 1] != b'\r'))
975    }
976
977    /// Returns true when [`Look::WordAscii`] is satisfied `at` the given
978    /// position in `haystack`.
979    ///
980    /// # Panics
981    ///
982    /// This may panic when `at > haystack.len()`. Note that `at ==
983    /// haystack.len()` is legal and guaranteed not to panic.
984    #[inline]
985    pub fn is_word_ascii(&self, haystack: &[u8], at: usize) -> bool {
986        let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]);
987        let word_after =
988            at < haystack.len() && utf8::is_word_byte(haystack[at]);
989        word_before != word_after
990    }
991
992    /// Returns true when [`Look::WordAsciiNegate`] is satisfied `at` the given
993    /// position in `haystack`.
994    ///
995    /// # Panics
996    ///
997    /// This may panic when `at > haystack.len()`. Note that `at ==
998    /// haystack.len()` is legal and guaranteed not to panic.
999    #[inline]
1000    pub fn is_word_ascii_negate(&self, haystack: &[u8], at: usize) -> bool {
1001        !self.is_word_ascii(haystack, at)
1002    }
1003
1004    /// Returns true when [`Look::WordUnicode`] is satisfied `at` the given
1005    /// position in `haystack`.
1006    ///
1007    /// # Panics
1008    ///
1009    /// This may panic when `at > haystack.len()`. Note that `at ==
1010    /// haystack.len()` is legal and guaranteed not to panic.
1011    ///
1012    /// # Errors
1013    ///
1014    /// This returns an error when Unicode word boundary tables
1015    /// are not available. Specifically, this only occurs when the
1016    /// `unicode-word-boundary` feature is not enabled.
1017    #[inline]
1018    pub fn is_word_unicode(
1019        &self,
1020        haystack: &[u8],
1021        at: usize,
1022    ) -> Result<bool, UnicodeWordBoundaryError> {
1023        let word_before = is_word_char::rev(haystack, at)?;
1024        let word_after = is_word_char::fwd(haystack, at)?;
1025        Ok(word_before != word_after)
1026    }
1027
1028    /// Returns true when [`Look::WordUnicodeNegate`] is satisfied `at` the
1029    /// given position in `haystack`.
1030    ///
1031    /// # Panics
1032    ///
1033    /// This may panic when `at > haystack.len()`. Note that `at ==
1034    /// haystack.len()` is legal and guaranteed not to panic.
1035    ///
1036    /// # Errors
1037    ///
1038    /// This returns an error when Unicode word boundary tables
1039    /// are not available. Specifically, this only occurs when the
1040    /// `unicode-word-boundary` feature is not enabled.
1041    #[inline]
1042    pub fn is_word_unicode_negate(
1043        &self,
1044        haystack: &[u8],
1045        at: usize,
1046    ) -> Result<bool, UnicodeWordBoundaryError> {
1047        // This is pretty subtle. Why do we need to do UTF-8 decoding here?
1048        // Well... at time of writing, the is_word_char_{fwd,rev} routines will
1049        // only return true if there is a valid UTF-8 encoding of a "word"
1050        // codepoint, and false in every other case (including invalid UTF-8).
1051        // This means that in regions of invalid UTF-8 (which might be a
1052        // subset of valid UTF-8!), it would result in \B matching. While this
1053        // would be questionable in the context of truly invalid UTF-8, it is
1054        // *certainly* wrong to report match boundaries that split the encoding
1055        // of a codepoint. So to work around this, we ensure that we can decode
1056        // a codepoint on either side of `at`. If either direction fails, then
1057        // we don't permit \B to match at all.
1058        //
1059        // Now, this isn't exactly optimal from a perf perspective. We could
1060        // try and detect this in is_word_char::{fwd,rev}, but it's not clear
1061        // if it's worth it. \B is, after all, rarely used. Even worse,
1062        // is_word_char::{fwd,rev} could do its own UTF-8 decoding, and so this
1063        // will wind up doing UTF-8 decoding twice. Ouch. We could fix this
1064        // with more code complexity, but it just doesn't feel worth it for \B.
1065        //
1066        // And in particular, we do *not* have to do this with \b, because \b
1067        // *requires* that at least one side of `at` be a "word" codepoint,
1068        // which in turn implies one side of `at` must be valid UTF-8. This in
1069        // turn implies that \b can never split a valid UTF-8 encoding of a
1070        // codepoint. In the case where one side of `at` is truly invalid UTF-8
1071        // and the other side IS a word codepoint, then we want \b to match
1072        // since it represents a valid UTF-8 boundary. It also makes sense. For
1073        // example, you'd want \b\w+\b to match 'abc' in '\xFFabc\xFF'.
1074        //
1075        // Note also that this is not just '!is_word_unicode(..)' like it is
1076        // for the ASCII case. For example, neither \b nor \B is satisfied
1077        // within invalid UTF-8 sequences.
1078        let word_before = at > 0
1079            && match utf8::decode_last(&haystack[..at]) {
1080                None | Some(Err(_)) => return Ok(false),
1081                Some(Ok(_)) => is_word_char::rev(haystack, at)?,
1082            };
1083        let word_after = at < haystack.len()
1084            && match utf8::decode(&haystack[at..]) {
1085                None | Some(Err(_)) => return Ok(false),
1086                Some(Ok(_)) => is_word_char::fwd(haystack, at)?,
1087            };
1088        Ok(word_before == word_after)
1089    }
1090
1091    /// Returns true when [`Look::WordStartAscii`] is satisfied `at` the given
1092    /// position in `haystack`.
1093    ///
1094    /// # Panics
1095    ///
1096    /// This may panic when `at > haystack.len()`. Note that `at ==
1097    /// haystack.len()` is legal and guaranteed not to panic.
1098    #[inline]
1099    pub fn is_word_start_ascii(&self, haystack: &[u8], at: usize) -> bool {
1100        let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]);
1101        let word_after =
1102            at < haystack.len() && utf8::is_word_byte(haystack[at]);
1103        !word_before && word_after
1104    }
1105
1106    /// Returns true when [`Look::WordEndAscii`] is satisfied `at` the given
1107    /// position in `haystack`.
1108    ///
1109    /// # Panics
1110    ///
1111    /// This may panic when `at > haystack.len()`. Note that `at ==
1112    /// haystack.len()` is legal and guaranteed not to panic.
1113    #[inline]
1114    pub fn is_word_end_ascii(&self, haystack: &[u8], at: usize) -> bool {
1115        let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]);
1116        let word_after =
1117            at < haystack.len() && utf8::is_word_byte(haystack[at]);
1118        word_before && !word_after
1119    }
1120
1121    /// Returns true when [`Look::WordStartUnicode`] is satisfied `at` the
1122    /// given position in `haystack`.
1123    ///
1124    /// # Panics
1125    ///
1126    /// This may panic when `at > haystack.len()`. Note that `at ==
1127    /// haystack.len()` is legal and guaranteed not to panic.
1128    ///
1129    /// # Errors
1130    ///
1131    /// This returns an error when Unicode word boundary tables
1132    /// are not available. Specifically, this only occurs when the
1133    /// `unicode-word-boundary` feature is not enabled.
1134    #[inline]
1135    pub fn is_word_start_unicode(
1136        &self,
1137        haystack: &[u8],
1138        at: usize,
1139    ) -> Result<bool, UnicodeWordBoundaryError> {
1140        let word_before = is_word_char::rev(haystack, at)?;
1141        let word_after = is_word_char::fwd(haystack, at)?;
1142        Ok(!word_before && word_after)
1143    }
1144
1145    /// Returns true when [`Look::WordEndUnicode`] is satisfied `at` the
1146    /// given position in `haystack`.
1147    ///
1148    /// # Panics
1149    ///
1150    /// This may panic when `at > haystack.len()`. Note that `at ==
1151    /// haystack.len()` is legal and guaranteed not to panic.
1152    ///
1153    /// # Errors
1154    ///
1155    /// This returns an error when Unicode word boundary tables
1156    /// are not available. Specifically, this only occurs when the
1157    /// `unicode-word-boundary` feature is not enabled.
1158    #[inline]
1159    pub fn is_word_end_unicode(
1160        &self,
1161        haystack: &[u8],
1162        at: usize,
1163    ) -> Result<bool, UnicodeWordBoundaryError> {
1164        let word_before = is_word_char::rev(haystack, at)?;
1165        let word_after = is_word_char::fwd(haystack, at)?;
1166        Ok(word_before && !word_after)
1167    }
1168
1169    /// Returns true when [`Look::WordStartHalfAscii`] is satisfied `at` the
1170    /// given position in `haystack`.
1171    ///
1172    /// # Panics
1173    ///
1174    /// This may panic when `at > haystack.len()`. Note that `at ==
1175    /// haystack.len()` is legal and guaranteed not to panic.
1176    #[inline]
1177    pub fn is_word_start_half_ascii(
1178        &self,
1179        haystack: &[u8],
1180        at: usize,
1181    ) -> bool {
1182        let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]);
1183        !word_before
1184    }
1185
1186    /// Returns true when [`Look::WordEndHalfAscii`] is satisfied `at` the
1187    /// given position in `haystack`.
1188    ///
1189    /// # Panics
1190    ///
1191    /// This may panic when `at > haystack.len()`. Note that `at ==
1192    /// haystack.len()` is legal and guaranteed not to panic.
1193    #[inline]
1194    pub fn is_word_end_half_ascii(&self, haystack: &[u8], at: usize) -> bool {
1195        let word_after =
1196            at < haystack.len() && utf8::is_word_byte(haystack[at]);
1197        !word_after
1198    }
1199
1200    /// Returns true when [`Look::WordStartHalfUnicode`] is satisfied `at` the
1201    /// given position in `haystack`.
1202    ///
1203    /// # Panics
1204    ///
1205    /// This may panic when `at > haystack.len()`. Note that `at ==
1206    /// haystack.len()` is legal and guaranteed not to panic.
1207    ///
1208    /// # Errors
1209    ///
1210    /// This returns an error when Unicode word boundary tables
1211    /// are not available. Specifically, this only occurs when the
1212    /// `unicode-word-boundary` feature is not enabled.
1213    #[inline]
1214    pub fn is_word_start_half_unicode(
1215        &self,
1216        haystack: &[u8],
1217        at: usize,
1218    ) -> Result<bool, UnicodeWordBoundaryError> {
1219        // See `is_word_unicode_negate` for why we need to do this. We don't
1220        // need to do it for `is_word_start_unicode` because that guarantees
1221        // that the position matched falls on a valid UTF-8 boundary given
1222        // that the right side must be in \w.
1223        let word_before = at > 0
1224            && match utf8::decode_last(&haystack[..at]) {
1225                None | Some(Err(_)) => return Ok(false),
1226                Some(Ok(_)) => is_word_char::rev(haystack, at)?,
1227            };
1228        Ok(!word_before)
1229    }
1230
1231    /// Returns true when [`Look::WordEndHalfUnicode`] is satisfied `at` the
1232    /// given position in `haystack`.
1233    ///
1234    /// # Panics
1235    ///
1236    /// This may panic when `at > haystack.len()`. Note that `at ==
1237    /// haystack.len()` is legal and guaranteed not to panic.
1238    ///
1239    /// # Errors
1240    ///
1241    /// This returns an error when Unicode word boundary tables
1242    /// are not available. Specifically, this only occurs when the
1243    /// `unicode-word-boundary` feature is not enabled.
1244    #[inline]
1245    pub fn is_word_end_half_unicode(
1246        &self,
1247        haystack: &[u8],
1248        at: usize,
1249    ) -> Result<bool, UnicodeWordBoundaryError> {
1250        // See `is_word_unicode_negate` for why we need to do this. We don't
1251        // need to do it for `is_word_end_unicode` because that guarantees
1252        // that the position matched falls on a valid UTF-8 boundary given
1253        // that the left side must be in \w.
1254        let word_after = at < haystack.len()
1255            && match utf8::decode(&haystack[at..]) {
1256                None | Some(Err(_)) => return Ok(false),
1257                Some(Ok(_)) => is_word_char::fwd(haystack, at)?,
1258            };
1259        Ok(!word_after)
1260    }
1261}
1262
1263impl Default for LookMatcher {
1264    fn default() -> LookMatcher {
1265        LookMatcher::new()
1266    }
1267}
1268
1269/// An error that occurs when the Unicode-aware `\w` class is unavailable.
1270///
1271/// This error can occur when the data tables necessary for the Unicode aware
1272/// Perl character class `\w` are unavailable. The `\w` class is used to
1273/// determine whether a codepoint is considered a word character or not when
1274/// determining whether a Unicode aware `\b` (or `\B`) matches at a particular
1275/// position.
1276///
1277/// This error can only occur when the `unicode-word-boundary` feature is
1278/// disabled.
1279#[derive(Clone, Debug)]
1280pub struct UnicodeWordBoundaryError(());
1281
1282impl UnicodeWordBoundaryError {
1283    #[cfg(not(feature = "unicode-word-boundary"))]
1284    pub(crate) fn new() -> UnicodeWordBoundaryError {
1285        UnicodeWordBoundaryError(())
1286    }
1287
1288    /// Returns an error if and only if Unicode word boundary data is
1289    /// unavailable.
1290    pub fn check() -> Result<(), UnicodeWordBoundaryError> {
1291        is_word_char::check()
1292    }
1293}
1294
1295#[cfg(feature = "std")]
1296impl std::error::Error for UnicodeWordBoundaryError {}
1297
1298impl core::fmt::Display for UnicodeWordBoundaryError {
1299    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1300        write!(
1301            f,
1302            "Unicode-aware \\b and \\B are unavailable because the \
1303             requisite data tables are missing, please enable the \
1304             unicode-word-boundary feature"
1305        )
1306    }
1307}
1308
1309// Below are FOUR different ways for checking whether whether a "word"
1310// codepoint exists at a particular position in the haystack. The four
1311// different approaches are, in order of preference:
1312//
1313// 1. Parse '\w', convert to an NFA, convert to a fully compiled DFA on the
1314// first call, and then use that DFA for all subsequent calls.
1315// 2. Do UTF-8 decoding and use regex_syntax::is_word_character if available.
1316// 3. Do UTF-8 decoding and use our own 'perl_word' table.
1317// 4. Return an error.
1318//
1319// The reason for all of these approaches is a combination of perf and
1320// permitting one to build regex-automata without the Unicode data necessary
1321// for handling Unicode-aware word boundaries. (In which case, '(?-u:\b)' would
1322// still work.)
1323//
1324// The DFA approach is the fastest, but it requires the regex parser, the
1325// NFA compiler, the DFA builder and the DFA search runtime. That's a lot to
1326// bring in, but if it's available, it's (probably) the best we can do.
1327//
1328// Approaches (2) and (3) are effectively equivalent, but (2) reuses the
1329// data in regex-syntax and avoids duplicating it in regex-automata.
1330//
1331// Finally, (4) unconditionally returns an error since the requisite data isn't
1332// available anywhere.
1333//
1334// There are actually more approaches possible that we didn't implement. For
1335// example, if the DFA builder is available but the syntax parser is not, we
1336// could technically hand construct our own NFA from the 'perl_word' data
1337// table. But to avoid some pretty hairy code duplication, we would in turn
1338// need to pull the UTF-8 compiler out of the NFA compiler. Yikes.
1339//
1340// A possibly more sensible alternative is to use a lazy DFA when the full
1341// DFA builder isn't available...
1342//
1343// Yet another choice would be to build the full DFA and then embed it into the
1344// source. Then we'd only need to bring in the DFA search runtime, which is
1345// considerably smaller than the DFA builder code. The problem here is that the
1346// Debian people have spooked me[1] into avoiding cyclic dependencies. Namely,
1347// we'd need to build regex-cli, which depends on regex-automata in order to
1348// build some part of regex-automata. But to be honest, something like this has
1349// to be allowed somehow? I just don't know what the right process is.
1350//
1351// There are perhaps other choices as well. Why did I stop at these 4? Because
1352// I wanted to preserve my sanity. I suspect I'll wind up adding the lazy DFA
1353// approach eventually, as the benefits of the DFA approach are somewhat
1354// compelling. The 'boundary-words-holmes' benchmark tests this. (Note that
1355// the commands below no longer work. If necessary, we should re-capitulate
1356// the benchmark from whole cloth in rebar.)
1357//
1358//   $ regex-cli bench measure -f boundary-words-holmes -e pikevm > dfa.csv
1359//
1360// Then I changed the code below so that the util/unicode_data/perl_word table
1361// was used and re-ran the benchmark:
1362//
1363//   $ regex-cli bench measure -f boundary-words-holmes -e pikevm > table.csv
1364//
1365// And compared them:
1366//
1367//   $ regex-cli bench diff dfa.csv table.csv
1368//   benchmark                             engine                 dfa        table
1369//   ---------                             ------                 ---        -----
1370//   internal/count/boundary-words-holmes  regex/automata/pikevm  18.6 MB/s  12.9 MB/s
1371//
1372// Which is a nice improvement.
1373//
1374// UPDATE: It turns out that it takes approximately 22ms to build the reverse
1375// DFA for \w. (And about 3ms for the forward DFA.) It's probably not much in
1376// the grand scheme things, but that is a significant latency cost. So I'm not
1377// sure that's a good idea. I then tried using a lazy DFA instead, and that
1378// eliminated the overhead, but since the lazy DFA requires mutable working
1379// memory, that requires introducing a 'Cache' for every simultaneous call.
1380//
1381// I ended up deciding for now to just keep the "UTF-8 decode and check the
1382// table." The DFA and lazy DFA approaches are still below, but commented out.
1383//
1384// [1]: https://github.com/BurntSushi/ucd-generate/issues/11
1385
1386/*
1387/// A module that looks for word codepoints using lazy DFAs.
1388#[cfg(all(
1389    feature = "unicode-word-boundary",
1390    feature = "syntax",
1391    feature = "unicode-perl",
1392    feature = "hybrid"
1393))]
1394mod is_word_char {
1395    use alloc::vec::Vec;
1396
1397    use crate::{
1398        hybrid::dfa::{Cache, DFA},
1399        nfa::thompson::NFA,
1400        util::{lazy::Lazy, pool::Pool, primitives::StateID},
1401        Anchored, Input,
1402    };
1403
1404    pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> {
1405        Ok(())
1406    }
1407
1408    #[cfg_attr(feature = "perf-inline", inline(always))]
1409    pub(super) fn fwd(
1410        haystack: &[u8],
1411        mut at: usize,
1412    ) -> Result<bool, super::UnicodeWordBoundaryError> {
1413        static WORD: Lazy<DFA> = Lazy::new(|| DFA::new(r"\w").unwrap());
1414        static CACHE: Lazy<Pool<Cache>> =
1415            Lazy::new(|| Pool::new(|| WORD.create_cache()));
1416        let dfa = Lazy::get(&WORD);
1417        let mut cache = Lazy::get(&CACHE).get();
1418        let mut sid = dfa
1419            .start_state_forward(
1420                &mut cache,
1421                &Input::new("").anchored(Anchored::Yes),
1422            )
1423            .unwrap();
1424        while at < haystack.len() {
1425            let byte = haystack[at];
1426            sid = dfa.next_state(&mut cache, sid, byte).unwrap();
1427            at += 1;
1428            if sid.is_tagged() {
1429                if sid.is_match() {
1430                    return Ok(true);
1431                } else if sid.is_dead() {
1432                    return Ok(false);
1433                }
1434            }
1435        }
1436        Ok(dfa.next_eoi_state(&mut cache, sid).unwrap().is_match())
1437    }
1438
1439    #[cfg_attr(feature = "perf-inline", inline(always))]
1440    pub(super) fn rev(
1441        haystack: &[u8],
1442        mut at: usize,
1443    ) -> Result<bool, super::UnicodeWordBoundaryError> {
1444        static WORD: Lazy<DFA> = Lazy::new(|| {
1445            DFA::builder()
1446                .thompson(NFA::config().reverse(true))
1447                .build(r"\w")
1448                .unwrap()
1449        });
1450        static CACHE: Lazy<Pool<Cache>> =
1451            Lazy::new(|| Pool::new(|| WORD.create_cache()));
1452        let dfa = Lazy::get(&WORD);
1453        let mut cache = Lazy::get(&CACHE).get();
1454        let mut sid = dfa
1455            .start_state_reverse(
1456                &mut cache,
1457                &Input::new("").anchored(Anchored::Yes),
1458            )
1459            .unwrap();
1460        while at > 0 {
1461            at -= 1;
1462            let byte = haystack[at];
1463            sid = dfa.next_state(&mut cache, sid, byte).unwrap();
1464            if sid.is_tagged() {
1465                if sid.is_match() {
1466                    return Ok(true);
1467                } else if sid.is_dead() {
1468                    return Ok(false);
1469                }
1470            }
1471        }
1472        Ok(dfa.next_eoi_state(&mut cache, sid).unwrap().is_match())
1473    }
1474}
1475*/
1476
1477/*
1478/// A module that looks for word codepoints using fully compiled DFAs.
1479#[cfg(all(
1480    feature = "unicode-word-boundary",
1481    feature = "syntax",
1482    feature = "unicode-perl",
1483    feature = "dfa-build"
1484))]
1485mod is_word_char {
1486    use alloc::vec::Vec;
1487
1488    use crate::{
1489        dfa::{dense::DFA, Automaton, StartKind},
1490        nfa::thompson::NFA,
1491        util::{lazy::Lazy, primitives::StateID},
1492        Anchored, Input,
1493    };
1494
1495    pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> {
1496        Ok(())
1497    }
1498
1499    #[cfg_attr(feature = "perf-inline", inline(always))]
1500    pub(super) fn fwd(
1501        haystack: &[u8],
1502        mut at: usize,
1503    ) -> Result<bool, super::UnicodeWordBoundaryError> {
1504        static WORD: Lazy<(DFA<Vec<u32>>, StateID)> = Lazy::new(|| {
1505            let dfa = DFA::builder()
1506                .configure(DFA::config().start_kind(StartKind::Anchored))
1507                .build(r"\w")
1508                .unwrap();
1509            // OK because our regex has no look-around.
1510            let start_id = dfa.universal_start_state(Anchored::Yes).unwrap();
1511            (dfa, start_id)
1512        });
1513        let &(ref dfa, mut sid) = Lazy::get(&WORD);
1514        while at < haystack.len() {
1515            let byte = haystack[at];
1516            sid = dfa.next_state(sid, byte);
1517            at += 1;
1518            if dfa.is_special_state(sid) {
1519                if dfa.is_match_state(sid) {
1520                    return Ok(true);
1521                } else if dfa.is_dead_state(sid) {
1522                    return Ok(false);
1523                }
1524            }
1525        }
1526        Ok(dfa.is_match_state(dfa.next_eoi_state(sid)))
1527    }
1528
1529    #[cfg_attr(feature = "perf-inline", inline(always))]
1530    pub(super) fn rev(
1531        haystack: &[u8],
1532        mut at: usize,
1533    ) -> Result<bool, super::UnicodeWordBoundaryError> {
1534        static WORD: Lazy<(DFA<Vec<u32>>, StateID)> = Lazy::new(|| {
1535            let dfa = DFA::builder()
1536                .configure(DFA::config().start_kind(StartKind::Anchored))
1537                // From ad hoc measurements, it looks like setting
1538                // shrink==false is slightly faster than shrink==true. I kind
1539                // of feel like this indicates that shrinking is probably a
1540                // failure, although it can help in some cases. Sigh.
1541                .thompson(NFA::config().reverse(true).shrink(false))
1542                .build(r"\w")
1543                .unwrap();
1544            // OK because our regex has no look-around.
1545            let start_id = dfa.universal_start_state(Anchored::Yes).unwrap();
1546            (dfa, start_id)
1547        });
1548        let &(ref dfa, mut sid) = Lazy::get(&WORD);
1549        while at > 0 {
1550            at -= 1;
1551            let byte = haystack[at];
1552            sid = dfa.next_state(sid, byte);
1553            if dfa.is_special_state(sid) {
1554                if dfa.is_match_state(sid) {
1555                    return Ok(true);
1556                } else if dfa.is_dead_state(sid) {
1557                    return Ok(false);
1558                }
1559            }
1560        }
1561        Ok(dfa.is_match_state(dfa.next_eoi_state(sid)))
1562    }
1563}
1564*/
1565
1566/// A module that looks for word codepoints using regex-syntax's data tables.
1567#[cfg(all(
1568    feature = "unicode-word-boundary",
1569    feature = "syntax",
1570    feature = "unicode-perl",
1571))]
1572mod is_word_char {
1573    use regex_syntax::try_is_word_character;
1574
1575    use crate::util::utf8;
1576
1577    pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> {
1578        Ok(())
1579    }
1580
1581    #[cfg_attr(feature = "perf-inline", inline(always))]
1582    pub(super) fn fwd(
1583        haystack: &[u8],
1584        at: usize,
1585    ) -> Result<bool, super::UnicodeWordBoundaryError> {
1586        Ok(match utf8::decode(&haystack[at..]) {
1587            None | Some(Err(_)) => false,
1588            Some(Ok(ch)) => try_is_word_character(ch).expect(
1589                "since unicode-word-boundary, syntax and unicode-perl \
1590                 are all enabled, it is expected that \
1591                 try_is_word_character succeeds",
1592            ),
1593        })
1594    }
1595
1596    #[cfg_attr(feature = "perf-inline", inline(always))]
1597    pub(super) fn rev(
1598        haystack: &[u8],
1599        at: usize,
1600    ) -> Result<bool, super::UnicodeWordBoundaryError> {
1601        Ok(match utf8::decode_last(&haystack[..at]) {
1602            None | Some(Err(_)) => false,
1603            Some(Ok(ch)) => try_is_word_character(ch).expect(
1604                "since unicode-word-boundary, syntax and unicode-perl \
1605                 are all enabled, it is expected that \
1606                 try_is_word_character succeeds",
1607            ),
1608        })
1609    }
1610}
1611
1612/// A module that looks for word codepoints using regex-automata's data tables
1613/// (which are only compiled when regex-syntax's tables aren't available).
1614///
1615/// Note that the cfg should match the one in src/util/unicode_data/mod.rs for
1616/// perl_word.
1617#[cfg(all(
1618    feature = "unicode-word-boundary",
1619    not(all(feature = "syntax", feature = "unicode-perl")),
1620))]
1621mod is_word_char {
1622    use crate::util::utf8;
1623
1624    pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> {
1625        Ok(())
1626    }
1627
1628    #[cfg_attr(feature = "perf-inline", inline(always))]
1629    pub(super) fn fwd(
1630        haystack: &[u8],
1631        at: usize,
1632    ) -> Result<bool, super::UnicodeWordBoundaryError> {
1633        Ok(match utf8::decode(&haystack[at..]) {
1634            None | Some(Err(_)) => false,
1635            Some(Ok(ch)) => is_word_character(ch),
1636        })
1637    }
1638
1639    #[cfg_attr(feature = "perf-inline", inline(always))]
1640    pub(super) fn rev(
1641        haystack: &[u8],
1642        at: usize,
1643    ) -> Result<bool, super::UnicodeWordBoundaryError> {
1644        Ok(match utf8::decode_last(&haystack[..at]) {
1645            None | Some(Err(_)) => false,
1646            Some(Ok(ch)) => is_word_character(ch),
1647        })
1648    }
1649
1650    #[cfg_attr(feature = "perf-inline", inline(always))]
1651    fn is_word_character(c: char) -> bool {
1652        use crate::util::{unicode_data::perl_word::PERL_WORD, utf8};
1653
1654        if u8::try_from(c).map_or(false, utf8::is_word_byte) {
1655            return true;
1656        }
1657        PERL_WORD
1658            .binary_search_by(|&(start, end)| {
1659                use core::cmp::Ordering;
1660
1661                if start <= c && c <= end {
1662                    Ordering::Equal
1663                } else if start > c {
1664                    Ordering::Greater
1665                } else {
1666                    Ordering::Less
1667                }
1668            })
1669            .is_ok()
1670    }
1671}
1672
1673/// A module that always returns an error if Unicode word boundaries are
1674/// disabled. When this feature is disabled, then regex-automata will not
1675/// include its own data tables even if regex-syntax is disabled.
1676#[cfg(not(feature = "unicode-word-boundary"))]
1677mod is_word_char {
1678    pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> {
1679        Err(super::UnicodeWordBoundaryError::new())
1680    }
1681
1682    #[cfg_attr(feature = "perf-inline", inline(always))]
1683    pub(super) fn fwd(
1684        _bytes: &[u8],
1685        _at: usize,
1686    ) -> Result<bool, super::UnicodeWordBoundaryError> {
1687        Err(super::UnicodeWordBoundaryError::new())
1688    }
1689
1690    #[cfg_attr(feature = "perf-inline", inline(always))]
1691    pub(super) fn rev(
1692        _bytes: &[u8],
1693        _at: usize,
1694    ) -> Result<bool, super::UnicodeWordBoundaryError> {
1695        Err(super::UnicodeWordBoundaryError::new())
1696    }
1697}
1698
1699#[cfg(test)]
1700mod tests {
1701    use super::*;
1702
1703    macro_rules! testlook {
1704        ($look:expr, $haystack:expr, $at:expr) => {
1705            LookMatcher::default().matches($look, $haystack.as_bytes(), $at)
1706        };
1707    }
1708
1709    #[test]
1710    fn look_matches_start_line() {
1711        let look = Look::StartLF;
1712
1713        assert!(testlook!(look, "", 0));
1714        assert!(testlook!(look, "\n", 0));
1715        assert!(testlook!(look, "\n", 1));
1716        assert!(testlook!(look, "a", 0));
1717        assert!(testlook!(look, "\na", 1));
1718
1719        assert!(!testlook!(look, "a", 1));
1720        assert!(!testlook!(look, "a\na", 1));
1721    }
1722
1723    #[test]
1724    fn look_matches_end_line() {
1725        let look = Look::EndLF;
1726
1727        assert!(testlook!(look, "", 0));
1728        assert!(testlook!(look, "\n", 1));
1729        assert!(testlook!(look, "\na", 0));
1730        assert!(testlook!(look, "\na", 2));
1731        assert!(testlook!(look, "a\na", 1));
1732
1733        assert!(!testlook!(look, "a", 0));
1734        assert!(!testlook!(look, "\na", 1));
1735        assert!(!testlook!(look, "a\na", 0));
1736        assert!(!testlook!(look, "a\na", 2));
1737    }
1738
1739    #[test]
1740    fn look_matches_start_text() {
1741        let look = Look::Start;
1742
1743        assert!(testlook!(look, "", 0));
1744        assert!(testlook!(look, "\n", 0));
1745        assert!(testlook!(look, "a", 0));
1746
1747        assert!(!testlook!(look, "\n", 1));
1748        assert!(!testlook!(look, "\na", 1));
1749        assert!(!testlook!(look, "a", 1));
1750        assert!(!testlook!(look, "a\na", 1));
1751    }
1752
1753    #[test]
1754    fn look_matches_end_text() {
1755        let look = Look::End;
1756
1757        assert!(testlook!(look, "", 0));
1758        assert!(testlook!(look, "\n", 1));
1759        assert!(testlook!(look, "\na", 2));
1760
1761        assert!(!testlook!(look, "\na", 0));
1762        assert!(!testlook!(look, "a\na", 1));
1763        assert!(!testlook!(look, "a", 0));
1764        assert!(!testlook!(look, "\na", 1));
1765        assert!(!testlook!(look, "a\na", 0));
1766        assert!(!testlook!(look, "a\na", 2));
1767    }
1768
1769    #[test]
1770    #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
1771    fn look_matches_word_unicode() {
1772        let look = Look::WordUnicode;
1773
1774        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
1775        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
1776
1777        // Simple ASCII word boundaries.
1778        assert!(testlook!(look, "a", 0));
1779        assert!(testlook!(look, "a", 1));
1780        assert!(testlook!(look, "a ", 1));
1781        assert!(testlook!(look, " a ", 1));
1782        assert!(testlook!(look, " a ", 2));
1783
1784        // Unicode word boundaries with a non-ASCII codepoint.
1785        assert!(testlook!(look, "𝛃", 0));
1786        assert!(testlook!(look, "𝛃", 4));
1787        assert!(testlook!(look, "𝛃 ", 4));
1788        assert!(testlook!(look, " 𝛃 ", 1));
1789        assert!(testlook!(look, " 𝛃 ", 5));
1790
1791        // Unicode word boundaries between non-ASCII codepoints.
1792        assert!(testlook!(look, "𝛃𐆀", 0));
1793        assert!(testlook!(look, "𝛃𐆀", 4));
1794
1795        // Non word boundaries for ASCII.
1796        assert!(!testlook!(look, "", 0));
1797        assert!(!testlook!(look, "ab", 1));
1798        assert!(!testlook!(look, "a ", 2));
1799        assert!(!testlook!(look, " a ", 0));
1800        assert!(!testlook!(look, " a ", 3));
1801
1802        // Non word boundaries with a non-ASCII codepoint.
1803        assert!(!testlook!(look, "𝛃b", 4));
1804        assert!(!testlook!(look, "𝛃 ", 5));
1805        assert!(!testlook!(look, " 𝛃 ", 0));
1806        assert!(!testlook!(look, " 𝛃 ", 6));
1807        assert!(!testlook!(look, "𝛃", 1));
1808        assert!(!testlook!(look, "𝛃", 2));
1809        assert!(!testlook!(look, "𝛃", 3));
1810
1811        // Non word boundaries with non-ASCII codepoints.
1812        assert!(!testlook!(look, "𝛃𐆀", 1));
1813        assert!(!testlook!(look, "𝛃𐆀", 2));
1814        assert!(!testlook!(look, "𝛃𐆀", 3));
1815        assert!(!testlook!(look, "𝛃𐆀", 5));
1816        assert!(!testlook!(look, "𝛃𐆀", 6));
1817        assert!(!testlook!(look, "𝛃𐆀", 7));
1818        assert!(!testlook!(look, "𝛃𐆀", 8));
1819    }
1820
1821    #[test]
1822    fn look_matches_word_ascii() {
1823        let look = Look::WordAscii;
1824
1825        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
1826        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
1827
1828        // Simple ASCII word boundaries.
1829        assert!(testlook!(look, "a", 0));
1830        assert!(testlook!(look, "a", 1));
1831        assert!(testlook!(look, "a ", 1));
1832        assert!(testlook!(look, " a ", 1));
1833        assert!(testlook!(look, " a ", 2));
1834
1835        // Unicode word boundaries with a non-ASCII codepoint. Since this is
1836        // an ASCII word boundary, none of these match.
1837        assert!(!testlook!(look, "𝛃", 0));
1838        assert!(!testlook!(look, "𝛃", 4));
1839        assert!(!testlook!(look, "𝛃 ", 4));
1840        assert!(!testlook!(look, " 𝛃 ", 1));
1841        assert!(!testlook!(look, " 𝛃 ", 5));
1842
1843        // Unicode word boundaries between non-ASCII codepoints. Again, since
1844        // this is an ASCII word boundary, none of these match.
1845        assert!(!testlook!(look, "𝛃𐆀", 0));
1846        assert!(!testlook!(look, "𝛃𐆀", 4));
1847
1848        // Non word boundaries for ASCII.
1849        assert!(!testlook!(look, "", 0));
1850        assert!(!testlook!(look, "ab", 1));
1851        assert!(!testlook!(look, "a ", 2));
1852        assert!(!testlook!(look, " a ", 0));
1853        assert!(!testlook!(look, " a ", 3));
1854
1855        // Non word boundaries with a non-ASCII codepoint.
1856        assert!(testlook!(look, "𝛃b", 4));
1857        assert!(!testlook!(look, "𝛃 ", 5));
1858        assert!(!testlook!(look, " 𝛃 ", 0));
1859        assert!(!testlook!(look, " 𝛃 ", 6));
1860        assert!(!testlook!(look, "𝛃", 1));
1861        assert!(!testlook!(look, "𝛃", 2));
1862        assert!(!testlook!(look, "𝛃", 3));
1863
1864        // Non word boundaries with non-ASCII codepoints.
1865        assert!(!testlook!(look, "𝛃𐆀", 1));
1866        assert!(!testlook!(look, "𝛃𐆀", 2));
1867        assert!(!testlook!(look, "𝛃𐆀", 3));
1868        assert!(!testlook!(look, "𝛃𐆀", 5));
1869        assert!(!testlook!(look, "𝛃𐆀", 6));
1870        assert!(!testlook!(look, "𝛃𐆀", 7));
1871        assert!(!testlook!(look, "𝛃𐆀", 8));
1872    }
1873
1874    #[test]
1875    #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
1876    fn look_matches_word_unicode_negate() {
1877        let look = Look::WordUnicodeNegate;
1878
1879        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
1880        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
1881
1882        // Simple ASCII word boundaries.
1883        assert!(!testlook!(look, "a", 0));
1884        assert!(!testlook!(look, "a", 1));
1885        assert!(!testlook!(look, "a ", 1));
1886        assert!(!testlook!(look, " a ", 1));
1887        assert!(!testlook!(look, " a ", 2));
1888
1889        // Unicode word boundaries with a non-ASCII codepoint.
1890        assert!(!testlook!(look, "𝛃", 0));
1891        assert!(!testlook!(look, "𝛃", 4));
1892        assert!(!testlook!(look, "𝛃 ", 4));
1893        assert!(!testlook!(look, " 𝛃 ", 1));
1894        assert!(!testlook!(look, " 𝛃 ", 5));
1895
1896        // Unicode word boundaries between non-ASCII codepoints.
1897        assert!(!testlook!(look, "𝛃𐆀", 0));
1898        assert!(!testlook!(look, "𝛃𐆀", 4));
1899
1900        // Non word boundaries for ASCII.
1901        assert!(testlook!(look, "", 0));
1902        assert!(testlook!(look, "ab", 1));
1903        assert!(testlook!(look, "a ", 2));
1904        assert!(testlook!(look, " a ", 0));
1905        assert!(testlook!(look, " a ", 3));
1906
1907        // Non word boundaries with a non-ASCII codepoint.
1908        assert!(testlook!(look, "𝛃b", 4));
1909        assert!(testlook!(look, "𝛃 ", 5));
1910        assert!(testlook!(look, " 𝛃 ", 0));
1911        assert!(testlook!(look, " 𝛃 ", 6));
1912        // These don't match because they could otherwise return an offset that
1913        // splits the UTF-8 encoding of a codepoint.
1914        assert!(!testlook!(look, "𝛃", 1));
1915        assert!(!testlook!(look, "𝛃", 2));
1916        assert!(!testlook!(look, "𝛃", 3));
1917
1918        // Non word boundaries with non-ASCII codepoints. These also don't
1919        // match because they could otherwise return an offset that splits the
1920        // UTF-8 encoding of a codepoint.
1921        assert!(!testlook!(look, "𝛃𐆀", 1));
1922        assert!(!testlook!(look, "𝛃𐆀", 2));
1923        assert!(!testlook!(look, "𝛃𐆀", 3));
1924        assert!(!testlook!(look, "𝛃𐆀", 5));
1925        assert!(!testlook!(look, "𝛃𐆀", 6));
1926        assert!(!testlook!(look, "𝛃𐆀", 7));
1927        // But this one does, since 𐆀 isn't a word codepoint, and 8 is the end
1928        // of the haystack. So the "end" of the haystack isn't a word and 𐆀
1929        // isn't a word, thus, \B matches.
1930        assert!(testlook!(look, "𝛃𐆀", 8));
1931    }
1932
1933    #[test]
1934    fn look_matches_word_ascii_negate() {
1935        let look = Look::WordAsciiNegate;
1936
1937        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
1938        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
1939
1940        // Simple ASCII word boundaries.
1941        assert!(!testlook!(look, "a", 0));
1942        assert!(!testlook!(look, "a", 1));
1943        assert!(!testlook!(look, "a ", 1));
1944        assert!(!testlook!(look, " a ", 1));
1945        assert!(!testlook!(look, " a ", 2));
1946
1947        // Unicode word boundaries with a non-ASCII codepoint. Since this is
1948        // an ASCII word boundary, none of these match.
1949        assert!(testlook!(look, "𝛃", 0));
1950        assert!(testlook!(look, "𝛃", 4));
1951        assert!(testlook!(look, "𝛃 ", 4));
1952        assert!(testlook!(look, " 𝛃 ", 1));
1953        assert!(testlook!(look, " 𝛃 ", 5));
1954
1955        // Unicode word boundaries between non-ASCII codepoints. Again, since
1956        // this is an ASCII word boundary, none of these match.
1957        assert!(testlook!(look, "𝛃𐆀", 0));
1958        assert!(testlook!(look, "𝛃𐆀", 4));
1959
1960        // Non word boundaries for ASCII.
1961        assert!(testlook!(look, "", 0));
1962        assert!(testlook!(look, "ab", 1));
1963        assert!(testlook!(look, "a ", 2));
1964        assert!(testlook!(look, " a ", 0));
1965        assert!(testlook!(look, " a ", 3));
1966
1967        // Non word boundaries with a non-ASCII codepoint.
1968        assert!(!testlook!(look, "𝛃b", 4));
1969        assert!(testlook!(look, "𝛃 ", 5));
1970        assert!(testlook!(look, " 𝛃 ", 0));
1971        assert!(testlook!(look, " 𝛃 ", 6));
1972        assert!(testlook!(look, "𝛃", 1));
1973        assert!(testlook!(look, "𝛃", 2));
1974        assert!(testlook!(look, "𝛃", 3));
1975
1976        // Non word boundaries with non-ASCII codepoints.
1977        assert!(testlook!(look, "𝛃𐆀", 1));
1978        assert!(testlook!(look, "𝛃𐆀", 2));
1979        assert!(testlook!(look, "𝛃𐆀", 3));
1980        assert!(testlook!(look, "𝛃𐆀", 5));
1981        assert!(testlook!(look, "𝛃𐆀", 6));
1982        assert!(testlook!(look, "𝛃𐆀", 7));
1983        assert!(testlook!(look, "𝛃𐆀", 8));
1984    }
1985
1986    #[test]
1987    fn look_matches_word_start_ascii() {
1988        let look = Look::WordStartAscii;
1989
1990        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
1991        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
1992
1993        // Simple ASCII word boundaries.
1994        assert!(testlook!(look, "a", 0));
1995        assert!(!testlook!(look, "a", 1));
1996        assert!(!testlook!(look, "a ", 1));
1997        assert!(testlook!(look, " a ", 1));
1998        assert!(!testlook!(look, " a ", 2));
1999
2000        // Unicode word boundaries with a non-ASCII codepoint. Since this is
2001        // an ASCII word boundary, none of these match.
2002        assert!(!testlook!(look, "𝛃", 0));
2003        assert!(!testlook!(look, "𝛃", 4));
2004        assert!(!testlook!(look, "𝛃 ", 4));
2005        assert!(!testlook!(look, " 𝛃 ", 1));
2006        assert!(!testlook!(look, " 𝛃 ", 5));
2007
2008        // Unicode word boundaries between non-ASCII codepoints. Again, since
2009        // this is an ASCII word boundary, none of these match.
2010        assert!(!testlook!(look, "𝛃𐆀", 0));
2011        assert!(!testlook!(look, "𝛃𐆀", 4));
2012
2013        // Non word boundaries for ASCII.
2014        assert!(!testlook!(look, "", 0));
2015        assert!(!testlook!(look, "ab", 1));
2016        assert!(!testlook!(look, "a ", 2));
2017        assert!(!testlook!(look, " a ", 0));
2018        assert!(!testlook!(look, " a ", 3));
2019
2020        // Non word boundaries with a non-ASCII codepoint.
2021        assert!(testlook!(look, "𝛃b", 4));
2022        assert!(!testlook!(look, "b𝛃", 1));
2023        assert!(!testlook!(look, "𝛃 ", 5));
2024        assert!(!testlook!(look, " 𝛃 ", 0));
2025        assert!(!testlook!(look, " 𝛃 ", 6));
2026        assert!(!testlook!(look, "𝛃", 1));
2027        assert!(!testlook!(look, "𝛃", 2));
2028        assert!(!testlook!(look, "𝛃", 3));
2029
2030        // Non word boundaries with non-ASCII codepoints.
2031        assert!(!testlook!(look, "𝛃𐆀", 1));
2032        assert!(!testlook!(look, "𝛃𐆀", 2));
2033        assert!(!testlook!(look, "𝛃𐆀", 3));
2034        assert!(!testlook!(look, "𝛃𐆀", 5));
2035        assert!(!testlook!(look, "𝛃𐆀", 6));
2036        assert!(!testlook!(look, "𝛃𐆀", 7));
2037        assert!(!testlook!(look, "𝛃𐆀", 8));
2038    }
2039
2040    #[test]
2041    fn look_matches_word_end_ascii() {
2042        let look = Look::WordEndAscii;
2043
2044        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
2045        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
2046
2047        // Simple ASCII word boundaries.
2048        assert!(!testlook!(look, "a", 0));
2049        assert!(testlook!(look, "a", 1));
2050        assert!(testlook!(look, "a ", 1));
2051        assert!(!testlook!(look, " a ", 1));
2052        assert!(testlook!(look, " a ", 2));
2053
2054        // Unicode word boundaries with a non-ASCII codepoint. Since this is
2055        // an ASCII word boundary, none of these match.
2056        assert!(!testlook!(look, "𝛃", 0));
2057        assert!(!testlook!(look, "𝛃", 4));
2058        assert!(!testlook!(look, "𝛃 ", 4));
2059        assert!(!testlook!(look, " 𝛃 ", 1));
2060        assert!(!testlook!(look, " 𝛃 ", 5));
2061
2062        // Unicode word boundaries between non-ASCII codepoints. Again, since
2063        // this is an ASCII word boundary, none of these match.
2064        assert!(!testlook!(look, "𝛃𐆀", 0));
2065        assert!(!testlook!(look, "𝛃𐆀", 4));
2066
2067        // Non word boundaries for ASCII.
2068        assert!(!testlook!(look, "", 0));
2069        assert!(!testlook!(look, "ab", 1));
2070        assert!(!testlook!(look, "a ", 2));
2071        assert!(!testlook!(look, " a ", 0));
2072        assert!(!testlook!(look, " a ", 3));
2073
2074        // Non word boundaries with a non-ASCII codepoint.
2075        assert!(!testlook!(look, "𝛃b", 4));
2076        assert!(testlook!(look, "b𝛃", 1));
2077        assert!(!testlook!(look, "𝛃 ", 5));
2078        assert!(!testlook!(look, " 𝛃 ", 0));
2079        assert!(!testlook!(look, " 𝛃 ", 6));
2080        assert!(!testlook!(look, "𝛃", 1));
2081        assert!(!testlook!(look, "𝛃", 2));
2082        assert!(!testlook!(look, "𝛃", 3));
2083
2084        // Non word boundaries with non-ASCII codepoints.
2085        assert!(!testlook!(look, "𝛃𐆀", 1));
2086        assert!(!testlook!(look, "𝛃𐆀", 2));
2087        assert!(!testlook!(look, "𝛃𐆀", 3));
2088        assert!(!testlook!(look, "𝛃𐆀", 5));
2089        assert!(!testlook!(look, "𝛃𐆀", 6));
2090        assert!(!testlook!(look, "𝛃𐆀", 7));
2091        assert!(!testlook!(look, "𝛃𐆀", 8));
2092    }
2093
2094    #[test]
2095    #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
2096    fn look_matches_word_start_unicode() {
2097        let look = Look::WordStartUnicode;
2098
2099        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
2100        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
2101
2102        // Simple ASCII word boundaries.
2103        assert!(testlook!(look, "a", 0));
2104        assert!(!testlook!(look, "a", 1));
2105        assert!(!testlook!(look, "a ", 1));
2106        assert!(testlook!(look, " a ", 1));
2107        assert!(!testlook!(look, " a ", 2));
2108
2109        // Unicode word boundaries with a non-ASCII codepoint.
2110        assert!(testlook!(look, "𝛃", 0));
2111        assert!(!testlook!(look, "𝛃", 4));
2112        assert!(!testlook!(look, "𝛃 ", 4));
2113        assert!(testlook!(look, " 𝛃 ", 1));
2114        assert!(!testlook!(look, " 𝛃 ", 5));
2115
2116        // Unicode word boundaries between non-ASCII codepoints.
2117        assert!(testlook!(look, "𝛃𐆀", 0));
2118        assert!(!testlook!(look, "𝛃𐆀", 4));
2119
2120        // Non word boundaries for ASCII.
2121        assert!(!testlook!(look, "", 0));
2122        assert!(!testlook!(look, "ab", 1));
2123        assert!(!testlook!(look, "a ", 2));
2124        assert!(!testlook!(look, " a ", 0));
2125        assert!(!testlook!(look, " a ", 3));
2126
2127        // Non word boundaries with a non-ASCII codepoint.
2128        assert!(!testlook!(look, "𝛃b", 4));
2129        assert!(!testlook!(look, "b𝛃", 1));
2130        assert!(!testlook!(look, "𝛃 ", 5));
2131        assert!(!testlook!(look, " 𝛃 ", 0));
2132        assert!(!testlook!(look, " 𝛃 ", 6));
2133        assert!(!testlook!(look, "𝛃", 1));
2134        assert!(!testlook!(look, "𝛃", 2));
2135        assert!(!testlook!(look, "𝛃", 3));
2136
2137        // Non word boundaries with non-ASCII codepoints.
2138        assert!(!testlook!(look, "𝛃𐆀", 1));
2139        assert!(!testlook!(look, "𝛃𐆀", 2));
2140        assert!(!testlook!(look, "𝛃𐆀", 3));
2141        assert!(!testlook!(look, "𝛃𐆀", 5));
2142        assert!(!testlook!(look, "𝛃𐆀", 6));
2143        assert!(!testlook!(look, "𝛃𐆀", 7));
2144        assert!(!testlook!(look, "𝛃𐆀", 8));
2145    }
2146
2147    #[test]
2148    #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
2149    fn look_matches_word_end_unicode() {
2150        let look = Look::WordEndUnicode;
2151
2152        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
2153        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
2154
2155        // Simple ASCII word boundaries.
2156        assert!(!testlook!(look, "a", 0));
2157        assert!(testlook!(look, "a", 1));
2158        assert!(testlook!(look, "a ", 1));
2159        assert!(!testlook!(look, " a ", 1));
2160        assert!(testlook!(look, " a ", 2));
2161
2162        // Unicode word boundaries with a non-ASCII codepoint.
2163        assert!(!testlook!(look, "𝛃", 0));
2164        assert!(testlook!(look, "𝛃", 4));
2165        assert!(testlook!(look, "𝛃 ", 4));
2166        assert!(!testlook!(look, " 𝛃 ", 1));
2167        assert!(testlook!(look, " 𝛃 ", 5));
2168
2169        // Unicode word boundaries between non-ASCII codepoints.
2170        assert!(!testlook!(look, "𝛃𐆀", 0));
2171        assert!(testlook!(look, "𝛃𐆀", 4));
2172
2173        // Non word boundaries for ASCII.
2174        assert!(!testlook!(look, "", 0));
2175        assert!(!testlook!(look, "ab", 1));
2176        assert!(!testlook!(look, "a ", 2));
2177        assert!(!testlook!(look, " a ", 0));
2178        assert!(!testlook!(look, " a ", 3));
2179
2180        // Non word boundaries with a non-ASCII codepoint.
2181        assert!(!testlook!(look, "𝛃b", 4));
2182        assert!(!testlook!(look, "b𝛃", 1));
2183        assert!(!testlook!(look, "𝛃 ", 5));
2184        assert!(!testlook!(look, " 𝛃 ", 0));
2185        assert!(!testlook!(look, " 𝛃 ", 6));
2186        assert!(!testlook!(look, "𝛃", 1));
2187        assert!(!testlook!(look, "𝛃", 2));
2188        assert!(!testlook!(look, "𝛃", 3));
2189
2190        // Non word boundaries with non-ASCII codepoints.
2191        assert!(!testlook!(look, "𝛃𐆀", 1));
2192        assert!(!testlook!(look, "𝛃𐆀", 2));
2193        assert!(!testlook!(look, "𝛃𐆀", 3));
2194        assert!(!testlook!(look, "𝛃𐆀", 5));
2195        assert!(!testlook!(look, "𝛃𐆀", 6));
2196        assert!(!testlook!(look, "𝛃𐆀", 7));
2197        assert!(!testlook!(look, "𝛃𐆀", 8));
2198    }
2199
2200    #[test]
2201    fn look_matches_word_start_half_ascii() {
2202        let look = Look::WordStartHalfAscii;
2203
2204        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
2205        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
2206
2207        // Simple ASCII word boundaries.
2208        assert!(testlook!(look, "a", 0));
2209        assert!(!testlook!(look, "a", 1));
2210        assert!(!testlook!(look, "a ", 1));
2211        assert!(testlook!(look, " a ", 1));
2212        assert!(!testlook!(look, " a ", 2));
2213
2214        // Unicode word boundaries with a non-ASCII codepoint. Since this is
2215        // an ASCII word boundary, none of these match.
2216        assert!(testlook!(look, "𝛃", 0));
2217        assert!(testlook!(look, "𝛃", 4));
2218        assert!(testlook!(look, "𝛃 ", 4));
2219        assert!(testlook!(look, " 𝛃 ", 1));
2220        assert!(testlook!(look, " 𝛃 ", 5));
2221
2222        // Unicode word boundaries between non-ASCII codepoints. Again, since
2223        // this is an ASCII word boundary, none of these match.
2224        assert!(testlook!(look, "𝛃𐆀", 0));
2225        assert!(testlook!(look, "𝛃𐆀", 4));
2226
2227        // Non word boundaries for ASCII.
2228        assert!(testlook!(look, "", 0));
2229        assert!(!testlook!(look, "ab", 1));
2230        assert!(testlook!(look, "a ", 2));
2231        assert!(testlook!(look, " a ", 0));
2232        assert!(testlook!(look, " a ", 3));
2233
2234        // Non word boundaries with a non-ASCII codepoint.
2235        assert!(testlook!(look, "𝛃b", 4));
2236        assert!(!testlook!(look, "b𝛃", 1));
2237        assert!(testlook!(look, "𝛃 ", 5));
2238        assert!(testlook!(look, " 𝛃 ", 0));
2239        assert!(testlook!(look, " 𝛃 ", 6));
2240        assert!(testlook!(look, "𝛃", 1));
2241        assert!(testlook!(look, "𝛃", 2));
2242        assert!(testlook!(look, "𝛃", 3));
2243
2244        // Non word boundaries with non-ASCII codepoints.
2245        assert!(testlook!(look, "𝛃𐆀", 1));
2246        assert!(testlook!(look, "𝛃𐆀", 2));
2247        assert!(testlook!(look, "𝛃𐆀", 3));
2248        assert!(testlook!(look, "𝛃𐆀", 5));
2249        assert!(testlook!(look, "𝛃𐆀", 6));
2250        assert!(testlook!(look, "𝛃𐆀", 7));
2251        assert!(testlook!(look, "𝛃𐆀", 8));
2252    }
2253
2254    #[test]
2255    fn look_matches_word_end_half_ascii() {
2256        let look = Look::WordEndHalfAscii;
2257
2258        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
2259        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
2260
2261        // Simple ASCII word boundaries.
2262        assert!(!testlook!(look, "a", 0));
2263        assert!(testlook!(look, "a", 1));
2264        assert!(testlook!(look, "a ", 1));
2265        assert!(!testlook!(look, " a ", 1));
2266        assert!(testlook!(look, " a ", 2));
2267
2268        // Unicode word boundaries with a non-ASCII codepoint. Since this is
2269        // an ASCII word boundary, none of these match.
2270        assert!(testlook!(look, "𝛃", 0));
2271        assert!(testlook!(look, "𝛃", 4));
2272        assert!(testlook!(look, "𝛃 ", 4));
2273        assert!(testlook!(look, " 𝛃 ", 1));
2274        assert!(testlook!(look, " 𝛃 ", 5));
2275
2276        // Unicode word boundaries between non-ASCII codepoints. Again, since
2277        // this is an ASCII word boundary, none of these match.
2278        assert!(testlook!(look, "𝛃𐆀", 0));
2279        assert!(testlook!(look, "𝛃𐆀", 4));
2280
2281        // Non word boundaries for ASCII.
2282        assert!(testlook!(look, "", 0));
2283        assert!(!testlook!(look, "ab", 1));
2284        assert!(testlook!(look, "a ", 2));
2285        assert!(testlook!(look, " a ", 0));
2286        assert!(testlook!(look, " a ", 3));
2287
2288        // Non word boundaries with a non-ASCII codepoint.
2289        assert!(!testlook!(look, "𝛃b", 4));
2290        assert!(testlook!(look, "b𝛃", 1));
2291        assert!(testlook!(look, "𝛃 ", 5));
2292        assert!(testlook!(look, " 𝛃 ", 0));
2293        assert!(testlook!(look, " 𝛃 ", 6));
2294        assert!(testlook!(look, "𝛃", 1));
2295        assert!(testlook!(look, "𝛃", 2));
2296        assert!(testlook!(look, "𝛃", 3));
2297
2298        // Non word boundaries with non-ASCII codepoints.
2299        assert!(testlook!(look, "𝛃𐆀", 1));
2300        assert!(testlook!(look, "𝛃𐆀", 2));
2301        assert!(testlook!(look, "𝛃𐆀", 3));
2302        assert!(testlook!(look, "𝛃𐆀", 5));
2303        assert!(testlook!(look, "𝛃𐆀", 6));
2304        assert!(testlook!(look, "𝛃𐆀", 7));
2305        assert!(testlook!(look, "𝛃𐆀", 8));
2306    }
2307
2308    #[test]
2309    #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
2310    fn look_matches_word_start_half_unicode() {
2311        let look = Look::WordStartHalfUnicode;
2312
2313        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
2314        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
2315
2316        // Simple ASCII word boundaries.
2317        assert!(testlook!(look, "a", 0));
2318        assert!(!testlook!(look, "a", 1));
2319        assert!(!testlook!(look, "a ", 1));
2320        assert!(testlook!(look, " a ", 1));
2321        assert!(!testlook!(look, " a ", 2));
2322
2323        // Unicode word boundaries with a non-ASCII codepoint.
2324        assert!(testlook!(look, "𝛃", 0));
2325        assert!(!testlook!(look, "𝛃", 4));
2326        assert!(!testlook!(look, "𝛃 ", 4));
2327        assert!(testlook!(look, " 𝛃 ", 1));
2328        assert!(!testlook!(look, " 𝛃 ", 5));
2329
2330        // Unicode word boundaries between non-ASCII codepoints.
2331        assert!(testlook!(look, "𝛃𐆀", 0));
2332        assert!(!testlook!(look, "𝛃𐆀", 4));
2333
2334        // Non word boundaries for ASCII.
2335        assert!(testlook!(look, "", 0));
2336        assert!(!testlook!(look, "ab", 1));
2337        assert!(testlook!(look, "a ", 2));
2338        assert!(testlook!(look, " a ", 0));
2339        assert!(testlook!(look, " a ", 3));
2340
2341        // Non word boundaries with a non-ASCII codepoint.
2342        assert!(!testlook!(look, "𝛃b", 4));
2343        assert!(!testlook!(look, "b𝛃", 1));
2344        assert!(testlook!(look, "𝛃 ", 5));
2345        assert!(testlook!(look, " 𝛃 ", 0));
2346        assert!(testlook!(look, " 𝛃 ", 6));
2347        assert!(!testlook!(look, "𝛃", 1));
2348        assert!(!testlook!(look, "𝛃", 2));
2349        assert!(!testlook!(look, "𝛃", 3));
2350
2351        // Non word boundaries with non-ASCII codepoints.
2352        assert!(!testlook!(look, "𝛃𐆀", 1));
2353        assert!(!testlook!(look, "𝛃𐆀", 2));
2354        assert!(!testlook!(look, "𝛃𐆀", 3));
2355        assert!(!testlook!(look, "𝛃𐆀", 5));
2356        assert!(!testlook!(look, "𝛃𐆀", 6));
2357        assert!(!testlook!(look, "𝛃𐆀", 7));
2358        assert!(testlook!(look, "𝛃𐆀", 8));
2359    }
2360
2361    #[test]
2362    #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
2363    fn look_matches_word_end_half_unicode() {
2364        let look = Look::WordEndHalfUnicode;
2365
2366        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
2367        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
2368
2369        // Simple ASCII word boundaries.
2370        assert!(!testlook!(look, "a", 0));
2371        assert!(testlook!(look, "a", 1));
2372        assert!(testlook!(look, "a ", 1));
2373        assert!(!testlook!(look, " a ", 1));
2374        assert!(testlook!(look, " a ", 2));
2375
2376        // Unicode word boundaries with a non-ASCII codepoint.
2377        assert!(!testlook!(look, "𝛃", 0));
2378        assert!(testlook!(look, "𝛃", 4));
2379        assert!(testlook!(look, "𝛃 ", 4));
2380        assert!(!testlook!(look, " 𝛃 ", 1));
2381        assert!(testlook!(look, " 𝛃 ", 5));
2382
2383        // Unicode word boundaries between non-ASCII codepoints.
2384        assert!(!testlook!(look, "𝛃𐆀", 0));
2385        assert!(testlook!(look, "𝛃𐆀", 4));
2386
2387        // Non word boundaries for ASCII.
2388        assert!(testlook!(look, "", 0));
2389        assert!(!testlook!(look, "ab", 1));
2390        assert!(testlook!(look, "a ", 2));
2391        assert!(testlook!(look, " a ", 0));
2392        assert!(testlook!(look, " a ", 3));
2393
2394        // Non word boundaries with a non-ASCII codepoint.
2395        assert!(!testlook!(look, "𝛃b", 4));
2396        assert!(!testlook!(look, "b𝛃", 1));
2397        assert!(testlook!(look, "𝛃 ", 5));
2398        assert!(testlook!(look, " 𝛃 ", 0));
2399        assert!(testlook!(look, " 𝛃 ", 6));
2400        assert!(!testlook!(look, "𝛃", 1));
2401        assert!(!testlook!(look, "𝛃", 2));
2402        assert!(!testlook!(look, "𝛃", 3));
2403
2404        // Non word boundaries with non-ASCII codepoints.
2405        assert!(!testlook!(look, "𝛃𐆀", 1));
2406        assert!(!testlook!(look, "𝛃𐆀", 2));
2407        assert!(!testlook!(look, "𝛃𐆀", 3));
2408        assert!(!testlook!(look, "𝛃𐆀", 5));
2409        assert!(!testlook!(look, "𝛃𐆀", 6));
2410        assert!(!testlook!(look, "𝛃𐆀", 7));
2411        assert!(testlook!(look, "𝛃𐆀", 8));
2412    }
2413
2414    #[test]
2415    fn look_set() {
2416        let mut f = LookSet::default();
2417        assert!(!f.contains(Look::Start));
2418        assert!(!f.contains(Look::End));
2419        assert!(!f.contains(Look::StartLF));
2420        assert!(!f.contains(Look::EndLF));
2421        assert!(!f.contains(Look::WordUnicode));
2422        assert!(!f.contains(Look::WordUnicodeNegate));
2423        assert!(!f.contains(Look::WordAscii));
2424        assert!(!f.contains(Look::WordAsciiNegate));
2425
2426        f = f.insert(Look::Start);
2427        assert!(f.contains(Look::Start));
2428        f = f.remove(Look::Start);
2429        assert!(!f.contains(Look::Start));
2430
2431        f = f.insert(Look::End);
2432        assert!(f.contains(Look::End));
2433        f = f.remove(Look::End);
2434        assert!(!f.contains(Look::End));
2435
2436        f = f.insert(Look::StartLF);
2437        assert!(f.contains(Look::StartLF));
2438        f = f.remove(Look::StartLF);
2439        assert!(!f.contains(Look::StartLF));
2440
2441        f = f.insert(Look::EndLF);
2442        assert!(f.contains(Look::EndLF));
2443        f = f.remove(Look::EndLF);
2444        assert!(!f.contains(Look::EndLF));
2445
2446        f = f.insert(Look::StartCRLF);
2447        assert!(f.contains(Look::StartCRLF));
2448        f = f.remove(Look::StartCRLF);
2449        assert!(!f.contains(Look::StartCRLF));
2450
2451        f = f.insert(Look::EndCRLF);
2452        assert!(f.contains(Look::EndCRLF));
2453        f = f.remove(Look::EndCRLF);
2454        assert!(!f.contains(Look::EndCRLF));
2455
2456        f = f.insert(Look::WordUnicode);
2457        assert!(f.contains(Look::WordUnicode));
2458        f = f.remove(Look::WordUnicode);
2459        assert!(!f.contains(Look::WordUnicode));
2460
2461        f = f.insert(Look::WordUnicodeNegate);
2462        assert!(f.contains(Look::WordUnicodeNegate));
2463        f = f.remove(Look::WordUnicodeNegate);
2464        assert!(!f.contains(Look::WordUnicodeNegate));
2465
2466        f = f.insert(Look::WordAscii);
2467        assert!(f.contains(Look::WordAscii));
2468        f = f.remove(Look::WordAscii);
2469        assert!(!f.contains(Look::WordAscii));
2470
2471        f = f.insert(Look::WordAsciiNegate);
2472        assert!(f.contains(Look::WordAsciiNegate));
2473        f = f.remove(Look::WordAsciiNegate);
2474        assert!(!f.contains(Look::WordAsciiNegate));
2475
2476        f = f.insert(Look::WordStartAscii);
2477        assert!(f.contains(Look::WordStartAscii));
2478        f = f.remove(Look::WordStartAscii);
2479        assert!(!f.contains(Look::WordStartAscii));
2480
2481        f = f.insert(Look::WordEndAscii);
2482        assert!(f.contains(Look::WordEndAscii));
2483        f = f.remove(Look::WordEndAscii);
2484        assert!(!f.contains(Look::WordEndAscii));
2485
2486        f = f.insert(Look::WordStartUnicode);
2487        assert!(f.contains(Look::WordStartUnicode));
2488        f = f.remove(Look::WordStartUnicode);
2489        assert!(!f.contains(Look::WordStartUnicode));
2490
2491        f = f.insert(Look::WordEndUnicode);
2492        assert!(f.contains(Look::WordEndUnicode));
2493        f = f.remove(Look::WordEndUnicode);
2494        assert!(!f.contains(Look::WordEndUnicode));
2495
2496        f = f.insert(Look::WordStartHalfAscii);
2497        assert!(f.contains(Look::WordStartHalfAscii));
2498        f = f.remove(Look::WordStartHalfAscii);
2499        assert!(!f.contains(Look::WordStartHalfAscii));
2500
2501        f = f.insert(Look::WordEndHalfAscii);
2502        assert!(f.contains(Look::WordEndHalfAscii));
2503        f = f.remove(Look::WordEndHalfAscii);
2504        assert!(!f.contains(Look::WordEndHalfAscii));
2505
2506        f = f.insert(Look::WordStartHalfUnicode);
2507        assert!(f.contains(Look::WordStartHalfUnicode));
2508        f = f.remove(Look::WordStartHalfUnicode);
2509        assert!(!f.contains(Look::WordStartHalfUnicode));
2510
2511        f = f.insert(Look::WordEndHalfUnicode);
2512        assert!(f.contains(Look::WordEndHalfUnicode));
2513        f = f.remove(Look::WordEndHalfUnicode);
2514        assert!(!f.contains(Look::WordEndHalfUnicode));
2515    }
2516
2517    #[test]
2518    fn look_set_iter() {
2519        let set = LookSet::empty();
2520        assert_eq!(0, set.iter().count());
2521
2522        let set = LookSet::full();
2523        assert_eq!(18, set.iter().count());
2524
2525        let set =
2526            LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode);
2527        assert_eq!(2, set.iter().count());
2528
2529        let set = LookSet::empty().insert(Look::StartLF);
2530        assert_eq!(1, set.iter().count());
2531
2532        let set = LookSet::empty().insert(Look::WordAsciiNegate);
2533        assert_eq!(1, set.iter().count());
2534
2535        let set = LookSet::empty().insert(Look::WordEndHalfUnicode);
2536        assert_eq!(1, set.iter().count());
2537    }
2538
2539    #[test]
2540    #[cfg(feature = "alloc")]
2541    fn look_set_debug() {
2542        let res = alloc::format!("{:?}", LookSet::empty());
2543        assert_eq!("∅", res);
2544        let res = alloc::format!("{:?}", LookSet::full());
2545        assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res);
2546    }
2547}
regex_automata/util/look.rs

regex_automata/util/
look.rs