regex_syntax/hir/
interval.rs

1use core::{char, cmp, fmt::Debug, slice};
2
3use alloc::vec::Vec;
4
5use crate::unicode;
6
7// This module contains an *internal* implementation of interval sets.
8//
9// The primary invariant that interval sets guards is canonical ordering. That
10// is, every interval set contains an ordered sequence of intervals where
11// no two intervals are overlapping or adjacent. While this invariant is
12// occasionally broken within the implementation, it should be impossible for
13// callers to observe it.
14//
15// Since case folding (as implemented below) breaks that invariant, we roll
16// that into this API even though it is a little out of place in an otherwise
17// generic interval set. (Hence the reason why the `unicode` module is imported
18// here.)
19//
20// Some of the implementation complexity here is a result of me wanting to
21// preserve the sequential representation without using additional memory.
22// In many cases, we do use linear extra memory, but it is at most 2x and it
23// is amortized. If we relaxed the memory requirements, this implementation
24// could become much simpler. The extra memory is honestly probably OK, but
25// character classes (especially of the Unicode variety) can become quite
26// large, and it would be nice to keep regex compilation snappy even in debug
27// builds. (In the past, I have been careless with this area of code and it has
28// caused slow regex compilations in debug mode, so this isn't entirely
29// unwarranted.)
30//
31// Tests on this are relegated to the public API of HIR in src/hir.rs.
32
33#[derive(Clone, Debug)]
34pub struct IntervalSet<I> {
35    /// A sorted set of non-overlapping ranges.
36    ranges: Vec<I>,
37    /// While not required at all for correctness, we keep track of whether an
38    /// interval set has been case folded or not. This helps us avoid doing
39    /// redundant work if, for example, a set has already been cased folded.
40    /// And note that whether a set is folded or not is preserved through
41    /// all of the pairwise set operations. That is, if both interval sets
42    /// have been case folded, then any of difference, union, intersection or
43    /// symmetric difference all produce a case folded set.
44    ///
45    /// Note that when this is true, it *must* be the case that the set is case
46    /// folded. But when it's false, the set *may* be case folded. In other
47    /// words, we only set this to true when we know it to be case, but we're
48    /// okay with it being false if it would otherwise be costly to determine
49    /// whether it should be true. This means code cannot assume that a false
50    /// value necessarily indicates that the set is not case folded.
51    ///
52    /// Bottom line: this is a performance optimization.
53    folded: bool,
54}
55
56impl<I: Interval> Eq for IntervalSet<I> {}
57
58// We implement PartialEq manually so that we don't consider the set's internal
59// 'folded' property to be part of its identity. The 'folded' property is
60// strictly an optimization.
61impl<I: Interval> PartialEq for IntervalSet<I> {
62    fn eq(&self, other: &IntervalSet<I>) -> bool {
63        self.ranges.eq(&other.ranges)
64    }
65}
66
67impl<I: Interval> IntervalSet<I> {
68    /// Create a new set from a sequence of intervals. Each interval is
69    /// specified as a pair of bounds, where both bounds are inclusive.
70    ///
71    /// The given ranges do not need to be in any specific order, and ranges
72    /// may overlap.
73    pub fn new<T: IntoIterator<Item = I>>(intervals: T) -> IntervalSet<I> {
74        let ranges: Vec<I> = intervals.into_iter().collect();
75        // An empty set is case folded.
76        let folded = ranges.is_empty();
77        let mut set = IntervalSet { ranges, folded };
78        set.canonicalize();
79        set
80    }
81
82    /// Add a new interval to this set.
83    pub fn push(&mut self, mut interval: I) {
84        let Err(i) = self.ranges.binary_search(&interval) else {
85            // Exact match, `interval` is already in the set.
86            return;
87        };
88
89        // The search finds us the first index where the previous interval
90        // start is less than or equal to the new interval start. Since the
91        // existing intervals are non-overlapping we only need to try to union
92        // this single preceding interval
93        let mut start = i;
94        if let Some(before_i) = i.checked_sub(1) {
95            let before = &self.ranges[before_i];
96            if let Some(union) = before.union(&interval) {
97                interval = union;
98                start = before_i;
99            }
100        }
101        // `interval` may overlap any number of intervals following the
102        // insertion point so will union each of them until we reach the
103        // first non-overlapping interval
104        let mut end = i;
105        for after_i in i..self.ranges.len() {
106            let after = &self.ranges[after_i];
107            let Some(union) = interval.union(after) else {
108                break;
109            };
110            interval = union;
111            end = after_i + 1;
112        }
113        self.ranges.splice(start..end, core::iter::once(interval));
114
115        // We don't know whether the new interval added here is considered
116        // case folded, so we conservatively assume that the entire set is
117        // no longer case folded if it was previously.
118        self.folded = false;
119    }
120
121    /// Return an iterator over all intervals in this set.
122    ///
123    /// The iterator yields intervals in ascending order.
124    pub fn iter(&self) -> IntervalSetIter<'_, I> {
125        IntervalSetIter(self.ranges.iter())
126    }
127
128    /// Return an immutable slice of intervals in this set.
129    ///
130    /// The sequence returned is in canonical ordering.
131    pub fn intervals(&self) -> &[I] {
132        &self.ranges
133    }
134
135    /// Expand this interval set such that it contains all case folded
136    /// characters. For example, if this class consists of the range `a-z`,
137    /// then applying case folding will result in the class containing both the
138    /// ranges `a-z` and `A-Z`.
139    ///
140    /// This returns an error if the necessary case mapping data is not
141    /// available.
142    pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> {
143        if self.folded {
144            return Ok(());
145        }
146        let len = self.ranges.len();
147        for i in 0..len {
148            let range = self.ranges[i];
149            if let Err(err) = range.case_fold_simple(&mut self.ranges) {
150                self.canonicalize();
151                return Err(err);
152            }
153        }
154        self.canonicalize();
155        self.folded = true;
156        Ok(())
157    }
158
159    /// Union this set with the given set, in place.
160    pub fn union(&mut self, other: &IntervalSet<I>) {
161        if other.ranges.is_empty() || self.ranges == other.ranges {
162            return;
163        }
164        // This could almost certainly be done more efficiently.
165        self.ranges.extend(&other.ranges);
166        self.canonicalize();
167        self.folded = self.folded && other.folded;
168    }
169
170    /// Intersect this set with the given set, in place.
171    pub fn intersect(&mut self, other: &IntervalSet<I>) {
172        if self.ranges.is_empty() {
173            return;
174        }
175        if other.ranges.is_empty() {
176            self.ranges.clear();
177            // An empty set is case folded.
178            self.folded = true;
179            return;
180        }
181
182        // There should be a way to do this in-place with constant memory,
183        // but I couldn't figure out a simple way to do it. So just append
184        // the intersection to the end of this range, and then drain it before
185        // we're done.
186        let drain_end = self.ranges.len();
187
188        let mut ita = 0..drain_end;
189        let mut itb = 0..other.ranges.len();
190        let mut a = ita.next().unwrap();
191        let mut b = itb.next().unwrap();
192        loop {
193            if let Some(ab) = self.ranges[a].intersect(&other.ranges[b]) {
194                self.ranges.push(ab);
195            }
196            let (it, aorb) =
197                if self.ranges[a].upper() < other.ranges[b].upper() {
198                    (&mut ita, &mut a)
199                } else {
200                    (&mut itb, &mut b)
201                };
202            match it.next() {
203                Some(v) => *aorb = v,
204                None => break,
205            }
206        }
207        self.ranges.drain(..drain_end);
208        self.folded = self.folded && other.folded;
209    }
210
211    /// Subtract the given set from this set, in place.
212    pub fn difference(&mut self, other: &IntervalSet<I>) {
213        if self.ranges.is_empty() || other.ranges.is_empty() {
214            return;
215        }
216
217        // This algorithm is (to me) surprisingly complex. A search of the
218        // interwebs indicate that this is a potentially interesting problem.
219        // Folks seem to suggest interval or segment trees, but I'd like to
220        // avoid the overhead (both runtime and conceptual) of that.
221        //
222        // The following is basically my Shitty First Draft. Therefore, in
223        // order to grok it, you probably need to read each line carefully.
224        // Simplifications are most welcome!
225        //
226        // Remember, we can assume the canonical format invariant here, which
227        // says that all ranges are sorted, not overlapping and not adjacent in
228        // each class.
229        let drain_end = self.ranges.len();
230        let (mut a, mut b) = (0, 0);
231        'LOOP: while a < drain_end && b < other.ranges.len() {
232            // Basically, the easy cases are when neither range overlaps with
233            // each other. If the `b` range is less than our current `a`
234            // range, then we can skip it and move on.
235            if other.ranges[b].upper() < self.ranges[a].lower() {
236                b += 1;
237                continue;
238            }
239            // ... similarly for the `a` range. If it's less than the smallest
240            // `b` range, then we can add it as-is.
241            if self.ranges[a].upper() < other.ranges[b].lower() {
242                let range = self.ranges[a];
243                self.ranges.push(range);
244                a += 1;
245                continue;
246            }
247            // Otherwise, we have overlapping ranges.
248            assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b]));
249
250            // This part is tricky and was non-obvious to me without looking
251            // at explicit examples (see the tests). The trickiness stems from
252            // two things: 1) subtracting a range from another range could
253            // yield two ranges and 2) after subtracting a range, it's possible
254            // that future ranges can have an impact. The loop below advances
255            // the `b` ranges until they can't possible impact the current
256            // range.
257            //
258            // For example, if our `a` range is `a-t` and our next three `b`
259            // ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply
260            // subtraction three times before moving on to the next `a` range.
261            let mut range = self.ranges[a];
262            while b < other.ranges.len()
263                && !range.is_intersection_empty(&other.ranges[b])
264            {
265                let old_range = range;
266                range = match range.difference(&other.ranges[b]) {
267                    (None, None) => {
268                        // We lost the entire range, so move on to the next
269                        // without adding this one.
270                        a += 1;
271                        continue 'LOOP;
272                    }
273                    (Some(range1), None) | (None, Some(range1)) => range1,
274                    (Some(range1), Some(range2)) => {
275                        self.ranges.push(range1);
276                        range2
277                    }
278                };
279                // It's possible that the `b` range has more to contribute
280                // here. In particular, if it is greater than the original
281                // range, then it might impact the next `a` range *and* it
282                // has impacted the current `a` range as much as possible,
283                // so we can quit. We don't bump `b` so that the next `a`
284                // range can apply it.
285                if other.ranges[b].upper() > old_range.upper() {
286                    break;
287                }
288                // Otherwise, the next `b` range might apply to the current
289                // `a` range.
290                b += 1;
291            }
292            self.ranges.push(range);
293            a += 1;
294        }
295        while a < drain_end {
296            let range = self.ranges[a];
297            self.ranges.push(range);
298            a += 1;
299        }
300        self.ranges.drain(..drain_end);
301        self.folded = self.folded && other.folded;
302    }
303
304    /// Compute the symmetric difference of the two sets, in place.
305    ///
306    /// This computes the symmetric difference of two interval sets. This
307    /// removes all elements in this set that are also in the given set,
308    /// but also adds all elements from the given set that aren't in this
309    /// set. That is, the set will contain all elements in either set,
310    /// but will not contain any elements that are in both sets.
311    pub fn symmetric_difference(&mut self, other: &IntervalSet<I>) {
312        // TODO(burntsushi): Fix this so that it amortizes allocation.
313        let mut intersection = self.clone();
314        intersection.intersect(other);
315        self.union(other);
316        self.difference(&intersection);
317    }
318
319    /// Negate this interval set.
320    ///
321    /// For all `x` where `x` is any element, if `x` was in this set, then it
322    /// will not be in this set after negation.
323    pub fn negate(&mut self) {
324        if self.ranges.is_empty() {
325            let (min, max) = (I::Bound::min_value(), I::Bound::max_value());
326            self.ranges.push(I::create(min, max));
327            // The set containing everything must case folded.
328            self.folded = true;
329            return;
330        }
331
332        // There should be a way to do this in-place with constant memory,
333        // but I couldn't figure out a simple way to do it. So just append
334        // the negation to the end of this range, and then drain it before
335        // we're done.
336        let drain_end = self.ranges.len();
337
338        // We do checked arithmetic below because of the canonical ordering
339        // invariant.
340        if self.ranges[0].lower() > I::Bound::min_value() {
341            let upper = self.ranges[0].lower().decrement();
342            self.ranges.push(I::create(I::Bound::min_value(), upper));
343        }
344        for i in 1..drain_end {
345            let lower = self.ranges[i - 1].upper().increment();
346            let upper = self.ranges[i].lower().decrement();
347            self.ranges.push(I::create(lower, upper));
348        }
349        if self.ranges[drain_end - 1].upper() < I::Bound::max_value() {
350            let lower = self.ranges[drain_end - 1].upper().increment();
351            self.ranges.push(I::create(lower, I::Bound::max_value()));
352        }
353        self.ranges.drain(..drain_end);
354        // We don't need to update whether this set is folded or not, because
355        // it is conservatively preserved through negation. Namely, if a set
356        // is not folded, then it is possible that its negation is folded, for
357        // example, [^☃]. But we're fine with assuming that the set is not
358        // folded in that case. (`folded` permits false negatives but not false
359        // positives.)
360        //
361        // But what about when a set is folded, is its negation also
362        // necessarily folded? Yes. Because if a set is folded, then for every
363        // character in the set, it necessarily included its equivalence class
364        // of case folded characters. Negating it in turn means that all
365        // equivalence classes in the set are negated, and any equivalence
366        // class that was previously not in the set is now entirely in the set.
367    }
368
369    /// Converts this set into a canonical ordering.
370    fn canonicalize(&mut self) {
371        if self.is_canonical() {
372            return;
373        }
374        self.ranges.sort();
375        assert!(!self.ranges.is_empty());
376
377        // Is there a way to do this in-place with constant memory? I couldn't
378        // figure out a way to do it. So just append the canonicalization to
379        // the end of this range, and then drain it before we're done.
380        let drain_end = self.ranges.len();
381        for oldi in 0..drain_end {
382            // If we've added at least one new range, then check if we can
383            // merge this range in the previously added range.
384            if self.ranges.len() > drain_end {
385                let (last, rest) = self.ranges.split_last_mut().unwrap();
386                if let Some(union) = last.union(&rest[oldi]) {
387                    *last = union;
388                    continue;
389                }
390            }
391            let range = self.ranges[oldi];
392            self.ranges.push(range);
393        }
394        self.ranges.drain(..drain_end);
395    }
396
397    /// Returns true if and only if this class is in a canonical ordering.
398    fn is_canonical(&self) -> bool {
399        for pair in self.ranges.windows(2) {
400            if pair[0] >= pair[1] {
401                return false;
402            }
403            if pair[0].is_contiguous(&pair[1]) {
404                return false;
405            }
406        }
407        true
408    }
409}
410
411/// An iterator over intervals.
412#[derive(Debug)]
413pub struct IntervalSetIter<'a, I>(slice::Iter<'a, I>);
414
415impl<'a, I> Iterator for IntervalSetIter<'a, I> {
416    type Item = &'a I;
417
418    fn next(&mut self) -> Option<&'a I> {
419        self.0.next()
420    }
421}
422
423pub trait Interval:
424    Clone + Copy + Debug + Default + Eq + PartialEq + PartialOrd + Ord
425{
426    type Bound: Bound;
427
428    fn lower(&self) -> Self::Bound;
429    fn upper(&self) -> Self::Bound;
430    fn set_lower(&mut self, bound: Self::Bound);
431    fn set_upper(&mut self, bound: Self::Bound);
432    fn case_fold_simple(
433        &self,
434        intervals: &mut Vec<Self>,
435    ) -> Result<(), unicode::CaseFoldError>;
436
437    /// Create a new interval.
438    fn create(lower: Self::Bound, upper: Self::Bound) -> Self {
439        let mut int = Self::default();
440        if lower <= upper {
441            int.set_lower(lower);
442            int.set_upper(upper);
443        } else {
444            int.set_lower(upper);
445            int.set_upper(lower);
446        }
447        int
448    }
449
450    /// Union the given overlapping range into this range.
451    ///
452    /// If the two ranges aren't contiguous, then this returns `None`.
453    fn union(&self, other: &Self) -> Option<Self> {
454        if !self.is_contiguous(other) {
455            return None;
456        }
457        let lower = cmp::min(self.lower(), other.lower());
458        let upper = cmp::max(self.upper(), other.upper());
459        Some(Self::create(lower, upper))
460    }
461
462    /// Intersect this range with the given range and return the result.
463    ///
464    /// If the intersection is empty, then this returns `None`.
465    fn intersect(&self, other: &Self) -> Option<Self> {
466        let lower = cmp::max(self.lower(), other.lower());
467        let upper = cmp::min(self.upper(), other.upper());
468        if lower <= upper {
469            Some(Self::create(lower, upper))
470        } else {
471            None
472        }
473    }
474
475    /// Subtract the given range from this range and return the resulting
476    /// ranges.
477    ///
478    /// If subtraction would result in an empty range, then no ranges are
479    /// returned.
480    fn difference(&self, other: &Self) -> (Option<Self>, Option<Self>) {
481        if self.is_subset(other) {
482            return (None, None);
483        }
484        if self.is_intersection_empty(other) {
485            return (Some(self.clone()), None);
486        }
487        let add_lower = other.lower() > self.lower();
488        let add_upper = other.upper() < self.upper();
489        // We know this because !self.is_subset(other) and the ranges have
490        // a non-empty intersection.
491        assert!(add_lower || add_upper);
492        let mut ret = (None, None);
493        if add_lower {
494            let upper = other.lower().decrement();
495            ret.0 = Some(Self::create(self.lower(), upper));
496        }
497        if add_upper {
498            let lower = other.upper().increment();
499            let range = Self::create(lower, self.upper());
500            if ret.0.is_none() {
501                ret.0 = Some(range);
502            } else {
503                ret.1 = Some(range);
504            }
505        }
506        ret
507    }
508
509    /// Returns true if and only if the two ranges are contiguous. Two ranges
510    /// are contiguous if and only if the ranges are either overlapping or
511    /// adjacent.
512    fn is_contiguous(&self, other: &Self) -> bool {
513        let lower1 = self.lower().as_u32();
514        let upper1 = self.upper().as_u32();
515        let lower2 = other.lower().as_u32();
516        let upper2 = other.upper().as_u32();
517        cmp::max(lower1, lower2) <= cmp::min(upper1, upper2).saturating_add(1)
518    }
519
520    /// Returns true if and only if the intersection of this range and the
521    /// other range is empty.
522    fn is_intersection_empty(&self, other: &Self) -> bool {
523        let (lower1, upper1) = (self.lower(), self.upper());
524        let (lower2, upper2) = (other.lower(), other.upper());
525        cmp::max(lower1, lower2) > cmp::min(upper1, upper2)
526    }
527
528    /// Returns true if and only if this range is a subset of the other range.
529    fn is_subset(&self, other: &Self) -> bool {
530        let (lower1, upper1) = (self.lower(), self.upper());
531        let (lower2, upper2) = (other.lower(), other.upper());
532        (lower2 <= lower1 && lower1 <= upper2)
533            && (lower2 <= upper1 && upper1 <= upper2)
534    }
535}
536
537pub trait Bound:
538    Copy + Clone + Debug + Eq + PartialEq + PartialOrd + Ord
539{
540    fn min_value() -> Self;
541    fn max_value() -> Self;
542    fn as_u32(self) -> u32;
543    fn increment(self) -> Self;
544    fn decrement(self) -> Self;
545}
546
547impl Bound for u8 {
548    fn min_value() -> Self {
549        u8::MIN
550    }
551    fn max_value() -> Self {
552        u8::MAX
553    }
554    fn as_u32(self) -> u32 {
555        u32::from(self)
556    }
557    fn increment(self) -> Self {
558        self.checked_add(1).unwrap()
559    }
560    fn decrement(self) -> Self {
561        self.checked_sub(1).unwrap()
562    }
563}
564
565impl Bound for char {
566    fn min_value() -> Self {
567        '\x00'
568    }
569    fn max_value() -> Self {
570        '\u{10FFFF}'
571    }
572    fn as_u32(self) -> u32 {
573        u32::from(self)
574    }
575
576    fn increment(self) -> Self {
577        match self {
578            '\u{D7FF}' => '\u{E000}',
579            c => char::from_u32(u32::from(c).checked_add(1).unwrap()).unwrap(),
580        }
581    }
582
583    fn decrement(self) -> Self {
584        match self {
585            '\u{E000}' => '\u{D7FF}',
586            c => char::from_u32(u32::from(c).checked_sub(1).unwrap()).unwrap(),
587        }
588    }
589}
590
591// Tests for interval sets are written in src/hir.rs against the public API.
regex_syntax/hir/interval.rs

regex_syntax/hir/
interval.rs