fst/automaton/
mod.rs

1#[cfg(feature = "levenshtein")]
2pub use self::levenshtein::{Levenshtein, LevenshteinError};
3
4#[cfg(feature = "levenshtein")]
5mod levenshtein;
6
7/// Automaton describes types that behave as a finite automaton.
8///
9/// All implementors of this trait are represented by *byte based* automata.
10/// Stated differently, all transitions in the automata correspond to a single
11/// byte in the input.
12///
13/// This implementation choice is important for a couple reasons:
14///
15/// 1. The set of possible transitions in each node is small, which may make
16///    efficient memory usage easier.
17/// 2. The finite state transducers in this crate are all byte based, so any
18///    automata used on them must also be byte based.
19///
20/// In practice, this does present somewhat of a problem, for example, if
21/// you're storing UTF-8 encoded strings in a finite state transducer. Consider
22/// using a `Levenshtein` automaton, which accepts a query string and an edit
23/// distance. The edit distance should apply to some notion of *character*,
24/// which could be represented by at least 1-4 bytes in a UTF-8 encoding (for
25/// some definition of "character"). Therefore, the automaton must have UTF-8
26/// decoding built into it. This can be tricky to implement, so you may find
27/// the [`utf8-ranges`](https://crates.io/crates/utf8-ranges) crate useful.
28pub trait Automaton {
29    /// The type of the state used in the automaton.
30    type State;
31
32    /// Returns a single start state for this automaton.
33    ///
34    /// This method should always return the same value for each
35    /// implementation.
36    fn start(&self) -> Self::State;
37
38    /// Returns true if and only if `state` is a match state.
39    fn is_match(&self, state: &Self::State) -> bool;
40
41    /// Returns true if and only if `state` can lead to a match in zero or more
42    /// steps.
43    ///
44    /// If this returns `false`, then no sequence of inputs from this state
45    /// should ever produce a match. If this does not follow, then those match
46    /// states may never be reached. In other words, behavior may be incorrect.
47    ///
48    /// If this returns `true` even when no match is possible, then behavior
49    /// will be correct, but callers may be forced to do additional work.
50    fn can_match(&self, _state: &Self::State) -> bool {
51        true
52    }
53
54    /// Returns true if and only if `state` matches and must match no matter
55    /// what steps are taken.
56    ///
57    /// If this returns `true`, then every sequence of inputs from this state
58    /// produces a match. If this does not follow, then those match states may
59    /// never be reached. In other words, behavior may be incorrect.
60    ///
61    /// If this returns `false` even when every sequence of inputs will lead to
62    /// a match, then behavior will be correct, but callers may be forced to do
63    /// additional work.
64    fn will_always_match(&self, _state: &Self::State) -> bool {
65        false
66    }
67
68    /// Return the next state given `state` and an input.
69    fn accept(&self, state: &Self::State, byte: u8) -> Self::State;
70
71    /// If applicable, return the next state when the end of a key is seen.
72    fn accept_eof(&self, _: &Self::State) -> Option<Self::State> {
73        None
74    }
75
76    /// Returns an automaton that matches the strings that start with something
77    /// this automaton matches.
78    fn starts_with(self) -> StartsWith<Self>
79    where
80        Self: Sized,
81    {
82        StartsWith(self)
83    }
84
85    /// Returns an automaton that matches the strings matched by either this or
86    /// the other automaton.
87    fn union<Rhs: Automaton>(self, rhs: Rhs) -> Union<Self, Rhs>
88    where
89        Self: Sized,
90    {
91        Union(self, rhs)
92    }
93
94    /// Returns an automaton that matches the strings matched by both this and
95    /// the other automaton.
96    fn intersection<Rhs: Automaton>(self, rhs: Rhs) -> Intersection<Self, Rhs>
97    where
98        Self: Sized,
99    {
100        Intersection(self, rhs)
101    }
102
103    /// Returns an automaton that matches the strings not matched by this
104    /// automaton.
105    fn complement(self) -> Complement<Self>
106    where
107        Self: Sized,
108    {
109        Complement(self)
110    }
111}
112
113impl<'a, T: Automaton> Automaton for &'a T {
114    type State = T::State;
115
116    fn start(&self) -> T::State {
117        (*self).start()
118    }
119
120    fn is_match(&self, state: &T::State) -> bool {
121        (*self).is_match(state)
122    }
123
124    fn can_match(&self, state: &T::State) -> bool {
125        (*self).can_match(state)
126    }
127
128    fn will_always_match(&self, state: &T::State) -> bool {
129        (*self).will_always_match(state)
130    }
131
132    fn accept(&self, state: &T::State, byte: u8) -> T::State {
133        (*self).accept(state, byte)
134    }
135
136    fn accept_eof(&self, state: &Self::State) -> Option<Self::State> {
137        (*self).accept_eof(state)
138    }
139}
140
141/// An automaton that matches if the input equals to a specific string.
142///
143/// It can be used in combination with [`StartsWith`] to search strings
144/// starting with a given prefix.
145///
146/// ```rust
147/// extern crate fst;
148///
149/// use fst::{Automaton, IntoStreamer, Streamer, Set};
150/// use fst::automaton::Str;
151///
152/// # fn main() { example().unwrap(); }
153/// fn example() -> Result<(), Box<dyn std::error::Error>> {
154///     let paths = vec!["/home/projects/bar", "/home/projects/foo", "/tmp/foo"];
155///     let set = Set::from_iter(paths)?;
156///
157///     // Build our prefix query.
158///     let prefix = Str::new("/home").starts_with();
159///
160///     // Apply our query to the set we built.
161///     let mut stream = set.search(prefix).into_stream();
162///
163///     let matches = stream.into_strs()?;
164///     assert_eq!(matches, vec!["/home/projects/bar", "/home/projects/foo"]);
165///     Ok(())
166/// }
167/// ```
168#[derive(Clone, Debug)]
169pub struct Str<'a> {
170    string: &'a [u8],
171}
172
173impl<'a> Str<'a> {
174    /// Constructs automaton that matches an exact string.
175    #[inline]
176    pub fn new(string: &'a str) -> Str<'a> {
177        Str { string: string.as_bytes() }
178    }
179}
180
181impl<'a> Automaton for Str<'a> {
182    type State = Option<usize>;
183
184    #[inline]
185    fn start(&self) -> Option<usize> {
186        Some(0)
187    }
188
189    #[inline]
190    fn is_match(&self, pos: &Option<usize>) -> bool {
191        *pos == Some(self.string.len())
192    }
193
194    #[inline]
195    fn can_match(&self, pos: &Option<usize>) -> bool {
196        pos.is_some()
197    }
198
199    #[inline]
200    fn accept(&self, pos: &Option<usize>, byte: u8) -> Option<usize> {
201        // if we aren't already past the end...
202        if let Some(pos) = *pos {
203            // and there is still a matching byte at the current position...
204            if self.string.get(pos).cloned() == Some(byte) {
205                // then move forward
206                return Some(pos + 1);
207            }
208        }
209        // otherwise we're either past the end or didn't match the byte
210        None
211    }
212}
213
214/// An automaton that matches if the input contains a specific subsequence.
215///
216/// It can be used to build a simple fuzzy-finder.
217///
218/// ```rust
219/// extern crate fst;
220///
221/// use fst::{IntoStreamer, Streamer, Set};
222/// use fst::automaton::Subsequence;
223///
224/// # fn main() { example().unwrap(); }
225/// fn example() -> Result<(), Box<dyn std::error::Error>> {
226///     let paths = vec!["/home/projects/bar", "/home/projects/foo", "/tmp/foo"];
227///     let set = Set::from_iter(paths)?;
228///
229///     // Build our fuzzy query.
230///     let subseq = Subsequence::new("hpf");
231///
232///     // Apply our fuzzy query to the set we built.
233///     let mut stream = set.search(subseq).into_stream();
234///
235///     let matches = stream.into_strs()?;
236///     assert_eq!(matches, vec!["/home/projects/foo"]);
237///     Ok(())
238/// }
239/// ```
240#[derive(Clone, Debug)]
241pub struct Subsequence<'a> {
242    subseq: &'a [u8],
243}
244
245impl<'a> Subsequence<'a> {
246    /// Constructs automaton that matches input containing the
247    /// specified subsequence.
248    #[inline]
249    pub fn new(subsequence: &'a str) -> Subsequence<'a> {
250        Subsequence { subseq: subsequence.as_bytes() }
251    }
252}
253
254impl<'a> Automaton for Subsequence<'a> {
255    type State = usize;
256
257    #[inline]
258    fn start(&self) -> usize {
259        0
260    }
261
262    #[inline]
263    fn is_match(&self, &state: &usize) -> bool {
264        state == self.subseq.len()
265    }
266
267    #[inline]
268    fn can_match(&self, _: &usize) -> bool {
269        true
270    }
271
272    #[inline]
273    fn will_always_match(&self, &state: &usize) -> bool {
274        state == self.subseq.len()
275    }
276
277    #[inline]
278    fn accept(&self, &state: &usize, byte: u8) -> usize {
279        if state == self.subseq.len() {
280            return state;
281        }
282        state + (byte == self.subseq[state]) as usize
283    }
284}
285
286/// An automaton that always matches.
287///
288/// This is useful in a generic context as a way to express that no automaton
289/// should be used.
290#[derive(Clone, Debug)]
291pub struct AlwaysMatch;
292
293impl Automaton for AlwaysMatch {
294    type State = ();
295
296    #[inline]
297    fn start(&self) -> () {
298        ()
299    }
300    #[inline]
301    fn is_match(&self, _: &()) -> bool {
302        true
303    }
304    #[inline]
305    fn can_match(&self, _: &()) -> bool {
306        true
307    }
308    #[inline]
309    fn will_always_match(&self, _: &()) -> bool {
310        true
311    }
312    #[inline]
313    fn accept(&self, _: &(), _: u8) -> () {
314        ()
315    }
316}
317
318/// An automaton that matches a string that begins with something that the
319/// wrapped automaton matches.
320#[derive(Clone, Debug)]
321pub struct StartsWith<A>(A);
322
323/// The `Automaton` state for `StartsWith<A>`.
324pub struct StartsWithState<A: Automaton>(StartsWithStateKind<A>);
325
326enum StartsWithStateKind<A: Automaton> {
327    Done,
328    Running(A::State),
329}
330
331impl<A: Automaton> Automaton for StartsWith<A> {
332    type State = StartsWithState<A>;
333
334    fn start(&self) -> StartsWithState<A> {
335        StartsWithState({
336            let inner = self.0.start();
337            if self.0.is_match(&inner) {
338                StartsWithStateKind::Done
339            } else {
340                StartsWithStateKind::Running(inner)
341            }
342        })
343    }
344
345    fn is_match(&self, state: &StartsWithState<A>) -> bool {
346        match state.0 {
347            StartsWithStateKind::Done => true,
348            StartsWithStateKind::Running(_) => false,
349        }
350    }
351
352    fn can_match(&self, state: &StartsWithState<A>) -> bool {
353        match state.0 {
354            StartsWithStateKind::Done => true,
355            StartsWithStateKind::Running(ref inner) => self.0.can_match(inner),
356        }
357    }
358
359    fn will_always_match(&self, state: &StartsWithState<A>) -> bool {
360        match state.0 {
361            StartsWithStateKind::Done => true,
362            StartsWithStateKind::Running(_) => false,
363        }
364    }
365
366    fn accept(
367        &self,
368        state: &StartsWithState<A>,
369        byte: u8,
370    ) -> StartsWithState<A> {
371        StartsWithState(match state.0 {
372            StartsWithStateKind::Done => StartsWithStateKind::Done,
373            StartsWithStateKind::Running(ref inner) => {
374                let next_inner = self.0.accept(inner, byte);
375                if self.0.is_match(&next_inner) {
376                    StartsWithStateKind::Done
377                } else {
378                    StartsWithStateKind::Running(next_inner)
379                }
380            }
381        })
382    }
383}
384
385/// An automaton that matches when one of its component automata match.
386#[derive(Clone, Debug)]
387pub struct Union<A, B>(A, B);
388
389/// The `Automaton` state for `Union<A, B>`.
390pub struct UnionState<A: Automaton, B: Automaton>(A::State, B::State);
391
392impl<A: Automaton, B: Automaton> Automaton for Union<A, B> {
393    type State = UnionState<A, B>;
394
395    fn start(&self) -> UnionState<A, B> {
396        UnionState(self.0.start(), self.1.start())
397    }
398
399    fn is_match(&self, state: &UnionState<A, B>) -> bool {
400        self.0.is_match(&state.0) || self.1.is_match(&state.1)
401    }
402
403    fn can_match(&self, state: &UnionState<A, B>) -> bool {
404        self.0.can_match(&state.0) || self.1.can_match(&state.1)
405    }
406
407    fn will_always_match(&self, state: &UnionState<A, B>) -> bool {
408        self.0.will_always_match(&state.0)
409            || self.1.will_always_match(&state.1)
410    }
411
412    fn accept(&self, state: &UnionState<A, B>, byte: u8) -> UnionState<A, B> {
413        UnionState(
414            self.0.accept(&state.0, byte),
415            self.1.accept(&state.1, byte),
416        )
417    }
418}
419
420/// An automaton that matches when both of its component automata match.
421#[derive(Clone, Debug)]
422pub struct Intersection<A, B>(A, B);
423
424/// The `Automaton` state for `Intersection<A, B>`.
425pub struct IntersectionState<A: Automaton, B: Automaton>(A::State, B::State);
426
427impl<A: Automaton, B: Automaton> Automaton for Intersection<A, B> {
428    type State = IntersectionState<A, B>;
429
430    fn start(&self) -> IntersectionState<A, B> {
431        IntersectionState(self.0.start(), self.1.start())
432    }
433
434    fn is_match(&self, state: &IntersectionState<A, B>) -> bool {
435        self.0.is_match(&state.0) && self.1.is_match(&state.1)
436    }
437
438    fn can_match(&self, state: &IntersectionState<A, B>) -> bool {
439        self.0.can_match(&state.0) && self.1.can_match(&state.1)
440    }
441
442    fn will_always_match(&self, state: &IntersectionState<A, B>) -> bool {
443        self.0.will_always_match(&state.0)
444            && self.1.will_always_match(&state.1)
445    }
446
447    fn accept(
448        &self,
449        state: &IntersectionState<A, B>,
450        byte: u8,
451    ) -> IntersectionState<A, B> {
452        IntersectionState(
453            self.0.accept(&state.0, byte),
454            self.1.accept(&state.1, byte),
455        )
456    }
457}
458
459/// An automaton that matches exactly when the automaton it wraps does not.
460#[derive(Clone, Debug)]
461pub struct Complement<A>(A);
462
463/// The `Automaton` state for `Complement<A>`.
464pub struct ComplementState<A: Automaton>(A::State);
465
466impl<A: Automaton> Automaton for Complement<A> {
467    type State = ComplementState<A>;
468
469    fn start(&self) -> ComplementState<A> {
470        ComplementState(self.0.start())
471    }
472
473    fn is_match(&self, state: &ComplementState<A>) -> bool {
474        !self.0.is_match(&state.0)
475    }
476
477    fn can_match(&self, state: &ComplementState<A>) -> bool {
478        !self.0.will_always_match(&state.0)
479    }
480
481    fn will_always_match(&self, state: &ComplementState<A>) -> bool {
482        !self.0.can_match(&state.0)
483    }
484
485    fn accept(
486        &self,
487        state: &ComplementState<A>,
488        byte: u8,
489    ) -> ComplementState<A> {
490        ComplementState(self.0.accept(&state.0, byte))
491    }
492}