fst/
set.rs

1use std::fmt;
2use std::io;
3use std::iter::{self, FromIterator};
4
5use crate::automaton::{AlwaysMatch, Automaton};
6use crate::raw;
7use crate::stream::{IntoStreamer, Streamer};
8use crate::Result;
9
10/// Set is a lexicographically ordered set of byte strings.
11///
12/// A `Set` is constructed with the `SetBuilder` type. Alternatively, a `Set`
13/// can be constructed in memory from a lexicographically ordered iterator
14/// of byte strings (`Set::from_iter`).
15///
16/// A key feature of `Set` is that it can be serialized to disk compactly. Its
17/// underlying representation is built such that the `Set` can be memory mapped
18/// and searched without necessarily loading the entire set into memory.
19///
20/// It supports most common operations associated with sets, such as
21/// membership, union, intersection, subset/superset, etc. It also supports
22/// range queries and automata based searches (e.g. a regular expression).
23///
24/// Sets are represented by a finite state transducer where output values are
25/// always zero. As such, sets have the following invariants:
26///
27/// 1. Once constructed, a `Set` can never be modified.
28/// 2. Sets must be constructed with lexicographically ordered byte sequences.
29#[derive(Clone)]
30pub struct Set<D>(raw::Fst<D>);
31
32impl Set<Vec<u8>> {
33    /// Create a `Set` from an iterator of lexicographically ordered byte
34    /// strings.
35    ///
36    /// If the iterator does not yield values in lexicographic order, then an
37    /// error is returned.
38    ///
39    /// Note that this is a convenience function to build a set in memory.
40    /// To build a set that streams to an arbitrary `io::Write`, use
41    /// `SetBuilder`.
42    pub fn from_iter<T, I>(iter: I) -> Result<Set<Vec<u8>>>
43    where
44        T: AsRef<[u8]>,
45        I: IntoIterator<Item = T>,
46    {
47        let mut builder = SetBuilder::memory();
48        builder.extend_iter(iter)?;
49        Set::new(builder.into_inner()?)
50    }
51}
52
53impl<D: AsRef<[u8]>> Set<D> {
54    /// Creates a set from its representation as a raw byte sequence.
55    ///
56    /// This accepts anything that can be cheaply converted to a `&[u8]`. The
57    /// caller is responsible for guaranteeing that the given bytes refer to
58    /// a valid FST. While memory safety will not be violated by invalid input,
59    /// a panic could occur while reading the FST at any point.
60    ///
61    /// # Example
62    ///
63    /// ```no_run
64    /// use fst::Set;
65    ///
66    /// // File written from a build script using SetBuilder.
67    /// # const IGNORE: &str = stringify! {
68    /// static FST: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/set.fst"));
69    /// # };
70    /// # static FST: &[u8] = &[];
71    ///
72    /// let set = Set::new(FST).unwrap();
73    /// ```
74    pub fn new(data: D) -> Result<Set<D>> {
75        raw::Fst::new(data).map(Set)
76    }
77
78    /// Tests the membership of a single key.
79    ///
80    /// # Example
81    ///
82    /// ```rust
83    /// use fst::Set;
84    ///
85    /// let set = Set::from_iter(&["a", "b", "c"]).unwrap();
86    ///
87    /// assert_eq!(set.contains("b"), true);
88    /// assert_eq!(set.contains("z"), false);
89    /// ```
90    pub fn contains<K: AsRef<[u8]>>(&self, key: K) -> bool {
91        self.0.contains_key(key)
92    }
93
94    /// Return a lexicographically ordered stream of all keys in this set.
95    ///
96    /// While this is a stream, it does require heap space proportional to the
97    /// longest key in the set.
98    ///
99    /// If the set is memory mapped, then no further heap space is needed.
100    /// Note though that your operating system may fill your page cache
101    /// (which will cause the resident memory usage of the process to go up
102    /// correspondingly).
103    ///
104    /// # Example
105    ///
106    /// Since streams are not iterators, the traditional `for` loop cannot be
107    /// used. `while let` is useful instead:
108    ///
109    /// ```rust
110    /// use fst::{IntoStreamer, Streamer, Set};
111    ///
112    /// let set = Set::from_iter(&["a", "b", "c"]).unwrap();
113    /// let mut stream = set.stream();
114    ///
115    /// let mut keys = vec![];
116    /// while let Some(key) = stream.next() {
117    ///     keys.push(key.to_vec());
118    /// }
119    /// assert_eq!(keys, vec![b"a", b"b", b"c"]);
120    /// ```
121    #[inline]
122    pub fn stream(&self) -> Stream<'_> {
123        Stream(self.0.stream())
124    }
125
126    /// Return a builder for range queries.
127    ///
128    /// A range query returns a subset of keys in this set in a range given in
129    /// lexicographic order.
130    ///
131    /// Memory requirements are the same as described on `Set::stream`.
132    /// Notably, only the keys in the range are read; keys outside the range
133    /// are not.
134    ///
135    /// # Example
136    ///
137    /// Returns only the keys in the range given.
138    ///
139    /// ```rust
140    /// use fst::{IntoStreamer, Streamer, Set};
141    ///
142    /// let set = Set::from_iter(&["a", "b", "c", "d", "e"]).unwrap();
143    /// let mut stream = set.range().ge("b").lt("e").into_stream();
144    ///
145    /// let mut keys = vec![];
146    /// while let Some(key) = stream.next() {
147    ///     keys.push(key.to_vec());
148    /// }
149    /// assert_eq!(keys, vec![b"b", b"c", b"d"]);
150    /// ```
151    #[inline]
152    pub fn range(&self) -> StreamBuilder<'_> {
153        StreamBuilder(self.0.range())
154    }
155
156    /// Executes an automaton on the keys of this set.
157    ///
158    /// Note that this returns a `StreamBuilder`, which can be used to
159    /// add a range query to the search (see the `range` method).
160    ///
161    /// Memory requirements are the same as described on `Set::stream`.
162    ///
163    /// # Example
164    ///
165    /// An implementation of subsequence search for `Automaton` can be used
166    /// to search sets:
167    ///
168    /// ```rust
169    /// use fst::automaton::Subsequence;
170    /// use fst::{IntoStreamer, Streamer, Set};
171    ///
172    /// # fn main() { example().unwrap(); }
173    /// fn example() -> Result<(), Box<dyn std::error::Error>> {
174    ///     let set = Set::from_iter(&[
175    ///         "a foo bar", "foo", "foo1", "foo2", "foo3", "foobar",
176    ///     ]).unwrap();
177    ///
178    ///     let matcher = Subsequence::new("for");
179    ///     let mut stream = set.search(&matcher).into_stream();
180    ///
181    ///     let mut keys = vec![];
182    ///     while let Some(key) = stream.next() {
183    ///         keys.push(String::from_utf8(key.to_vec())?);
184    ///     }
185    ///     assert_eq!(keys, vec![
186    ///         "a foo bar", "foobar",
187    ///     ]);
188    ///
189    ///     Ok(())
190    /// }
191    /// ```
192    pub fn search<A: Automaton>(&self, aut: A) -> StreamBuilder<'_, A> {
193        StreamBuilder(self.0.search(aut))
194    }
195
196    /// Executes an automaton on the values of this set and yields matching
197    /// values along with the corresponding matching states in the given
198    /// automaton.
199    ///
200    /// Note that this returns a `StreamWithStateBuilder`, which can be used to
201    /// add a range query to the search (see the `range` method).
202    ///
203    /// Memory requirements are the same as described on `Map::stream`.
204    ///
205    #[cfg_attr(
206        feature = "levenshtein",
207        doc = r##"
208# Example
209
210An implementation of fuzzy search using Levenshtein automata can be used
211to search sets:
212
213```rust
214use fst::automaton::Levenshtein;
215use fst::{IntoStreamer, Streamer, Set};
216
217# fn main() { example().unwrap(); }
218fn example() -> Result<(), Box<dyn std::error::Error>> {
219    let set = Set::from_iter(vec![
220        "foo",
221        "foob",
222        "foobar",
223        "fozb",
224    ]).unwrap();
225
226    let query = Levenshtein::new("foo", 2)?;
227    let mut stream = set.search_with_state(&query).into_stream();
228
229    let mut vs = vec![];
230    while let Some((v, s)) = stream.next() {
231        vs.push((String::from_utf8(v.to_vec())?, s));
232    }
233    // Currently, there isn't much interesting that you can do with the states.
234    assert_eq!(vs, vec![
235        ("foo".to_string(), Some(183)),
236        ("foob".to_string(), Some(123)),
237        ("fozb".to_string(), Some(83)),
238    ]);
239
240    Ok(())
241}
242```
243"##
244    )]
245    pub fn search_with_state<A: Automaton>(
246        &self,
247        aut: A,
248    ) -> StreamWithStateBuilder<'_, A> {
249        StreamWithStateBuilder(self.0.search_with_state(aut))
250    }
251
252    /// Returns the number of elements in this set.
253    #[inline]
254    pub fn len(&self) -> usize {
255        self.0.len()
256    }
257
258    /// Returns true if and only if this set is empty.
259    #[inline]
260    pub fn is_empty(&self) -> bool {
261        self.0.is_empty()
262    }
263
264    /// Creates a new set operation with this set added to it.
265    ///
266    /// The `OpBuilder` type can be used to add additional set streams
267    /// and perform set operations like union, intersection, difference and
268    /// symmetric difference.
269    ///
270    /// # Example
271    ///
272    /// ```rust
273    /// use fst::{IntoStreamer, Streamer, Set};
274    ///
275    /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
276    /// let set2 = Set::from_iter(&["a", "y", "z"]).unwrap();
277    ///
278    /// let mut union = set1.op().add(&set2).union();
279    ///
280    /// let mut keys = vec![];
281    /// while let Some(key) = union.next() {
282    ///     keys.push(key.to_vec());
283    /// }
284    /// assert_eq!(keys, vec![b"a", b"b", b"c", b"y", b"z"]);
285    /// ```
286    #[inline]
287    pub fn op(&self) -> OpBuilder<'_> {
288        OpBuilder::new().add(self)
289    }
290
291    /// Returns true if and only if the `self` set is disjoint with the set
292    /// `stream`.
293    ///
294    /// `stream` must be a lexicographically ordered sequence of byte strings.
295    ///
296    /// # Example
297    ///
298    /// ```rust
299    /// use fst::{IntoStreamer, Streamer, Set};
300    ///
301    /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
302    /// let set2 = Set::from_iter(&["x", "y", "z"]).unwrap();
303    ///
304    /// assert_eq!(set1.is_disjoint(&set2), true);
305    ///
306    /// let set3 = Set::from_iter(&["a", "c"]).unwrap();
307    ///
308    /// assert_eq!(set1.is_disjoint(&set3), false);
309    /// ```
310    pub fn is_disjoint<'f, I, S>(&self, stream: I) -> bool
311    where
312        I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
313        S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
314    {
315        self.0.is_disjoint(StreamZeroOutput(stream.into_stream()))
316    }
317
318    /// Returns true if and only if the `self` set is a subset of `stream`.
319    ///
320    /// `stream` must be a lexicographically ordered sequence of byte strings.
321    ///
322    /// # Example
323    ///
324    /// ```rust
325    /// use fst::Set;
326    ///
327    /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
328    /// let set2 = Set::from_iter(&["x", "y", "z"]).unwrap();
329    ///
330    /// assert_eq!(set1.is_subset(&set2), false);
331    ///
332    /// let set3 = Set::from_iter(&["a", "c"]).unwrap();
333    ///
334    /// assert_eq!(set1.is_subset(&set3), false);
335    /// assert_eq!(set3.is_subset(&set1), true);
336    /// ```
337    pub fn is_subset<'f, I, S>(&self, stream: I) -> bool
338    where
339        I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
340        S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
341    {
342        self.0.is_subset(StreamZeroOutput(stream.into_stream()))
343    }
344
345    /// Returns true if and only if the `self` set is a superset of `stream`.
346    ///
347    /// `stream` must be a lexicographically ordered sequence of byte strings.
348    ///
349    /// # Example
350    ///
351    /// ```rust
352    /// use fst::Set;
353    ///
354    /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
355    /// let set2 = Set::from_iter(&["x", "y", "z"]).unwrap();
356    ///
357    /// assert_eq!(set1.is_superset(&set2), false);
358    ///
359    /// let set3 = Set::from_iter(&["a", "c"]).unwrap();
360    ///
361    /// assert_eq!(set1.is_superset(&set3), true);
362    /// assert_eq!(set3.is_superset(&set1), false);
363    /// ```
364    pub fn is_superset<'f, I, S>(&self, stream: I) -> bool
365    where
366        I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
367        S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
368    {
369        self.0.is_superset(StreamZeroOutput(stream.into_stream()))
370    }
371
372    /// Returns a reference to the underlying raw finite state transducer.
373    #[inline]
374    pub fn as_fst(&self) -> &raw::Fst<D> {
375        &self.0
376    }
377
378    /// Returns the underlying raw finite state transducer.
379    #[inline]
380    pub fn into_fst(self) -> raw::Fst<D> {
381        self.0
382    }
383
384    /// Maps the underlying data of the fst Set to another data type.
385    ///
386    /// # Example
387    ///
388    /// This example shows that you can map an fst Set based on a `Vec<u8>`
389    /// into an fst Set based on a `Cow<[u8]>`, it can also work with a
390    /// reference counted type (e.g. `Arc`, `Rc`).
391    ///
392    /// ```
393    /// use std::borrow::Cow;
394    ///
395    /// use fst::Set;
396    ///
397    /// let set: Set<Vec<u8>> = Set::from_iter(
398    ///     &["hello", "world"],
399    /// ).unwrap();
400    ///
401    /// let set_on_cow: Set<Cow<[u8]>> = set.map_data(Cow::Owned).unwrap();
402    /// ```
403    #[inline]
404    pub fn map_data<F, T>(self, f: F) -> Result<Set<T>>
405    where
406        F: FnMut(D) -> T,
407        T: AsRef<[u8]>,
408    {
409        self.into_fst().map_data(f).map(Set::from)
410    }
411}
412
413impl Default for Set<Vec<u8>> {
414    #[inline]
415    fn default() -> Set<Vec<u8>> {
416        Set::from_iter(iter::empty::<&[u8]>()).unwrap()
417    }
418}
419
420impl<D: AsRef<[u8]>> fmt::Debug for Set<D> {
421    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
422        write!(f, "Set([")?;
423        let mut stream = self.stream();
424        let mut first = true;
425        while let Some(key) = stream.next() {
426            if !first {
427                write!(f, ", ")?;
428            }
429            first = false;
430            write!(f, "{}", String::from_utf8_lossy(key))?;
431        }
432        write!(f, "])")
433    }
434}
435
436/// Returns the underlying finite state transducer.
437impl<D: AsRef<[u8]>> AsRef<raw::Fst<D>> for Set<D> {
438    #[inline]
439    fn as_ref(&self) -> &raw::Fst<D> {
440        &self.0
441    }
442}
443
444impl<'s, 'a, D: AsRef<[u8]>> IntoStreamer<'a> for &'s Set<D> {
445    type Item = &'a [u8];
446    type Into = Stream<'s>;
447
448    #[inline]
449    fn into_stream(self) -> Stream<'s> {
450        Stream(self.0.stream())
451    }
452}
453
454// Construct a set from an Fst object.
455impl<D: AsRef<[u8]>> From<raw::Fst<D>> for Set<D> {
456    #[inline]
457    fn from(fst: raw::Fst<D>) -> Set<D> {
458        Set(fst)
459    }
460}
461
462/// A builder for creating a set.
463///
464/// This is not your average everyday builder. It has two important qualities
465/// that make it a bit unique from what you might expect:
466///
467/// 1. All keys must be added in lexicographic order. Adding a key out of order
468///    will result in an error.
469/// 2. The representation of a set is streamed to *any* `io::Write` as it is
470///    built. For an in memory representation, this can be a `Vec<u8>`.
471///
472/// Point (2) is especially important because it means that a set can be
473/// constructed *without storing the entire set in memory*. Namely, since it
474/// works with any `io::Write`, it can be streamed directly to a file.
475///
476/// With that said, the builder does use memory, but **memory usage is bounded
477/// to a constant size**. The amount of memory used trades off with the
478/// compression ratio. Currently, the implementation hard codes this trade off
479/// which can result in about 5-20MB of heap usage during construction. (N.B.
480/// Guaranteeing a maximal compression ratio requires memory proportional to
481/// the size of the set, which defeats the benefit of streaming it to disk.
482/// In practice, a small bounded amount of memory achieves close-to-minimal
483/// compression ratios.)
484///
485/// The algorithmic complexity of set construction is `O(n)` where `n` is the
486/// number of elements added to the set.
487///
488/// # Example: build in memory
489///
490/// This shows how to use the builder to construct a set in memory. Note that
491/// `Set::from_iter` provides a convenience function that achieves this same
492/// goal without needing to explicitly use `SetBuilder`.
493///
494/// ```rust
495/// use fst::{IntoStreamer, Streamer, Set, SetBuilder};
496///
497/// let mut build = SetBuilder::memory();
498/// build.insert("bruce").unwrap();
499/// build.insert("clarence").unwrap();
500/// build.insert("stevie").unwrap();
501///
502/// // You could also call `finish()` here, but since we're building the set in
503/// // memory, there would be no way to get the `Vec<u8>` back.
504/// let bytes = build.into_inner().unwrap();
505///
506/// // At this point, the set has been constructed, but here's how to read it.
507/// let set = Set::new(bytes).unwrap();
508/// let mut stream = set.into_stream();
509/// let mut keys = vec![];
510/// while let Some(key) = stream.next() {
511///     keys.push(key.to_vec());
512/// }
513/// assert_eq!(keys, vec![
514///     "bruce".as_bytes(), "clarence".as_bytes(), "stevie".as_bytes(),
515/// ]);
516/// ```
517///
518/// # Example: stream to file
519///
520/// This shows how to stream construction of a set to a file.
521///
522/// ```rust,no_run
523/// use std::fs::File;
524/// use std::io;
525///
526/// use fst::{IntoStreamer, Streamer, Set, SetBuilder};
527///
528/// let mut wtr = io::BufWriter::new(File::create("set.fst").unwrap());
529/// let mut build = SetBuilder::new(wtr).unwrap();
530/// build.insert("bruce").unwrap();
531/// build.insert("clarence").unwrap();
532/// build.insert("stevie").unwrap();
533///
534/// // If you want the writer back, then call `into_inner`. Otherwise, this
535/// // will finish construction and call `flush`.
536/// build.finish().unwrap();
537///
538/// // At this point, the set has been constructed, but here's how to read it.
539/// // NOTE: Normally, one would memory map a file instead of reading its
540/// // entire contents on to the heap.
541/// let set = Set::new(std::fs::read("set.fst").unwrap()).unwrap();
542/// let mut stream = set.into_stream();
543/// let mut keys = vec![];
544/// while let Some(key) = stream.next() {
545///     keys.push(key.to_vec());
546/// }
547/// assert_eq!(keys, vec![
548///     "bruce".as_bytes(), "clarence".as_bytes(), "stevie".as_bytes(),
549/// ]);
550/// ```
551pub struct SetBuilder<W>(raw::Builder<W>);
552
553impl SetBuilder<Vec<u8>> {
554    /// Create a builder that builds a set in memory.
555    #[inline]
556    pub fn memory() -> SetBuilder<Vec<u8>> {
557        SetBuilder(raw::Builder::memory())
558    }
559
560    /// Finishes the construction of the set and returns it.
561    #[inline]
562    pub fn into_set(self) -> Set<Vec<u8>> {
563        Set(self.0.into_fst())
564    }
565}
566
567impl<W: io::Write> SetBuilder<W> {
568    /// Create a builder that builds a set by writing it to `wtr` in a
569    /// streaming fashion.
570    pub fn new(wtr: W) -> Result<SetBuilder<W>> {
571        raw::Builder::new_type(wtr, 0).map(SetBuilder)
572    }
573
574    /// Insert a new key into the set.
575    ///
576    /// If a key is inserted that is less than any previous key added, then
577    /// an error is returned. Similarly, if there was a problem writing to
578    /// the underlying writer, an error is returned.
579    pub fn insert<K: AsRef<[u8]>>(&mut self, key: K) -> Result<()> {
580        self.0.add(key)
581    }
582
583    /// Calls insert on each item in the iterator.
584    ///
585    /// If an error occurred while adding an element, processing is stopped
586    /// and the error is returned.
587    pub fn extend_iter<T, I>(&mut self, iter: I) -> Result<()>
588    where
589        T: AsRef<[u8]>,
590        I: IntoIterator<Item = T>,
591    {
592        for key in iter {
593            self.0.add(key)?;
594        }
595        Ok(())
596    }
597
598    /// Calls insert on each item in the stream.
599    ///
600    /// Note that unlike `extend_iter`, this is not generic on the items in
601    /// the stream.
602    pub fn extend_stream<'f, I, S>(&mut self, stream: I) -> Result<()>
603    where
604        I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
605        S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
606    {
607        self.0.extend_stream(StreamZeroOutput(stream.into_stream()))
608    }
609
610    /// Finishes the construction of the set and flushes the underlying
611    /// writer. After completion, the data written to `W` may be read using
612    /// one of `Set`'s constructor methods.
613    pub fn finish(self) -> Result<()> {
614        self.0.finish()
615    }
616
617    /// Just like `finish`, except it returns the underlying writer after
618    /// flushing it.
619    pub fn into_inner(self) -> Result<W> {
620        self.0.into_inner()
621    }
622
623    /// Gets a reference to the underlying writer.
624    pub fn get_ref(&self) -> &W {
625        self.0.get_ref()
626    }
627
628    /// Returns the number of bytes written to the underlying writer
629    pub fn bytes_written(&self) -> u64 {
630        self.0.bytes_written()
631    }
632}
633
634/// A lexicographically ordered stream of keys from a set.
635///
636/// The `A` type parameter corresponds to an optional automaton to filter
637/// the stream. By default, no filtering is done.
638///
639/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
640pub struct Stream<'s, A = AlwaysMatch>(raw::Stream<'s, A>)
641where
642    A: Automaton;
643
644impl<'s, A: Automaton> Stream<'s, A> {
645    /// Convert this stream into a vector of Unicode strings.
646    ///
647    /// If any key is not valid UTF-8, then iteration on the stream is stopped
648    /// and a UTF-8 decoding error is returned.
649    ///
650    /// Note that this creates a new allocation for every key in the stream.
651    pub fn into_strs(self) -> Result<Vec<String>> {
652        self.0.into_str_keys()
653    }
654
655    /// Convert this stream into a vector of byte strings.
656    ///
657    /// Note that this creates a new allocation for every key in the stream.
658    pub fn into_bytes(self) -> Vec<Vec<u8>> {
659        self.0.into_byte_keys()
660    }
661}
662
663impl<'a, 's, A: Automaton> Streamer<'a> for Stream<'s, A> {
664    type Item = &'a [u8];
665
666    fn next(&'a mut self) -> Option<&'a [u8]> {
667        self.0.next().map(|(key, _)| key)
668    }
669}
670
671/// A lexicographically ordered stream of key-state pairs from a set and
672/// an automaton.
673///
674/// The keys are from the set while the states are from the automaton.
675///
676/// The `A` type parameter corresponds to an optional automaton to filter
677/// the stream. By default, no filtering is done.
678///
679/// The `'m` lifetime parameter refers to the lifetime of the underlying set.
680pub struct StreamWithState<'m, A = AlwaysMatch>(raw::StreamWithState<'m, A>)
681where
682    A: Automaton;
683
684impl<'a, 'm, A: 'a + Automaton> Streamer<'a> for StreamWithState<'m, A>
685where
686    A::State: Clone,
687{
688    type Item = (&'a [u8], A::State);
689
690    fn next(&'a mut self) -> Option<(&'a [u8], A::State)> {
691        self.0.next().map(|(key, _, state)| (key, state))
692    }
693}
694
695/// A builder for constructing range queries on streams.
696///
697/// Once all bounds are set, one should call `into_stream` to get a
698/// `Stream`.
699///
700/// Bounds are not additive. That is, if `ge` is called twice on the same
701/// builder, then the second setting wins.
702///
703/// The `A` type parameter corresponds to an optional automaton to filter
704/// the stream. By default, no filtering is done.
705///
706/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
707pub struct StreamBuilder<'s, A = AlwaysMatch>(raw::StreamBuilder<'s, A>);
708
709impl<'s, A: Automaton> StreamBuilder<'s, A> {
710    /// Specify a greater-than-or-equal-to bound.
711    pub fn ge<T: AsRef<[u8]>>(self, bound: T) -> StreamBuilder<'s, A> {
712        StreamBuilder(self.0.ge(bound))
713    }
714
715    /// Specify a greater-than bound.
716    pub fn gt<T: AsRef<[u8]>>(self, bound: T) -> StreamBuilder<'s, A> {
717        StreamBuilder(self.0.gt(bound))
718    }
719
720    /// Specify a less-than-or-equal-to bound.
721    pub fn le<T: AsRef<[u8]>>(self, bound: T) -> StreamBuilder<'s, A> {
722        StreamBuilder(self.0.le(bound))
723    }
724
725    /// Specify a less-than bound.
726    pub fn lt<T: AsRef<[u8]>>(self, bound: T) -> StreamBuilder<'s, A> {
727        StreamBuilder(self.0.lt(bound))
728    }
729}
730
731impl<'s, 'a, A: Automaton> IntoStreamer<'a> for StreamBuilder<'s, A> {
732    type Item = &'a [u8];
733    type Into = Stream<'s, A>;
734
735    fn into_stream(self) -> Stream<'s, A> {
736        Stream(self.0.into_stream())
737    }
738}
739
740/// A builder for constructing range queries on streams that include automaton
741/// states.
742///
743/// In general, one should use `StreamBuilder` unless you have a specific need
744/// for accessing the states of the underlying automaton that is being used to
745/// filter this stream.
746///
747/// Once all bounds are set, one should call `into_stream` to get a
748/// `Stream`.
749///
750/// Bounds are not additive. That is, if `ge` is called twice on the same
751/// builder, then the second setting wins.
752///
753/// The `A` type parameter corresponds to an optional automaton to filter
754/// the stream. By default, no filtering is done.
755///
756/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
757pub struct StreamWithStateBuilder<'s, A = AlwaysMatch>(
758    raw::StreamWithStateBuilder<'s, A>,
759);
760
761impl<'s, A: Automaton> StreamWithStateBuilder<'s, A> {
762    /// Specify a greater-than-or-equal-to bound.
763    pub fn ge<T: AsRef<[u8]>>(
764        self,
765        bound: T,
766    ) -> StreamWithStateBuilder<'s, A> {
767        StreamWithStateBuilder(self.0.ge(bound))
768    }
769
770    /// Specify a greater-than bound.
771    pub fn gt<T: AsRef<[u8]>>(
772        self,
773        bound: T,
774    ) -> StreamWithStateBuilder<'s, A> {
775        StreamWithStateBuilder(self.0.gt(bound))
776    }
777
778    /// Specify a less-than-or-equal-to bound.
779    pub fn le<T: AsRef<[u8]>>(
780        self,
781        bound: T,
782    ) -> StreamWithStateBuilder<'s, A> {
783        StreamWithStateBuilder(self.0.le(bound))
784    }
785
786    /// Specify a less-than bound.
787    pub fn lt<T: AsRef<[u8]>>(
788        self,
789        bound: T,
790    ) -> StreamWithStateBuilder<'s, A> {
791        StreamWithStateBuilder(self.0.lt(bound))
792    }
793}
794
795impl<'s, 'a, A: 'a + Automaton> IntoStreamer<'a>
796    for StreamWithStateBuilder<'s, A>
797where
798    A::State: Clone,
799{
800    type Item = (&'a [u8], A::State);
801    type Into = StreamWithState<'s, A>;
802
803    fn into_stream(self) -> StreamWithState<'s, A> {
804        StreamWithState(self.0.into_stream())
805    }
806}
807
808/// A builder for collecting set streams on which to perform set operations.
809///
810/// Set operations include intersection, union, difference and symmetric
811/// difference. The result of each set operation is itself a stream that emits
812/// keys in lexicographic order.
813///
814/// All set operations work efficiently on an arbitrary number of
815/// streams with memory proportional to the number of streams.
816///
817/// The algorithmic complexity of all set operations is `O(n1 + n2 + n3 + ...)`
818/// where `n1, n2, n3, ...` correspond to the number of elements in each
819/// stream.
820///
821/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
822pub struct OpBuilder<'s>(raw::OpBuilder<'s>);
823
824impl<'s> OpBuilder<'s> {
825    /// Create a new set operation builder.
826    #[inline]
827    pub fn new() -> OpBuilder<'s> {
828        OpBuilder(raw::OpBuilder::new())
829    }
830
831    /// Add a stream to this set operation.
832    ///
833    /// This is useful for a chaining style pattern, e.g.,
834    /// `builder.add(stream1).add(stream2).union()`.
835    ///
836    /// The stream must emit a lexicographically ordered sequence of byte
837    /// strings.
838    pub fn add<I, S>(mut self, streamable: I) -> OpBuilder<'s>
839    where
840        I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
841        S: 's + for<'a> Streamer<'a, Item = &'a [u8]>,
842    {
843        self.push(streamable);
844        self
845    }
846
847    /// Add a stream to this set operation.
848    ///
849    /// The stream must emit a lexicographically ordered sequence of byte
850    /// strings.
851    pub fn push<I, S>(&mut self, streamable: I)
852    where
853        I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
854        S: 's + for<'a> Streamer<'a, Item = &'a [u8]>,
855    {
856        self.0.push(StreamZeroOutput(streamable.into_stream()));
857    }
858
859    /// Performs a union operation on all streams that have been added.
860    ///
861    /// # Example
862    ///
863    /// ```rust
864    /// use fst::{IntoStreamer, Streamer, Set};
865    ///
866    /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
867    /// let set2 = Set::from_iter(&["a", "y", "z"]).unwrap();
868    ///
869    /// let mut union = set1.op().add(&set2).union();
870    ///
871    /// let mut keys = vec![];
872    /// while let Some(key) = union.next() {
873    ///     keys.push(key.to_vec());
874    /// }
875    /// assert_eq!(keys, vec![b"a", b"b", b"c", b"y", b"z"]);
876    /// ```
877    #[inline]
878    pub fn union(self) -> Union<'s> {
879        Union(self.0.union())
880    }
881
882    /// Performs an intersection operation on all streams that have been added.
883    ///
884    /// # Example
885    ///
886    /// ```rust
887    /// use fst::{IntoStreamer, Streamer, Set};
888    ///
889    /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
890    /// let set2 = Set::from_iter(&["a", "y", "z"]).unwrap();
891    ///
892    /// let mut intersection = set1.op().add(&set2).intersection();
893    ///
894    /// let mut keys = vec![];
895    /// while let Some(key) = intersection.next() {
896    ///     keys.push(key.to_vec());
897    /// }
898    /// assert_eq!(keys, vec![b"a"]);
899    /// ```
900    #[inline]
901    pub fn intersection(self) -> Intersection<'s> {
902        Intersection(self.0.intersection())
903    }
904
905    /// Performs a difference operation with respect to the first stream added.
906    /// That is, this returns a stream of all elements in the first stream
907    /// that don't exist in any other stream that has been added.
908    ///
909    /// # Example
910    ///
911    /// ```rust
912    /// use fst::{IntoStreamer, Streamer, Set};
913    ///
914    /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
915    /// let set2 = Set::from_iter(&["a", "y", "z"]).unwrap();
916    ///
917    /// let mut difference = set1.op().add(&set2).difference();
918    ///
919    /// let mut keys = vec![];
920    /// while let Some(key) = difference.next() {
921    ///     keys.push(key.to_vec());
922    /// }
923    /// assert_eq!(keys, vec![b"b", b"c"]);
924    /// ```
925    #[inline]
926    pub fn difference(self) -> Difference<'s> {
927        Difference(self.0.difference())
928    }
929
930    /// Performs a symmetric difference operation on all of the streams that
931    /// have been added.
932    ///
933    /// When there are only two streams, then the keys returned correspond to
934    /// keys that are in either stream but *not* in both streams.
935    ///
936    /// More generally, for any number of streams, keys that occur in an odd
937    /// number of streams are returned.
938    ///
939    /// # Example
940    ///
941    /// ```rust
942    /// use fst::{IntoStreamer, Streamer, Set};
943    ///
944    /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
945    /// let set2 = Set::from_iter(&["a", "y", "z"]).unwrap();
946    ///
947    /// let mut sym_difference = set1.op().add(&set2).symmetric_difference();
948    ///
949    /// let mut keys = vec![];
950    /// while let Some(key) = sym_difference.next() {
951    ///     keys.push(key.to_vec());
952    /// }
953    /// assert_eq!(keys, vec![b"b", b"c", b"y", b"z"]);
954    /// ```
955    #[inline]
956    pub fn symmetric_difference(self) -> SymmetricDifference<'s> {
957        SymmetricDifference(self.0.symmetric_difference())
958    }
959}
960
961impl<'f, I, S> Extend<I> for OpBuilder<'f>
962where
963    I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
964    S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
965{
966    fn extend<T>(&mut self, it: T)
967    where
968        T: IntoIterator<Item = I>,
969    {
970        for stream in it {
971            self.push(stream);
972        }
973    }
974}
975
976impl<'f, I, S> FromIterator<I> for OpBuilder<'f>
977where
978    I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
979    S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
980{
981    fn from_iter<T>(it: T) -> OpBuilder<'f>
982    where
983        T: IntoIterator<Item = I>,
984    {
985        let mut op = OpBuilder::new();
986        op.extend(it);
987        op
988    }
989}
990
991/// A stream of set union over multiple streams in lexicographic order.
992///
993/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
994pub struct Union<'s>(raw::Union<'s>);
995
996impl<'a, 's> Streamer<'a> for Union<'s> {
997    type Item = &'a [u8];
998
999    #[inline]
1000    fn next(&'a mut self) -> Option<&'a [u8]> {
1001        self.0.next().map(|(key, _)| key)
1002    }
1003}
1004
1005/// A stream of set intersection over multiple streams in lexicographic order.
1006///
1007/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
1008pub struct Intersection<'s>(raw::Intersection<'s>);
1009
1010impl<'a, 's> Streamer<'a> for Intersection<'s> {
1011    type Item = &'a [u8];
1012
1013    #[inline]
1014    fn next(&'a mut self) -> Option<&'a [u8]> {
1015        self.0.next().map(|(key, _)| key)
1016    }
1017}
1018
1019/// A stream of set difference over multiple streams in lexicographic order.
1020///
1021/// The difference operation is taken with respect to the first stream and the
1022/// rest of the streams. i.e., All elements in the first stream that do not
1023/// appear in any other streams.
1024///
1025/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
1026pub struct Difference<'s>(raw::Difference<'s>);
1027
1028impl<'a, 's> Streamer<'a> for Difference<'s> {
1029    type Item = &'a [u8];
1030
1031    #[inline]
1032    fn next(&'a mut self) -> Option<&'a [u8]> {
1033        self.0.next().map(|(key, _)| key)
1034    }
1035}
1036
1037/// A stream of set symmetric difference over multiple streams in lexicographic
1038/// order.
1039///
1040/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
1041pub struct SymmetricDifference<'s>(raw::SymmetricDifference<'s>);
1042
1043impl<'a, 's> Streamer<'a> for SymmetricDifference<'s> {
1044    type Item = &'a [u8];
1045
1046    #[inline]
1047    fn next(&'a mut self) -> Option<&'a [u8]> {
1048        self.0.next().map(|(key, _)| key)
1049    }
1050}
1051
1052/// A specialized stream for mapping set streams (`&[u8]`) to streams used
1053/// by raw fsts (`(&[u8], Output)`).
1054///
1055/// If this were iterators, we could use `iter::Map`, but doing this on streams
1056/// requires HKT, so we need to write out the monomorphization ourselves.
1057struct StreamZeroOutput<S>(S);
1058
1059impl<'a, S: Streamer<'a>> Streamer<'a> for StreamZeroOutput<S> {
1060    type Item = (S::Item, raw::Output);
1061
1062    fn next(&'a mut self) -> Option<(S::Item, raw::Output)> {
1063        self.0.next().map(|key| (key, raw::Output::zero()))
1064    }
1065}
1066
1067#[cfg(test)]
1068mod tests {
1069    use super::OpBuilder;
1070    use crate::Streamer;
1071
1072    #[test]
1073    fn no_fsts() {
1074        struct Iter<'a> {
1075            i: usize,
1076            xs: Vec<&'a [u8]>,
1077        }
1078
1079        impl<'a> Iter<'a> {
1080            fn new(xs: Vec<&'a [u8]>) -> Iter<'a> {
1081                Iter { i: 0, xs }
1082            }
1083        }
1084
1085        impl<'a, 's> Streamer<'a> for Iter<'s> {
1086            type Item = &'a [u8];
1087            fn next(&'a mut self) -> Option<&'a [u8]> {
1088                if self.i >= self.xs.len() {
1089                    None
1090                } else {
1091                    let i = self.i;
1092                    self.i += 1;
1093                    Some(self.xs[i])
1094                }
1095            }
1096        }
1097
1098        let mut stream = OpBuilder::new()
1099            .add(Iter::new(vec![
1100                &b"bar"[..],
1101                &b"baz"[..],
1102                &b"foo"[..],
1103                &b"fubar"[..],
1104                &b"quux"[..],
1105            ]))
1106            .add(Iter::new(vec![&b"bar"[..], &b"foofoo"[..], &b"fubar"[..]]))
1107            .intersection();
1108
1109        let mut got = vec![];
1110        while let Some(x) = stream.next() {
1111            got.push(x.to_vec());
1112        }
1113        assert_eq!(got, vec![&b"bar"[..], &b"fubar"[..]]);
1114    }
1115}