fst/set.rs
1use std::fmt;
2use std::io;
3use std::iter::{self, FromIterator};
4
5use crate::automaton::{AlwaysMatch, Automaton};
6use crate::raw;
7use crate::stream::{IntoStreamer, Streamer};
8use crate::Result;
9
10/// Set is a lexicographically ordered set of byte strings.
11///
12/// A `Set` is constructed with the `SetBuilder` type. Alternatively, a `Set`
13/// can be constructed in memory from a lexicographically ordered iterator
14/// of byte strings (`Set::from_iter`).
15///
16/// A key feature of `Set` is that it can be serialized to disk compactly. Its
17/// underlying representation is built such that the `Set` can be memory mapped
18/// and searched without necessarily loading the entire set into memory.
19///
20/// It supports most common operations associated with sets, such as
21/// membership, union, intersection, subset/superset, etc. It also supports
22/// range queries and automata based searches (e.g. a regular expression).
23///
24/// Sets are represented by a finite state transducer where output values are
25/// always zero. As such, sets have the following invariants:
26///
27/// 1. Once constructed, a `Set` can never be modified.
28/// 2. Sets must be constructed with lexicographically ordered byte sequences.
29#[derive(Clone)]
30pub struct Set<D>(raw::Fst<D>);
31
32impl Set<Vec<u8>> {
33 /// Create a `Set` from an iterator of lexicographically ordered byte
34 /// strings.
35 ///
36 /// If the iterator does not yield values in lexicographic order, then an
37 /// error is returned.
38 ///
39 /// Note that this is a convenience function to build a set in memory.
40 /// To build a set that streams to an arbitrary `io::Write`, use
41 /// `SetBuilder`.
42 pub fn from_iter<T, I>(iter: I) -> Result<Set<Vec<u8>>>
43 where
44 T: AsRef<[u8]>,
45 I: IntoIterator<Item = T>,
46 {
47 let mut builder = SetBuilder::memory();
48 builder.extend_iter(iter)?;
49 Set::new(builder.into_inner()?)
50 }
51}
52
53impl<D: AsRef<[u8]>> Set<D> {
54 /// Creates a set from its representation as a raw byte sequence.
55 ///
56 /// This accepts anything that can be cheaply converted to a `&[u8]`. The
57 /// caller is responsible for guaranteeing that the given bytes refer to
58 /// a valid FST. While memory safety will not be violated by invalid input,
59 /// a panic could occur while reading the FST at any point.
60 ///
61 /// # Example
62 ///
63 /// ```no_run
64 /// use fst::Set;
65 ///
66 /// // File written from a build script using SetBuilder.
67 /// # const IGNORE: &str = stringify! {
68 /// static FST: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/set.fst"));
69 /// # };
70 /// # static FST: &[u8] = &[];
71 ///
72 /// let set = Set::new(FST).unwrap();
73 /// ```
74 pub fn new(data: D) -> Result<Set<D>> {
75 raw::Fst::new(data).map(Set)
76 }
77
78 /// Tests the membership of a single key.
79 ///
80 /// # Example
81 ///
82 /// ```rust
83 /// use fst::Set;
84 ///
85 /// let set = Set::from_iter(&["a", "b", "c"]).unwrap();
86 ///
87 /// assert_eq!(set.contains("b"), true);
88 /// assert_eq!(set.contains("z"), false);
89 /// ```
90 pub fn contains<K: AsRef<[u8]>>(&self, key: K) -> bool {
91 self.0.contains_key(key)
92 }
93
94 /// Return a lexicographically ordered stream of all keys in this set.
95 ///
96 /// While this is a stream, it does require heap space proportional to the
97 /// longest key in the set.
98 ///
99 /// If the set is memory mapped, then no further heap space is needed.
100 /// Note though that your operating system may fill your page cache
101 /// (which will cause the resident memory usage of the process to go up
102 /// correspondingly).
103 ///
104 /// # Example
105 ///
106 /// Since streams are not iterators, the traditional `for` loop cannot be
107 /// used. `while let` is useful instead:
108 ///
109 /// ```rust
110 /// use fst::{IntoStreamer, Streamer, Set};
111 ///
112 /// let set = Set::from_iter(&["a", "b", "c"]).unwrap();
113 /// let mut stream = set.stream();
114 ///
115 /// let mut keys = vec![];
116 /// while let Some(key) = stream.next() {
117 /// keys.push(key.to_vec());
118 /// }
119 /// assert_eq!(keys, vec![b"a", b"b", b"c"]);
120 /// ```
121 #[inline]
122 pub fn stream(&self) -> Stream<'_> {
123 Stream(self.0.stream())
124 }
125
126 /// Return a builder for range queries.
127 ///
128 /// A range query returns a subset of keys in this set in a range given in
129 /// lexicographic order.
130 ///
131 /// Memory requirements are the same as described on `Set::stream`.
132 /// Notably, only the keys in the range are read; keys outside the range
133 /// are not.
134 ///
135 /// # Example
136 ///
137 /// Returns only the keys in the range given.
138 ///
139 /// ```rust
140 /// use fst::{IntoStreamer, Streamer, Set};
141 ///
142 /// let set = Set::from_iter(&["a", "b", "c", "d", "e"]).unwrap();
143 /// let mut stream = set.range().ge("b").lt("e").into_stream();
144 ///
145 /// let mut keys = vec![];
146 /// while let Some(key) = stream.next() {
147 /// keys.push(key.to_vec());
148 /// }
149 /// assert_eq!(keys, vec![b"b", b"c", b"d"]);
150 /// ```
151 #[inline]
152 pub fn range(&self) -> StreamBuilder<'_> {
153 StreamBuilder(self.0.range())
154 }
155
156 /// Executes an automaton on the keys of this set.
157 ///
158 /// Note that this returns a `StreamBuilder`, which can be used to
159 /// add a range query to the search (see the `range` method).
160 ///
161 /// Memory requirements are the same as described on `Set::stream`.
162 ///
163 /// # Example
164 ///
165 /// An implementation of subsequence search for `Automaton` can be used
166 /// to search sets:
167 ///
168 /// ```rust
169 /// use fst::automaton::Subsequence;
170 /// use fst::{IntoStreamer, Streamer, Set};
171 ///
172 /// # fn main() { example().unwrap(); }
173 /// fn example() -> Result<(), Box<dyn std::error::Error>> {
174 /// let set = Set::from_iter(&[
175 /// "a foo bar", "foo", "foo1", "foo2", "foo3", "foobar",
176 /// ]).unwrap();
177 ///
178 /// let matcher = Subsequence::new("for");
179 /// let mut stream = set.search(&matcher).into_stream();
180 ///
181 /// let mut keys = vec![];
182 /// while let Some(key) = stream.next() {
183 /// keys.push(String::from_utf8(key.to_vec())?);
184 /// }
185 /// assert_eq!(keys, vec![
186 /// "a foo bar", "foobar",
187 /// ]);
188 ///
189 /// Ok(())
190 /// }
191 /// ```
192 pub fn search<A: Automaton>(&self, aut: A) -> StreamBuilder<'_, A> {
193 StreamBuilder(self.0.search(aut))
194 }
195
196 /// Executes an automaton on the values of this set and yields matching
197 /// values along with the corresponding matching states in the given
198 /// automaton.
199 ///
200 /// Note that this returns a `StreamWithStateBuilder`, which can be used to
201 /// add a range query to the search (see the `range` method).
202 ///
203 /// Memory requirements are the same as described on `Map::stream`.
204 ///
205 #[cfg_attr(
206 feature = "levenshtein",
207 doc = r##"
208# Example
209
210An implementation of fuzzy search using Levenshtein automata can be used
211to search sets:
212
213```rust
214use fst::automaton::Levenshtein;
215use fst::{IntoStreamer, Streamer, Set};
216
217# fn main() { example().unwrap(); }
218fn example() -> Result<(), Box<dyn std::error::Error>> {
219 let set = Set::from_iter(vec![
220 "foo",
221 "foob",
222 "foobar",
223 "fozb",
224 ]).unwrap();
225
226 let query = Levenshtein::new("foo", 2)?;
227 let mut stream = set.search_with_state(&query).into_stream();
228
229 let mut vs = vec![];
230 while let Some((v, s)) = stream.next() {
231 vs.push((String::from_utf8(v.to_vec())?, s));
232 }
233 // Currently, there isn't much interesting that you can do with the states.
234 assert_eq!(vs, vec![
235 ("foo".to_string(), Some(183)),
236 ("foob".to_string(), Some(123)),
237 ("fozb".to_string(), Some(83)),
238 ]);
239
240 Ok(())
241}
242```
243"##
244 )]
245 pub fn search_with_state<A: Automaton>(
246 &self,
247 aut: A,
248 ) -> StreamWithStateBuilder<'_, A> {
249 StreamWithStateBuilder(self.0.search_with_state(aut))
250 }
251
252 /// Returns the number of elements in this set.
253 #[inline]
254 pub fn len(&self) -> usize {
255 self.0.len()
256 }
257
258 /// Returns true if and only if this set is empty.
259 #[inline]
260 pub fn is_empty(&self) -> bool {
261 self.0.is_empty()
262 }
263
264 /// Creates a new set operation with this set added to it.
265 ///
266 /// The `OpBuilder` type can be used to add additional set streams
267 /// and perform set operations like union, intersection, difference and
268 /// symmetric difference.
269 ///
270 /// # Example
271 ///
272 /// ```rust
273 /// use fst::{IntoStreamer, Streamer, Set};
274 ///
275 /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
276 /// let set2 = Set::from_iter(&["a", "y", "z"]).unwrap();
277 ///
278 /// let mut union = set1.op().add(&set2).union();
279 ///
280 /// let mut keys = vec![];
281 /// while let Some(key) = union.next() {
282 /// keys.push(key.to_vec());
283 /// }
284 /// assert_eq!(keys, vec![b"a", b"b", b"c", b"y", b"z"]);
285 /// ```
286 #[inline]
287 pub fn op(&self) -> OpBuilder<'_> {
288 OpBuilder::new().add(self)
289 }
290
291 /// Returns true if and only if the `self` set is disjoint with the set
292 /// `stream`.
293 ///
294 /// `stream` must be a lexicographically ordered sequence of byte strings.
295 ///
296 /// # Example
297 ///
298 /// ```rust
299 /// use fst::{IntoStreamer, Streamer, Set};
300 ///
301 /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
302 /// let set2 = Set::from_iter(&["x", "y", "z"]).unwrap();
303 ///
304 /// assert_eq!(set1.is_disjoint(&set2), true);
305 ///
306 /// let set3 = Set::from_iter(&["a", "c"]).unwrap();
307 ///
308 /// assert_eq!(set1.is_disjoint(&set3), false);
309 /// ```
310 pub fn is_disjoint<'f, I, S>(&self, stream: I) -> bool
311 where
312 I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
313 S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
314 {
315 self.0.is_disjoint(StreamZeroOutput(stream.into_stream()))
316 }
317
318 /// Returns true if and only if the `self` set is a subset of `stream`.
319 ///
320 /// `stream` must be a lexicographically ordered sequence of byte strings.
321 ///
322 /// # Example
323 ///
324 /// ```rust
325 /// use fst::Set;
326 ///
327 /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
328 /// let set2 = Set::from_iter(&["x", "y", "z"]).unwrap();
329 ///
330 /// assert_eq!(set1.is_subset(&set2), false);
331 ///
332 /// let set3 = Set::from_iter(&["a", "c"]).unwrap();
333 ///
334 /// assert_eq!(set1.is_subset(&set3), false);
335 /// assert_eq!(set3.is_subset(&set1), true);
336 /// ```
337 pub fn is_subset<'f, I, S>(&self, stream: I) -> bool
338 where
339 I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
340 S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
341 {
342 self.0.is_subset(StreamZeroOutput(stream.into_stream()))
343 }
344
345 /// Returns true if and only if the `self` set is a superset of `stream`.
346 ///
347 /// `stream` must be a lexicographically ordered sequence of byte strings.
348 ///
349 /// # Example
350 ///
351 /// ```rust
352 /// use fst::Set;
353 ///
354 /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
355 /// let set2 = Set::from_iter(&["x", "y", "z"]).unwrap();
356 ///
357 /// assert_eq!(set1.is_superset(&set2), false);
358 ///
359 /// let set3 = Set::from_iter(&["a", "c"]).unwrap();
360 ///
361 /// assert_eq!(set1.is_superset(&set3), true);
362 /// assert_eq!(set3.is_superset(&set1), false);
363 /// ```
364 pub fn is_superset<'f, I, S>(&self, stream: I) -> bool
365 where
366 I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
367 S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
368 {
369 self.0.is_superset(StreamZeroOutput(stream.into_stream()))
370 }
371
372 /// Returns a reference to the underlying raw finite state transducer.
373 #[inline]
374 pub fn as_fst(&self) -> &raw::Fst<D> {
375 &self.0
376 }
377
378 /// Returns the underlying raw finite state transducer.
379 #[inline]
380 pub fn into_fst(self) -> raw::Fst<D> {
381 self.0
382 }
383
384 /// Maps the underlying data of the fst Set to another data type.
385 ///
386 /// # Example
387 ///
388 /// This example shows that you can map an fst Set based on a `Vec<u8>`
389 /// into an fst Set based on a `Cow<[u8]>`, it can also work with a
390 /// reference counted type (e.g. `Arc`, `Rc`).
391 ///
392 /// ```
393 /// use std::borrow::Cow;
394 ///
395 /// use fst::Set;
396 ///
397 /// let set: Set<Vec<u8>> = Set::from_iter(
398 /// &["hello", "world"],
399 /// ).unwrap();
400 ///
401 /// let set_on_cow: Set<Cow<[u8]>> = set.map_data(Cow::Owned).unwrap();
402 /// ```
403 #[inline]
404 pub fn map_data<F, T>(self, f: F) -> Result<Set<T>>
405 where
406 F: FnMut(D) -> T,
407 T: AsRef<[u8]>,
408 {
409 self.into_fst().map_data(f).map(Set::from)
410 }
411}
412
413impl Default for Set<Vec<u8>> {
414 #[inline]
415 fn default() -> Set<Vec<u8>> {
416 Set::from_iter(iter::empty::<&[u8]>()).unwrap()
417 }
418}
419
420impl<D: AsRef<[u8]>> fmt::Debug for Set<D> {
421 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
422 write!(f, "Set([")?;
423 let mut stream = self.stream();
424 let mut first = true;
425 while let Some(key) = stream.next() {
426 if !first {
427 write!(f, ", ")?;
428 }
429 first = false;
430 write!(f, "{}", String::from_utf8_lossy(key))?;
431 }
432 write!(f, "])")
433 }
434}
435
436/// Returns the underlying finite state transducer.
437impl<D: AsRef<[u8]>> AsRef<raw::Fst<D>> for Set<D> {
438 #[inline]
439 fn as_ref(&self) -> &raw::Fst<D> {
440 &self.0
441 }
442}
443
444impl<'s, 'a, D: AsRef<[u8]>> IntoStreamer<'a> for &'s Set<D> {
445 type Item = &'a [u8];
446 type Into = Stream<'s>;
447
448 #[inline]
449 fn into_stream(self) -> Stream<'s> {
450 Stream(self.0.stream())
451 }
452}
453
454// Construct a set from an Fst object.
455impl<D: AsRef<[u8]>> From<raw::Fst<D>> for Set<D> {
456 #[inline]
457 fn from(fst: raw::Fst<D>) -> Set<D> {
458 Set(fst)
459 }
460}
461
462/// A builder for creating a set.
463///
464/// This is not your average everyday builder. It has two important qualities
465/// that make it a bit unique from what you might expect:
466///
467/// 1. All keys must be added in lexicographic order. Adding a key out of order
468/// will result in an error.
469/// 2. The representation of a set is streamed to *any* `io::Write` as it is
470/// built. For an in memory representation, this can be a `Vec<u8>`.
471///
472/// Point (2) is especially important because it means that a set can be
473/// constructed *without storing the entire set in memory*. Namely, since it
474/// works with any `io::Write`, it can be streamed directly to a file.
475///
476/// With that said, the builder does use memory, but **memory usage is bounded
477/// to a constant size**. The amount of memory used trades off with the
478/// compression ratio. Currently, the implementation hard codes this trade off
479/// which can result in about 5-20MB of heap usage during construction. (N.B.
480/// Guaranteeing a maximal compression ratio requires memory proportional to
481/// the size of the set, which defeats the benefit of streaming it to disk.
482/// In practice, a small bounded amount of memory achieves close-to-minimal
483/// compression ratios.)
484///
485/// The algorithmic complexity of set construction is `O(n)` where `n` is the
486/// number of elements added to the set.
487///
488/// # Example: build in memory
489///
490/// This shows how to use the builder to construct a set in memory. Note that
491/// `Set::from_iter` provides a convenience function that achieves this same
492/// goal without needing to explicitly use `SetBuilder`.
493///
494/// ```rust
495/// use fst::{IntoStreamer, Streamer, Set, SetBuilder};
496///
497/// let mut build = SetBuilder::memory();
498/// build.insert("bruce").unwrap();
499/// build.insert("clarence").unwrap();
500/// build.insert("stevie").unwrap();
501///
502/// // You could also call `finish()` here, but since we're building the set in
503/// // memory, there would be no way to get the `Vec<u8>` back.
504/// let bytes = build.into_inner().unwrap();
505///
506/// // At this point, the set has been constructed, but here's how to read it.
507/// let set = Set::new(bytes).unwrap();
508/// let mut stream = set.into_stream();
509/// let mut keys = vec![];
510/// while let Some(key) = stream.next() {
511/// keys.push(key.to_vec());
512/// }
513/// assert_eq!(keys, vec![
514/// "bruce".as_bytes(), "clarence".as_bytes(), "stevie".as_bytes(),
515/// ]);
516/// ```
517///
518/// # Example: stream to file
519///
520/// This shows how to stream construction of a set to a file.
521///
522/// ```rust,no_run
523/// use std::fs::File;
524/// use std::io;
525///
526/// use fst::{IntoStreamer, Streamer, Set, SetBuilder};
527///
528/// let mut wtr = io::BufWriter::new(File::create("set.fst").unwrap());
529/// let mut build = SetBuilder::new(wtr).unwrap();
530/// build.insert("bruce").unwrap();
531/// build.insert("clarence").unwrap();
532/// build.insert("stevie").unwrap();
533///
534/// // If you want the writer back, then call `into_inner`. Otherwise, this
535/// // will finish construction and call `flush`.
536/// build.finish().unwrap();
537///
538/// // At this point, the set has been constructed, but here's how to read it.
539/// // NOTE: Normally, one would memory map a file instead of reading its
540/// // entire contents on to the heap.
541/// let set = Set::new(std::fs::read("set.fst").unwrap()).unwrap();
542/// let mut stream = set.into_stream();
543/// let mut keys = vec![];
544/// while let Some(key) = stream.next() {
545/// keys.push(key.to_vec());
546/// }
547/// assert_eq!(keys, vec![
548/// "bruce".as_bytes(), "clarence".as_bytes(), "stevie".as_bytes(),
549/// ]);
550/// ```
551pub struct SetBuilder<W>(raw::Builder<W>);
552
553impl SetBuilder<Vec<u8>> {
554 /// Create a builder that builds a set in memory.
555 #[inline]
556 pub fn memory() -> SetBuilder<Vec<u8>> {
557 SetBuilder(raw::Builder::memory())
558 }
559
560 /// Finishes the construction of the set and returns it.
561 #[inline]
562 pub fn into_set(self) -> Set<Vec<u8>> {
563 Set(self.0.into_fst())
564 }
565}
566
567impl<W: io::Write> SetBuilder<W> {
568 /// Create a builder that builds a set by writing it to `wtr` in a
569 /// streaming fashion.
570 pub fn new(wtr: W) -> Result<SetBuilder<W>> {
571 raw::Builder::new_type(wtr, 0).map(SetBuilder)
572 }
573
574 /// Insert a new key into the set.
575 ///
576 /// If a key is inserted that is less than any previous key added, then
577 /// an error is returned. Similarly, if there was a problem writing to
578 /// the underlying writer, an error is returned.
579 pub fn insert<K: AsRef<[u8]>>(&mut self, key: K) -> Result<()> {
580 self.0.add(key)
581 }
582
583 /// Calls insert on each item in the iterator.
584 ///
585 /// If an error occurred while adding an element, processing is stopped
586 /// and the error is returned.
587 pub fn extend_iter<T, I>(&mut self, iter: I) -> Result<()>
588 where
589 T: AsRef<[u8]>,
590 I: IntoIterator<Item = T>,
591 {
592 for key in iter {
593 self.0.add(key)?;
594 }
595 Ok(())
596 }
597
598 /// Calls insert on each item in the stream.
599 ///
600 /// Note that unlike `extend_iter`, this is not generic on the items in
601 /// the stream.
602 pub fn extend_stream<'f, I, S>(&mut self, stream: I) -> Result<()>
603 where
604 I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
605 S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
606 {
607 self.0.extend_stream(StreamZeroOutput(stream.into_stream()))
608 }
609
610 /// Finishes the construction of the set and flushes the underlying
611 /// writer. After completion, the data written to `W` may be read using
612 /// one of `Set`'s constructor methods.
613 pub fn finish(self) -> Result<()> {
614 self.0.finish()
615 }
616
617 /// Just like `finish`, except it returns the underlying writer after
618 /// flushing it.
619 pub fn into_inner(self) -> Result<W> {
620 self.0.into_inner()
621 }
622
623 /// Gets a reference to the underlying writer.
624 pub fn get_ref(&self) -> &W {
625 self.0.get_ref()
626 }
627
628 /// Returns the number of bytes written to the underlying writer
629 pub fn bytes_written(&self) -> u64 {
630 self.0.bytes_written()
631 }
632}
633
634/// A lexicographically ordered stream of keys from a set.
635///
636/// The `A` type parameter corresponds to an optional automaton to filter
637/// the stream. By default, no filtering is done.
638///
639/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
640pub struct Stream<'s, A = AlwaysMatch>(raw::Stream<'s, A>)
641where
642 A: Automaton;
643
644impl<'s, A: Automaton> Stream<'s, A> {
645 /// Convert this stream into a vector of Unicode strings.
646 ///
647 /// If any key is not valid UTF-8, then iteration on the stream is stopped
648 /// and a UTF-8 decoding error is returned.
649 ///
650 /// Note that this creates a new allocation for every key in the stream.
651 pub fn into_strs(self) -> Result<Vec<String>> {
652 self.0.into_str_keys()
653 }
654
655 /// Convert this stream into a vector of byte strings.
656 ///
657 /// Note that this creates a new allocation for every key in the stream.
658 pub fn into_bytes(self) -> Vec<Vec<u8>> {
659 self.0.into_byte_keys()
660 }
661}
662
663impl<'a, 's, A: Automaton> Streamer<'a> for Stream<'s, A> {
664 type Item = &'a [u8];
665
666 fn next(&'a mut self) -> Option<&'a [u8]> {
667 self.0.next().map(|(key, _)| key)
668 }
669}
670
671/// A lexicographically ordered stream of key-state pairs from a set and
672/// an automaton.
673///
674/// The keys are from the set while the states are from the automaton.
675///
676/// The `A` type parameter corresponds to an optional automaton to filter
677/// the stream. By default, no filtering is done.
678///
679/// The `'m` lifetime parameter refers to the lifetime of the underlying set.
680pub struct StreamWithState<'m, A = AlwaysMatch>(raw::StreamWithState<'m, A>)
681where
682 A: Automaton;
683
684impl<'a, 'm, A: 'a + Automaton> Streamer<'a> for StreamWithState<'m, A>
685where
686 A::State: Clone,
687{
688 type Item = (&'a [u8], A::State);
689
690 fn next(&'a mut self) -> Option<(&'a [u8], A::State)> {
691 self.0.next().map(|(key, _, state)| (key, state))
692 }
693}
694
695/// A builder for constructing range queries on streams.
696///
697/// Once all bounds are set, one should call `into_stream` to get a
698/// `Stream`.
699///
700/// Bounds are not additive. That is, if `ge` is called twice on the same
701/// builder, then the second setting wins.
702///
703/// The `A` type parameter corresponds to an optional automaton to filter
704/// the stream. By default, no filtering is done.
705///
706/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
707pub struct StreamBuilder<'s, A = AlwaysMatch>(raw::StreamBuilder<'s, A>);
708
709impl<'s, A: Automaton> StreamBuilder<'s, A> {
710 /// Specify a greater-than-or-equal-to bound.
711 pub fn ge<T: AsRef<[u8]>>(self, bound: T) -> StreamBuilder<'s, A> {
712 StreamBuilder(self.0.ge(bound))
713 }
714
715 /// Specify a greater-than bound.
716 pub fn gt<T: AsRef<[u8]>>(self, bound: T) -> StreamBuilder<'s, A> {
717 StreamBuilder(self.0.gt(bound))
718 }
719
720 /// Specify a less-than-or-equal-to bound.
721 pub fn le<T: AsRef<[u8]>>(self, bound: T) -> StreamBuilder<'s, A> {
722 StreamBuilder(self.0.le(bound))
723 }
724
725 /// Specify a less-than bound.
726 pub fn lt<T: AsRef<[u8]>>(self, bound: T) -> StreamBuilder<'s, A> {
727 StreamBuilder(self.0.lt(bound))
728 }
729}
730
731impl<'s, 'a, A: Automaton> IntoStreamer<'a> for StreamBuilder<'s, A> {
732 type Item = &'a [u8];
733 type Into = Stream<'s, A>;
734
735 fn into_stream(self) -> Stream<'s, A> {
736 Stream(self.0.into_stream())
737 }
738}
739
740/// A builder for constructing range queries on streams that include automaton
741/// states.
742///
743/// In general, one should use `StreamBuilder` unless you have a specific need
744/// for accessing the states of the underlying automaton that is being used to
745/// filter this stream.
746///
747/// Once all bounds are set, one should call `into_stream` to get a
748/// `Stream`.
749///
750/// Bounds are not additive. That is, if `ge` is called twice on the same
751/// builder, then the second setting wins.
752///
753/// The `A` type parameter corresponds to an optional automaton to filter
754/// the stream. By default, no filtering is done.
755///
756/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
757pub struct StreamWithStateBuilder<'s, A = AlwaysMatch>(
758 raw::StreamWithStateBuilder<'s, A>,
759);
760
761impl<'s, A: Automaton> StreamWithStateBuilder<'s, A> {
762 /// Specify a greater-than-or-equal-to bound.
763 pub fn ge<T: AsRef<[u8]>>(
764 self,
765 bound: T,
766 ) -> StreamWithStateBuilder<'s, A> {
767 StreamWithStateBuilder(self.0.ge(bound))
768 }
769
770 /// Specify a greater-than bound.
771 pub fn gt<T: AsRef<[u8]>>(
772 self,
773 bound: T,
774 ) -> StreamWithStateBuilder<'s, A> {
775 StreamWithStateBuilder(self.0.gt(bound))
776 }
777
778 /// Specify a less-than-or-equal-to bound.
779 pub fn le<T: AsRef<[u8]>>(
780 self,
781 bound: T,
782 ) -> StreamWithStateBuilder<'s, A> {
783 StreamWithStateBuilder(self.0.le(bound))
784 }
785
786 /// Specify a less-than bound.
787 pub fn lt<T: AsRef<[u8]>>(
788 self,
789 bound: T,
790 ) -> StreamWithStateBuilder<'s, A> {
791 StreamWithStateBuilder(self.0.lt(bound))
792 }
793}
794
795impl<'s, 'a, A: 'a + Automaton> IntoStreamer<'a>
796 for StreamWithStateBuilder<'s, A>
797where
798 A::State: Clone,
799{
800 type Item = (&'a [u8], A::State);
801 type Into = StreamWithState<'s, A>;
802
803 fn into_stream(self) -> StreamWithState<'s, A> {
804 StreamWithState(self.0.into_stream())
805 }
806}
807
808/// A builder for collecting set streams on which to perform set operations.
809///
810/// Set operations include intersection, union, difference and symmetric
811/// difference. The result of each set operation is itself a stream that emits
812/// keys in lexicographic order.
813///
814/// All set operations work efficiently on an arbitrary number of
815/// streams with memory proportional to the number of streams.
816///
817/// The algorithmic complexity of all set operations is `O(n1 + n2 + n3 + ...)`
818/// where `n1, n2, n3, ...` correspond to the number of elements in each
819/// stream.
820///
821/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
822pub struct OpBuilder<'s>(raw::OpBuilder<'s>);
823
824impl<'s> OpBuilder<'s> {
825 /// Create a new set operation builder.
826 #[inline]
827 pub fn new() -> OpBuilder<'s> {
828 OpBuilder(raw::OpBuilder::new())
829 }
830
831 /// Add a stream to this set operation.
832 ///
833 /// This is useful for a chaining style pattern, e.g.,
834 /// `builder.add(stream1).add(stream2).union()`.
835 ///
836 /// The stream must emit a lexicographically ordered sequence of byte
837 /// strings.
838 pub fn add<I, S>(mut self, streamable: I) -> OpBuilder<'s>
839 where
840 I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
841 S: 's + for<'a> Streamer<'a, Item = &'a [u8]>,
842 {
843 self.push(streamable);
844 self
845 }
846
847 /// Add a stream to this set operation.
848 ///
849 /// The stream must emit a lexicographically ordered sequence of byte
850 /// strings.
851 pub fn push<I, S>(&mut self, streamable: I)
852 where
853 I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
854 S: 's + for<'a> Streamer<'a, Item = &'a [u8]>,
855 {
856 self.0.push(StreamZeroOutput(streamable.into_stream()));
857 }
858
859 /// Performs a union operation on all streams that have been added.
860 ///
861 /// # Example
862 ///
863 /// ```rust
864 /// use fst::{IntoStreamer, Streamer, Set};
865 ///
866 /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
867 /// let set2 = Set::from_iter(&["a", "y", "z"]).unwrap();
868 ///
869 /// let mut union = set1.op().add(&set2).union();
870 ///
871 /// let mut keys = vec![];
872 /// while let Some(key) = union.next() {
873 /// keys.push(key.to_vec());
874 /// }
875 /// assert_eq!(keys, vec![b"a", b"b", b"c", b"y", b"z"]);
876 /// ```
877 #[inline]
878 pub fn union(self) -> Union<'s> {
879 Union(self.0.union())
880 }
881
882 /// Performs an intersection operation on all streams that have been added.
883 ///
884 /// # Example
885 ///
886 /// ```rust
887 /// use fst::{IntoStreamer, Streamer, Set};
888 ///
889 /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
890 /// let set2 = Set::from_iter(&["a", "y", "z"]).unwrap();
891 ///
892 /// let mut intersection = set1.op().add(&set2).intersection();
893 ///
894 /// let mut keys = vec![];
895 /// while let Some(key) = intersection.next() {
896 /// keys.push(key.to_vec());
897 /// }
898 /// assert_eq!(keys, vec![b"a"]);
899 /// ```
900 #[inline]
901 pub fn intersection(self) -> Intersection<'s> {
902 Intersection(self.0.intersection())
903 }
904
905 /// Performs a difference operation with respect to the first stream added.
906 /// That is, this returns a stream of all elements in the first stream
907 /// that don't exist in any other stream that has been added.
908 ///
909 /// # Example
910 ///
911 /// ```rust
912 /// use fst::{IntoStreamer, Streamer, Set};
913 ///
914 /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
915 /// let set2 = Set::from_iter(&["a", "y", "z"]).unwrap();
916 ///
917 /// let mut difference = set1.op().add(&set2).difference();
918 ///
919 /// let mut keys = vec![];
920 /// while let Some(key) = difference.next() {
921 /// keys.push(key.to_vec());
922 /// }
923 /// assert_eq!(keys, vec![b"b", b"c"]);
924 /// ```
925 #[inline]
926 pub fn difference(self) -> Difference<'s> {
927 Difference(self.0.difference())
928 }
929
930 /// Performs a symmetric difference operation on all of the streams that
931 /// have been added.
932 ///
933 /// When there are only two streams, then the keys returned correspond to
934 /// keys that are in either stream but *not* in both streams.
935 ///
936 /// More generally, for any number of streams, keys that occur in an odd
937 /// number of streams are returned.
938 ///
939 /// # Example
940 ///
941 /// ```rust
942 /// use fst::{IntoStreamer, Streamer, Set};
943 ///
944 /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
945 /// let set2 = Set::from_iter(&["a", "y", "z"]).unwrap();
946 ///
947 /// let mut sym_difference = set1.op().add(&set2).symmetric_difference();
948 ///
949 /// let mut keys = vec![];
950 /// while let Some(key) = sym_difference.next() {
951 /// keys.push(key.to_vec());
952 /// }
953 /// assert_eq!(keys, vec![b"b", b"c", b"y", b"z"]);
954 /// ```
955 #[inline]
956 pub fn symmetric_difference(self) -> SymmetricDifference<'s> {
957 SymmetricDifference(self.0.symmetric_difference())
958 }
959}
960
961impl<'f, I, S> Extend<I> for OpBuilder<'f>
962where
963 I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
964 S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
965{
966 fn extend<T>(&mut self, it: T)
967 where
968 T: IntoIterator<Item = I>,
969 {
970 for stream in it {
971 self.push(stream);
972 }
973 }
974}
975
976impl<'f, I, S> FromIterator<I> for OpBuilder<'f>
977where
978 I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
979 S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
980{
981 fn from_iter<T>(it: T) -> OpBuilder<'f>
982 where
983 T: IntoIterator<Item = I>,
984 {
985 let mut op = OpBuilder::new();
986 op.extend(it);
987 op
988 }
989}
990
991/// A stream of set union over multiple streams in lexicographic order.
992///
993/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
994pub struct Union<'s>(raw::Union<'s>);
995
996impl<'a, 's> Streamer<'a> for Union<'s> {
997 type Item = &'a [u8];
998
999 #[inline]
1000 fn next(&'a mut self) -> Option<&'a [u8]> {
1001 self.0.next().map(|(key, _)| key)
1002 }
1003}
1004
1005/// A stream of set intersection over multiple streams in lexicographic order.
1006///
1007/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
1008pub struct Intersection<'s>(raw::Intersection<'s>);
1009
1010impl<'a, 's> Streamer<'a> for Intersection<'s> {
1011 type Item = &'a [u8];
1012
1013 #[inline]
1014 fn next(&'a mut self) -> Option<&'a [u8]> {
1015 self.0.next().map(|(key, _)| key)
1016 }
1017}
1018
1019/// A stream of set difference over multiple streams in lexicographic order.
1020///
1021/// The difference operation is taken with respect to the first stream and the
1022/// rest of the streams. i.e., All elements in the first stream that do not
1023/// appear in any other streams.
1024///
1025/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
1026pub struct Difference<'s>(raw::Difference<'s>);
1027
1028impl<'a, 's> Streamer<'a> for Difference<'s> {
1029 type Item = &'a [u8];
1030
1031 #[inline]
1032 fn next(&'a mut self) -> Option<&'a [u8]> {
1033 self.0.next().map(|(key, _)| key)
1034 }
1035}
1036
1037/// A stream of set symmetric difference over multiple streams in lexicographic
1038/// order.
1039///
1040/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
1041pub struct SymmetricDifference<'s>(raw::SymmetricDifference<'s>);
1042
1043impl<'a, 's> Streamer<'a> for SymmetricDifference<'s> {
1044 type Item = &'a [u8];
1045
1046 #[inline]
1047 fn next(&'a mut self) -> Option<&'a [u8]> {
1048 self.0.next().map(|(key, _)| key)
1049 }
1050}
1051
1052/// A specialized stream for mapping set streams (`&[u8]`) to streams used
1053/// by raw fsts (`(&[u8], Output)`).
1054///
1055/// If this were iterators, we could use `iter::Map`, but doing this on streams
1056/// requires HKT, so we need to write out the monomorphization ourselves.
1057struct StreamZeroOutput<S>(S);
1058
1059impl<'a, S: Streamer<'a>> Streamer<'a> for StreamZeroOutput<S> {
1060 type Item = (S::Item, raw::Output);
1061
1062 fn next(&'a mut self) -> Option<(S::Item, raw::Output)> {
1063 self.0.next().map(|key| (key, raw::Output::zero()))
1064 }
1065}
1066
1067#[cfg(test)]
1068mod tests {
1069 use super::OpBuilder;
1070 use crate::Streamer;
1071
1072 #[test]
1073 fn no_fsts() {
1074 struct Iter<'a> {
1075 i: usize,
1076 xs: Vec<&'a [u8]>,
1077 }
1078
1079 impl<'a> Iter<'a> {
1080 fn new(xs: Vec<&'a [u8]>) -> Iter<'a> {
1081 Iter { i: 0, xs }
1082 }
1083 }
1084
1085 impl<'a, 's> Streamer<'a> for Iter<'s> {
1086 type Item = &'a [u8];
1087 fn next(&'a mut self) -> Option<&'a [u8]> {
1088 if self.i >= self.xs.len() {
1089 None
1090 } else {
1091 let i = self.i;
1092 self.i += 1;
1093 Some(self.xs[i])
1094 }
1095 }
1096 }
1097
1098 let mut stream = OpBuilder::new()
1099 .add(Iter::new(vec![
1100 &b"bar"[..],
1101 &b"baz"[..],
1102 &b"foo"[..],
1103 &b"fubar"[..],
1104 &b"quux"[..],
1105 ]))
1106 .add(Iter::new(vec![&b"bar"[..], &b"foofoo"[..], &b"fubar"[..]]))
1107 .intersection();
1108
1109 let mut got = vec![];
1110 while let Some(x) = stream.next() {
1111 got.push(x.to_vec());
1112 }
1113 assert_eq!(got, vec![&b"bar"[..], &b"fubar"[..]]);
1114 }
1115}