fst/map.rs
1use std::fmt;
2use std::io;
3use std::iter::{self, FromIterator};
4
5use crate::automaton::{AlwaysMatch, Automaton};
6use crate::raw;
7pub use crate::raw::IndexedValue;
8use crate::stream::{IntoStreamer, Streamer};
9use crate::Result;
10
11/// Map is a lexicographically ordered map from byte strings to integers.
12///
13/// A `Map` is constructed with the `MapBuilder` type. Alternatively, a `Map`
14/// can be constructed in memory from a lexicographically ordered iterator
15/// of key-value pairs (`Map::from_iter`).
16///
17/// A key feature of `Map` is that it can be serialized to disk compactly. Its
18/// underlying representation is built such that the `Map` can be memory mapped
19/// and searched without necessarily loading the entire map into memory.
20///
21/// It supports most common operations associated with maps, such as key
22/// lookup and search. It also supports set operations on its keys along with
23/// the ability to specify how conflicting values are merged together. Maps
24/// also support range queries and automata based searches (e.g. a regular
25/// expression).
26///
27/// Maps are represented by a finite state transducer where inputs are the keys
28/// and outputs are the values. As such, maps have the following invariants:
29///
30/// 1. Once constructed, a `Map` can never be modified.
31/// 2. Maps must be constructed with lexicographically ordered byte sequences.
32/// There is no restricting on the ordering of values.
33///
34/// # Differences with sets
35///
36/// Maps and sets are represented by the same underlying data structure: the
37/// finite state transducer. The principal difference between them is that
38/// sets always have their output values set to `0`. This has an impact on the
39/// representation size and is reflected in the type system for convenience.
40/// A secondary but subtle difference is that duplicate keys can be added
41/// to a set, but it is an error to do so with maps. That is, a set can have
42/// the same key added sequentially, but a map can't.
43///
44/// # The future
45///
46/// It is regrettable that the output value is fixed to `u64`. Indeed, it is
47/// not necessary, but it was a major simplification in the implementation.
48/// In the future, the value type may become generic to an extent (outputs must
49/// satisfy a basic algebra).
50///
51/// Keys will always be byte strings; however, we may grow more conveniences
52/// around dealing with them (such as a serialization/deserialization step,
53/// although it isn't clear where exactly this should live).
54#[derive(Clone)]
55pub struct Map<D>(raw::Fst<D>);
56
57impl Map<Vec<u8>> {
58 /// Create a `Map` from an iterator of lexicographically ordered byte
59 /// strings and associated values.
60 ///
61 /// If the iterator does not yield unique keys in lexicographic order, then
62 /// an error is returned.
63 ///
64 /// Note that this is a convenience function to build a map in memory.
65 /// To build a map that streams to an arbitrary `io::Write`, use
66 /// `MapBuilder`.
67 pub fn from_iter<K, I>(iter: I) -> Result<Map<Vec<u8>>>
68 where
69 K: AsRef<[u8]>,
70 I: IntoIterator<Item = (K, u64)>,
71 {
72 let mut builder = MapBuilder::memory();
73 builder.extend_iter(iter)?;
74 Map::new(builder.into_inner()?)
75 }
76}
77
78impl<D: AsRef<[u8]>> Map<D> {
79 /// Creates a map from its representation as a raw byte sequence.
80 ///
81 /// This accepts anything that can be cheaply converted to a `&[u8]`. The
82 /// caller is responsible for guaranteeing that the given bytes refer to
83 /// a valid FST. While memory safety will not be violated by invalid input,
84 /// a panic could occur while reading the FST at any point.
85 ///
86 /// # Example
87 ///
88 /// ```no_run
89 /// use fst::Map;
90 ///
91 /// // File written from a build script using MapBuilder.
92 /// # const IGNORE: &str = stringify! {
93 /// static FST: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/map.fst"));
94 /// # };
95 /// # static FST: &[u8] = &[];
96 ///
97 /// let map = Map::new(FST).unwrap();
98 /// ```
99 pub fn new(data: D) -> Result<Map<D>> {
100 raw::Fst::new(data).map(Map)
101 }
102
103 /// Tests the membership of a single key.
104 ///
105 /// # Example
106 ///
107 /// ```rust
108 /// use fst::Map;
109 ///
110 /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap();
111 ///
112 /// assert_eq!(map.contains_key("b"), true);
113 /// assert_eq!(map.contains_key("z"), false);
114 /// ```
115 pub fn contains_key<K: AsRef<[u8]>>(&self, key: K) -> bool {
116 self.0.contains_key(key)
117 }
118
119 /// Retrieves the value associated with a key.
120 ///
121 /// If the key does not exist, then `None` is returned.
122 ///
123 /// # Example
124 ///
125 /// ```rust
126 /// use fst::Map;
127 ///
128 /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap();
129 ///
130 /// assert_eq!(map.get("b"), Some(2));
131 /// assert_eq!(map.get("z"), None);
132 /// ```
133 pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<u64> {
134 self.0.get(key).map(|output| output.value())
135 }
136
137 /// Return a lexicographically ordered stream of all key-value pairs in
138 /// this map.
139 ///
140 /// While this is a stream, it does require heap space proportional to the
141 /// longest key in the map.
142 ///
143 /// If the map is memory mapped, then no further heap space is needed.
144 /// Note though that your operating system may fill your page cache
145 /// (which will cause the resident memory usage of the process to go up
146 /// correspondingly).
147 ///
148 /// # Example
149 ///
150 /// Since streams are not iterators, the traditional `for` loop cannot be
151 /// used. `while let` is useful instead:
152 ///
153 /// ```rust
154 /// use fst::{IntoStreamer, Streamer, Map};
155 ///
156 /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap();
157 /// let mut stream = map.stream();
158 ///
159 /// let mut kvs = vec![];
160 /// while let Some((k, v)) = stream.next() {
161 /// kvs.push((k.to_vec(), v));
162 /// }
163 /// assert_eq!(kvs, vec![
164 /// (b"a".to_vec(), 1),
165 /// (b"b".to_vec(), 2),
166 /// (b"c".to_vec(), 3),
167 /// ]);
168 /// ```
169 #[inline]
170 pub fn stream(&self) -> Stream<'_> {
171 Stream(self.0.stream())
172 }
173
174 /// Return a lexicographically ordered stream of all keys in this map.
175 ///
176 /// Memory requirements are the same as described on `Map::stream`.
177 ///
178 /// # Example
179 ///
180 /// ```rust
181 /// use fst::{IntoStreamer, Streamer, Map};
182 ///
183 /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap();
184 /// let mut stream = map.keys();
185 ///
186 /// let mut keys = vec![];
187 /// while let Some(k) = stream.next() {
188 /// keys.push(k.to_vec());
189 /// }
190 /// assert_eq!(keys, vec![b"a", b"b", b"c"]);
191 /// ```
192 #[inline]
193 pub fn keys(&self) -> Keys<'_> {
194 Keys(self.0.stream())
195 }
196
197 /// Return a stream of all values in this map ordered lexicographically
198 /// by each value's corresponding key.
199 ///
200 /// Memory requirements are the same as described on `Map::stream`.
201 ///
202 /// # Example
203 ///
204 /// ```rust
205 /// use fst::{IntoStreamer, Streamer, Map};
206 ///
207 /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap();
208 /// let mut stream = map.values();
209 ///
210 /// let mut values = vec![];
211 /// while let Some(v) = stream.next() {
212 /// values.push(v);
213 /// }
214 /// assert_eq!(values, vec![1, 2, 3]);
215 /// ```
216 #[inline]
217 pub fn values(&self) -> Values<'_> {
218 Values(self.0.stream())
219 }
220
221 /// Return a builder for range queries.
222 ///
223 /// A range query returns a subset of key-value pairs in this map in a
224 /// range given in lexicographic order.
225 ///
226 /// Memory requirements are the same as described on `Map::stream`.
227 /// Notably, only the keys in the range are read; keys outside the range
228 /// are not.
229 ///
230 /// # Example
231 ///
232 /// Returns only the key-value pairs in the range given.
233 ///
234 /// ```rust
235 /// use fst::{IntoStreamer, Streamer, Map};
236 ///
237 /// let map = Map::from_iter(vec![
238 /// ("a", 1), ("b", 2), ("c", 3), ("d", 4), ("e", 5),
239 /// ]).unwrap();
240 /// let mut stream = map.range().ge("b").lt("e").into_stream();
241 ///
242 /// let mut kvs = vec![];
243 /// while let Some((k, v)) = stream.next() {
244 /// kvs.push((k.to_vec(), v));
245 /// }
246 /// assert_eq!(kvs, vec![
247 /// (b"b".to_vec(), 2),
248 /// (b"c".to_vec(), 3),
249 /// (b"d".to_vec(), 4),
250 /// ]);
251 /// ```
252 #[inline]
253 pub fn range(&self) -> StreamBuilder<'_> {
254 StreamBuilder(self.0.range())
255 }
256
257 /// Executes an automaton on the keys of this map.
258 ///
259 /// Note that this returns a `StreamBuilder`, which can be used to
260 /// add a range query to the search (see the `range` method).
261 ///
262 /// Memory requirements are the same as described on `Map::stream`.
263 ///
264 /// # Example
265 ///
266 /// An implementation of regular expressions for `Automaton` is available
267 /// in the `regex-automata` crate with the `fst1` feature enabled, which
268 /// can be used to search maps.
269 ///
270 /// # Example
271 ///
272 /// An implementation of subsequence search for `Automaton` can be used
273 /// to search maps:
274 ///
275 /// ```rust
276 /// use fst::automaton::Subsequence;
277 /// use fst::{IntoStreamer, Streamer, Map};
278 ///
279 /// # fn main() { example().unwrap(); }
280 /// fn example() -> Result<(), Box<dyn std::error::Error>> {
281 /// let map = Map::from_iter(vec![
282 /// ("a foo bar", 1),
283 /// ("foo", 2),
284 /// ("foo1", 3),
285 /// ("foo2", 4),
286 /// ("foo3", 5),
287 /// ("foobar", 6),
288 /// ]).unwrap();
289 ///
290 /// let matcher = Subsequence::new("for");
291 /// let mut stream = map.search(&matcher).into_stream();
292 ///
293 /// let mut kvs = vec![];
294 /// while let Some((k, v)) = stream.next() {
295 /// kvs.push((String::from_utf8(k.to_vec())?, v));
296 /// }
297 /// assert_eq!(kvs, vec![
298 /// ("a foo bar".to_string(), 1), ("foobar".to_string(), 6),
299 /// ]);
300 ///
301 /// Ok(())
302 /// }
303 /// ```
304 pub fn search<A: Automaton>(&self, aut: A) -> StreamBuilder<'_, A> {
305 StreamBuilder(self.0.search(aut))
306 }
307
308 /// Executes an automaton on the keys of this map and yields matching
309 /// keys along with the corresponding matching states in the given
310 /// automaton.
311 ///
312 /// Note that this returns a `StreamWithStateBuilder`, which can be used to
313 /// add a range query to the search (see the `range` method).
314 ///
315 /// Memory requirements are the same as described on `Map::stream`.
316 ///
317 #[cfg_attr(
318 feature = "levenshtein",
319 doc = r##"
320# Example
321
322An implementation of fuzzy search using Levenshtein automata can be used
323to search maps:
324
325```rust
326use fst::automaton::Levenshtein;
327use fst::{IntoStreamer, Streamer, Map};
328
329# fn main() { example().unwrap(); }
330fn example() -> Result<(), Box<dyn std::error::Error>> {
331 let map = Map::from_iter(vec![
332 ("foo", 1),
333 ("foob", 2),
334 ("foobar", 3),
335 ("fozb", 4),
336 ]).unwrap();
337
338 let query = Levenshtein::new("foo", 2)?;
339 let mut stream = map.search_with_state(&query).into_stream();
340
341 let mut kvs = vec![];
342 while let Some((k, v, s)) = stream.next() {
343 kvs.push((String::from_utf8(k.to_vec())?, v, s));
344 }
345 // Currently, there isn't much interesting that you can do with the states.
346 assert_eq!(kvs, vec![
347 ("foo".to_string(), 1, Some(183)),
348 ("foob".to_string(), 2, Some(123)),
349 ("fozb".to_string(), 4, Some(83)),
350 ]);
351
352 Ok(())
353}
354```
355"##
356 )]
357 pub fn search_with_state<A: Automaton>(
358 &self,
359 aut: A,
360 ) -> StreamWithStateBuilder<'_, A> {
361 StreamWithStateBuilder(self.0.search_with_state(aut))
362 }
363
364 /// Returns the number of elements in this map.
365 #[inline]
366 pub fn len(&self) -> usize {
367 self.0.len()
368 }
369
370 /// Returns true if and only if this map is empty.
371 #[inline]
372 pub fn is_empty(&self) -> bool {
373 self.0.is_empty()
374 }
375
376 /// Creates a new map operation with this map added to it.
377 ///
378 /// The `OpBuilder` type can be used to add additional map streams
379 /// and perform set operations like union, intersection, difference and
380 /// symmetric difference on the keys of the map. These set operations also
381 /// allow one to specify how conflicting values are merged in the stream.
382 ///
383 /// # Example
384 ///
385 /// This example demonstrates a union on multiple map streams. Notice that
386 /// the stream returned from the union is not a sequence of key-value
387 /// pairs, but rather a sequence of keys associated with one or more
388 /// values. Namely, a key is associated with each value associated with
389 /// that same key in the all of the streams.
390 ///
391 /// ```rust
392 /// use fst::{Streamer, Map};
393 /// use fst::map::IndexedValue;
394 ///
395 /// let map1 = Map::from_iter(vec![
396 /// ("a", 1), ("b", 2), ("c", 3),
397 /// ]).unwrap();
398 /// let map2 = Map::from_iter(vec![
399 /// ("a", 10), ("y", 11), ("z", 12),
400 /// ]).unwrap();
401 ///
402 /// let mut union = map1.op().add(&map2).union();
403 ///
404 /// let mut kvs = vec![];
405 /// while let Some((k, vs)) = union.next() {
406 /// kvs.push((k.to_vec(), vs.to_vec()));
407 /// }
408 /// assert_eq!(kvs, vec![
409 /// (b"a".to_vec(), vec![
410 /// IndexedValue { index: 0, value: 1 },
411 /// IndexedValue { index: 1, value: 10 },
412 /// ]),
413 /// (b"b".to_vec(), vec![IndexedValue { index: 0, value: 2 }]),
414 /// (b"c".to_vec(), vec![IndexedValue { index: 0, value: 3 }]),
415 /// (b"y".to_vec(), vec![IndexedValue { index: 1, value: 11 }]),
416 /// (b"z".to_vec(), vec![IndexedValue { index: 1, value: 12 }]),
417 /// ]);
418 /// ```
419 #[inline]
420 pub fn op(&self) -> OpBuilder<'_> {
421 OpBuilder::new().add(self)
422 }
423
424 /// Returns a reference to the underlying raw finite state transducer.
425 #[inline]
426 pub fn as_fst(&self) -> &raw::Fst<D> {
427 &self.0
428 }
429
430 /// Returns the underlying raw finite state transducer.
431 #[inline]
432 pub fn into_fst(self) -> raw::Fst<D> {
433 self.0
434 }
435
436 /// Maps the underlying data of the fst Map to another data type.
437 ///
438 /// # Example
439 ///
440 /// This example shows that you can map an fst Map based on a `Vec<u8>`
441 /// into an fst Map based on a `Cow<[u8]>`, it can also work with a
442 /// reference counted type (e.g. `Arc`, `Rc`).
443 ///
444 /// ```
445 /// use std::borrow::Cow;
446 ///
447 /// use fst::Map;
448 ///
449 /// let map: Map<Vec<u8>> = Map::from_iter(
450 /// [("hello", 12), ("world", 42)].iter().cloned(),
451 /// ).unwrap();
452 ///
453 /// let map_on_cow: Map<Cow<[u8]>> = map.map_data(Cow::Owned).unwrap();
454 /// ```
455 #[inline]
456 pub fn map_data<F, T>(self, f: F) -> Result<Map<T>>
457 where
458 F: FnMut(D) -> T,
459 T: AsRef<[u8]>,
460 {
461 self.into_fst().map_data(f).map(Map::from)
462 }
463}
464
465impl Default for Map<Vec<u8>> {
466 #[inline]
467 fn default() -> Map<Vec<u8>> {
468 Map::from_iter(iter::empty::<(&[u8], u64)>()).unwrap()
469 }
470}
471
472impl<D: AsRef<[u8]>> fmt::Debug for Map<D> {
473 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
474 write!(f, "Map([")?;
475 let mut stream = self.stream();
476 let mut first = true;
477 while let Some((k, v)) = stream.next() {
478 if !first {
479 write!(f, ", ")?;
480 }
481 first = false;
482 write!(f, "({}, {})", String::from_utf8_lossy(k), v)?;
483 }
484 write!(f, "])")
485 }
486}
487
488// Construct a map from an Fst object.
489impl<D: AsRef<[u8]>> From<raw::Fst<D>> for Map<D> {
490 #[inline]
491 fn from(fst: raw::Fst<D>) -> Map<D> {
492 Map(fst)
493 }
494}
495
496/// Returns the underlying finite state transducer.
497impl<D: AsRef<[u8]>> AsRef<raw::Fst<D>> for Map<D> {
498 #[inline]
499 fn as_ref(&self) -> &raw::Fst<D> {
500 &self.0
501 }
502}
503
504impl<'m, 'a, D: AsRef<[u8]>> IntoStreamer<'a> for &'m Map<D> {
505 type Item = (&'a [u8], u64);
506 type Into = Stream<'m>;
507
508 #[inline]
509 fn into_stream(self) -> Stream<'m> {
510 Stream(self.0.stream())
511 }
512}
513
514/// A builder for creating a map.
515///
516/// This is not your average everyday builder. It has two important qualities
517/// that make it a bit unique from what you might expect:
518///
519/// 1. All keys must be added in lexicographic order. Adding a key out of order
520/// will result in an error. Additionally, adding a duplicate key will also
521/// result in an error. That is, once a key is associated with a value,
522/// that association can never be modified or deleted.
523/// 2. The representation of a map is streamed to *any* `io::Write` as it is
524/// built. For an in memory representation, this can be a `Vec<u8>`.
525///
526/// Point (2) is especially important because it means that a map can be
527/// constructed *without storing the entire map in memory*. Namely, since it
528/// works with any `io::Write`, it can be streamed directly to a file.
529///
530/// With that said, the builder does use memory, but **memory usage is bounded
531/// to a constant size**. The amount of memory used trades off with the
532/// compression ratio. Currently, the implementation hard codes this trade off
533/// which can result in about 5-20MB of heap usage during construction. (N.B.
534/// Guaranteeing a maximal compression ratio requires memory proportional to
535/// the size of the map, which defeats some of the benefit of streaming
536/// it to disk. In practice, a small bounded amount of memory achieves
537/// close-to-minimal compression ratios.)
538///
539/// The algorithmic complexity of map construction is `O(n)` where `n` is the
540/// number of elements added to the map.
541///
542/// # Example: build in memory
543///
544/// This shows how to use the builder to construct a map in memory. Note that
545/// `Map::from_iter` provides a convenience function that achieves this same
546/// goal without needing to explicitly use `MapBuilder`.
547///
548/// ```rust
549/// use fst::{IntoStreamer, Streamer, Map, MapBuilder};
550///
551/// let mut build = MapBuilder::memory();
552/// build.insert("bruce", 1).unwrap();
553/// build.insert("clarence", 2).unwrap();
554/// build.insert("stevie", 3).unwrap();
555///
556/// // You could also call `finish()` here, but since we're building the map in
557/// // memory, there would be no way to get the `Vec<u8>` back.
558/// let bytes = build.into_inner().unwrap();
559///
560/// // At this point, the map has been constructed, but here's how to read it.
561/// let map = Map::new(bytes).unwrap();
562/// let mut stream = map.into_stream();
563/// let mut kvs = vec![];
564/// while let Some((k, v)) = stream.next() {
565/// kvs.push((k.to_vec(), v));
566/// }
567/// assert_eq!(kvs, vec![
568/// (b"bruce".to_vec(), 1),
569/// (b"clarence".to_vec(), 2),
570/// (b"stevie".to_vec(), 3),
571/// ]);
572/// ```
573///
574/// # Example: stream to file
575///
576/// This shows how to do stream construction of a map to a file.
577///
578/// ```rust,no_run
579/// use std::fs::File;
580/// use std::io;
581///
582/// use fst::{IntoStreamer, Streamer, Map, MapBuilder};
583///
584/// let mut wtr = io::BufWriter::new(File::create("map.fst").unwrap());
585/// let mut build = MapBuilder::new(wtr).unwrap();
586/// build.insert("bruce", 1).unwrap();
587/// build.insert("clarence", 2).unwrap();
588/// build.insert("stevie", 3).unwrap();
589///
590/// // If you want the writer back, then call `into_inner`. Otherwise, this
591/// // will finish construction and call `flush`.
592/// build.finish().unwrap();
593///
594/// // At this point, the map has been constructed, but here's how to read it.
595/// // NOTE: Normally, one would memory map a file instead of reading its
596/// // entire contents on to the heap.
597/// let map = Map::new(std::fs::read("map.fst").unwrap()).unwrap();
598/// let mut stream = map.into_stream();
599/// let mut kvs = vec![];
600/// while let Some((k, v)) = stream.next() {
601/// kvs.push((k.to_vec(), v));
602/// }
603/// assert_eq!(kvs, vec![
604/// (b"bruce".to_vec(), 1),
605/// (b"clarence".to_vec(), 2),
606/// (b"stevie".to_vec(), 3),
607/// ]);
608/// ```
609pub struct MapBuilder<W>(raw::Builder<W>);
610
611impl MapBuilder<Vec<u8>> {
612 /// Create a builder that builds a map in memory.
613 #[inline]
614 pub fn memory() -> MapBuilder<Vec<u8>> {
615 MapBuilder(raw::Builder::memory())
616 }
617
618 /// Finishes the construction of the map and returns it.
619 #[inline]
620 pub fn into_map(self) -> Map<Vec<u8>> {
621 Map(self.0.into_fst())
622 }
623}
624
625impl<W: io::Write> MapBuilder<W> {
626 /// Create a builder that builds a map by writing it to `wtr` in a
627 /// streaming fashion.
628 pub fn new(wtr: W) -> Result<MapBuilder<W>> {
629 raw::Builder::new_type(wtr, 0).map(MapBuilder)
630 }
631
632 /// Insert a new key-value pair into the map.
633 ///
634 /// Keys must be convertible to byte strings. Values must be a `u64`, which
635 /// is a restriction of the current implementation of finite state
636 /// transducers. (Values may one day be expanded to other types.)
637 ///
638 /// If a key is inserted that is less than or equal to any previous key
639 /// added, then an error is returned. Similarly, if there was a problem
640 /// writing to the underlying writer, an error is returned.
641 pub fn insert<K: AsRef<[u8]>>(&mut self, key: K, val: u64) -> Result<()> {
642 self.0.insert(key, val)
643 }
644
645 /// Calls insert on each item in the iterator.
646 ///
647 /// If an error occurred while adding an element, processing is stopped
648 /// and the error is returned.
649 ///
650 /// If a key is inserted that is less than or equal to any previous key
651 /// added, then an error is returned. Similarly, if there was a problem
652 /// writing to the underlying writer, an error is returned.
653 pub fn extend_iter<K, I>(&mut self, iter: I) -> Result<()>
654 where
655 K: AsRef<[u8]>,
656 I: IntoIterator<Item = (K, u64)>,
657 {
658 self.0.extend_iter(
659 iter.into_iter().map(|(k, v)| (k, raw::Output::new(v))),
660 )
661 }
662
663 /// Calls insert on each item in the stream.
664 ///
665 /// Note that unlike `extend_iter`, this is not generic on the items in
666 /// the stream.
667 ///
668 /// If a key is inserted that is less than or equal to any previous key
669 /// added, then an error is returned. Similarly, if there was a problem
670 /// writing to the underlying writer, an error is returned.
671 pub fn extend_stream<'f, I, S>(&mut self, stream: I) -> Result<()>
672 where
673 I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>,
674 S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], u64)>,
675 {
676 self.0.extend_stream(StreamOutput(stream.into_stream()))
677 }
678
679 /// Finishes the construction of the map and flushes the underlying
680 /// writer. After completion, the data written to `W` may be read using
681 /// one of `Map`'s constructor methods.
682 pub fn finish(self) -> Result<()> {
683 self.0.finish()
684 }
685
686 /// Just like `finish`, except it returns the underlying writer after
687 /// flushing it.
688 pub fn into_inner(self) -> Result<W> {
689 self.0.into_inner()
690 }
691
692 /// Gets a reference to the underlying writer.
693 pub fn get_ref(&self) -> &W {
694 self.0.get_ref()
695 }
696
697 /// Returns the number of bytes written to the underlying writer
698 pub fn bytes_written(&self) -> u64 {
699 self.0.bytes_written()
700 }
701}
702
703/// A lexicographically ordered stream of key-value pairs from a map.
704///
705/// The `A` type parameter corresponds to an optional automaton to filter
706/// the stream. By default, no filtering is done.
707///
708/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
709pub struct Stream<'m, A = AlwaysMatch>(raw::Stream<'m, A>)
710where
711 A: Automaton;
712
713impl<'a, 'm, A: Automaton> Streamer<'a> for Stream<'m, A> {
714 type Item = (&'a [u8], u64);
715
716 fn next(&'a mut self) -> Option<(&'a [u8], u64)> {
717 self.0.next().map(|(key, out)| (key, out.value()))
718 }
719}
720
721impl<'m, A: Automaton> Stream<'m, A> {
722 /// Convert this stream into a vector of byte strings and outputs.
723 ///
724 /// Note that this creates a new allocation for every key in the stream.
725 pub fn into_byte_vec(self) -> Vec<(Vec<u8>, u64)> {
726 self.0.into_byte_vec()
727 }
728
729 /// Convert this stream into a vector of Unicode strings and outputs.
730 ///
731 /// If any key is not valid UTF-8, then iteration on the stream is stopped
732 /// and a UTF-8 decoding error is returned.
733 ///
734 /// Note that this creates a new allocation for every key in the stream.
735 pub fn into_str_vec(self) -> Result<Vec<(String, u64)>> {
736 self.0.into_str_vec()
737 }
738
739 /// Convert this stream into a vector of byte strings.
740 ///
741 /// Note that this creates a new allocation for every key in the stream.
742 pub fn into_byte_keys(self) -> Vec<Vec<u8>> {
743 self.0.into_byte_keys()
744 }
745
746 /// Convert this stream into a vector of Unicode strings.
747 ///
748 /// If any key is not valid UTF-8, then iteration on the stream is stopped
749 /// and a UTF-8 decoding error is returned.
750 ///
751 /// Note that this creates a new allocation for every key in the stream.
752 pub fn into_str_keys(self) -> Result<Vec<String>> {
753 self.0.into_str_keys()
754 }
755
756 /// Convert this stream into a vector of outputs.
757 pub fn into_values(self) -> Vec<u64> {
758 self.0.into_values()
759 }
760}
761
762/// A lexicographically ordered stream of key-value-state triples from a map
763/// and an automaton.
764///
765/// The key-values are from the map while the states are from the automaton.
766///
767/// The `A` type parameter corresponds to an optional automaton to filter
768/// the stream. By default, no filtering is done.
769///
770/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
771pub struct StreamWithState<'m, A = AlwaysMatch>(raw::StreamWithState<'m, A>)
772where
773 A: Automaton;
774
775impl<'a, 'm, A: 'a + Automaton> Streamer<'a> for StreamWithState<'m, A>
776where
777 A::State: Clone,
778{
779 type Item = (&'a [u8], u64, A::State);
780
781 fn next(&'a mut self) -> Option<(&'a [u8], u64, A::State)> {
782 self.0.next().map(|(key, out, state)| (key, out.value(), state))
783 }
784}
785
786/// A lexicographically ordered stream of keys from a map.
787///
788/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
789pub struct Keys<'m>(raw::Stream<'m>);
790
791impl<'a, 'm> Streamer<'a> for Keys<'m> {
792 type Item = &'a [u8];
793
794 #[inline]
795 fn next(&'a mut self) -> Option<&'a [u8]> {
796 self.0.next().map(|(key, _)| key)
797 }
798}
799
800/// A stream of values from a map, lexicographically ordered by each value's
801/// corresponding key.
802///
803/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
804pub struct Values<'m>(raw::Stream<'m>);
805
806impl<'a, 'm> Streamer<'a> for Values<'m> {
807 type Item = u64;
808
809 #[inline]
810 fn next(&'a mut self) -> Option<u64> {
811 self.0.next().map(|(_, out)| out.value())
812 }
813}
814
815/// A builder for constructing range queries on streams.
816///
817/// Once all bounds are set, one should call `into_stream` to get a
818/// `Stream`.
819///
820/// Bounds are not additive. That is, if `ge` is called twice on the same
821/// builder, then the second setting wins.
822///
823/// The `A` type parameter corresponds to an optional automaton to filter
824/// the stream. By default, no filtering is done.
825///
826/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
827pub struct StreamBuilder<'m, A = AlwaysMatch>(raw::StreamBuilder<'m, A>);
828
829impl<'m, A: Automaton> StreamBuilder<'m, A> {
830 /// Specify a greater-than-or-equal-to bound.
831 pub fn ge<T: AsRef<[u8]>>(self, bound: T) -> StreamBuilder<'m, A> {
832 StreamBuilder(self.0.ge(bound))
833 }
834
835 /// Specify a greater-than bound.
836 pub fn gt<T: AsRef<[u8]>>(self, bound: T) -> StreamBuilder<'m, A> {
837 StreamBuilder(self.0.gt(bound))
838 }
839
840 /// Specify a less-than-or-equal-to bound.
841 pub fn le<T: AsRef<[u8]>>(self, bound: T) -> StreamBuilder<'m, A> {
842 StreamBuilder(self.0.le(bound))
843 }
844
845 /// Specify a less-than bound.
846 pub fn lt<T: AsRef<[u8]>>(self, bound: T) -> StreamBuilder<'m, A> {
847 StreamBuilder(self.0.lt(bound))
848 }
849}
850
851impl<'m, 'a, A: Automaton> IntoStreamer<'a> for StreamBuilder<'m, A> {
852 type Item = (&'a [u8], u64);
853 type Into = Stream<'m, A>;
854
855 fn into_stream(self) -> Stream<'m, A> {
856 Stream(self.0.into_stream())
857 }
858}
859
860/// A builder for constructing range queries on streams that include automaton
861/// states.
862///
863/// In general, one should use `StreamBuilder` unless you have a specific need
864/// for accessing the states of the underlying automaton that is being used to
865/// filter this stream.
866///
867/// Once all bounds are set, one should call `into_stream` to get a
868/// `Stream`.
869///
870/// Bounds are not additive. That is, if `ge` is called twice on the same
871/// builder, then the second setting wins.
872///
873/// The `A` type parameter corresponds to an optional automaton to filter
874/// the stream. By default, no filtering is done.
875///
876/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
877pub struct StreamWithStateBuilder<'m, A = AlwaysMatch>(
878 raw::StreamWithStateBuilder<'m, A>,
879);
880
881impl<'m, A: Automaton> StreamWithStateBuilder<'m, A> {
882 /// Specify a greater-than-or-equal-to bound.
883 pub fn ge<T: AsRef<[u8]>>(
884 self,
885 bound: T,
886 ) -> StreamWithStateBuilder<'m, A> {
887 StreamWithStateBuilder(self.0.ge(bound))
888 }
889
890 /// Specify a greater-than bound.
891 pub fn gt<T: AsRef<[u8]>>(
892 self,
893 bound: T,
894 ) -> StreamWithStateBuilder<'m, A> {
895 StreamWithStateBuilder(self.0.gt(bound))
896 }
897
898 /// Specify a less-than-or-equal-to bound.
899 pub fn le<T: AsRef<[u8]>>(
900 self,
901 bound: T,
902 ) -> StreamWithStateBuilder<'m, A> {
903 StreamWithStateBuilder(self.0.le(bound))
904 }
905
906 /// Specify a less-than bound.
907 pub fn lt<T: AsRef<[u8]>>(
908 self,
909 bound: T,
910 ) -> StreamWithStateBuilder<'m, A> {
911 StreamWithStateBuilder(self.0.lt(bound))
912 }
913}
914
915impl<'m, 'a, A: 'a + Automaton> IntoStreamer<'a>
916 for StreamWithStateBuilder<'m, A>
917where
918 A::State: Clone,
919{
920 type Item = (&'a [u8], u64, A::State);
921 type Into = StreamWithState<'m, A>;
922
923 fn into_stream(self) -> StreamWithState<'m, A> {
924 StreamWithState(self.0.into_stream())
925 }
926}
927
928/// A builder for collecting map streams on which to perform set operations
929/// on the keys of maps.
930///
931/// Set operations include intersection, union, difference and symmetric
932/// difference. The result of each set operation is itself a stream that emits
933/// pairs of keys and a sequence of each occurrence of that key in the
934/// participating streams. This information allows one to perform set
935/// operations on maps and customize how conflicting output values are handled.
936///
937/// All set operations work efficiently on an arbitrary number of
938/// streams with memory proportional to the number of streams.
939///
940/// The algorithmic complexity of all set operations is `O(n1 + n2 + n3 + ...)`
941/// where `n1, n2, n3, ...` correspond to the number of elements in each
942/// stream.
943///
944/// The `'m` lifetime parameter refers to the lifetime of the underlying set.
945pub struct OpBuilder<'m>(raw::OpBuilder<'m>);
946
947impl<'m> OpBuilder<'m> {
948 /// Create a new set operation builder.
949 #[inline]
950 pub fn new() -> OpBuilder<'m> {
951 OpBuilder(raw::OpBuilder::new())
952 }
953
954 /// Add a stream to this set operation.
955 ///
956 /// This is useful for a chaining style pattern, e.g.,
957 /// `builder.add(stream1).add(stream2).union()`.
958 ///
959 /// The stream must emit a lexicographically ordered sequence of key-value
960 /// pairs.
961 pub fn add<I, S>(mut self, streamable: I) -> OpBuilder<'m>
962 where
963 I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>,
964 S: 'm + for<'a> Streamer<'a, Item = (&'a [u8], u64)>,
965 {
966 self.push(streamable);
967 self
968 }
969
970 /// Add a stream to this set operation.
971 ///
972 /// The stream must emit a lexicographically ordered sequence of key-value
973 /// pairs.
974 pub fn push<I, S>(&mut self, streamable: I)
975 where
976 I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>,
977 S: 'm + for<'a> Streamer<'a, Item = (&'a [u8], u64)>,
978 {
979 self.0.push(StreamOutput(streamable.into_stream()));
980 }
981
982 /// Performs a union operation on all streams that have been added.
983 ///
984 /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The
985 /// first element of the tuple is the byte string key. The second element
986 /// of the tuple is a list of all occurrences of that key in participating
987 /// streams. The `IndexedValue` contains an index and the value associated
988 /// with that key in that stream. The index uniquely identifies each
989 /// stream, which is an integer that is auto-incremented when a stream
990 /// is added to this operation (starting at `0`).
991 ///
992 /// # Example
993 ///
994 /// ```rust
995 /// use fst::{IntoStreamer, Streamer, Map};
996 /// use fst::map::IndexedValue;
997 ///
998 /// let map1 = Map::from_iter(vec![
999 /// ("a", 1), ("b", 2), ("c", 3),
1000 /// ]).unwrap();
1001 /// let map2 = Map::from_iter(vec![
1002 /// ("a", 11), ("y", 12), ("z", 13),
1003 /// ]).unwrap();
1004 ///
1005 /// let mut union = map1.op().add(&map2).union();
1006 ///
1007 /// let mut kvs = vec![];
1008 /// while let Some((k, vs)) = union.next() {
1009 /// kvs.push((k.to_vec(), vs.to_vec()));
1010 /// }
1011 /// assert_eq!(kvs, vec![
1012 /// (b"a".to_vec(), vec![
1013 /// IndexedValue { index: 0, value: 1 },
1014 /// IndexedValue { index: 1, value: 11 },
1015 /// ]),
1016 /// (b"b".to_vec(), vec![IndexedValue { index: 0, value: 2 }]),
1017 /// (b"c".to_vec(), vec![IndexedValue { index: 0, value: 3 }]),
1018 /// (b"y".to_vec(), vec![IndexedValue { index: 1, value: 12 }]),
1019 /// (b"z".to_vec(), vec![IndexedValue { index: 1, value: 13 }]),
1020 /// ]);
1021 /// ```
1022 #[inline]
1023 pub fn union(self) -> Union<'m> {
1024 Union(self.0.union())
1025 }
1026
1027 /// Performs an intersection operation on all streams that have been added.
1028 ///
1029 /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The
1030 /// first element of the tuple is the byte string key. The second element
1031 /// of the tuple is a list of all occurrences of that key in participating
1032 /// streams. The `IndexedValue` contains an index and the value associated
1033 /// with that key in that stream. The index uniquely identifies each
1034 /// stream, which is an integer that is auto-incremented when a stream
1035 /// is added to this operation (starting at `0`).
1036 ///
1037 /// # Example
1038 ///
1039 /// ```rust
1040 /// use fst::{IntoStreamer, Streamer, Map};
1041 /// use fst::map::IndexedValue;
1042 ///
1043 /// let map1 = Map::from_iter(vec![
1044 /// ("a", 1), ("b", 2), ("c", 3),
1045 /// ]).unwrap();
1046 /// let map2 = Map::from_iter(vec![
1047 /// ("a", 11), ("y", 12), ("z", 13),
1048 /// ]).unwrap();
1049 ///
1050 /// let mut intersection = map1.op().add(&map2).intersection();
1051 ///
1052 /// let mut kvs = vec![];
1053 /// while let Some((k, vs)) = intersection.next() {
1054 /// kvs.push((k.to_vec(), vs.to_vec()));
1055 /// }
1056 /// assert_eq!(kvs, vec![
1057 /// (b"a".to_vec(), vec![
1058 /// IndexedValue { index: 0, value: 1 },
1059 /// IndexedValue { index: 1, value: 11 },
1060 /// ]),
1061 /// ]);
1062 /// ```
1063 #[inline]
1064 pub fn intersection(self) -> Intersection<'m> {
1065 Intersection(self.0.intersection())
1066 }
1067
1068 /// Performs a difference operation with respect to the first stream added.
1069 /// That is, this returns a stream of all elements in the first stream
1070 /// that don't exist in any other stream that has been added.
1071 ///
1072 /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The
1073 /// first element of the tuple is the byte string key. The second element
1074 /// of the tuple is a list of all occurrences of that key in participating
1075 /// streams. The `IndexedValue` contains an index and the value associated
1076 /// with that key in that stream. The index uniquely identifies each
1077 /// stream, which is an integer that is auto-incremented when a stream
1078 /// is added to this operation (starting at `0`).
1079 ///
1080 /// While the interface is the same for all the operations combining multiple
1081 /// maps, due to the nature of `difference` there's exactly one `IndexValue`
1082 /// for each yielded value.
1083 ///
1084 /// # Example
1085 ///
1086 /// ```rust
1087 /// use fst::{Streamer, Map};
1088 /// use fst::map::IndexedValue;
1089 ///
1090 /// let map1 = Map::from_iter(vec![
1091 /// ("a", 1), ("b", 2), ("c", 3),
1092 /// ]).unwrap();
1093 /// let map2 = Map::from_iter(vec![
1094 /// ("a", 11), ("y", 12), ("z", 13),
1095 /// ]).unwrap();
1096 ///
1097 /// let mut difference = map1.op().add(&map2).difference();
1098 ///
1099 /// let mut kvs = vec![];
1100 /// while let Some((k, vs)) = difference.next() {
1101 /// kvs.push((k.to_vec(), vs.to_vec()));
1102 /// }
1103 /// assert_eq!(kvs, vec![
1104 /// (b"b".to_vec(), vec![IndexedValue { index: 0, value: 2 }]),
1105 /// (b"c".to_vec(), vec![IndexedValue { index: 0, value: 3 }]),
1106 /// ]);
1107 /// ```
1108 #[inline]
1109 pub fn difference(self) -> Difference<'m> {
1110 Difference(self.0.difference())
1111 }
1112
1113 /// Performs a symmetric difference operation on all of the streams that
1114 /// have been added.
1115 ///
1116 /// When there are only two streams, then the keys returned correspond to
1117 /// keys that are in either stream but *not* in both streams.
1118 ///
1119 /// More generally, for any number of streams, keys that occur in an odd
1120 /// number of streams are returned.
1121 ///
1122 /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The
1123 /// first element of the tuple is the byte string key. The second element
1124 /// of the tuple is a list of all occurrences of that key in participating
1125 /// streams. The `IndexedValue` contains an index and the value associated
1126 /// with that key in that stream. The index uniquely identifies each
1127 /// stream, which is an integer that is auto-incremented when a stream
1128 /// is added to this operation (starting at `0`).
1129 ///
1130 /// # Example
1131 ///
1132 /// ```rust
1133 /// use fst::{IntoStreamer, Streamer, Map};
1134 /// use fst::map::IndexedValue;
1135 ///
1136 /// let map1 = Map::from_iter(vec![
1137 /// ("a", 1), ("b", 2), ("c", 3),
1138 /// ]).unwrap();
1139 /// let map2 = Map::from_iter(vec![
1140 /// ("a", 11), ("y", 12), ("z", 13),
1141 /// ]).unwrap();
1142 ///
1143 /// let mut sym_difference = map1.op().add(&map2).symmetric_difference();
1144 ///
1145 /// let mut kvs = vec![];
1146 /// while let Some((k, vs)) = sym_difference.next() {
1147 /// kvs.push((k.to_vec(), vs.to_vec()));
1148 /// }
1149 /// assert_eq!(kvs, vec![
1150 /// (b"b".to_vec(), vec![IndexedValue { index: 0, value: 2 }]),
1151 /// (b"c".to_vec(), vec![IndexedValue { index: 0, value: 3 }]),
1152 /// (b"y".to_vec(), vec![IndexedValue { index: 1, value: 12 }]),
1153 /// (b"z".to_vec(), vec![IndexedValue { index: 1, value: 13 }]),
1154 /// ]);
1155 /// ```
1156 #[inline]
1157 pub fn symmetric_difference(self) -> SymmetricDifference<'m> {
1158 SymmetricDifference(self.0.symmetric_difference())
1159 }
1160}
1161
1162impl<'f, I, S> Extend<I> for OpBuilder<'f>
1163where
1164 I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>,
1165 S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], u64)>,
1166{
1167 fn extend<T>(&mut self, it: T)
1168 where
1169 T: IntoIterator<Item = I>,
1170 {
1171 for stream in it {
1172 self.push(stream);
1173 }
1174 }
1175}
1176
1177impl<'f, I, S> FromIterator<I> for OpBuilder<'f>
1178where
1179 I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>,
1180 S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], u64)>,
1181{
1182 fn from_iter<T>(it: T) -> OpBuilder<'f>
1183 where
1184 T: IntoIterator<Item = I>,
1185 {
1186 let mut op = OpBuilder::new();
1187 op.extend(it);
1188 op
1189 }
1190}
1191
1192/// A stream of set union over multiple map streams in lexicographic order.
1193///
1194/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
1195pub struct Union<'m>(raw::Union<'m>);
1196
1197impl<'a, 'm> Streamer<'a> for Union<'m> {
1198 type Item = (&'a [u8], &'a [IndexedValue]);
1199
1200 #[inline]
1201 fn next(&'a mut self) -> Option<(&'a [u8], &'a [IndexedValue])> {
1202 self.0.next()
1203 }
1204}
1205
1206/// A stream of set intersection over multiple map streams in lexicographic
1207/// order.
1208///
1209/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
1210pub struct Intersection<'m>(raw::Intersection<'m>);
1211
1212impl<'a, 'm> Streamer<'a> for Intersection<'m> {
1213 type Item = (&'a [u8], &'a [IndexedValue]);
1214
1215 #[inline]
1216 fn next(&'a mut self) -> Option<(&'a [u8], &'a [IndexedValue])> {
1217 self.0.next()
1218 }
1219}
1220
1221/// A stream of set difference over multiple map streams in lexicographic
1222/// order.
1223///
1224/// The difference operation is taken with respect to the first stream and the
1225/// rest of the streams. i.e., All elements in the first stream that do not
1226/// appear in any other streams.
1227///
1228/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
1229pub struct Difference<'m>(raw::Difference<'m>);
1230
1231impl<'a, 'm> Streamer<'a> for Difference<'m> {
1232 type Item = (&'a [u8], &'a [IndexedValue]);
1233
1234 #[inline]
1235 fn next(&'a mut self) -> Option<(&'a [u8], &'a [IndexedValue])> {
1236 self.0.next()
1237 }
1238}
1239
1240/// A stream of set symmetric difference over multiple map streams in
1241/// lexicographic order.
1242///
1243/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
1244pub struct SymmetricDifference<'m>(raw::SymmetricDifference<'m>);
1245
1246impl<'a, 'm> Streamer<'a> for SymmetricDifference<'m> {
1247 type Item = (&'a [u8], &'a [IndexedValue]);
1248
1249 #[inline]
1250 fn next(&'a mut self) -> Option<(&'a [u8], &'a [IndexedValue])> {
1251 self.0.next()
1252 }
1253}
1254
1255/// A specialized stream for mapping map streams (`(&[u8], u64)`) to streams
1256/// used by raw fsts (`(&[u8], Output)`).
1257///
1258/// If this were iterators, we could use `iter::Map`, but doing this on streams
1259/// requires HKT, so we need to write out the monomorphization ourselves.
1260struct StreamOutput<S>(S);
1261
1262impl<'a, S> Streamer<'a> for StreamOutput<S>
1263where
1264 S: Streamer<'a, Item = (&'a [u8], u64)>,
1265{
1266 type Item = (&'a [u8], raw::Output);
1267
1268 fn next(&'a mut self) -> Option<(&'a [u8], raw::Output)> {
1269 self.0.next().map(|(k, v)| (k, raw::Output::new(v)))
1270 }
1271}