fst/raw/mod.rs
1/*!
2Operations on raw finite state transducers.
3
4This sub-module exposes the guts of a finite state transducer. Many parts of
5it, such as construction and traversal, are mirrored in the `set` and `map`
6sub-modules. Other parts of it, such as direct access to nodes and transitions
7in the transducer, do not have any analog.
8
9# Overview of types
10
11`Fst` is a read only interface to pre-constructed finite state transducers.
12`Node` is a read only interface to a single node in a transducer. `Builder` is
13used to create new finite state transducers. (Once a transducer is created, it
14can never be modified.) `Stream` is a stream of all inputs and outputs in a
15transducer. `StreamBuilder` builds range queries. `OpBuilder` collects streams
16and executes set operations like `union` or `intersection` on them with the
17option of specifying a merge strategy for output values.
18
19Most of the rest of the types are streams from set operations.
20*/
21use std::cmp;
22use std::fmt;
23
24use crate::automaton::{AlwaysMatch, Automaton};
25use crate::bytes;
26use crate::error::Result;
27use crate::stream::{IntoStreamer, Streamer};
28
29pub use crate::raw::build::Builder;
30pub use crate::raw::error::Error;
31pub use crate::raw::node::{Node, Transitions};
32pub use crate::raw::ops::{
33 Difference, IndexedValue, Intersection, OpBuilder, SymmetricDifference,
34 Union,
35};
36
37mod build;
38mod common_inputs;
39mod counting_writer;
40mod crc32;
41mod crc32_table;
42mod error;
43mod node;
44mod ops;
45mod registry;
46mod registry_minimal;
47#[cfg(test)]
48mod tests;
49
50/// The API version of this crate.
51///
52/// This version number is written to every finite state transducer created by
53/// this crate. When a finite state transducer is read, its version number is
54/// checked against this value.
55///
56/// Currently, any version mismatch results in an error. Fixing this requires
57/// regenerating the finite state transducer or switching to a version of this
58/// crate that is compatible with the serialized transducer. This particular
59/// behavior may be relaxed in future versions.
60pub const VERSION: u64 = 3;
61
62/// A sentinel value used to indicate an empty final state.
63const EMPTY_ADDRESS: CompiledAddr = 0;
64
65/// A sentinel value used to indicate an invalid state.
66///
67/// This is never the address of a node in a serialized transducer.
68const NONE_ADDRESS: CompiledAddr = 1;
69
70/// FstType is a convention used to indicate the type of the underlying
71/// transducer.
72///
73/// This crate reserves the range 0-255 (inclusive) but currently leaves the
74/// meaning of 0-255 unspecified.
75pub type FstType = u64;
76
77/// CompiledAddr is the type used to address nodes in a finite state
78/// transducer.
79///
80/// It is most useful as a pointer to nodes. It can be used in the `Fst::node`
81/// method to resolve the pointer.
82pub type CompiledAddr = usize;
83
84/// An acyclic deterministic finite state transducer.
85///
86/// # How does it work?
87///
88/// The short answer: it's just like a prefix trie, which compresses keys
89/// based only on their prefixes, except that a automaton/transducer also
90/// compresses suffixes.
91///
92/// The longer answer is that keys in an automaton are stored only in the
93/// transitions from one state to another. A key can be acquired by tracing
94/// a path from the root of the automaton to any match state. The inputs along
95/// each transition are concatenated. Once a match state is reached, the
96/// concatenation of inputs up until that point corresponds to a single key.
97///
98/// But why is it called a transducer instead of an automaton? A finite state
99/// transducer is just like a finite state automaton, except that it has output
100/// transitions in addition to input transitions. Namely, the value associated
101/// with any particular key is determined by summing the outputs along every
102/// input transition that leads to the key's corresponding match state.
103///
104/// This is best demonstrated with a couple images. First, let's ignore the
105/// "transducer" aspect and focus on a plain automaton.
106///
107/// Consider that your keys are abbreviations of some of the months in the
108/// Gregorian calendar:
109///
110/// ```plain
111/// jan
112/// feb
113/// mar
114/// may
115/// jun
116/// jul
117/// ```
118///
119/// The corresponding automaton that stores all of these as keys looks like
120/// this:
121///
122/// 
123///
124/// Notice here how the prefix and suffix of `jan` and `jun` are shared.
125/// Similarly, the prefixes of `jun` and `jul` are shared and the prefixes
126/// of `mar` and `may` are shared.
127///
128/// All of the keys from this automaton can be enumerated in lexicographic
129/// order by following every transition from each node in lexicographic
130/// order. Since it is acyclic, the procedure will terminate.
131///
132/// A key can be found by tracing it through the transitions in the automaton.
133/// For example, the key `aug` is known not to be in the automaton by only
134/// visiting the root state (because there is no `a` transition). For another
135/// example, the key `jax` is known not to be in the set only after moving
136/// through the transitions for `j` and `a`. Namely, after those transitions
137/// are followed, there are no transitions for `x`.
138///
139/// Notice here that looking up a key is proportional the length of the key
140/// itself. Namely, lookup time is not affected by the number of keys in the
141/// automaton!
142///
143/// Additionally, notice that the automaton exploits the fact that many keys
144/// share common prefixes and suffixes. For example, `jun` and `jul` are
145/// represented with no more states than would be required to represent either
146/// one on its own. Instead, the only change is a single extra transition. This
147/// is a form of compression and is key to how the automatons produced by this
148/// crate are so small.
149///
150/// Let's move on to finite state transducers. Consider the same set of keys
151/// as above, but let's assign their numeric month values:
152///
153/// ```plain
154/// jan,1
155/// feb,2
156/// mar,3
157/// may,5
158/// jun,6
159/// jul,7
160/// ```
161///
162/// The corresponding transducer looks very similar to the automaton above,
163/// except outputs have been added to some of the transitions:
164///
165/// 
166///
167/// All of the operations with a transducer are the same as described above
168/// for automatons. Additionally, the same compression techniques are used:
169/// common prefixes and suffixes in keys are exploited.
170///
171/// The key difference is that some transitions have been given an output.
172/// As one follows input transitions, one must sum the outputs as they
173/// are seen. (A transition with no output represents the additive identity,
174/// or `0` in this case.) For example, when looking up `feb`, the transition
175/// `f` has output `2`, the transition `e` has output `0`, and the transition
176/// `b` also has output `0`. The sum of these is `2`, which is exactly the
177/// value we associated with `feb`.
178///
179/// For another more interesting example, consider `jul`. The `j` transition
180/// has output `1`, the `u` transition has output `5` and the `l` transition
181/// has output `1`. Summing these together gets us `7`, which is again the
182/// correct value associated with `jul`. Notice that if we instead looked up
183/// the `jun` key, then the `n` transition would be followed instead of the
184/// `l` transition, which has no output. Therefore, the `jun` key equals
185/// `1+5+0=6`.
186///
187/// The trick to transducers is that there exists a unique path through the
188/// transducer for every key, and its outputs are stored appropriately along
189/// this path such that the correct value is returned when they are all summed
190/// together. This process also enables the data that makes up each value to be
191/// shared across many values in the transducer in exactly the same way that
192/// keys are shared. This is yet another form of compression!
193///
194/// # Bonus: a billion strings
195///
196/// The amount of compression one can get from automata can be absolutely
197/// ridiuclous. Consider the particular case of storing all billion strings
198/// in the range `0000000001-1000000000`, e.g.,
199///
200/// ```plain
201/// 0000000001
202/// 0000000002
203/// ...
204/// 0000000100
205/// 0000000101
206/// ...
207/// 0999999999
208/// 1000000000
209/// ```
210///
211/// The corresponding automaton looks like this:
212///
213/// 
214///
215/// Indeed, the on disk size of this automaton is a mere **251 bytes**.
216///
217/// Of course, this is a bit of a pathological best case, but it does serve
218/// to show how good compression can be in the optimal case.
219///
220/// Also, check out the
221/// [corresponding transducer](https://burntsushi.net/stuff/one-billion-map.svg)
222/// that maps each string to its integer value. It's a bit bigger, but still
223/// only takes up **896 bytes** of space on disk. This demonstrates that
224/// output values are also compressible.
225///
226/// # Does this crate produce minimal transducers?
227///
228/// For any non-trivial sized set of keys, it is unlikely that this crate will
229/// produce a minimal transducer. As far as this author knows, guaranteeing a
230/// minimal transducer requires working memory proportional to the number of
231/// states. This can be quite costly and is anathema to the main design goal of
232/// this crate: provide the ability to work with gigantic sets of strings with
233/// constant memory overhead.
234///
235/// Instead, construction of a finite state transducer uses a cache of
236/// states. More frequently used states are cached and reused, which provides
237/// reasonably good compression ratios. (No comprehensive benchmarks exist to
238/// back up this claim.)
239///
240/// It is possible that this crate may expose a way to guarantee minimal
241/// construction of transducers at the expense of exorbitant memory
242/// requirements.
243///
244/// # Bibliography
245///
246/// I initially got the idea to use finite state tranducers to represent
247/// ordered sets/maps from
248/// [Michael
249/// McCandless'](http://blog.mikemccandless.com/2010/12/using-finite-state-transducers-in.html)
250/// work on incorporating transducers in Lucene.
251///
252/// However, my work would also not have been possible without the hard work
253/// of many academics, especially
254/// [Jan Daciuk](http://galaxy.eti.pg.gda.pl/katedry/kiw/pracownicy/Jan.Daciuk/personal/).
255///
256/// * [Incremental construction of minimal acyclic finite-state automata](https://www.mitpressjournals.org/doi/pdfplus/10.1162/089120100561601)
257/// (Section 3 provides a decent overview of the algorithm used to construct
258/// transducers in this crate, assuming all outputs are `0`.)
259/// * [Direct Construction of Minimal Acyclic Subsequential Transducers](https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.24.3698&rep=rep1&type=pdf)
260/// (The whole thing. The proof is dense but illuminating. The algorithm at
261/// the end is the money shot, namely, it incorporates output values.)
262/// * [Experiments with Automata Compression](https://www.researchgate.net/profile/Jiri-Dvorsky/publication/221568039_Word_Random_Access_Compression/links/0c96052c095630d5b3000000/Word-Random-Access-Compression.pdf#page=116), [Smaller Representation of Finite State Automata](https://www.cs.put.poznan.pl/dweiss/site/publications/download/fsacomp.pdf)
263/// (various compression techniques for representing states/transitions)
264/// * [Jan Daciuk's dissertation](http://www.pg.gda.pl/~jandac/thesis.ps.gz)
265/// (excellent for in depth overview)
266/// * [Comparison of Construction Algorithms for Minimal, Acyclic, Deterministic, Finite-State Automata from Sets of Strings](https://www.cs.mun.ca/~harold/Courses/Old/CS4750/Diary/q3p2qx4lv71m5vew.pdf)
267/// (excellent for surface level overview)
268#[derive(Clone)]
269pub struct Fst<D> {
270 meta: Meta,
271 data: D,
272}
273
274#[derive(Debug, Clone)]
275struct Meta {
276 version: u64,
277 root_addr: CompiledAddr,
278 ty: FstType,
279 len: usize,
280 /// A checksum is missing when the FST version is <= 2. (Checksums were
281 /// added in version 3.)
282 checksum: Option<u32>,
283}
284
285impl Fst<Vec<u8>> {
286 /// Create a new FST from an iterator of lexicographically ordered byte
287 /// strings. Every key's value is set to `0`.
288 ///
289 /// If the iterator does not yield values in lexicographic order, then an
290 /// error is returned.
291 ///
292 /// Note that this is a convenience function to build an FST in memory.
293 /// To build an FST that streams to an arbitrary `io::Write`, use
294 /// `raw::Builder`.
295 pub fn from_iter_set<K, I>(iter: I) -> Result<Fst<Vec<u8>>>
296 where
297 K: AsRef<[u8]>,
298 I: IntoIterator<Item = K>,
299 {
300 let mut builder = Builder::memory();
301 for k in iter {
302 builder.add(k)?;
303 }
304 Ok(builder.into_fst())
305 }
306
307 /// Create a new FST from an iterator of lexicographically ordered byte
308 /// strings. The iterator should consist of tuples, where the first element
309 /// is the byte string and the second element is its corresponding value.
310 ///
311 /// If the iterator does not yield unique keys in lexicographic order, then
312 /// an error is returned.
313 ///
314 /// Note that this is a convenience function to build an FST in memory.
315 /// To build an FST that streams to an arbitrary `io::Write`, use
316 /// `raw::Builder`.
317 pub fn from_iter_map<K, I>(iter: I) -> Result<Fst<Vec<u8>>>
318 where
319 K: AsRef<[u8]>,
320 I: IntoIterator<Item = (K, u64)>,
321 {
322 let mut builder = Builder::memory();
323 for (k, v) in iter {
324 builder.insert(k, v)?;
325 }
326 Ok(builder.into_fst())
327 }
328}
329
330impl<D: AsRef<[u8]>> Fst<D> {
331 /// Creates a transducer from its representation as a raw byte sequence.
332 ///
333 /// This operation is intentionally very cheap (no allocations and no
334 /// copies). In particular, no verification on the integrity of the
335 /// FST is performed. Callers may opt into integrity checks via the
336 /// [`Fst::verify`](struct.Fst.html#method.verify) method.
337 ///
338 /// The fst must have been written with a compatible finite state
339 /// transducer builder (`Builder` qualifies). If the format is invalid or
340 /// if there is a mismatch between the API version of this library and the
341 /// fst, then an error is returned.
342 #[inline]
343 pub fn new(data: D) -> Result<Fst<D>> {
344 let bytes = data.as_ref();
345 if bytes.len() < 36 {
346 return Err(Error::Format { size: bytes.len() }.into());
347 }
348 // The read_u64 unwraps below are OK because they can never fail.
349 // They can only fail when there is an IO error or if there is an
350 // unexpected EOF. However, we are reading from a byte slice (no
351 // IO errors possible) and we've confirmed the byte slice is at least
352 // N bytes (no unexpected EOF).
353 let version = bytes::read_u64_le(&bytes);
354 if version == 0 || version > VERSION {
355 return Err(
356 Error::Version { expected: VERSION, got: version }.into()
357 );
358 }
359 let ty = bytes::read_u64_le(&bytes[8..]);
360
361 let (end, checksum) = if version <= 2 {
362 (bytes.len(), None)
363 } else {
364 let checksum = bytes::read_u32_le(&bytes[bytes.len() - 4..]);
365 (bytes.len() - 4, Some(checksum))
366 };
367 let root_addr = {
368 let last = &bytes[end - 8..];
369 u64_to_usize(bytes::read_u64_le(last))
370 };
371 let len = {
372 let last2 = &bytes[end - 16..];
373 u64_to_usize(bytes::read_u64_le(last2))
374 };
375 // The root node is always the last node written, so its address should
376 // be near the end. After the root node is written, we still have to
377 // write the root *address* and the number of keys in the FST, along
378 // with the checksum. That's 20 bytes. The extra byte used below (21
379 // and not 20) comes from the fact that the root address points to
380 // the last byte in the root node, rather than the byte immediately
381 // following the root node.
382 //
383 // If this check passes, it is still possible that the FST is invalid
384 // but probably unlikely. If this check reports a false positive, then
385 // the program will probably panic. In the worst case, the FST will
386 // operate but be subtly wrong. (This would require the bytes to be in
387 // a format expected by an FST, which is incredibly unlikely.)
388 //
389 // The special check for EMPTY_ADDRESS is needed since an empty FST
390 // has a root node that is empty and final, which means it has the
391 // special address `0`. In that case, the FST is the smallest it can
392 // be: the version, type, root address and number of nodes. That's
393 // 36 bytes (8 byte u64 each).
394 //
395 // And finally, our calculation changes somewhat based on version.
396 // If the FST version is less than 3, then it does not have a checksum.
397 let (empty_total, addr_offset) =
398 if version <= 2 { (32, 17) } else { (36, 21) };
399 if (root_addr == EMPTY_ADDRESS && bytes.len() != empty_total)
400 && root_addr + addr_offset != bytes.len()
401 {
402 return Err(Error::Format { size: bytes.len() }.into());
403 }
404 let meta = Meta { version, root_addr, ty, len, checksum };
405 Ok(Fst { meta, data })
406 }
407
408 /// Retrieves the value associated with a key.
409 ///
410 /// If the key does not exist, then `None` is returned.
411 #[inline]
412 pub fn get<B: AsRef<[u8]>>(&self, key: B) -> Option<Output> {
413 self.as_ref().get(key.as_ref())
414 }
415
416 /// Returns true if and only if the given key is in this FST.
417 #[inline]
418 pub fn contains_key<B: AsRef<[u8]>>(&self, key: B) -> bool {
419 self.as_ref().contains_key(key.as_ref())
420 }
421
422 /// Retrieves the key associated with the given value.
423 ///
424 /// This is like `get_key_into`, but will return the key itself without
425 /// allowing the caller to reuse an allocation.
426 ///
427 /// If the given value does not exist, then `None` is returned.
428 ///
429 /// The values in this FST are not monotonically increasing when sorted
430 /// lexicographically by key, then this routine has unspecified behavior.
431 #[inline]
432 pub fn get_key(&self, value: u64) -> Option<Vec<u8>> {
433 let mut key = vec![];
434 if self.get_key_into(value, &mut key) {
435 Some(key)
436 } else {
437 None
438 }
439 }
440
441 /// Retrieves the key associated with the given value.
442 ///
443 /// If the given value does not exist, then `false` is returned. In this
444 /// case, the contents of `key` are unspecified.
445 ///
446 /// The given buffer is not clearer before the key is written to it.
447 ///
448 /// The values in this FST are not monotonically increasing when sorted
449 /// lexicographically by key, then this routine has unspecified behavior.
450 #[inline]
451 pub fn get_key_into(&self, value: u64, key: &mut Vec<u8>) -> bool {
452 self.as_ref().get_key_into(value, key)
453 }
454
455 /// Return a lexicographically ordered stream of all key-value pairs in
456 /// this fst.
457 #[inline]
458 pub fn stream(&self) -> Stream<'_> {
459 StreamBuilder::new(self.as_ref(), AlwaysMatch).into_stream()
460 }
461
462 /// Return a builder for range queries.
463 ///
464 /// A range query returns a subset of key-value pairs in this fst in a
465 /// range given in lexicographic order.
466 #[inline]
467 pub fn range(&self) -> StreamBuilder<'_> {
468 StreamBuilder::new(self.as_ref(), AlwaysMatch)
469 }
470
471 /// Executes an automaton on the keys of this FST.
472 #[inline]
473 pub fn search<A: Automaton>(&self, aut: A) -> StreamBuilder<'_, A> {
474 StreamBuilder::new(self.as_ref(), aut)
475 }
476
477 /// Executes an automaton on the keys of this FST and yields matching
478 /// keys along with the corresponding matching states in the given
479 /// automaton.
480 #[inline]
481 pub fn search_with_state<A: Automaton>(
482 &self,
483 aut: A,
484 ) -> StreamWithStateBuilder<'_, A> {
485 StreamWithStateBuilder::new(self.as_ref(), aut)
486 }
487
488 /// Returns the number of keys in this fst.
489 #[inline]
490 pub fn len(&self) -> usize {
491 self.as_ref().len()
492 }
493
494 /// Returns true if and only if this fst has no keys.
495 #[inline]
496 pub fn is_empty(&self) -> bool {
497 self.as_ref().is_empty()
498 }
499
500 /// Returns the number of bytes used by this fst.
501 #[inline]
502 pub fn size(&self) -> usize {
503 self.as_ref().size()
504 }
505
506 /// Attempts to verify this FST by computing its checksum.
507 ///
508 /// This will scan over all of the bytes in the underlying FST, so this
509 /// may be an expensive operation depending on the size of the FST.
510 ///
511 /// This returns an error in two cases:
512 ///
513 /// 1. When a checksum does not exist, which is the case for FSTs that were
514 /// produced by the `fst` crate before version `0.4`.
515 /// 2. When the checksum in the FST does not match the computed checksum
516 /// performed by this procedure.
517 #[inline]
518 pub fn verify(&self) -> Result<()> {
519 use crate::raw::crc32::CheckSummer;
520
521 let expected = match self.as_ref().meta.checksum {
522 None => return Err(Error::ChecksumMissing.into()),
523 Some(expected) => expected,
524 };
525 let mut summer = CheckSummer::new();
526 summer.update(&self.as_bytes()[..self.as_bytes().len() - 4]);
527 let got = summer.masked();
528 if expected == got {
529 return Ok(());
530 }
531 Err(Error::ChecksumMismatch { expected, got }.into())
532 }
533
534 /// Creates a new fst operation with this fst added to it.
535 ///
536 /// The `OpBuilder` type can be used to add additional fst streams
537 /// and perform set operations like union, intersection, difference and
538 /// symmetric difference on the keys of the fst. These set operations also
539 /// allow one to specify how conflicting values are merged in the stream.
540 #[inline]
541 pub fn op(&self) -> OpBuilder<'_> {
542 OpBuilder::new().add(self)
543 }
544
545 /// Returns true if and only if the `self` fst is disjoint with the fst
546 /// `stream`.
547 ///
548 /// `stream` must be a lexicographically ordered sequence of byte strings
549 /// with associated values.
550 #[inline]
551 pub fn is_disjoint<'f, I, S>(&self, stream: I) -> bool
552 where
553 I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>,
554 S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>,
555 {
556 self.op().add(stream).intersection().next().is_none()
557 }
558
559 /// Returns true if and only if the `self` fst is a subset of the fst
560 /// `stream`.
561 ///
562 /// `stream` must be a lexicographically ordered sequence of byte strings
563 /// with associated values.
564 #[inline]
565 pub fn is_subset<'f, I, S>(&self, stream: I) -> bool
566 where
567 I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>,
568 S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>,
569 {
570 let mut op = self.op().add(stream).intersection();
571 let mut count = 0;
572 while let Some(_) = op.next() {
573 count += 1;
574 }
575 count == self.len()
576 }
577
578 /// Returns true if and only if the `self` fst is a superset of the fst
579 /// `stream`.
580 ///
581 /// `stream` must be a lexicographically ordered sequence of byte strings
582 /// with associated values.
583 #[inline]
584 pub fn is_superset<'f, I, S>(&self, stream: I) -> bool
585 where
586 I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>,
587 S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>,
588 {
589 let mut op = self.op().add(stream).union();
590 let mut count = 0;
591 while let Some(_) = op.next() {
592 count += 1;
593 }
594 count == self.len()
595 }
596
597 /// Returns the underlying type of this fst.
598 ///
599 /// FstType is a convention used to indicate the type of the underlying
600 /// transducer.
601 ///
602 /// This crate reserves the range 0-255 (inclusive) but currently leaves
603 /// the meaning of 0-255 unspecified.
604 #[inline]
605 pub fn fst_type(&self) -> FstType {
606 self.as_ref().fst_type()
607 }
608
609 /// Returns the root node of this fst.
610 #[inline]
611 pub fn root(&self) -> Node<'_> {
612 self.as_ref().root()
613 }
614
615 /// Returns the node at the given address.
616 ///
617 /// Node addresses can be obtained by reading transitions on `Node` values.
618 #[inline]
619 pub fn node(&self, addr: CompiledAddr) -> Node<'_> {
620 self.as_ref().node(addr)
621 }
622
623 /// Returns a copy of the binary contents of this FST.
624 #[inline]
625 pub fn to_vec(&self) -> Vec<u8> {
626 self.as_ref().to_vec()
627 }
628
629 /// Returns the binary contents of this FST.
630 #[inline]
631 pub fn as_bytes(&self) -> &[u8] {
632 self.as_ref().as_bytes()
633 }
634
635 #[inline]
636 fn as_ref(&self) -> FstRef {
637 FstRef { meta: &self.meta, data: self.data.as_ref() }
638 }
639}
640
641impl<D> Fst<D> {
642 /// Returns the underlying data which constitutes the FST itself.
643 #[inline]
644 pub fn into_inner(self) -> D {
645 self.data
646 }
647
648 /// Returns a borrow to the underlying data which constitutes the FST itself.
649 #[inline]
650 pub fn as_inner(&self) -> &D {
651 &self.data
652 }
653
654 /// Maps the underlying data of the fst to another data type.
655 #[inline]
656 pub fn map_data<F, T>(self, mut f: F) -> Result<Fst<T>>
657 where
658 F: FnMut(D) -> T,
659 T: AsRef<[u8]>,
660 {
661 Fst::new(f(self.into_inner()))
662 }
663}
664
665impl<'a, 'f, D: AsRef<[u8]>> IntoStreamer<'a> for &'f Fst<D> {
666 type Item = (&'a [u8], Output);
667 type Into = Stream<'f>;
668
669 #[inline]
670 fn into_stream(self) -> Stream<'f> {
671 StreamBuilder::new(self.as_ref(), AlwaysMatch).into_stream()
672 }
673}
674
675struct FstRef<'f> {
676 meta: &'f Meta,
677 data: &'f [u8],
678}
679
680impl<'f> FstRef<'f> {
681 #[inline]
682 fn get(&self, key: &[u8]) -> Option<Output> {
683 let mut node = self.root();
684 let mut out = Output::zero();
685 for &b in key {
686 node = match node.find_input(b) {
687 None => return None,
688 Some(i) => {
689 let t = node.transition(i);
690 out = out.cat(t.out);
691 self.node(t.addr)
692 }
693 }
694 }
695 if !node.is_final() {
696 None
697 } else {
698 Some(out.cat(node.final_output()))
699 }
700 }
701
702 #[inline]
703 fn contains_key(&self, key: &[u8]) -> bool {
704 let mut node = self.root();
705 for &b in key {
706 node = match node.find_input(b) {
707 None => return false,
708 Some(i) => self.node(node.transition_addr(i)),
709 }
710 }
711 node.is_final()
712 }
713
714 #[inline]
715 fn get_key_into(&self, mut value: u64, key: &mut Vec<u8>) -> bool {
716 let mut node = self.root();
717 while value != 0 || !node.is_final() {
718 let trans = node
719 .transitions()
720 .take_while(|t| t.out.value() <= value)
721 .last();
722 node = match trans {
723 None => return false,
724 Some(t) => {
725 value -= t.out.value();
726 key.push(t.inp);
727 self.node(t.addr)
728 }
729 };
730 }
731 true
732 }
733
734 #[inline]
735 fn len(&self) -> usize {
736 self.meta.len
737 }
738
739 #[inline]
740 fn is_empty(&self) -> bool {
741 self.meta.len == 0
742 }
743
744 #[inline]
745 fn size(&self) -> usize {
746 self.as_bytes().len()
747 }
748
749 #[inline]
750 fn fst_type(&self) -> FstType {
751 self.meta.ty
752 }
753
754 #[inline]
755 fn root_addr(&self) -> CompiledAddr {
756 self.meta.root_addr
757 }
758
759 #[inline]
760 fn root(&self) -> Node<'f> {
761 self.node(self.root_addr())
762 }
763
764 #[inline]
765 fn node(&self, addr: CompiledAddr) -> Node<'f> {
766 Node::new(self.meta.version, addr, self.as_bytes())
767 }
768
769 #[inline]
770 fn to_vec(&self) -> Vec<u8> {
771 self.as_bytes().to_vec()
772 }
773
774 #[inline]
775 fn as_bytes(&self) -> &'f [u8] {
776 self.data
777 }
778
779 #[inline]
780 fn empty_final_output(&self) -> Option<Output> {
781 let root = self.root();
782 if root.is_final() {
783 Some(root.final_output())
784 } else {
785 None
786 }
787 }
788}
789
790/// A builder for constructing range queries on streams.
791///
792/// Once all bounds are set, one should call `into_stream` to get a
793/// `Stream`.
794///
795/// Bounds are not additive. That is, if `ge` is called twice on the same
796/// builder, then the second setting wins.
797///
798/// The `A` type parameter corresponds to an optional automaton to filter
799/// the stream. By default, no filtering is done.
800///
801/// The `'f` lifetime parameter refers to the lifetime of the underlying fst.
802pub struct StreamBuilder<'f, A = AlwaysMatch> {
803 fst: FstRef<'f>,
804 aut: A,
805 min: Bound,
806 max: Bound,
807}
808
809impl<'f, A: Automaton> StreamBuilder<'f, A> {
810 fn new(fst: FstRef<'f>, aut: A) -> StreamBuilder<'f, A> {
811 StreamBuilder {
812 fst,
813 aut,
814 min: Bound::Unbounded,
815 max: Bound::Unbounded,
816 }
817 }
818
819 /// Specify a greater-than-or-equal-to bound.
820 pub fn ge<T: AsRef<[u8]>>(mut self, bound: T) -> StreamBuilder<'f, A> {
821 self.min = Bound::Included(bound.as_ref().to_owned());
822 self
823 }
824
825 /// Specify a greater-than bound.
826 pub fn gt<T: AsRef<[u8]>>(mut self, bound: T) -> StreamBuilder<'f, A> {
827 self.min = Bound::Excluded(bound.as_ref().to_owned());
828 self
829 }
830
831 /// Specify a less-than-or-equal-to bound.
832 pub fn le<T: AsRef<[u8]>>(mut self, bound: T) -> StreamBuilder<'f, A> {
833 self.max = Bound::Included(bound.as_ref().to_owned());
834 self
835 }
836
837 /// Specify a less-than bound.
838 pub fn lt<T: AsRef<[u8]>>(mut self, bound: T) -> StreamBuilder<'f, A> {
839 self.max = Bound::Excluded(bound.as_ref().to_owned());
840 self
841 }
842}
843
844impl<'a, 'f, A: Automaton> IntoStreamer<'a> for StreamBuilder<'f, A> {
845 type Item = (&'a [u8], Output);
846 type Into = Stream<'f, A>;
847
848 fn into_stream(self) -> Stream<'f, A> {
849 Stream::new(self.fst, self.aut, self.min, self.max)
850 }
851}
852
853/// A builder for constructing range queries on streams that include automaton
854/// states.
855///
856/// In general, one should use `StreamBuilder` unless you have a specific need
857/// for accessing the states of the underlying automaton that is being used to
858/// filter this stream.
859///
860/// Once all bounds are set, one should call `into_stream` to get a
861/// `Stream`.
862///
863/// Bounds are not additive. That is, if `ge` is called twice on the same
864/// builder, then the second setting wins.
865///
866/// The `A` type parameter corresponds to an optional automaton to filter
867/// the stream. By default, no filtering is done.
868///
869/// The `'f` lifetime parameter refers to the lifetime of the underlying fst.
870pub struct StreamWithStateBuilder<'f, A = AlwaysMatch> {
871 fst: FstRef<'f>,
872 aut: A,
873 min: Bound,
874 max: Bound,
875}
876
877impl<'f, A: Automaton> StreamWithStateBuilder<'f, A> {
878 fn new(fst: FstRef<'f>, aut: A) -> StreamWithStateBuilder<'f, A> {
879 StreamWithStateBuilder {
880 fst,
881 aut,
882 min: Bound::Unbounded,
883 max: Bound::Unbounded,
884 }
885 }
886
887 /// Specify a greater-than-or-equal-to bound.
888 pub fn ge<T: AsRef<[u8]>>(
889 mut self,
890 bound: T,
891 ) -> StreamWithStateBuilder<'f, A> {
892 self.min = Bound::Included(bound.as_ref().to_owned());
893 self
894 }
895
896 /// Specify a greater-than bound.
897 pub fn gt<T: AsRef<[u8]>>(
898 mut self,
899 bound: T,
900 ) -> StreamWithStateBuilder<'f, A> {
901 self.min = Bound::Excluded(bound.as_ref().to_owned());
902 self
903 }
904
905 /// Specify a less-than-or-equal-to bound.
906 pub fn le<T: AsRef<[u8]>>(
907 mut self,
908 bound: T,
909 ) -> StreamWithStateBuilder<'f, A> {
910 self.max = Bound::Included(bound.as_ref().to_owned());
911 self
912 }
913
914 /// Specify a less-than bound.
915 pub fn lt<T: AsRef<[u8]>>(
916 mut self,
917 bound: T,
918 ) -> StreamWithStateBuilder<'f, A> {
919 self.max = Bound::Excluded(bound.as_ref().to_owned());
920 self
921 }
922}
923
924impl<'a, 'f, A: 'a + Automaton> IntoStreamer<'a>
925 for StreamWithStateBuilder<'f, A>
926where
927 A::State: Clone,
928{
929 type Item = (&'a [u8], Output, A::State);
930 type Into = StreamWithState<'f, A>;
931
932 fn into_stream(self) -> StreamWithState<'f, A> {
933 StreamWithState::new(self.fst, self.aut, self.min, self.max)
934 }
935}
936
937#[derive(Debug)]
938enum Bound {
939 Included(Vec<u8>),
940 Excluded(Vec<u8>),
941 Unbounded,
942}
943
944impl Bound {
945 #[inline]
946 fn exceeded_by(&self, inp: &[u8]) -> bool {
947 match *self {
948 Bound::Included(ref v) => inp > v,
949 Bound::Excluded(ref v) => inp >= v,
950 Bound::Unbounded => false,
951 }
952 }
953
954 #[inline]
955 fn is_empty(&self) -> bool {
956 match *self {
957 Bound::Included(ref v) => v.is_empty(),
958 Bound::Excluded(ref v) => v.is_empty(),
959 Bound::Unbounded => true,
960 }
961 }
962
963 #[inline]
964 fn is_inclusive(&self) -> bool {
965 match *self {
966 Bound::Excluded(_) => false,
967 _ => true,
968 }
969 }
970}
971
972/// A lexicographically ordered stream of key-value pairs from an fst.
973///
974/// The `A` type parameter corresponds to an optional automaton to filter
975/// the stream. By default, no filtering is done.
976///
977/// The `'f` lifetime parameter refers to the lifetime of the underlying fst.
978pub struct Stream<'f, A: Automaton = AlwaysMatch>(StreamWithState<'f, A>);
979
980impl<'f, A: Automaton> Stream<'f, A> {
981 fn new(fst: FstRef<'f>, aut: A, min: Bound, max: Bound) -> Stream<'f, A> {
982 Stream(StreamWithState::new(fst, aut, min, max))
983 }
984
985 /// Convert this stream into a vector of byte strings and outputs.
986 ///
987 /// Note that this creates a new allocation for every key in the stream.
988 pub fn into_byte_vec(mut self) -> Vec<(Vec<u8>, u64)> {
989 let mut vs = vec![];
990 while let Some((k, v)) = self.next() {
991 vs.push((k.to_vec(), v.value()));
992 }
993 vs
994 }
995
996 /// Convert this stream into a vector of Unicode strings and outputs.
997 ///
998 /// If any key is not valid UTF-8, then iteration on the stream is stopped
999 /// and a UTF-8 decoding error is returned.
1000 ///
1001 /// Note that this creates a new allocation for every key in the stream.
1002 pub fn into_str_vec(mut self) -> Result<Vec<(String, u64)>> {
1003 let mut vs = vec![];
1004 while let Some((k, v)) = self.next() {
1005 let k = String::from_utf8(k.to_vec()).map_err(Error::from)?;
1006 vs.push((k, v.value()));
1007 }
1008 Ok(vs)
1009 }
1010
1011 /// Convert this stream into a vector of byte strings.
1012 ///
1013 /// Note that this creates a new allocation for every key in the stream.
1014 pub fn into_byte_keys(mut self) -> Vec<Vec<u8>> {
1015 let mut vs = vec![];
1016 while let Some((k, _)) = self.next() {
1017 vs.push(k.to_vec());
1018 }
1019 vs
1020 }
1021
1022 /// Convert this stream into a vector of Unicode strings.
1023 ///
1024 /// If any key is not valid UTF-8, then iteration on the stream is stopped
1025 /// and a UTF-8 decoding error is returned.
1026 ///
1027 /// Note that this creates a new allocation for every key in the stream.
1028 pub fn into_str_keys(mut self) -> Result<Vec<String>> {
1029 let mut vs = vec![];
1030 while let Some((k, _)) = self.next() {
1031 let k = String::from_utf8(k.to_vec()).map_err(Error::from)?;
1032 vs.push(k);
1033 }
1034 Ok(vs)
1035 }
1036
1037 /// Convert this stream into a vector of outputs.
1038 pub fn into_values(mut self) -> Vec<u64> {
1039 let mut vs = vec![];
1040 while let Some((_, v)) = self.next() {
1041 vs.push(v.value());
1042 }
1043 vs
1044 }
1045}
1046
1047impl<'f, 'a, A: Automaton> Streamer<'a> for Stream<'f, A> {
1048 type Item = (&'a [u8], Output);
1049
1050 fn next(&'a mut self) -> Option<(&'a [u8], Output)> {
1051 self.0.next_with(|_| ()).map(|(key, out, _)| (key, out))
1052 }
1053}
1054
1055/// A lexicographically ordered stream of key-value-state triples from an fst
1056/// and an automaton.
1057///
1058/// The key-values are from the underyling FSTP while the states are from the
1059/// automaton.
1060///
1061/// The `A` type parameter corresponds to an optional automaton to filter
1062/// the stream. By default, no filtering is done.
1063///
1064/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
1065pub struct StreamWithState<'f, A = AlwaysMatch>
1066where
1067 A: Automaton,
1068{
1069 fst: FstRef<'f>,
1070 aut: A,
1071 inp: Vec<u8>,
1072 empty_output: Option<Output>,
1073 stack: Vec<StreamState<'f, A::State>>,
1074 end_at: Bound,
1075}
1076
1077#[derive(Clone, Debug)]
1078struct StreamState<'f, S> {
1079 node: Node<'f>,
1080 trans: usize,
1081 out: Output,
1082 aut_state: S,
1083}
1084
1085impl<'f, A: Automaton> StreamWithState<'f, A> {
1086 fn new(
1087 fst: FstRef<'f>,
1088 aut: A,
1089 min: Bound,
1090 max: Bound,
1091 ) -> StreamWithState<'f, A> {
1092 let mut rdr = StreamWithState {
1093 fst,
1094 aut,
1095 inp: Vec::with_capacity(16),
1096 empty_output: None,
1097 stack: vec![],
1098 end_at: max,
1099 };
1100 rdr.seek_min(min);
1101 rdr
1102 }
1103
1104 /// Seeks the underlying stream such that the next key to be read is the
1105 /// smallest key in the underlying fst that satisfies the given minimum
1106 /// bound.
1107 ///
1108 /// This theoretically should be straight-forward, but we need to make
1109 /// sure our stack is correct, which includes accounting for automaton
1110 /// states.
1111 fn seek_min(&mut self, min: Bound) {
1112 if min.is_empty() {
1113 if min.is_inclusive() {
1114 self.empty_output = self.fst.empty_final_output();
1115 }
1116 self.stack = vec![StreamState {
1117 node: self.fst.root(),
1118 trans: 0,
1119 out: Output::zero(),
1120 aut_state: self.aut.start(),
1121 }];
1122 return;
1123 }
1124 let (key, inclusive) = match min {
1125 Bound::Excluded(ref min) => (min, false),
1126 Bound::Included(ref min) => (min, true),
1127 Bound::Unbounded => unreachable!(),
1128 };
1129 // At this point, we need to find the starting location of `min` in
1130 // the FST. However, as we search, we need to maintain a stack of
1131 // reader states so that the reader can pick up where we left off.
1132 // N.B. We do not necessarily need to stop in a final state, unlike
1133 // the one-off `find` method. For the example, the given bound might
1134 // not actually exist in the FST.
1135 let mut node = self.fst.root();
1136 let mut out = Output::zero();
1137 let mut aut_state = self.aut.start();
1138 for &b in key {
1139 match node.find_input(b) {
1140 Some(i) => {
1141 let t = node.transition(i);
1142 let prev_state = aut_state;
1143 aut_state = self.aut.accept(&prev_state, b);
1144 self.inp.push(b);
1145 self.stack.push(StreamState {
1146 node,
1147 trans: i + 1,
1148 out,
1149 aut_state: prev_state,
1150 });
1151 out = out.cat(t.out);
1152 node = self.fst.node(t.addr);
1153 }
1154 None => {
1155 // This is a little tricky. We're in this case if the
1156 // given bound is not a prefix of any key in the FST.
1157 // Since this is a minimum bound, we need to find the
1158 // first transition in this node that proceeds the current
1159 // input byte.
1160 self.stack.push(StreamState {
1161 node,
1162 trans: node
1163 .transitions()
1164 .position(|t| t.inp > b)
1165 .unwrap_or(node.len()),
1166 out,
1167 aut_state,
1168 });
1169 return;
1170 }
1171 }
1172 }
1173 if !self.stack.is_empty() {
1174 let last = self.stack.len() - 1;
1175 if inclusive {
1176 self.stack[last].trans -= 1;
1177 self.inp.pop();
1178 } else {
1179 let node = self.stack[last].node;
1180 let trans = self.stack[last].trans;
1181 self.stack.push(StreamState {
1182 node: self.fst.node(node.transition(trans - 1).addr),
1183 trans: 0,
1184 out,
1185 aut_state,
1186 });
1187 }
1188 }
1189 }
1190
1191 fn next_with<T>(
1192 &mut self,
1193 mut map: impl FnMut(&A::State) -> T,
1194 ) -> Option<(&[u8], Output, T)> {
1195 if let Some(out) = self.empty_output.take() {
1196 if self.end_at.exceeded_by(&[]) {
1197 self.stack.clear();
1198 return None;
1199 }
1200
1201 let start = self.aut.start();
1202 if self.aut.is_match(&start) {
1203 return Some((&[], out, map(&start)));
1204 }
1205 }
1206 while let Some(state) = self.stack.pop() {
1207 if state.trans >= state.node.len()
1208 || !self.aut.can_match(&state.aut_state)
1209 {
1210 if state.node.addr() != self.fst.root_addr() {
1211 self.inp.pop().unwrap();
1212 }
1213 continue;
1214 }
1215 let trans = state.node.transition(state.trans);
1216 let out = state.out.cat(trans.out);
1217 let next_state = self.aut.accept(&state.aut_state, trans.inp);
1218 let t = map(&next_state);
1219 let mut is_match = self.aut.is_match(&next_state);
1220 let next_node = self.fst.node(trans.addr);
1221 self.inp.push(trans.inp);
1222 if next_node.is_final() {
1223 if let Some(eof_state) = self.aut.accept_eof(&next_state) {
1224 is_match = self.aut.is_match(&eof_state);
1225 }
1226 }
1227 self.stack.push(StreamState { trans: state.trans + 1, ..state });
1228 self.stack.push(StreamState {
1229 node: next_node,
1230 trans: 0,
1231 out,
1232 aut_state: next_state,
1233 });
1234 if self.end_at.exceeded_by(&self.inp) {
1235 // We are done, forever.
1236 self.stack.clear();
1237 return None;
1238 }
1239 if next_node.is_final() && is_match {
1240 return Some((
1241 &self.inp,
1242 out.cat(next_node.final_output()),
1243 t,
1244 ));
1245 }
1246 }
1247 None
1248 }
1249}
1250
1251impl<'a, 'f, A: 'a + Automaton> Streamer<'a> for StreamWithState<'f, A>
1252where
1253 A::State: Clone,
1254{
1255 type Item = (&'a [u8], Output, A::State);
1256
1257 fn next(&'a mut self) -> Option<(&'a [u8], Output, A::State)> {
1258 self.next_with(|state| state.clone())
1259 }
1260}
1261
1262/// An output is a value that is associated with a key in a finite state
1263/// transducer.
1264///
1265/// Note that outputs must satisfy an algebra. Namely, it must have an additive
1266/// identity and the following binary operations defined: `prefix`,
1267/// `concatenation` and `subtraction`. `prefix` and `concatenation` are
1268/// commutative while `subtraction` is not. `subtraction` is only defined on
1269/// pairs of operands where the first operand is greater than or equal to the
1270/// second operand.
1271///
1272/// Currently, output values must be `u64`. However, in theory, an output value
1273/// can be anything that satisfies the above algebra. Future versions of this
1274/// crate may make outputs generic on this algebra.
1275#[derive(Copy, Clone, Debug, Hash, Eq, Ord, PartialEq, PartialOrd)]
1276pub struct Output(u64);
1277
1278impl Output {
1279 /// Create a new output from a `u64`.
1280 #[inline]
1281 pub fn new(v: u64) -> Output {
1282 Output(v)
1283 }
1284
1285 /// Create a zero output.
1286 #[inline]
1287 pub fn zero() -> Output {
1288 Output(0)
1289 }
1290
1291 /// Retrieve the value inside this output.
1292 #[inline]
1293 pub fn value(self) -> u64 {
1294 self.0
1295 }
1296
1297 /// Returns true if this is a zero output.
1298 #[inline]
1299 pub fn is_zero(self) -> bool {
1300 self.0 == 0
1301 }
1302
1303 /// Returns the prefix of this output and `o`.
1304 #[inline]
1305 pub fn prefix(self, o: Output) -> Output {
1306 Output(cmp::min(self.0, o.0))
1307 }
1308
1309 /// Returns the concatenation of this output and `o`.
1310 #[inline]
1311 pub fn cat(self, o: Output) -> Output {
1312 Output(self.0 + o.0)
1313 }
1314
1315 /// Returns the subtraction of `o` from this output.
1316 ///
1317 /// This function panics if `self < o`.
1318 #[inline]
1319 pub fn sub(self, o: Output) -> Output {
1320 Output(
1321 self.0
1322 .checked_sub(o.0)
1323 .expect("BUG: underflow subtraction not allowed"),
1324 )
1325 }
1326}
1327
1328/// A transition from one note to another.
1329#[derive(Copy, Clone, Hash, Eq, PartialEq)]
1330pub struct Transition {
1331 /// The byte input associated with this transition.
1332 pub inp: u8,
1333 /// The output associated with this transition.
1334 pub out: Output,
1335 /// The address of the node that this transition points to.
1336 pub addr: CompiledAddr,
1337}
1338
1339impl Default for Transition {
1340 #[inline]
1341 fn default() -> Transition {
1342 Transition { inp: 0, out: Output::zero(), addr: NONE_ADDRESS }
1343 }
1344}
1345
1346impl fmt::Debug for Transition {
1347 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1348 if self.out.is_zero() {
1349 write!(f, "{} -> {}", self.inp as char, self.addr)
1350 } else {
1351 write!(
1352 f,
1353 "({}, {}) -> {}",
1354 self.inp as char,
1355 self.out.value(),
1356 self.addr
1357 )
1358 }
1359 }
1360}
1361
1362#[inline]
1363#[cfg(target_pointer_width = "64")]
1364fn u64_to_usize(n: u64) -> usize {
1365 n as usize
1366}
1367
1368#[inline]
1369#[cfg(not(target_pointer_width = "64"))]
1370fn u64_to_usize(n: u64) -> usize {
1371 if n > std::usize::MAX as u64 {
1372 panic!(
1373 "\
1374Cannot convert node address {} to a pointer sized variable. If this FST
1375is very large and was generated on a system with a larger pointer size
1376than this system, then it is not possible to read this FST on this
1377system.",
1378 n
1379 );
1380 }
1381 n as usize
1382}