zerotrie/
cursor.rs

Help
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

//! Types for walking stepwise through a trie.
//!
//! For examples, see the `.cursor()` functions
//! and the `Cursor` types in this module.

use crate::reader;
use crate::ZeroAsciiIgnoreCaseTrie;
use crate::ZeroTrieSimpleAscii;

use core::fmt;

impl<Store> ZeroTrieSimpleAscii<Store>
where
    Store: AsRef<[u8]> + ?Sized,
{
    /// Gets a cursor into the current trie.
    ///
    /// Useful to query a trie with data that is not a slice.
    ///
    /// This is currently supported only on [`ZeroTrieSimpleAscii`]
    /// and [`ZeroAsciiIgnoreCaseTrie`].
    ///
    /// # Examples
    ///
    /// Get a value out of a trie by [writing](fmt::Write) it to the cursor:
    ///
    /// ```
    /// use core::fmt::Write;
    /// use zerotrie::ZeroTrieSimpleAscii;
    ///
    /// // A trie with two values: "abc" and "abcdef"
    /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
    ///
    /// // Get out the value for "abc"
    /// let mut cursor = trie.cursor();
    /// write!(&mut cursor, "abc");
    /// assert_eq!(cursor.take_value(), Some(0));
    /// ```
    ///
    /// Find the longest prefix match:
    ///
    /// ```
    /// use zerotrie::ZeroTrieSimpleAscii;
    ///
    /// // A trie with two values: "abc" and "abcdef"
    /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
    ///
    /// // Find the longest prefix of the string "abcdxy":
    /// let query = b"abcdxy";
    /// let mut longest_prefix = 0;
    /// let mut cursor = trie.cursor();
    /// for (i, b) in query.iter().enumerate() {
    ///     // Checking is_empty() is not required, but it is
    ///     // good for efficiency
    ///     if cursor.is_empty() {
    ///         break;
    ///     }
    ///     if cursor.take_value().is_some() {
    ///         longest_prefix = i;
    ///     }
    ///     cursor.step(*b);
    /// }
    ///
    /// // The longest prefix is "abc" which is length 3:
    /// assert_eq!(longest_prefix, 3);
    /// ```
    #[inline]
    pub fn cursor(&self) -> ZeroTrieSimpleAsciiCursor {
        ZeroTrieSimpleAsciiCursor {
            trie: self.as_borrowed_slice(),
        }
    }
}

impl<Store> ZeroAsciiIgnoreCaseTrie<Store>
where
    Store: AsRef<[u8]> + ?Sized,
{
    /// Gets a cursor into the current trie.
    ///
    /// Useful to query a trie with data that is not a slice.
    ///
    /// This is currently supported only on [`ZeroTrieSimpleAscii`]
    /// and [`ZeroAsciiIgnoreCaseTrie`].
    ///
    /// # Examples
    ///
    /// Get a value out of a trie by [writing](fmt::Write) it to the cursor:
    ///
    /// ```
    /// use core::fmt::Write;
    /// use zerotrie::ZeroAsciiIgnoreCaseTrie;
    ///
    /// // A trie with two values: "aBc" and "aBcdEf"
    /// let trie = ZeroAsciiIgnoreCaseTrie::from_bytes(b"aBc\x80dEf\x81");
    ///
    /// // Get out the value for "abc" (case-insensitive!)
    /// let mut cursor = trie.cursor();
    /// write!(&mut cursor, "abc");
    /// assert_eq!(cursor.take_value(), Some(0));
    /// ```
    ///
    /// For more examples, see [`ZeroTrieSimpleAscii::cursor`].
    #[inline]
    pub fn cursor(&self) -> ZeroAsciiIgnoreCaseTrieCursor {
        ZeroAsciiIgnoreCaseTrieCursor {
            trie: self.as_borrowed_slice(),
        }
    }
}

impl<'a> ZeroTrieSimpleAscii<&'a [u8]> {
    /// Same as [`ZeroTrieSimpleAscii::cursor()`] but moves self to avoid
    /// having to doubly anchor the trie to the stack.
    #[inline]
    pub fn into_cursor(self) -> ZeroTrieSimpleAsciiCursor<'a> {
        ZeroTrieSimpleAsciiCursor { trie: self }
    }
}

impl<'a> ZeroAsciiIgnoreCaseTrie<&'a [u8]> {
    /// Same as [`ZeroAsciiIgnoreCaseTrie::cursor()`] but moves self to avoid
    /// having to doubly anchor the trie to the stack.
    #[inline]
    pub fn into_cursor(self) -> ZeroAsciiIgnoreCaseTrieCursor<'a> {
        ZeroAsciiIgnoreCaseTrieCursor { trie: self }
    }
}

/// A cursor into a [`ZeroTrieSimpleAscii`], useful for stepwise lookup.
///
/// For examples, see [`ZeroTrieSimpleAscii::cursor()`].
// Clone but not Copy: <https://stackoverflow.com/q/32324251/1407170>
#[derive(Debug, Clone)]
pub struct ZeroTrieSimpleAsciiCursor<'a> {
    trie: ZeroTrieSimpleAscii<&'a [u8]>,
}

/// A cursor into a [`ZeroAsciiIgnoreCaseTrie`], useful for stepwise lookup.
///
/// For examples, see [`ZeroAsciiIgnoreCaseTrie::cursor()`].
// Clone but not Copy: <https://stackoverflow.com/q/32324251/1407170>
#[derive(Debug, Clone)]
pub struct ZeroAsciiIgnoreCaseTrieCursor<'a> {
    trie: ZeroAsciiIgnoreCaseTrie<&'a [u8]>,
}

/// Information about a probed edge.
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
#[non_exhaustive] // no need to destructure or construct this in userland
pub struct AsciiProbeResult {
    /// The character's byte value between this node and its parent.
    pub byte: u8,
    /// The number of siblings of this node, _including itself_.
    pub total_siblings: u8,
}

impl<'a> ZeroTrieSimpleAsciiCursor<'a> {
    /// Steps the cursor one character into the trie based on the character's byte value.
    ///
    /// # Examples
    ///
    /// Unrolled loop checking for string presence at every step:
    ///
    /// ```
    /// use zerotrie::ZeroTrieSimpleAscii;
    ///
    /// // A trie with two values: "abc" and "abcdef"
    /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
    ///
    /// // Search the trie for the string "abcdxy"
    /// let mut cursor = trie.cursor();
    /// assert_eq!(cursor.take_value(), None); // ""
    /// cursor.step(b'a');
    /// assert_eq!(cursor.take_value(), None); // "a"
    /// cursor.step(b'b');
    /// assert_eq!(cursor.take_value(), None); // "ab"
    /// cursor.step(b'c');
    /// assert_eq!(cursor.take_value(), Some(0)); // "abc"
    /// cursor.step(b'd');
    /// assert_eq!(cursor.take_value(), None); // "abcd"
    /// assert!(!cursor.is_empty());
    /// cursor.step(b'x'); // no strings have the prefix "abcdx"
    /// assert!(cursor.is_empty());
    /// assert_eq!(cursor.take_value(), None); // "abcdx"
    /// cursor.step(b'y');
    /// assert_eq!(cursor.take_value(), None); // "abcdxy"
    /// ```
    ///
    /// If the byte is not ASCII, the cursor will become empty:
    ///
    /// ```
    /// use zerotrie::ZeroTrieSimpleAscii;
    ///
    /// // A trie with two values: "abc" and "abcdef"
    /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
    ///
    /// let mut cursor = trie.cursor();
    /// assert_eq!(cursor.take_value(), None); // ""
    /// cursor.step(b'a');
    /// assert_eq!(cursor.take_value(), None); // "a"
    /// cursor.step(b'b');
    /// assert_eq!(cursor.take_value(), None); // "ab"
    /// cursor.step(b'\xFF');
    /// assert!(cursor.is_empty());
    /// assert_eq!(cursor.take_value(), None);
    /// ```
    #[inline]
    pub fn step(&mut self, byte: u8) {
        reader::step_parameterized::<ZeroTrieSimpleAscii<[u8]>>(&mut self.trie.store, byte);
    }

    /// Takes the value at the current position.
    ///
    /// Calling this function on a new cursor is equivalent to calling `.get()`
    /// with the empty string (except that it can only be called once).
    ///
    /// # Examples
    ///
    /// ```
    /// use zerotrie::ZeroTrieSimpleAscii;
    ///
    /// // A trie with two values: "" and "abc"
    /// let trie = ZeroTrieSimpleAscii::from_bytes(b"\x80abc\x81");
    ///
    /// assert_eq!(Some(0), trie.get(""));
    /// let mut cursor = trie.cursor();
    /// assert_eq!(Some(0), cursor.take_value());
    /// assert_eq!(None, cursor.take_value());
    /// ```
    #[inline]
    pub fn take_value(&mut self) -> Option<usize> {
        reader::take_value(&mut self.trie.store)
    }

    /// Steps the cursor one character into the trie based on an edge index,
    /// returning the corresponding character as a byte.
    ///
    /// This function is similar to [`Self::step()`], but it takes an index instead of a char.
    /// This enables stepwise iteration over the contents of the trie.
    ///
    /// If there are multiple possibilities for the next byte, the `index` argument allows
    /// visiting them in order. Since this function steps the cursor, the cursor must be
    /// cloned (a cheap operation) in order to visit multiple children.
    ///
    /// # Examples
    ///
    /// Continually query index 0 to extract the first item from a trie:
    ///
    /// ```
    /// use zerotrie::ZeroTrieSimpleAscii;
    ///
    /// let data: &[(String, usize)] = &[
    ///     ("ab".to_string(), 111),
    ///     ("abcxyz".to_string(), 22),
    ///     ("abde".to_string(), 333),
    ///     ("afg".to_string(), 44),
    /// ];
    ///
    /// let trie: ZeroTrieSimpleAscii<Vec<u8>> =
    ///     data.iter().map(|(s, v)| (s.as_str(), *v)).collect();
    ///
    /// let mut cursor = trie.cursor();
    /// let mut key = String::new();
    /// let value = loop {
    ///     if let Some(value) = cursor.take_value() {
    ///         break value;
    ///     }
    ///     let probe_result = cursor.probe(0).unwrap();
    ///     key.push(char::from(probe_result.byte));
    /// };
    ///
    /// assert_eq!(key, "ab");
    /// assert_eq!(value, 111);
    /// ```
    ///
    /// Stepwise iterate over all entries in the trie:
    ///
    /// ```
    /// # use zerotrie::ZeroTrieSimpleAscii;
    /// # let data: &[(String, usize)] = &[
    /// #     ("ab".to_string(), 111),
    /// #     ("abcxyz".to_string(), 22),
    /// #     ("abde".to_string(), 333),
    /// #     ("afg".to_string(), 44)
    /// # ];
    /// # let trie: ZeroTrieSimpleAscii<Vec<u8>> = data
    /// #     .iter()
    /// #     .map(|(s, v)| (s.as_str(), *v))
    /// #     .collect();
    /// // (trie built as in previous example)
    ///
    /// // Initialize the iteration at the first child of the trie.
    /// let mut stack = Vec::from([(trie.cursor(), 0, 0)]);
    /// let mut key = Vec::new();
    /// let mut results = Vec::new();
    /// loop {
    ///     let Some((mut cursor, index, suffix_len)) = stack.pop() else {
    ///         // Nothing left in the trie.
    ///         break;
    ///     };
    ///     // Check to see if there is a value at the current node.
    ///     if let Some(value) = cursor.take_value() {
    ///         results.push((String::from_utf8(key.clone()).unwrap(), value));
    ///     }
    ///     // Now check for children of the current node.
    ///     let mut sub_cursor = cursor.clone();
    ///     if let Some(probe_result) = sub_cursor.probe(index) {
    ///         // Found a child. Add the current byte edge to the key.
    ///         key.push(probe_result.byte);
    ///         // Add the child to the stack, and also add back the current
    ///         // node if there are more siblings to visit.
    ///         if index + 1 < probe_result.total_siblings as usize {
    ///             stack.push((cursor, index + 1, suffix_len));
    ///             stack.push((sub_cursor, 0, 1));
    ///         } else {
    ///             stack.push((sub_cursor, 0, suffix_len + 1));
    ///         }
    ///     } else {
    ///         // No more children. Pop this node's bytes from the key.
    ///         for _ in 0..suffix_len {
    ///             key.pop();
    ///         }
    ///     }
    /// }
    ///
    /// assert_eq!(&results, data);
    /// ```
    pub fn probe(&mut self, index: usize) -> Option<AsciiProbeResult> {
        reader::probe_parameterized::<ZeroTrieSimpleAscii<[u8]>>(&mut self.trie.store, index)
    }

    /// Checks whether the cursor points to an empty trie.
    ///
    /// Use this to determine when to stop iterating.
    #[inline]
    pub fn is_empty(&self) -> bool {
        self.trie.is_empty()
    }
}

impl<'a> ZeroAsciiIgnoreCaseTrieCursor<'a> {
    /// Steps the cursor one byte into the trie.
    ///
    /// Returns the byte if matched, which may be a different case than the input byte.
    /// If this function returns `None`, any lookup loops can be terminated.
    ///
    /// # Examples
    ///
    /// Normalize the case of a value by stepping through an ignore-case trie:
    ///
    /// ```
    /// use std::borrow::Cow;
    /// use zerotrie::ZeroAsciiIgnoreCaseTrie;
    ///
    /// // A trie with two values: "aBc" and "aBcdEf"
    /// let trie = ZeroAsciiIgnoreCaseTrie::from_bytes(b"aBc\x80dEf\x81");
    ///
    /// // Get out the value for "abc" and normalize the key string
    /// let mut cursor = trie.cursor();
    /// let mut key_str = Cow::Borrowed("abc".as_bytes());
    /// let mut i = 0;
    /// let value = loop {
    ///     let Some(&input_byte) = key_str.get(i) else {
    ///         break cursor.take_value();
    ///     };
    ///     let Some(matched_byte) = cursor.step(input_byte) else {
    ///         break None;
    ///     };
    ///     if matched_byte != input_byte {
    ///         key_str.to_mut()[i] = matched_byte;
    ///     }
    ///     i += 1;
    /// };
    ///
    /// assert_eq!(value, Some(0));
    /// assert_eq!(&*key_str, "aBc".as_bytes());
    /// ```
    ///
    /// For more examples, see [`ZeroTrieSimpleAsciiCursor::step`].
    #[inline]
    pub fn step(&mut self, byte: u8) -> Option<u8> {
        reader::step_parameterized::<ZeroAsciiIgnoreCaseTrie<[u8]>>(&mut self.trie.store, byte)
    }

    /// Takes the value at the current position.
    ///
    /// For more details, see [`ZeroTrieSimpleAsciiCursor::take_value`].
    #[inline]
    pub fn take_value(&mut self) -> Option<usize> {
        reader::take_value(&mut self.trie.store)
    }

    /// Probes the next byte in the cursor.
    ///
    /// For more details, see [`ZeroTrieSimpleAsciiCursor::probe`].
    pub fn probe(&mut self, index: usize) -> Option<AsciiProbeResult> {
        reader::probe_parameterized::<ZeroAsciiIgnoreCaseTrie<[u8]>>(&mut self.trie.store, index)
    }

    /// Checks whether the cursor points to an empty trie.
    ///
    /// For more details, see [`ZeroTrieSimpleAsciiCursor::is_empty`].
    #[inline]
    pub fn is_empty(&self) -> bool {
        self.trie.is_empty()
    }
}

impl<'a> fmt::Write for ZeroTrieSimpleAsciiCursor<'a> {
    /// Steps the cursor through each ASCII byte of the string.
    ///
    /// If the string contains non-ASCII chars, an error is returned.
    ///
    /// # Examples
    ///
    /// ```
    /// use core::fmt::Write;
    /// use zerotrie::ZeroTrieSimpleAscii;
    ///
    /// // A trie with two values: "abc" and "abcdef"
    /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
    ///
    /// let mut cursor = trie.cursor();
    /// cursor.write_str("abcdxy").expect("all ASCII");
    /// cursor.write_str("🚂").expect_err("non-ASCII");
    /// ```
    fn write_str(&mut self, s: &str) -> fmt::Result {
        for b in s.bytes() {
            if !b.is_ascii() {
                return Err(fmt::Error);
            }
            self.step(b);
        }
        Ok(())
    }

    /// Equivalent to [`ZeroTrieSimpleAsciiCursor::step()`], except returns
    /// an error if the char is non-ASCII.
    ///
    /// # Examples
    ///
    /// ```
    /// use core::fmt::Write;
    /// use zerotrie::ZeroTrieSimpleAscii;
    ///
    /// // A trie with two values: "abc" and "abcdef"
    /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
    ///
    /// let mut cursor = trie.cursor();
    /// cursor.write_char('a').expect("ASCII");
    /// cursor.write_char('x').expect("ASCII");
    /// cursor.write_char('🚂').expect_err("non-ASCII");
    /// ```
    fn write_char(&mut self, c: char) -> fmt::Result {
        if !c.is_ascii() {
            return Err(fmt::Error);
        }
        self.step(c as u8);
        Ok(())
    }
}

impl<'a> fmt::Write for ZeroAsciiIgnoreCaseTrieCursor<'a> {
    /// Steps the cursor through each ASCII byte of the string.
    ///
    /// If the string contains non-ASCII chars, an error is returned.
    fn write_str(&mut self, s: &str) -> fmt::Result {
        for b in s.bytes() {
            if !b.is_ascii() {
                return Err(fmt::Error);
            }
            self.step(b);
        }
        Ok(())
    }

    /// Equivalent to [`ZeroAsciiIgnoreCaseTrieCursor::step()`], except returns
    /// an error if the char is non-ASCII.
    fn write_char(&mut self, c: char) -> fmt::Result {
        if !c.is_ascii() {
            return Err(fmt::Error);
        }
        self.step(c as u8);
        Ok(())
    }
}
zerotrie/cursor.rs

zerotrie/
cursor.rs