zerotrie/byte_phf/
mod.rs

Help
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

#![allow(rustdoc::private_intra_doc_links)] // doc(hidden) module

//! # Byte Perfect Hash Function Internals
//!
//! This module contains a perfect hash function (PHF) designed for a fast, compact perfect
//! hash over 1 to 256 nodes (bytes).
//!
//! The PHF uses the following variables:
//!
//! 1. A single parameter `p`, which is 0 in about 98% of cases.
//! 2. A list of `N` parameters `q_t`, one per _bucket_
//! 3. The `N` keys in an arbitrary order determined by the PHF
//!
//! Reading a `key` from the PHF uses the following algorithm:
//!
//! 1. Let `t`, the bucket index, be `f1(key, p)`.
//! 2. Let `i`, the key index, be `f2(key, q_t)`.
//! 3. If `key == k_i`, return `Some(i)`; else return `None`.
//!
//! The functions [`f1`] and [`f2`] are internal to the PHF but should remain stable across
//! serialization versions of `ZeroTrie`. They are very fast, constant-time operations as long
//! as `p` <= [`P_FAST_MAX`] and `q` <= [`Q_FAST_MAX`]. In practice, nearly 100% of parameter
//! values are in the fast range.
//!
//! ```
//! use zerotrie::_internal::PerfectByteHashMap;
//!
//! let phf_example_bytes = [
//!     // `p` parameter
//!     1, // `q` parameters, one for each of the N buckets
//!     0, 0, 1, 1, // Exact keys to be compared with the input
//!     b'e', b'a', b'c', b'g',
//! ];
//!
//! let phf = PerfectByteHashMap::from_bytes(&phf_example_bytes);
//!
//! // The PHF returns the index of the key or `None` if not found.
//! assert_eq!(phf.get(b'a'), Some(1));
//! assert_eq!(phf.get(b'b'), None);
//! assert_eq!(phf.get(b'c'), Some(2));
//! assert_eq!(phf.get(b'd'), None);
//! assert_eq!(phf.get(b'e'), Some(0));
//! assert_eq!(phf.get(b'f'), None);
//! assert_eq!(phf.get(b'g'), Some(3));
//! ```

use crate::helpers::*;

#[cfg(feature = "alloc")]
mod builder;
#[cfg(feature = "alloc")]
mod cached_owned;

#[cfg(feature = "alloc")]
pub use cached_owned::PerfectByteHashMapCacheOwned;

/// The cutoff for the fast version of [`f1`].
#[cfg(feature = "alloc")] // used in the builder code
const P_FAST_MAX: u8 = 95;

/// The cutoff for the fast version of [`f2`].
const Q_FAST_MAX: u8 = 95;

/// The maximum allowable value of `p`. This could be raised if found to be necessary.
/// Values exceeding P_FAST_MAX could use a different `p` algorithm by modifying [`f1`].
#[cfg(feature = "alloc")] // used in the builder code
const P_REAL_MAX: u8 = P_FAST_MAX;

/// The maximum allowable value of `q`. This could be raised if found to be necessary.
#[cfg(feature = "alloc")] // used in the builder code
const Q_REAL_MAX: u8 = 127;

/// Calculates the function `f1` for the PHF. For the exact formula, please read the code.
///
/// When `p == 0`, the operation is a simple modulus.
///
/// The argument `n` is used only for taking the modulus so that the return value is
/// in the range `[0, n)`.
///
/// Invariant: n > 0
///
/// # Examples
///
/// ```
/// use zerotrie::_internal::f1;
/// const N: usize = 10;
///
/// // With p = 0:
/// assert_eq!(0, f1(0, 0, N));
/// assert_eq!(1, f1(1, 0, N));
/// assert_eq!(2, f1(2, 0, N));
/// assert_eq!(9, f1(9, 0, N));
/// assert_eq!(0, f1(10, 0, N));
/// assert_eq!(1, f1(11, 0, N));
/// assert_eq!(2, f1(12, 0, N));
/// assert_eq!(9, f1(19, 0, N));
///
/// // With p = 1:
/// assert_eq!(1, f1(0, 1, N));
/// assert_eq!(0, f1(1, 1, N));
/// assert_eq!(2, f1(2, 1, N));
/// assert_eq!(2, f1(9, 1, N));
/// assert_eq!(4, f1(10, 1, N));
/// assert_eq!(5, f1(11, 1, N));
/// assert_eq!(1, f1(12, 1, N));
/// assert_eq!(7, f1(19, 1, N));
/// ```
#[inline]
pub fn f1(byte: u8, p: u8, n: usize) -> usize {
    let n = if n > 0 {
        n
    } else {
        debug_assert!(false, "unreachable by invariant");
        1
    };
    if p == 0 {
        byte as usize % n
    } else {
        // `p` always uses the below constant-time operation. If needed, we
        // could add some other operation here with `p > P_FAST_MAX` to solve
        // difficult cases if the need arises.
        let result = byte ^ p ^ byte.wrapping_shr(p as u32);
        result as usize % n
    }
}

/// Calculates the function `f2` for the PHF. For the exact formula, please read the code.
///
/// When `q == 0`, the operation is a simple modulus.
///
/// The argument `n` is used only for taking the modulus so that the return value is
/// in the range `[0, n)`.
///
/// Invariant: n > 0
///
/// # Examples
///
/// ```
/// use zerotrie::_internal::f2;
/// const N: usize = 10;
///
/// // With q = 0:
/// assert_eq!(0, f2(0, 0, N));
/// assert_eq!(1, f2(1, 0, N));
/// assert_eq!(2, f2(2, 0, N));
/// assert_eq!(9, f2(9, 0, N));
/// assert_eq!(0, f2(10, 0, N));
/// assert_eq!(1, f2(11, 0, N));
/// assert_eq!(2, f2(12, 0, N));
/// assert_eq!(9, f2(19, 0, N));
///
/// // With q = 1:
/// assert_eq!(1, f2(0, 1, N));
/// assert_eq!(0, f2(1, 1, N));
/// assert_eq!(3, f2(2, 1, N));
/// assert_eq!(8, f2(9, 1, N));
/// assert_eq!(1, f2(10, 1, N));
/// assert_eq!(0, f2(11, 1, N));
/// assert_eq!(3, f2(12, 1, N));
/// assert_eq!(8, f2(19, 1, N));
/// ```
#[inline]
pub fn f2(byte: u8, q: u8, n: usize) -> usize {
    let n = if n > 0 {
        n
    } else {
        debug_assert!(false, "unreachable by invariant");
        1
    };
    let mut result = byte ^ q;
    // In almost all cases, the PHF works with the above constant-time operation.
    // However, to crack a few difficult cases, we fall back to the linear-time
    // operation shown below.
    for _ in Q_FAST_MAX..q {
        result = result ^ (result << 1) ^ (result >> 1);
    }
    result as usize % n
}

/// A constant-time map from bytes to unique indices.
///
/// Uses a perfect hash function (see module-level documentation). Does not support mutation.
///
/// Standard layout: P, N bytes of Q, N bytes of expected keys
#[derive(Debug, PartialEq, Eq)]
#[repr(transparent)]
pub struct PerfectByteHashMap<Store: ?Sized>(Store);

impl<Store> PerfectByteHashMap<Store> {
    /// Creates an instance from a pre-existing store. See [`Self::as_bytes`].
    #[inline]
    pub fn from_store(store: Store) -> Self {
        Self(store)
    }
}

impl<Store> PerfectByteHashMap<Store>
where
    Store: AsRef<[u8]> + ?Sized,
{
    /// Gets the usize for the given byte, or `None` if it is not in the map.
    pub fn get(&self, key: u8) -> Option<usize> {
        let (p, buffer) = self.0.as_ref().split_first()?;
        // Note: there are N buckets followed by N keys
        let n = buffer.len() / 2;
        if n == 0 {
            return None;
        }
        let (qq, eks) = buffer.debug_split_at(n);
        debug_assert_eq!(qq.len(), eks.len());
        let q = debug_unwrap!(qq.get(f1(key, *p, n)), return None);
        let l2 = f2(key, *q, n);
        let ek = debug_unwrap!(eks.get(l2), return None);
        if *ek == key {
            Some(l2)
        } else {
            None
        }
    }
    /// This is called `num_items` because `len` is ambiguous: it could refer
    /// to the number of items or the number of bytes.
    pub fn num_items(&self) -> usize {
        self.0.as_ref().len() / 2
    }
    /// Get an iterator over the keys in the order in which they are stored in the map.
    pub fn keys(&self) -> &[u8] {
        let n = self.num_items();
        self.0.as_ref().debug_split_at(1 + n).1
    }
    /// Diagnostic function that returns `p` and the maximum value of `q`
    #[cfg(test)]
    pub fn p_qmax(&self) -> Option<(u8, u8)> {
        let (p, buffer) = self.0.as_ref().split_first()?;
        let n = buffer.len() / 2;
        if n == 0 {
            return None;
        }
        let (qq, _) = buffer.debug_split_at(n);
        Some((*p, *qq.iter().max().unwrap()))
    }
    /// Returns the map as bytes. The map can be recovered with [`Self::from_store`]
    /// or [`Self::from_bytes`].
    pub fn as_bytes(&self) -> &[u8] {
        self.0.as_ref()
    }
    #[cfg(all(feature = "alloc", test))]
    pub fn check(&self) -> Result<(), (&'static str, u8)> {
        use alloc::vec;
        let len = self.num_items();
        let mut seen = vec![false; len];
        for b in 0..=255u8 {
            let get_result = self.get(b);
            if self.keys().contains(&b) {
                let i = get_result.ok_or(("expected to find", b))?;
                if seen[i] {
                    return Err(("seen", b));
                }
                seen[i] = true;
            } else if get_result.is_some() {
                return Err(("did not expect to find", b));
            }
        }
        Ok(())
    }
}

impl PerfectByteHashMap<[u8]> {
    /// Creates an instance from pre-existing bytes. See [`Self::as_bytes`].
    #[inline]
    pub fn from_bytes(bytes: &[u8]) -> &Self {
        // Safety: Self is repr(transparent) over [u8]
        unsafe { core::mem::transmute(bytes) }
    }
}

impl<Store> PerfectByteHashMap<Store>
where
    Store: AsRef<[u8]> + ?Sized,
{
    /// Converts from `PerfectByteHashMap<AsRef<[u8]>>` to `&PerfectByteHashMap<[u8]>`
    #[inline]
    pub fn as_borrowed(&self) -> &PerfectByteHashMap<[u8]> {
        PerfectByteHashMap::from_bytes(self.0.as_ref())
    }
}

#[cfg(all(test, feature = "alloc"))]
mod tests {
    use super::*;
    use alloc::vec::Vec;
    extern crate std;

    fn random_alphanums(seed: u64, len: usize) -> Vec<u8> {
        use rand::seq::SliceRandom;
        use rand::SeedableRng;
        const BYTES: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
        let mut rng = rand_pcg::Lcg64Xsh32::seed_from_u64(seed);
        BYTES.choose_multiple(&mut rng, len).copied().collect()
    }

    #[test]
    fn test_smaller() {
        let mut count_by_p = [0; 256];
        let mut count_by_qmax = [0; 256];
        for len in 1..16 {
            for seed in 0..150 {
                let keys = random_alphanums(seed, len);
                let keys_str = core::str::from_utf8(&keys).unwrap();
                let computed = PerfectByteHashMap::try_new(&keys).expect(keys_str);
                computed
                    .check()
                    .unwrap_or_else(|_| panic!("{}", std::str::from_utf8(&keys).expect(keys_str)));
                let (p, qmax) = computed.p_qmax().unwrap();
                count_by_p[p as usize] += 1;
                count_by_qmax[qmax as usize] += 1;
            }
        }
        std::println!("count_by_p (smaller): {count_by_p:?}");
        std::println!("count_by_qmax (smaller): {count_by_qmax:?}");
        let count_fastq = count_by_qmax[0..=Q_FAST_MAX as usize].iter().sum::<usize>();
        let count_slowq = count_by_qmax[Q_FAST_MAX as usize + 1..]
            .iter()
            .sum::<usize>();
        std::println!("fastq/slowq: {count_fastq}/{count_slowq}");
        // Assert that 99% of cases resolve to the fast hash
        assert!(count_fastq >= count_slowq * 100);
    }

    #[test]
    fn test_larger() {
        let mut count_by_p = [0; 256];
        let mut count_by_qmax = [0; 256];
        for len in 16..60 {
            for seed in 0..75 {
                let keys = random_alphanums(seed, len);
                let keys_str = core::str::from_utf8(&keys).unwrap();
                let computed = PerfectByteHashMap::try_new(&keys).expect(keys_str);
                computed
                    .check()
                    .unwrap_or_else(|_| panic!("{}", std::str::from_utf8(&keys).expect(keys_str)));
                let (p, qmax) = computed.p_qmax().unwrap();
                count_by_p[p as usize] += 1;
                count_by_qmax[qmax as usize] += 1;
            }
        }
        std::println!("count_by_p (larger): {count_by_p:?}");
        std::println!("count_by_qmax (larger): {count_by_qmax:?}");
        let count_fastq = count_by_qmax[0..=Q_FAST_MAX as usize].iter().sum::<usize>();
        let count_slowq = count_by_qmax[Q_FAST_MAX as usize + 1..]
            .iter()
            .sum::<usize>();
        std::println!("fastq/slowq: {count_fastq}/{count_slowq}");
        // Assert that 99% of cases resolve to the fast hash
        assert!(count_fastq >= count_slowq * 100);
    }

    #[test]
    fn test_hard_cases() {
        let keys = [
            0u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
            24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
            46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
            68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
            90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108,
            109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
            126, 195, 196,
        ];

        let computed = PerfectByteHashMap::try_new(&keys).unwrap();
        let (p, qmax) = computed.p_qmax().unwrap();
        assert_eq!(p, 69);
        assert_eq!(qmax, 67);
    }

    #[test]
    fn test_build_read_small() {
        #[derive(Debug)]
        struct TestCase<'a> {
            keys: &'a str,
            expected: &'a [u8],
            reordered_keys: &'a str,
        }
        let cases = [
            TestCase {
                keys: "ab",
                expected: &[0, 0, 0, b'b', b'a'],
                reordered_keys: "ba",
            },
            TestCase {
                keys: "abc",
                expected: &[0, 0, 0, 0, b'c', b'a', b'b'],
                reordered_keys: "cab",
            },
            TestCase {
                // Note: splitting "a" and "c" into different buckets requires the heavier hash
                // function because the difference between "a" and "c" is the period (2).
                keys: "ac",
                expected: &[1, 0, 1, b'c', b'a'],
                reordered_keys: "ca",
            },
            TestCase {
                keys: "aceg",
                expected: &[1, 0, 0, 1, 1, b'e', b'a', b'c', b'g'],
                reordered_keys: "eacg",
            },
            TestCase {
                keys: "abd",
                expected: &[0, 0, 1, 3, b'a', b'b', b'd'],
                reordered_keys: "abd",
            },
            TestCase {
                keys: "def",
                expected: &[0, 0, 0, 0, b'f', b'd', b'e'],
                reordered_keys: "fde",
            },
            TestCase {
                keys: "fi",
                expected: &[0, 0, 0, b'f', b'i'],
                reordered_keys: "fi",
            },
            TestCase {
                keys: "gh",
                expected: &[0, 0, 0, b'h', b'g'],
                reordered_keys: "hg",
            },
            TestCase {
                keys: "lm",
                expected: &[0, 0, 0, b'l', b'm'],
                reordered_keys: "lm",
            },
            TestCase {
                // Note: "a" and "q" (0x61 and 0x71) are very hard to split; only a handful of
                // hash function crates can get them into separate buckets.
                keys: "aq",
                expected: &[4, 0, 1, b'a', b'q'],
                reordered_keys: "aq",
            },
            TestCase {
                keys: "xy",
                expected: &[0, 0, 0, b'x', b'y'],
                reordered_keys: "xy",
            },
            TestCase {
                keys: "xyz",
                expected: &[0, 0, 0, 0, b'x', b'y', b'z'],
                reordered_keys: "xyz",
            },
            TestCase {
                keys: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
                expected: &[
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 10, 12, 16, 4, 4, 4, 4, 4, 4, 8, 4, 4, 4, 16,
                    16, 16, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
                    2, 0, 7, 104, 105, 106, 107, 108, 109, 110, 111, 112, 117, 118, 119, 68, 69,
                    70, 113, 114, 65, 66, 67, 120, 121, 122, 115, 72, 73, 74, 71, 80, 81, 82, 83,
                    84, 85, 86, 87, 88, 89, 90, 75, 76, 77, 78, 79, 103, 97, 98, 99, 116, 100, 102,
                    101,
                ],
                reordered_keys: "hijklmnopuvwDEFqrABCxyzsHIJGPQRSTUVWXYZKLMNOgabctdfe",
            },
            TestCase {
                keys: "abcdefghij",
                expected: &[
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100, 101, 102, 103, 104, 105, 106, 97, 98, 99,
                ],
                reordered_keys: "defghijabc",
            },
            TestCase {
                // This is a small case that resolves to the slow hasher
                keys: "Jbej",
                expected: &[2, 0, 0, 102, 0, b'j', b'e', b'b', b'J'],
                reordered_keys: "jebJ",
            },
            TestCase {
                // This is another small case that resolves to the slow hasher
                keys: "JFNv",
                expected: &[1, 98, 0, 2, 0, b'J', b'F', b'N', b'v'],
                reordered_keys: "JFNv",
            },
        ];
        for cas in cases {
            let computed = PerfectByteHashMap::try_new(cas.keys.as_bytes()).expect(cas.keys);
            assert_eq!(computed.as_bytes(), cas.expected, "{:?}", cas);
            assert_eq!(computed.keys(), cas.reordered_keys.as_bytes(), "{:?}", cas);
            computed.check().expect(cas.keys);
        }
    }
}
zerotrie/byte_phf/mod.rs

zerotrie/byte_phf/
mod.rs