selectors/
bloom.rs

1/* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at https://mozilla.org/MPL/2.0/. */
4
5//! Counting and non-counting Bloom filters tuned for use as ancestor filters
6//! for selector matching.
7
8use std::fmt::{self, Debug};
9
10// The top 8 bits of the 32-bit hash value are not used by the bloom filter.
11// Consumers may rely on this to pack hashes more efficiently.
12pub const BLOOM_HASH_MASK: u32 = 0x00ffffff;
13const KEY_SIZE: usize = 12;
14
15const ARRAY_SIZE: usize = 1 << KEY_SIZE;
16const KEY_MASK: u32 = (1 << KEY_SIZE) - 1;
17
18/// A counting Bloom filter with 8-bit counters.
19pub type BloomFilter = CountingBloomFilter<BloomStorageU8>;
20
21/// A counting Bloom filter with parameterized storage to handle
22/// counters of different sizes.  For now we assume that having two hash
23/// functions is enough, but we may revisit that decision later.
24///
25/// The filter uses an array with 2**KeySize entries.
26///
27/// Assuming a well-distributed hash function, a Bloom filter with
28/// array size M containing N elements and
29/// using k hash function has expected false positive rate exactly
30///
31/// $  (1 - (1 - 1/M)^{kN})^k  $
32///
33/// because each array slot has a
34///
35/// $  (1 - 1/M)^{kN}  $
36///
37/// chance of being 0, and the expected false positive rate is the
38/// probability that all of the k hash functions will hit a nonzero
39/// slot.
40///
41/// For reasonable assumptions (M large, kN large, which should both
42/// hold if we're worried about false positives) about M and kN this
43/// becomes approximately
44///
45/// $$  (1 - \exp(-kN/M))^k   $$
46///
47/// For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
48/// or in other words
49///
50/// $$    N/M = -0.5 * \ln(1 - \sqrt(r))   $$
51///
52/// where r is the false positive rate.  This can be used to compute
53/// the desired KeySize for a given load N and false positive rate r.
54///
55/// If N/M is assumed small, then the false positive rate can
56/// further be approximated as 4*N^2/M^2.  So increasing KeySize by
57/// 1, which doubles M, reduces the false positive rate by about a
58/// factor of 4, and a false positive rate of 1% corresponds to
59/// about M/N == 20.
60///
61/// What this means in practice is that for a few hundred keys using a
62/// KeySize of 12 gives false positive rates on the order of 0.25-4%.
63///
64/// Similarly, using a KeySize of 10 would lead to a 4% false
65/// positive rate for N == 100 and to quite bad false positive
66/// rates for larger N.
67#[derive(Clone, Default)]
68pub struct CountingBloomFilter<S>
69where
70    S: BloomStorage,
71{
72    storage: S,
73}
74
75impl<S> CountingBloomFilter<S>
76where
77    S: BloomStorage,
78{
79    /// Creates a new bloom filter.
80    #[inline]
81    pub fn new() -> Self {
82        Default::default()
83    }
84
85    #[inline]
86    pub fn clear(&mut self) {
87        self.storage = Default::default();
88    }
89
90    // Slow linear accessor to make sure the bloom filter is zeroed. This should
91    // never be used in release builds.
92    #[cfg(debug_assertions)]
93    pub fn is_zeroed(&self) -> bool {
94        self.storage.is_zeroed()
95    }
96
97    #[cfg(not(debug_assertions))]
98    pub fn is_zeroed(&self) -> bool {
99        unreachable!()
100    }
101
102    /// Inserts an item with a particular hash into the bloom filter.
103    #[inline]
104    pub fn insert_hash(&mut self, hash: u32) {
105        self.storage.adjust_first_slot(hash, true);
106        self.storage.adjust_second_slot(hash, true);
107    }
108
109    /// Removes an item with a particular hash from the bloom filter.
110    #[inline]
111    pub fn remove_hash(&mut self, hash: u32) {
112        self.storage.adjust_first_slot(hash, false);
113        self.storage.adjust_second_slot(hash, false);
114    }
115
116    /// Check whether the filter might contain an item with the given hash.
117    /// This can sometimes return true even if the item is not in the filter,
118    /// but will never return false for items that are actually in the filter.
119    #[inline]
120    pub fn might_contain_hash(&self, hash: u32) -> bool {
121        !self.storage.first_slot_is_empty(hash) && !self.storage.second_slot_is_empty(hash)
122    }
123}
124
125impl<S> Debug for CountingBloomFilter<S>
126where
127    S: BloomStorage,
128{
129    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
130        let mut slots_used = 0;
131        for i in 0..ARRAY_SIZE {
132            if !self.storage.slot_is_empty(i) {
133                slots_used += 1;
134            }
135        }
136        write!(f, "BloomFilter({}/{})", slots_used, ARRAY_SIZE)
137    }
138}
139
140pub trait BloomStorage: Clone + Default {
141    fn slot_is_empty(&self, index: usize) -> bool;
142    fn adjust_slot(&mut self, index: usize, increment: bool);
143    fn is_zeroed(&self) -> bool;
144
145    #[inline]
146    fn first_slot_is_empty(&self, hash: u32) -> bool {
147        self.slot_is_empty(Self::first_slot_index(hash))
148    }
149
150    #[inline]
151    fn second_slot_is_empty(&self, hash: u32) -> bool {
152        self.slot_is_empty(Self::second_slot_index(hash))
153    }
154
155    #[inline]
156    fn adjust_first_slot(&mut self, hash: u32, increment: bool) {
157        self.adjust_slot(Self::first_slot_index(hash), increment)
158    }
159
160    #[inline]
161    fn adjust_second_slot(&mut self, hash: u32, increment: bool) {
162        self.adjust_slot(Self::second_slot_index(hash), increment)
163    }
164
165    #[inline]
166    fn first_slot_index(hash: u32) -> usize {
167        hash1(hash) as usize
168    }
169
170    #[inline]
171    fn second_slot_index(hash: u32) -> usize {
172        hash2(hash) as usize
173    }
174}
175
176/// Storage class for a CountingBloomFilter that has 8-bit counters.
177pub struct BloomStorageU8 {
178    counters: [u8; ARRAY_SIZE],
179}
180
181impl BloomStorage for BloomStorageU8 {
182    #[inline]
183    fn adjust_slot(&mut self, index: usize, increment: bool) {
184        let slot = &mut self.counters[index];
185        if *slot != 0xff {
186            // full
187            if increment {
188                *slot += 1;
189            } else {
190                *slot -= 1;
191            }
192        }
193    }
194
195    #[inline]
196    fn slot_is_empty(&self, index: usize) -> bool {
197        self.counters[index] == 0
198    }
199
200    #[inline]
201    fn is_zeroed(&self) -> bool {
202        self.counters.iter().all(|x| *x == 0)
203    }
204}
205
206impl Default for BloomStorageU8 {
207    fn default() -> Self {
208        BloomStorageU8 {
209            counters: [0; ARRAY_SIZE],
210        }
211    }
212}
213
214impl Clone for BloomStorageU8 {
215    fn clone(&self) -> Self {
216        BloomStorageU8 {
217            counters: self.counters,
218        }
219    }
220}
221
222/// Storage class for a CountingBloomFilter that has 1-bit counters.
223pub struct BloomStorageBool {
224    counters: [u8; ARRAY_SIZE / 8],
225}
226
227impl BloomStorage for BloomStorageBool {
228    #[inline]
229    fn adjust_slot(&mut self, index: usize, increment: bool) {
230        let bit = 1 << (index % 8);
231        let byte = &mut self.counters[index / 8];
232
233        // Since we have only one bit for storage, decrementing it
234        // should never do anything.  Assert against an accidental
235        // decrementing of a bit that was never set.
236        assert!(
237            increment || (*byte & bit) != 0,
238            "should not decrement if slot is already false"
239        );
240
241        if increment {
242            *byte |= bit;
243        }
244    }
245
246    #[inline]
247    fn slot_is_empty(&self, index: usize) -> bool {
248        let bit = 1 << (index % 8);
249        (self.counters[index / 8] & bit) == 0
250    }
251
252    #[inline]
253    fn is_zeroed(&self) -> bool {
254        self.counters.iter().all(|x| *x == 0)
255    }
256}
257
258impl Default for BloomStorageBool {
259    fn default() -> Self {
260        BloomStorageBool {
261            counters: [0; ARRAY_SIZE / 8],
262        }
263    }
264}
265
266impl Clone for BloomStorageBool {
267    fn clone(&self) -> Self {
268        BloomStorageBool {
269            counters: self.counters,
270        }
271    }
272}
273
274#[inline]
275fn hash1(hash: u32) -> u32 {
276    hash & KEY_MASK
277}
278
279#[inline]
280fn hash2(hash: u32) -> u32 {
281    (hash >> KEY_SIZE) & KEY_MASK
282}
283
284#[test]
285fn create_and_insert_some_stuff() {
286    use fxhash::FxHasher;
287    use std::hash::{Hash, Hasher};
288    use std::mem::transmute;
289
290    fn hash_as_str(i: usize) -> u32 {
291        let mut hasher = FxHasher::default();
292        let s = i.to_string();
293        s.hash(&mut hasher);
294        let hash: u64 = hasher.finish();
295        (hash >> 32) as u32 ^ (hash as u32)
296    }
297
298    let mut bf = BloomFilter::new();
299
300    // Statically assert that ARRAY_SIZE is a multiple of 8, which
301    // BloomStorageBool relies on.
302    unsafe {
303        transmute::<[u8; ARRAY_SIZE % 8], [u8; 0]>([]);
304    }
305
306    for i in 0_usize..1000 {
307        bf.insert_hash(hash_as_str(i));
308    }
309
310    for i in 0_usize..1000 {
311        assert!(bf.might_contain_hash(hash_as_str(i)));
312    }
313
314    let false_positives = (1001_usize..2000)
315        .filter(|i| bf.might_contain_hash(hash_as_str(*i)))
316        .count();
317
318    assert!(false_positives < 190, "{} is not < 190", false_positives); // 19%.
319
320    for i in 0_usize..100 {
321        bf.remove_hash(hash_as_str(i));
322    }
323
324    for i in 100_usize..1000 {
325        assert!(bf.might_contain_hash(hash_as_str(i)));
326    }
327
328    let false_positives = (0_usize..100)
329        .filter(|i| bf.might_contain_hash(hash_as_str(*i)))
330        .count();
331
332    assert!(false_positives < 20, "{} is not < 20", false_positives); // 20%.
333
334    bf.clear();
335
336    for i in 0_usize..2000 {
337        assert!(!bf.might_contain_hash(hash_as_str(i)));
338    }
339}
340
341#[cfg(feature = "bench")]
342#[cfg(test)]
343mod bench {
344    extern crate test;
345    use super::BloomFilter;
346
347    #[derive(Default)]
348    struct HashGenerator(u32);
349
350    impl HashGenerator {
351        fn next(&mut self) -> u32 {
352            // Each hash is split into two twelve-bit segments, which are used
353            // as an index into an array. We increment each by 64 so that we
354            // hit the next cache line, and then another 1 so that our wrapping
355            // behavior leads us to different entries each time.
356            //
357            // Trying to simulate cold caches is rather difficult with the cargo
358            // benchmarking setup, so it may all be moot depending on the number
359            // of iterations that end up being run. But we might as well.
360            self.0 += (65) + (65 << super::KEY_SIZE);
361            self.0
362        }
363    }
364
365    #[bench]
366    fn create_insert_1000_remove_100_lookup_100(b: &mut test::Bencher) {
367        b.iter(|| {
368            let mut gen1 = HashGenerator::default();
369            let mut gen2 = HashGenerator::default();
370            let mut bf = BloomFilter::new();
371            for _ in 0_usize..1000 {
372                bf.insert_hash(gen1.next());
373            }
374            for _ in 0_usize..100 {
375                bf.remove_hash(gen2.next());
376            }
377            for _ in 100_usize..200 {
378                test::black_box(bf.might_contain_hash(gen2.next()));
379            }
380        });
381    }
382
383    #[bench]
384    fn might_contain_10(b: &mut test::Bencher) {
385        let bf = BloomFilter::new();
386        let mut gen = HashGenerator::default();
387        b.iter(|| {
388            for _ in 0..10 {
389                test::black_box(bf.might_contain_hash(gen.next()));
390            }
391        });
392    }
393
394    #[bench]
395    fn clear(b: &mut test::Bencher) {
396        let mut bf = Box::new(BloomFilter::new());
397        b.iter(|| test::black_box(&mut bf).clear());
398    }
399
400    #[bench]
401    fn insert_10(b: &mut test::Bencher) {
402        let mut bf = BloomFilter::new();
403        let mut gen = HashGenerator::default();
404        b.iter(|| {
405            for _ in 0..10 {
406                test::black_box(bf.insert_hash(gen.next()));
407            }
408        });
409    }
410
411    #[bench]
412    fn remove_10(b: &mut test::Bencher) {
413        let mut bf = BloomFilter::new();
414        let mut gen = HashGenerator::default();
415        // Note: this will underflow, and that's ok.
416        b.iter(|| {
417            for _ in 0..10 {
418                bf.remove_hash(gen.next())
419            }
420        });
421    }
422}