vello_cpu/
util.rs

1// Copyright 2025 the Vello Authors
2// SPDX-License-Identifier: Apache-2.0 OR MIT
3
4use crate::peniko::{BlendMode, Compose, ImageQuality, Mix};
5use vello_common::encode::EncodedImage;
6use vello_common::fearless_simd::{Simd, SimdBase, f32x4, u8x32, u16x16, u16x32};
7use vello_common::math::FloatExt;
8
9#[allow(
10    dead_code,
11    reason = "this is not used because the division by 255 is now done with SIMD, but\
12we still keep it around to document its properties."
13)]
14pub(crate) mod scalar {
15    /// Perform an approximate division by 255.
16    ///
17    /// There are three reasons for having this method.
18    /// 1) Divisions are slower than shifting + adding, and the compiler does not seem to replace
19    ///    divisions by 255 with an equivalent (this was verified by benchmarking; doing / 255 was
20    ///    significantly slower).
21    /// 2) Integer divisions are usually not available in SIMD, so this provides a good baseline
22    ///    implementation.
23    /// 3) There are two options for performing the division: One is to perform the division
24    ///    in a way that completely preserves the rounding semantics of a integer division by
25    ///    255. This could be achieved using the implementation `(val + 1 + (val >> 8)) >> 8`.
26    ///    The second approach (used here) has slightly different rounding behavior to a
27    ///    normal division by 255, but is much faster (see <https://github.com/linebender/vello/issues/904>)
28    ///    and therefore preferable for the high-performance pipeline.
29    ///
30    /// Four properties worth mentioning:
31    /// - This actually calculates the ceiling of `val / 256`.
32    /// - Within the allowed range for `val`, rounding errors do not appear for values divisible by 255, i.e. any call `div_255(val * 255)` will always yield `val`.
33    /// - If there is a discrepancy, this division will always yield a value 1 higher than the original.
34    /// - This holds for values of `val` up to and including `65279`. You should not call this function with higher values.
35    #[inline(always)]
36    pub(crate) const fn div_255(val: u16) -> u16 {
37        debug_assert!(
38            val < 65280,
39            "the properties of `div_255` do not hold for values of `65280` or greater"
40        );
41        (val + 255) >> 8
42    }
43
44    #[cfg(test)]
45    mod tests {
46        use crate::util::scalar::div_255;
47
48        #[test]
49        fn div_255_properties() {
50            for i in 0_u16..256 * 255 {
51                let expected = i / 255;
52                let actual = div_255(i);
53
54                assert!(
55                    expected <= actual,
56                    "In case of a discrepancy, the division should yield a value higher than the original."
57                );
58
59                let diff = expected.abs_diff(actual);
60                assert!(diff <= 1, "Rounding error shouldn't be higher than 1.");
61
62                if i % 255 == 0 {
63                    assert_eq!(diff, 0, "Division should be accurate for multiples of 255.");
64                }
65            }
66        }
67    }
68}
69
70pub(crate) trait NormalizedMulExt {
71    fn normalized_mul(self, other: Self) -> Self;
72}
73
74impl<S: Simd> NormalizedMulExt for u8x32<S> {
75    #[inline(always)]
76    fn normalized_mul(self, other: Self) -> Self {
77        let divided = (self.simd.widen_u8x32(self) * other.simd.widen_u8x32(other)).div_255();
78        self.simd.narrow_u16x32(divided)
79    }
80}
81
82pub(crate) trait Div255Ext {
83    fn div_255(self) -> Self;
84}
85
86impl<S: Simd> Div255Ext for u16x32<S> {
87    #[inline(always)]
88    fn div_255(self) -> Self {
89        let p1 = Self::splat(self.simd, 255);
90        let p2 = self + p1;
91        p2.shr(8)
92    }
93}
94
95impl<S: Simd> Div255Ext for u16x16<S> {
96    #[inline(always)]
97    fn div_255(self) -> Self {
98        let p1 = Self::splat(self.simd, 255);
99        let p2 = self + p1;
100        p2.shr(8)
101    }
102}
103
104#[inline(always)]
105pub(crate) fn normalized_mul<S: Simd>(a: u8x32<S>, b: u8x32<S>) -> u16x32<S> {
106    (S::widen_u8x32(a.simd, a) * S::widen_u8x32(b.simd, b)).div_255()
107}
108
109pub(crate) trait BlendModeExt {
110    fn is_default(&self) -> bool;
111}
112
113impl BlendModeExt for BlendMode {
114    // peniko uses `Clip` instead of `Normal` as the default, hence this override.
115    // TODO: This default has changed, re-evaluate.
116    #[expect(deprecated, reason = "Provided by the user, need to handle correctly.")]
117    fn is_default(&self) -> bool {
118        matches!(self.mix, Mix::Normal | Mix::Clip) && self.compose == Compose::SrcOver
119    }
120}
121
122pub(crate) trait EncodedImageExt {
123    fn has_skew(&self) -> bool;
124    fn nearest_neighbor(&self) -> bool;
125}
126
127impl EncodedImageExt for EncodedImage {
128    fn has_skew(&self) -> bool {
129        !(self.x_advance.y as f32).is_nearly_zero() || !(self.y_advance.x as f32).is_nearly_zero()
130    }
131
132    fn nearest_neighbor(&self) -> bool {
133        self.sampler.quality == ImageQuality::Low
134    }
135}
136
137pub(crate) trait Premultiply {
138    fn premultiply(self, alphas: Self) -> Self;
139    fn unpremultiply(self, alphas: Self) -> Self;
140}
141
142impl<S: Simd> Premultiply for f32x4<S> {
143    #[inline(always)]
144    fn premultiply(self, alphas: Self) -> Self {
145        self * alphas
146    }
147
148    #[inline(always)]
149    fn unpremultiply(self, alphas: Self) -> Self {
150        let zero = Self::splat(alphas.simd, 0.0);
151        let divided = self / alphas;
152
153        self.simd
154            .select_f32x4(self.simd.simd_eq_f32x4(alphas, zero), zero, divided)
155    }
156}
vello_cpu/util.rs

vello_cpu/
util.rs