vello_cpu/fine/highp/
mod.rs

1// Copyright 2025 the Vello Authors
2// SPDX-License-Identifier: Apache-2.0 OR MIT
3
4use crate::fine::FineKernel;
5use crate::fine::{COLOR_COMPONENTS, Painter};
6use crate::peniko::BlendMode;
7use crate::region::Region;
8use vello_common::fearless_simd::*;
9use vello_common::paint::PremulColor;
10use vello_common::tile::Tile;
11
12pub(crate) mod blend;
13pub(crate) mod compose;
14
15/// The kernel for doing rendering using f32.
16#[derive(Clone, Copy, Debug)]
17pub struct F32Kernel;
18
19impl<S: Simd> FineKernel<S> for F32Kernel {
20    type Numeric = f32;
21    type Composite = f32x16<S>;
22    type NumericVec = f32x16<S>;
23
24    #[inline(always)]
25    fn extract_color(color: PremulColor) -> [Self::Numeric; 4] {
26        color.as_premul_f32().components
27    }
28
29    #[inline(always)]
30    fn pack(simd: S, region: &mut Region<'_>, blend_buf: &[Self::Numeric]) {
31        simd.vectorize(
32            #[inline(always)]
33            || {
34                for y in 0..Tile::HEIGHT {
35                    for (x, pixel) in region
36                        .row_mut(y)
37                        .chunks_exact_mut(COLOR_COMPONENTS)
38                        .enumerate()
39                    {
40                        let idx =
41                            COLOR_COMPONENTS * (usize::from(Tile::HEIGHT) * x + usize::from(y));
42                        let start = &blend_buf[idx..];
43                        // TODO: Use explicit SIMD
44                        let converted = [
45                            (start[0] * 255.0 + 0.5) as u8,
46                            (start[1] * 255.0 + 0.5) as u8,
47                            (start[2] * 255.0 + 0.5) as u8,
48                            (start[3] * 255.0 + 0.5) as u8,
49                        ];
50                        pixel.copy_from_slice(&converted);
51                    }
52                }
53            },
54        );
55    }
56
57    // Not having this tanks performance for some reason.
58    #[inline(never)]
59    fn copy_solid(simd: S, dest: &mut [Self::Numeric], src: [Self::Numeric; 4]) {
60        simd.vectorize(
61            #[inline(always)]
62            || {
63                let color = f32x16::block_splat(src.simd_into(simd));
64
65                for el in dest.chunks_exact_mut(16) {
66                    el.copy_from_slice(&color.val);
67                }
68            },
69        );
70    }
71
72    fn apply_mask(
73        simd: S,
74        dest: &mut [Self::Numeric],
75        mut src: impl Iterator<Item = Self::NumericVec>,
76    ) {
77        simd.vectorize(
78            #[inline(always)]
79            || {
80                for el in dest.chunks_exact_mut(16) {
81                    let loaded = f32x16::from_slice(simd, el);
82                    let mulled = loaded * src.next().unwrap();
83                    el.copy_from_slice(&mulled.val);
84                }
85            },
86        );
87    }
88
89    #[inline(always)]
90    fn apply_painter<'a>(_: S, dest: &mut [Self::Numeric], mut painter: impl Painter + 'a) {
91        painter.paint_f32(dest);
92    }
93
94    #[inline(always)]
95    fn alpha_composite_solid(
96        simd: S,
97        dest: &mut [Self::Numeric],
98        src: [Self::Numeric; 4],
99        alphas: Option<&[u8]>,
100    ) {
101        if let Some(alphas) = alphas {
102            alpha_fill::alpha_composite_solid(simd, dest, src, alphas);
103        } else {
104            fill::alpha_composite_solid(simd, dest, src);
105        }
106    }
107
108    fn alpha_composite_buffer(
109        simd: S,
110        dest: &mut [Self::Numeric],
111        src: &[Self::Numeric],
112        alphas: Option<&[u8]>,
113    ) {
114        if let Some(alphas) = alphas {
115            alpha_fill::alpha_composite_arbitrary(
116                simd,
117                dest,
118                src.chunks_exact(16).map(|el| f32x16::from_slice(simd, el)),
119                alphas,
120            );
121        } else {
122            fill::alpha_composite_arbitrary(
123                simd,
124                dest,
125                src.chunks_exact(16).map(|el| f32x16::from_slice(simd, el)),
126            );
127        }
128    }
129
130    fn blend(
131        simd: S,
132        dest: &mut [Self::Numeric],
133        src: impl Iterator<Item = Self::Composite>,
134        blend_mode: BlendMode,
135        alphas: Option<&[u8]>,
136    ) {
137        if let Some(alphas) = alphas {
138            alpha_fill::blend(simd, dest, src, alphas, blend_mode);
139        } else {
140            fill::blend(simd, dest, src, blend_mode);
141        }
142    }
143}
144
145mod fill {
146    use crate::fine::Splat4thExt;
147    use crate::fine::highp::blend;
148    use crate::fine::highp::compose::ComposeExt;
149    use crate::peniko::BlendMode;
150
151    use vello_common::fearless_simd::*;
152    // Careful: From my experiments, inlining / not inlining these functions can have drastic (negative)
153    // consequences on performance.
154
155    #[inline(always)]
156    pub(super) fn alpha_composite_solid<S: Simd>(s: S, dest: &mut [f32], src: [f32; 4]) {
157        s.vectorize(
158            #[inline(always)]
159            || {
160                let one_minus_alpha = 1.0 - f32x16::block_splat(f32x4::splat(s, src[3]));
161                let src_c = f32x16::block_splat(f32x4::simd_from(src, s));
162
163                for next_dest in dest.chunks_exact_mut(16) {
164                    alpha_composite_inner(s, next_dest, src_c, one_minus_alpha);
165                }
166            },
167        );
168    }
169
170    #[inline(always)]
171    pub(super) fn alpha_composite_arbitrary<S: Simd, T: Iterator<Item = f32x16<S>>>(
172        simd: S,
173        dest: &mut [f32],
174        src: T,
175    ) {
176        simd.vectorize(
177            #[inline(always)]
178            || {
179                for (next_dest, next_src) in dest.chunks_exact_mut(16).zip(src) {
180                    let one_minus_alpha = 1.0 - next_src.splat_4th();
181                    alpha_composite_inner(simd, next_dest, next_src, one_minus_alpha);
182                }
183            },
184        );
185    }
186
187    pub(super) fn blend<S: Simd, T: Iterator<Item = f32x16<S>>>(
188        simd: S,
189        dest: &mut [f32],
190        src: T,
191        blend_mode: BlendMode,
192    ) {
193        let mask = f32x16::splat(simd, 1.0);
194
195        for (next_dest, next_src) in dest.chunks_exact_mut(16).zip(src) {
196            let bg_v = f32x16::from_slice(simd, next_dest);
197            let src_c = blend::mix(next_src, bg_v, blend_mode);
198            let res = blend_mode.compose(simd, src_c, bg_v, mask);
199            next_dest.copy_from_slice(&res.val);
200        }
201    }
202
203    #[inline(always)]
204    fn alpha_composite_inner<S: Simd>(
205        s: S,
206        dest: &mut [f32],
207        src: f32x16<S>,
208        one_minus_alpha: f32x16<S>,
209    ) {
210        let mut bg_c = f32x16::from_slice(s, dest);
211        bg_c = one_minus_alpha.madd(bg_c, src);
212        dest.copy_from_slice(&bg_c.val);
213    }
214}
215
216mod alpha_fill {
217    use crate::fine::Splat4thExt;
218    use crate::fine::highp::compose::ComposeExt;
219    use crate::fine::highp::{blend, extract_masks};
220    use crate::peniko::BlendMode;
221    use vello_common::fearless_simd::*;
222
223    #[inline(always)]
224    pub(super) fn alpha_composite_solid<S: Simd>(
225        s: S,
226        dest: &mut [f32],
227        src: [f32; 4],
228        alphas: &[u8],
229    ) {
230        s.vectorize(
231            #[inline(always)]
232            || {
233                let src_a = f32x16::splat(s, src[3]);
234                let src_c = f32x16::block_splat(src.simd_into(s));
235                let one = f32x16::splat(s, 1.0);
236
237                for (next_dest, next_mask) in dest.chunks_exact_mut(16).zip(alphas.chunks_exact(4))
238                {
239                    alpha_composite_inner(s, next_dest, next_mask, src_c, src_a, one);
240                }
241            },
242        );
243    }
244
245    #[inline(always)]
246    pub(super) fn alpha_composite_arbitrary<S: Simd, T: Iterator<Item = f32x16<S>>>(
247        simd: S,
248        dest: &mut [f32],
249        src: T,
250        alphas: &[u8],
251    ) {
252        simd.vectorize(
253            #[inline(always)]
254            || {
255                let one = f32x16::splat(simd, 1.0);
256
257                for ((next_dest, next_mask), next_src) in dest
258                    .chunks_exact_mut(16)
259                    .zip(alphas.chunks_exact(4))
260                    .zip(src)
261                {
262                    let src_a = next_src.splat_4th();
263                    alpha_composite_inner(simd, next_dest, next_mask, next_src, src_a, one);
264                }
265            },
266        );
267    }
268
269    pub(super) fn blend<S: Simd, T: Iterator<Item = f32x16<S>>>(
270        simd: S,
271        dest: &mut [f32],
272        src: T,
273        alphas: &[u8],
274        blend_mode: BlendMode,
275    ) {
276        simd.vectorize(
277            #[inline(always)]
278            || {
279                for ((next_dest, next_mask), next_src) in dest
280                    .chunks_exact_mut(16)
281                    .zip(alphas.chunks_exact(4))
282                    .zip(src)
283                {
284                    let masks = extract_masks(simd, next_mask);
285                    let bg = f32x16::from_slice(simd, next_dest);
286                    let src_c = blend::mix(next_src, bg, blend_mode);
287                    let res = blend_mode.compose(simd, src_c, bg, masks);
288                    next_dest.copy_from_slice(&res.val);
289                }
290            },
291        );
292    }
293
294    #[inline(always)]
295    fn alpha_composite_inner<S: Simd>(
296        s: S,
297        dest: &mut [f32],
298        masks: &[u8],
299        src_c: f32x16<S>,
300        src_a: f32x16<S>,
301        one: f32x16<S>,
302    ) {
303        let bg_c = f32x16::from_slice(s, dest);
304        let mask_a = extract_masks(s, masks);
305        // 1 - src_a * mask_a
306        let inv_src_a_mask_a = src_a.madd(-mask_a, one);
307
308        let res = bg_c.madd(inv_src_a_mask_a, src_c * mask_a);
309        dest.copy_from_slice(&res.val);
310    }
311}
312
313#[inline(always)]
314fn extract_masks<S: Simd>(simd: S, masks: &[u8]) -> f32x16<S> {
315    let mut base_mask = [
316        masks[0] as f32,
317        masks[1] as f32,
318        masks[2] as f32,
319        masks[3] as f32,
320    ]
321    .simd_into(simd);
322
323    base_mask *= f32x4::splat(simd, 1.0 / 255.0);
324
325    let res = f32x16::block_splat(base_mask);
326    let zip_low = res.zip_low(res);
327
328    zip_low.zip_low(zip_low)
329}
vello_cpu/fine/highp/mod.rs

vello_cpu/fine/highp/
mod.rs