Skip to main content

vello_cpu/fine/highp/
mod.rs

1// Copyright 2025 the Vello Authors
2// SPDX-License-Identifier: Apache-2.0 OR MIT
3
4//! High-precision (f32) rendering kernel implementation.
5//!
6//! This module implements the fine rasterization stage using 32-bit floating-point
7//! values for color components. This provides maximum precision and color accuracy,
8//! at the cost of higher memory bandwidth and potentially slower performance compared
9//! to the low-precision u8 kernel.
10//!
11//! The f32 kernel is particularly useful for:
12//! - Scenes requiring high precision (e.g., gradients with subtle color transitions)
13//! - Debugging and reference implementations
14//! - Platforms where SIMD f32 operations are well-optimized
15
16use crate::filter::filter_highp;
17use crate::fine::FineKernel;
18use crate::fine::{COLOR_COMPONENTS, Painter};
19use crate::layer_manager::LayerManager;
20use crate::peniko::BlendMode;
21use crate::region::Region;
22use vello_common::fearless_simd::*;
23use vello_common::filter_effects::Filter;
24use vello_common::kurbo::Affine;
25use vello_common::mask::Mask;
26use vello_common::paint::PremulColor;
27use vello_common::pixmap::Pixmap;
28use vello_common::tile::Tile;
29
30pub(crate) mod blend;
31pub(crate) mod compose;
32
33/// The kernel for doing rendering using f32.
34#[derive(Clone, Copy, Debug)]
35pub struct F32Kernel;
36
37impl<S: Simd> FineKernel<S> for F32Kernel {
38    type Numeric = f32;
39    type Composite = f32x16<S>;
40    type NumericVec = f32x16<S>;
41
42    /// Extracts RGBA color components from a premultiplied color as f32 values [0.0, 1.0].
43    #[inline(always)]
44    fn extract_color(color: PremulColor) -> [Self::Numeric; 4] {
45        color.as_premul_f32().components
46    }
47
48    /// Copies rendered pixels from the scratch buffer to the output region.
49    ///
50    /// Converts f32 color values in [0.0, 1.0] range to u8 [0, 255] with rounding,
51    /// transforming from column-major scratch buffer to row-major region layout.
52    #[inline(always)]
53    fn pack(simd: S, region: &mut Region<'_>, blend_buf: &[Self::Numeric]) {
54        simd.vectorize(
55            #[inline(always)]
56            || {
57                for y in 0..Tile::HEIGHT {
58                    for (x, pixel) in region
59                        .row_mut(y)
60                        .chunks_exact_mut(COLOR_COMPONENTS)
61                        .enumerate()
62                    {
63                        let idx =
64                            COLOR_COMPONENTS * (usize::from(Tile::HEIGHT) * x + usize::from(y));
65                        let start = &blend_buf[idx..];
66                        // Convert f32 [0.0, 1.0] to u8 [0, 255] with rounding.
67                        // TODO: Use explicit SIMD for better performance
68                        let converted = [
69                            (start[0] * 255.0 + 0.5) as u8,
70                            (start[1] * 255.0 + 0.5) as u8,
71                            (start[2] * 255.0 + 0.5) as u8,
72                            (start[3] * 255.0 + 0.5) as u8,
73                        ];
74                        pixel.copy_from_slice(&converted);
75                    }
76                }
77            },
78        );
79    }
80
81    /// Copies pixels from the output region to the scratch buffer.
82    ///
83    /// Converts u8 color values [0, 255] to normalized f32 [0.0, 1.0],
84    /// transforming from row-major region layout to column-major scratch buffer.
85    /// This is the inverse operation of `pack`.
86    #[inline(always)]
87    fn unpack(simd: S, region: &mut Region<'_>, blend_buf: &mut [Self::Numeric]) {
88        simd.vectorize(
89            #[inline(always)]
90            || {
91                for y in 0..Tile::HEIGHT {
92                    for (x, pixel) in region.row_mut(y).chunks_exact(COLOR_COMPONENTS).enumerate() {
93                        let idx =
94                            COLOR_COMPONENTS * (usize::from(Tile::HEIGHT) * x + usize::from(y));
95                        let start = &mut blend_buf[idx..];
96                        // Convert u8 [0, 255] to normalized f32 [0.0, 1.0] (reverse of pack)
97                        start[0] = pixel[0] as f32 / 255.0;
98                        start[1] = pixel[1] as f32 / 255.0;
99                        start[2] = pixel[2] as f32 / 255.0;
100                        start[3] = pixel[3] as f32 / 255.0;
101                    }
102                }
103            },
104        );
105    }
106
107    /// Applies a filter effect to a rendered layer.
108    ///
109    /// Delegates to the f32-specific filter implementation.
110    fn filter_layer(
111        pixmap: &mut Pixmap,
112        filter: &Filter,
113        layer_manager: &mut LayerManager,
114        transform: Affine,
115    ) {
116        filter_highp(filter, pixmap, layer_manager, transform);
117    }
118
119    /// Fills a buffer with a solid color using SIMD operations.
120    ///
121    /// Efficiently broadcasts a single RGBA color across all pixels in the destination.
122    #[inline(never)]
123    fn copy_solid(simd: S, dest: &mut [Self::Numeric], src: [Self::Numeric; 4]) {
124        simd.vectorize(
125            #[inline(always)]
126            || {
127                let color = f32x16::block_splat(src.simd_into(simd));
128
129                for el in dest.chunks_exact_mut(16) {
130                    el.copy_from_slice(color.as_slice());
131                }
132            },
133        );
134    }
135
136    /// Applies per-pixel mask values to a buffer by multiplying each component.
137    ///
138    /// Used for anti-aliasing and clipping effects. Each pixel is multiplied by
139    /// its corresponding mask value (already normalized to [0.0, 1.0]).
140    fn apply_mask(
141        simd: S,
142        dest: &mut [Self::Numeric],
143        mut src: impl Iterator<Item = Self::NumericVec>,
144    ) {
145        simd.vectorize(
146            #[inline(always)]
147            || {
148                for el in dest.chunks_exact_mut(16) {
149                    let loaded = f32x16::from_slice(simd, el);
150                    let mulled = loaded * src.next().unwrap();
151                    el.copy_from_slice(mulled.as_slice());
152                }
153            },
154        );
155    }
156
157    /// Applies a painter's output to the destination buffer.
158    ///
159    /// Delegates to the painter's f32-specific implementation.
160    #[inline(always)]
161    fn apply_painter<'a>(_: S, dest: &mut [Self::Numeric], mut painter: impl Painter + 'a) {
162        painter.paint_f32(dest);
163    }
164
165    /// Composites a solid color onto a buffer using alpha blending.
166    ///
167    /// Dispatches to either the masked or unmasked implementation based on the
168    /// presence of per-pixel alpha masks.
169    #[inline(always)]
170    fn alpha_composite_solid(
171        simd: S,
172        dest: &mut [Self::Numeric],
173        src: [Self::Numeric; 4],
174        alphas: Option<&[u8]>,
175    ) {
176        if let Some(alphas) = alphas {
177            alpha_fill::alpha_composite_solid(
178                simd,
179                dest,
180                src,
181                bytemuck::cast_slice::<u8, [u8; 4]>(alphas).iter().copied(),
182            );
183        } else {
184            fill::alpha_composite_solid(simd, dest, src);
185        }
186    }
187
188    /// Composites a source buffer onto a destination buffer using alpha blending.
189    ///
190    /// Dispatches to either the masked or unmasked implementation based on the
191    /// presence of per-pixel alpha masks. Each source pixel's alpha determines
192    /// the blending amount.
193    fn alpha_composite_buffer(
194        simd: S,
195        dest: &mut [Self::Numeric],
196        src: &[Self::Numeric],
197        alphas: Option<&[u8]>,
198    ) {
199        if let Some(alphas) = alphas {
200            alpha_fill::alpha_composite_arbitrary(
201                simd,
202                dest,
203                src.chunks_exact(16).map(|el| f32x16::from_slice(simd, el)),
204                bytemuck::cast_slice::<u8, [u8; 4]>(alphas).iter().copied(),
205            );
206        } else {
207            fill::alpha_composite_arbitrary(
208                simd,
209                dest,
210                src.chunks_exact(16).map(|el| f32x16::from_slice(simd, el)),
211            );
212        }
213    }
214
215    /// Applies a blend mode to composite source pixels onto destination.
216    ///
217    /// Dispatches to either the masked or unmasked blend implementation.
218    /// Handles both color mixing (multiply, screen, etc.) and compositing.
219    fn blend(
220        simd: S,
221        dest: &mut [Self::Numeric],
222        mut start_x: u16,
223        start_y: u16,
224        src: impl Iterator<Item = Self::Composite>,
225        blend_mode: BlendMode,
226        alphas: Option<&[u8]>,
227        mask: Option<&Mask>,
228    ) {
229        let alpha_iter = alphas.map(|a| bytemuck::cast_slice::<u8, [u8; 4]>(a).iter().copied());
230
231        let mask_iter = mask.map(|m| {
232            let width = m.width();
233            let height = m.height();
234
235            core::iter::from_fn(move || {
236                let samples = if start_x < width && start_y + 3 < height {
237                    // All in bounds, sample directly
238                    [
239                        m.sample(start_x, start_y),
240                        m.sample(start_x, start_y + 1),
241                        m.sample(start_x, start_y + 2),
242                        m.sample(start_x, start_y + 3),
243                    ]
244                } else {
245                    // Fallback: check each individually
246                    [
247                        if start_x < width && start_y < height {
248                            m.sample(start_x, start_y)
249                        } else {
250                            255
251                        },
252                        if start_x < width && start_y + 1 < height {
253                            m.sample(start_x, start_y + 1)
254                        } else {
255                            255
256                        },
257                        if start_x < width && start_y + 2 < height {
258                            m.sample(start_x, start_y + 2)
259                        } else {
260                            255
261                        },
262                        if start_x < width && start_y + 3 < height {
263                            m.sample(start_x, start_y + 3)
264                        } else {
265                            255
266                        },
267                    ]
268                };
269
270                start_x += 1;
271
272                Some(samples)
273            })
274        });
275
276        match (alpha_iter, mask_iter) {
277            (Some(alpha_iter), Some(mut mask_iter)) => {
278                let iter = alpha_iter.map(|a1| {
279                    let a2 = mask_iter.next().unwrap();
280                    [
281                        ((a1[0] as u16 * a2[0] as u16) / 255) as u8,
282                        ((a1[1] as u16 * a2[1] as u16) / 255) as u8,
283                        ((a1[2] as u16 * a2[2] as u16) / 255) as u8,
284                        ((a1[3] as u16 * a2[3] as u16) / 255) as u8,
285                    ]
286                });
287                alpha_fill::blend(simd, dest, src, iter, blend_mode);
288            }
289            (None, Some(mask_iter)) => alpha_fill::blend(simd, dest, src, mask_iter, blend_mode),
290            (Some(alpha_iter), None) => alpha_fill::blend(simd, dest, src, alpha_iter, blend_mode),
291            (None, None) => {
292                fill::blend(simd, dest, src, blend_mode);
293            }
294        }
295    }
296}
297
298mod fill {
299    //! Alpha compositing and blending operations without per-pixel alpha masks.
300    //!
301    //! This module handles the case where we're compositing full opacity pixels,
302    //! using only the source alpha channel for compositing.
303
304    use crate::fine::Splat4thExt;
305    use crate::fine::highp::blend;
306    use crate::fine::highp::compose::ComposeExt;
307    use crate::peniko::BlendMode;
308
309    use vello_common::fearless_simd::*;
310
311    // IMPORTANT: The inlining attributes (#[inline(always)], #[inline(never)]) in this
312    // module have been carefully tuned through benchmarking. Changing them can cause
313    // significant performance regressions.
314
315    /// Composites a solid color onto a buffer using alpha blending.
316    ///
317    /// Uses the "over" operator: `result = src + bg * (1 - src_alpha)`
318    #[inline(always)]
319    pub(super) fn alpha_composite_solid<S: Simd>(s: S, dest: &mut [f32], src: [f32; 4]) {
320        s.vectorize(
321            #[inline(always)]
322            || {
323                let one_minus_alpha = 1.0 - f32x16::block_splat(f32x4::splat(s, src[3]));
324                let src_c = f32x16::block_splat(f32x4::simd_from(src, s));
325
326                for next_dest in dest.chunks_exact_mut(16) {
327                    alpha_composite_inner(s, next_dest, src_c, one_minus_alpha);
328                }
329            },
330        );
331    }
332
333    /// Composites a buffer of colors onto another buffer using alpha blending.
334    ///
335    /// Each source pixel is composited individually based on its alpha channel.
336    #[inline(always)]
337    pub(super) fn alpha_composite_arbitrary<S: Simd, T: Iterator<Item = f32x16<S>>>(
338        simd: S,
339        dest: &mut [f32],
340        src: T,
341    ) {
342        simd.vectorize(
343            #[inline(always)]
344            || {
345                for (next_dest, next_src) in dest.chunks_exact_mut(16).zip(src) {
346                    let one_minus_alpha = 1.0 - next_src.splat_4th();
347                    alpha_composite_inner(simd, next_dest, next_src, one_minus_alpha);
348                }
349            },
350        );
351    }
352
353    /// Applies blend mode compositing to a buffer without per-pixel masks.
354    pub(super) fn blend<S: Simd, T: Iterator<Item = f32x16<S>>>(
355        simd: S,
356        dest: &mut [f32],
357        src: T,
358        blend_mode: BlendMode,
359    ) {
360        for (next_dest, next_src) in dest.chunks_exact_mut(16).zip(src) {
361            let bg_v = f32x16::from_slice(simd, next_dest);
362            let src_c = blend::mix(next_src, bg_v, blend_mode);
363            let res = blend_mode.compose(simd, src_c, bg_v, None);
364            next_dest.copy_from_slice(res.as_slice());
365        }
366    }
367
368    /// Performs the core alpha compositing calculation.
369    ///
370    /// Formula: `result = src + bg * (1 - src_alpha)`
371    /// This implements the Porter-Duff "source over" operator using FMA for efficiency.
372    #[inline(always)]
373    fn alpha_composite_inner<S: Simd>(
374        s: S,
375        dest: &mut [f32],
376        src: f32x16<S>,
377        one_minus_alpha: f32x16<S>,
378    ) {
379        let mut bg_c = f32x16::from_slice(s, dest);
380        bg_c = one_minus_alpha.madd(bg_c, src);
381        dest.copy_from_slice(bg_c.as_slice());
382    }
383}
384
385mod alpha_fill {
386    //! Alpha compositing and blending operations with per-pixel alpha masks.
387    //!
388    //! This module handles compositing when each pixel has an additional mask value
389    //! (e.g., from anti-aliasing or clip masks) that modulates the source alpha.
390
391    use crate::fine::Splat4thExt;
392    use crate::fine::highp::compose::ComposeExt;
393    use crate::fine::highp::{blend, extract_masks};
394    use crate::peniko::BlendMode;
395    use vello_common::fearless_simd::*;
396
397    /// Composites a solid color with per-pixel alpha masks.
398    ///
399    /// Combines source alpha with mask values: `effective_alpha = src_alpha * mask`
400    #[inline(always)]
401    pub(super) fn alpha_composite_solid<S: Simd>(
402        s: S,
403        dest: &mut [f32],
404        src: [f32; 4],
405        alphas: impl Iterator<Item = [u8; 4]>,
406    ) {
407        s.vectorize(
408            #[inline(always)]
409            || {
410                let src_a = f32x16::splat(s, src[3]);
411                let src_c = f32x16::block_splat(src.simd_into(s));
412                let one = f32x16::splat(s, 1.0);
413
414                for (next_dest, next_mask) in dest.chunks_exact_mut(16).zip(alphas) {
415                    alpha_composite_inner(s, next_dest, &next_mask, src_c, src_a, one);
416                }
417            },
418        );
419    }
420
421    /// Composites a buffer of colors with per-pixel alpha masks.
422    ///
423    /// Each pixel's source alpha is modulated by its corresponding mask value.
424    pub(super) fn alpha_composite_arbitrary<S: Simd, T: Iterator<Item = f32x16<S>>>(
425        simd: S,
426        dest: &mut [f32],
427        src: T,
428        alphas: impl Iterator<Item = [u8; 4]>,
429    ) {
430        simd.vectorize(
431            #[inline(always)]
432            || {
433                let one = f32x16::splat(simd, 1.0);
434
435                for ((next_dest, next_mask), next_src) in
436                    dest.chunks_exact_mut(16).zip(alphas).zip(src)
437                {
438                    let src_a = next_src.splat_4th();
439                    alpha_composite_inner(simd, next_dest, &next_mask, next_src, src_a, one);
440                }
441            },
442        );
443    }
444
445    /// Applies blend mode compositing with per-pixel alpha masks.
446    pub(super) fn blend<S: Simd, T: Iterator<Item = f32x16<S>>>(
447        simd: S,
448        dest: &mut [f32],
449        src: T,
450        alphas: impl Iterator<Item = [u8; 4]>,
451        blend_mode: BlendMode,
452    ) {
453        simd.vectorize(
454            #[inline(always)]
455            || {
456                for ((next_dest, next_mask), next_src) in
457                    dest.chunks_exact_mut(16).zip(alphas).zip(src)
458                {
459                    let masks = extract_masks(simd, &next_mask);
460                    let bg = f32x16::from_slice(simd, next_dest);
461                    let src_c = blend::mix(next_src, bg, blend_mode);
462                    let res = blend_mode.compose(simd, src_c, bg, Some(masks));
463                    next_dest.copy_from_slice(res.as_slice());
464                }
465            },
466        );
467    }
468
469    /// Performs alpha compositing with mask modulation.
470    ///
471    /// Formula: `result = src * mask + bg * (1 - src_alpha * mask)`
472    /// The mask value modulates both the source contribution and the inverse alpha.
473    /// Uses FMA instructions for optimal performance.
474    #[inline(always)]
475    fn alpha_composite_inner<S: Simd>(
476        s: S,
477        dest: &mut [f32],
478        masks: &[u8; 4],
479        src_c: f32x16<S>,
480        src_a: f32x16<S>,
481        one: f32x16<S>,
482    ) {
483        let bg_c = f32x16::from_slice(s, dest);
484        let mask_a = extract_masks(s, masks);
485        // 1 - src_a * mask_a
486        let inv_src_a_mask_a = src_a.madd(-mask_a, one);
487
488        let res = bg_c.madd(inv_src_a_mask_a, src_c * mask_a);
489        dest.copy_from_slice(res.as_slice());
490    }
491}
492
493/// Expands 4 mask bytes into a 16-element f32 SIMD vector with normalized values.
494///
495/// Converts u8 mask values to f32 in range [0.0, 1.0], then duplicates each mask
496/// value across 4 consecutive elements (one per color component).
497///
498/// Input: [m0, m1, m2, m3] (as u8, 0-255)
499/// Output: [m0/255, m0/255, m0/255, m0/255, m1/255, ..., m3/255] (as f32, 16 elements)
500#[inline(always)]
501fn extract_masks<S: Simd>(simd: S, masks: &[u8; 4]) -> f32x16<S> {
502    let mut base_mask = [
503        masks[0] as f32,
504        masks[1] as f32,
505        masks[2] as f32,
506        masks[3] as f32,
507    ]
508    .simd_into(simd);
509
510    base_mask *= f32x4::splat(simd, 1.0 / 255.0);
511
512    let res = f32x16::block_splat(base_mask);
513    let zip_low = res.zip_low(res);
514
515    zip_low.zip_low(zip_low)
516}