Skip to main content

vello_cpu/fine/
mod.rs

1// Copyright 2025 the Vello Authors
2// SPDX-License-Identifier: Apache-2.0 OR MIT
3
4//! Fine rasterization stage of the rendering pipeline.
5//!
6//! This module implements the fine rasterization phase, which processes tiles at the pixel level.
7//! It supports both high-precision (f32) and low-precision (u8) rendering paths, along with
8//! various paint types including solid colors, gradients, images, and blurred rounded rectangles.
9
10mod common;
11mod highp;
12mod lowp;
13
14use crate::fine::common::gradient::linear::SimdLinearKind;
15use crate::fine::common::gradient::radial::SimdRadialKind;
16use crate::fine::common::gradient::sweep::SimdSweepKind;
17use crate::fine::common::gradient::{GradientPainter, calculate_t_vals};
18use crate::fine::common::image::{FilteredImagePainter, NNImagePainter, PlainNNImagePainter};
19use crate::fine::common::rounded_blurred_rect::BlurredRoundedRectFiller;
20use crate::layer_manager::LayerManager;
21use crate::peniko::{BlendMode, ImageQuality};
22use crate::region::Region;
23use crate::util::EncodedImageExt;
24use alloc::vec;
25use alloc::vec::Vec;
26use core::fmt::Debug;
27use core::iter;
28use vello_common::coarse::{Cmd, CommandAttrs, WideTile};
29use vello_common::encode::{
30    EncodedBlurredRoundedRectangle, EncodedGradient, EncodedImage, EncodedKind, EncodedPaint,
31};
32use vello_common::fearless_simd::{
33    Bytes, Simd, SimdBase, SimdFloat, SimdInt, SimdInto, f32x4, f32x8, f32x16, u8x16, u8x32, u32x4,
34    u32x8,
35};
36use vello_common::filter_effects::Filter;
37use vello_common::kurbo::Affine;
38use vello_common::mask::Mask;
39use vello_common::paint::{ImageSource, Paint, PremulColor};
40use vello_common::pixmap::Pixmap;
41use vello_common::simd::Splat4thExt;
42use vello_common::tile::Tile;
43use vello_common::util::f32_to_u8;
44
45pub use highp::F32Kernel;
46pub use lowp::U8Kernel;
47
48/// Number of color components per pixel (RGBA).
49pub(crate) const COLOR_COMPONENTS: usize = 4;
50
51/// Number of color components in a single column of a tile (height * components).
52pub(crate) const TILE_HEIGHT_COMPONENTS: usize = Tile::HEIGHT as usize * COLOR_COMPONENTS;
53
54/// Size of the scratch buffer used for intermediate rendering operations.
55/// Sized to hold a full wide tile with all color components.
56pub const SCRATCH_BUF_SIZE: usize =
57    WideTile::WIDTH as usize * Tile::HEIGHT as usize * COLOR_COMPONENTS;
58
59/// Type alias for a scratch buffer that can hold a full wide tile's worth of data.
60pub type ScratchBuf<F> = [F; SCRATCH_BUF_SIZE];
61
62/// Trait for numeric types used in fine rasterization.
63///
64/// This trait abstracts over `f32` and `u8` to allow the same rendering logic
65/// to work with both high-precision (floating-point) and low-precision (integer)
66/// representations. This enables performance optimizations while maintaining accuracy
67/// where needed.
68pub trait Numeric: Copy + Default + Clone + Debug + PartialEq + Send + Sync + 'static {
69    /// The zero value for this numeric type (0.0 for f32, 0 for u8).
70    const ZERO: Self;
71
72    /// The maximum opacity value for this numeric type (1.0 for f32, 255 for u8).
73    const ONE: Self;
74}
75
76impl Numeric for f32 {
77    const ZERO: Self = 0.0;
78    const ONE: Self = 1.0;
79}
80
81impl Numeric for u8 {
82    const ZERO: Self = 0;
83    const ONE: Self = 255;
84}
85
86/// Trait for SIMD vector types that can convert between f32 and u8 representations.
87///
88/// This trait enables efficient batch conversions between different numeric representations
89/// during rendering operations, supporting both high-precision and low-precision rendering paths.
90pub trait NumericVec<S: Simd>: Copy + Clone + Send + Sync {
91    /// Convert from a SIMD vector of f32 values to this type.
92    fn from_f32(simd: S, val: f32x16<S>) -> Self;
93
94    /// Convert from a SIMD vector of u8 values to this type.
95    fn from_u8(simd: S, val: u8x16<S>) -> Self;
96}
97
98impl<S: Simd> NumericVec<S> for f32x16<S> {
99    #[inline(always)]
100    fn from_f32(_: S, val: Self) -> Self {
101        val
102    }
103
104    #[inline(always)]
105    fn from_u8(simd: S, val: u8x16<S>) -> Self {
106        let converted = u8_to_f32(val);
107        converted * Self::splat(simd, 1.0 / 255.0)
108    }
109}
110
111impl<S: Simd> NumericVec<S> for u8x16<S> {
112    #[inline(always)]
113    fn from_f32(simd: S, val: f32x16<S>) -> Self {
114        let v1 = f32x16::splat(simd, 255.0);
115        let v2 = f32x16::splat(simd, 0.5);
116        let mulled = val.madd(v1, v2);
117
118        f32_to_u8(mulled)
119    }
120
121    #[inline(always)]
122    fn from_u8(_: S, val: Self) -> Self {
123        val
124    }
125}
126
127/// Convert a SIMD vector of u8 values to f32 values.
128///
129/// This function efficiently converts 16 u8 values to their f32 equivalents using SIMD operations,
130/// preserving the values without normalization (i.e., 255 becomes 255.0, not 1.0).
131#[inline(always)]
132pub(crate) fn u8_to_f32<S: Simd>(val: u8x16<S>) -> f32x16<S> {
133    let simd = val.simd;
134    let zeroes = u8x16::splat(simd, 0);
135
136    let zip1 = simd.zip_high_u8x16(val, zeroes);
137    let zip2 = simd.zip_low_u8x16(val, zeroes);
138
139    let p1 = simd
140        .zip_low_u8x16(zip2, zeroes)
141        .bitcast::<u32x4<S>>()
142        .to_float::<f32x4<S>>();
143    let p2 = simd
144        .zip_high_u8x16(zip2, zeroes)
145        .bitcast::<u32x4<S>>()
146        .to_float::<f32x4<S>>();
147    let p3 = simd
148        .zip_low_u8x16(zip1, zeroes)
149        .bitcast::<u32x4<S>>()
150        .to_float::<f32x4<S>>();
151    let p4 = simd
152        .zip_high_u8x16(zip1, zeroes)
153        .bitcast::<u32x4<S>>()
154        .to_float::<f32x4<S>>();
155
156    simd.combine_f32x8(simd.combine_f32x4(p1, p2), simd.combine_f32x4(p3, p4))
157}
158
159/// Trait for SIMD vector types used in compositing and blending operations.
160///
161/// This trait abstracts over different SIMD vector widths (f32x16 for high-precision,
162/// u8x32 for low-precision) to enable efficient batch processing of pixel data during
163/// blending and compositing.
164pub trait CompositeType<N: Numeric, S: Simd>: Copy + Clone + Send + Sync {
165    /// The number of numeric values this composite type can hold.
166    const LENGTH: usize;
167
168    /// Load values from a slice into this composite type.
169    fn from_slice(simd: S, slice: &[N]) -> Self;
170
171    /// Create a composite type by repeating a single RGBA color across all elements.
172    fn from_color(simd: S, color: [N; 4]) -> Self;
173}
174
175impl<S: Simd> CompositeType<f32, S> for f32x16<S> {
176    const LENGTH: usize = 16;
177
178    #[inline(always)]
179    fn from_slice(simd: S, slice: &[f32]) -> Self {
180        <Self as SimdBase<_, _>>::from_slice(simd, slice)
181    }
182
183    #[inline(always)]
184    fn from_color(simd: S, color: [f32; 4]) -> Self {
185        Self::block_splat(f32x4::from_slice(simd, &color[..]))
186    }
187}
188
189impl<S: Simd> CompositeType<u8, S> for u8x32<S> {
190    const LENGTH: usize = 32;
191
192    #[inline(always)]
193    fn from_slice(simd: S, slice: &[u8]) -> Self {
194        <Self as SimdBase<_, _>>::from_slice(simd, slice)
195    }
196
197    #[inline(always)]
198    fn from_color(simd: S, color: [u8; 4]) -> Self {
199        u32x8::block_splat(u32x4::splat(simd, u32::from_ne_bytes(color))).to_bytes()
200    }
201}
202
203/// A kernel for performing fine rasterization.
204///
205/// This trait defines the interface for tile-level rendering operations, abstracting over
206/// different numeric precisions (f32 vs u8). Implementations provide the low-level pixel
207/// manipulation, blending, and painting operations needed to render tiles.
208///
209/// The two main implementations are:
210/// - [`F32Kernel`]: High-precision rendering using 32-bit floating-point values
211/// - [`U8Kernel`]: Low-precision rendering using 8-bit integer values
212pub trait FineKernel<S: Simd>: Send + Sync + 'static {
213    /// The basic underlying numerical type of the kernel (f32 or u8).
214    type Numeric: Numeric;
215
216    /// The SIMD composite type used for efficient batch blending and compositing operations.
217    type Composite: CompositeType<Self::Numeric, S>;
218
219    /// The SIMD vector type used for conversions between u8 and f32 representations.
220    type NumericVec: NumericVec<S>;
221
222    /// Extract and convert a premultiplied color to the kernel's numeric type.
223    ///
224    /// Converts RGBA components from the standard premultiplied color format to
225    /// the kernel's internal representation (e.g., 0.0-1.0 for f32, 0-255 for u8).
226    fn extract_color(color: PremulColor) -> [Self::Numeric; 4];
227
228    /// Pack the blend buffer contents into the output region.
229    ///
230    /// Converts from the internal scratch buffer format to the output tile format,
231    /// writing the results to the provided region.
232    fn pack(simd: S, region: &mut Region<'_>, blend_buf: &[Self::Numeric]);
233
234    /// Unpack the region contents back into the blend buffer.
235    ///
236    /// Performs the reverse of `pack`, reading pixel data from the tile region
237    /// and loading it into the scratch buffer for further processing.
238    fn unpack(simd: S, region: &mut Region<'_>, blend_buf: &mut [Self::Numeric]);
239
240    /// Apply a filter to a layer.
241    ///
242    /// This is used for applying filters to whole layers, which is necessary for
243    /// spatial filters (like blur) that need to access neighboring pixels. The filter
244    /// is applied in-place to the provided pixmap.
245    ///
246    /// The transform parameter is used to scale filter parameters based on the current
247    /// transformation matrix (e.g., zoom level), ensuring filters look consistent
248    /// regardless of scale.
249    fn filter_layer(
250        pixmap: &mut Pixmap,
251        filter: &Filter,
252        layer_manager: &mut LayerManager,
253        transform: Affine,
254    );
255
256    /// Fill the target buffer with a solid color.
257    ///
258    /// Efficiently replicates the given RGBA color across all pixels in the target buffer.
259    fn copy_solid(simd: S, target: &mut [Self::Numeric], color: [Self::Numeric; 4]);
260    /// Create a painter for rendering gradients.
261    ///
262    /// Returns a painter that can render linear, radial, or sweep gradients based on
263    /// pre-computed t values (gradient interpolation parameters).
264    fn gradient_painter<'a>(
265        simd: S,
266        gradient: &'a EncodedGradient,
267        t_vals: &'a [f32],
268    ) -> impl Painter + 'a {
269        simd.vectorize(
270            #[inline(always)]
271            || GradientPainter::new(simd, gradient, t_vals),
272        )
273    }
274
275    /// Create a painter for rendering gradients with undefined region support.
276    ///
277    /// Similar to `gradient_painter`, but with support for masking undefined locations
278    /// (used for radial gradients that may have mathematically undefined regions).
279    ///
280    /// This is intentionally a duplicate of the default [`FineKernel::gradient_painter`]
281    /// implementation--the `U8Kernel` overrides that method, but not this one.
282    fn gradient_painter_with_undefined<'a>(
283        simd: S,
284        gradient: &'a EncodedGradient,
285        t_vals: &'a [f32],
286    ) -> impl Painter + 'a {
287        simd.vectorize(
288            #[inline(always)]
289            || GradientPainter::new(simd, gradient, t_vals),
290        )
291    }
292    /// Create a painter for rendering axis-aligned nearest-neighbor images.
293    ///
294    /// Optimized painter for images with `Low` quality and no skewing component in their
295    /// transform. This is the fastest image rendering path.
296    fn plain_nn_image_painter<'a>(
297        simd: S,
298        image: &'a EncodedImage,
299        pixmap: &'a Pixmap,
300        start_x: u16,
301        start_y: u16,
302    ) -> impl Painter + 'a {
303        simd.vectorize(
304            #[inline(always)]
305            || PlainNNImagePainter::new(simd, image, pixmap, start_x, start_y),
306        )
307    }
308
309    /// Create a painter for rendering nearest-neighbor images with transforms.
310    ///
311    /// Similar to `plain_nn_image_painter`, but supports arbitrary affine transforms
312    /// including skewing and rotation.
313    fn nn_image_painter<'a>(
314        simd: S,
315        image: &'a EncodedImage,
316        pixmap: &'a Pixmap,
317        start_x: u16,
318        start_y: u16,
319    ) -> impl Painter + 'a {
320        simd.vectorize(
321            #[inline(always)]
322            || NNImagePainter::new(simd, image, pixmap, start_x, start_y),
323        )
324    }
325
326    /// Create a painter for rendering images with `Medium` quality filtering.
327    ///
328    /// Uses bilinear filtering for smoother appearance than nearest-neighbor.
329    fn medium_quality_image_painter<'a>(
330        simd: S,
331        image: &'a EncodedImage,
332        pixmap: &'a Pixmap,
333        start_x: u16,
334        start_y: u16,
335    ) -> impl Painter + 'a {
336        simd.vectorize(
337            #[inline(always)]
338            || FilteredImagePainter::new(simd, image, pixmap, start_x, start_y),
339        )
340    }
341
342    /// Create a painter for rendering axis-aligned images with `Medium` quality filtering.
343    ///
344    /// Optimized painter for images with bilinear filtering and no skewing component.
345    fn plain_medium_quality_image_painter<'a>(
346        simd: S,
347        image: &'a EncodedImage,
348        pixmap: &'a Pixmap,
349        start_x: u16,
350        start_y: u16,
351    ) -> impl Painter + 'a {
352        simd.vectorize(
353            #[inline(always)]
354            || FilteredImagePainter::new(simd, image, pixmap, start_x, start_y),
355        )
356    }
357
358    /// Create a painter for rendering images with `High` quality filtering.
359    ///
360    /// Uses high-quality filtering for the best visual appearance.
361    fn high_quality_image_painter<'a>(
362        simd: S,
363        image: &'a EncodedImage,
364        pixmap: &'a Pixmap,
365        start_x: u16,
366        start_y: u16,
367    ) -> impl Painter + 'a {
368        simd.vectorize(
369            #[inline(always)]
370            || FilteredImagePainter::new(simd, image, pixmap, start_x, start_y),
371        )
372    }
373
374    /// Create a painter for rendering blurred rounded rectangles.
375    ///
376    /// Efficiently renders rounded rectangles with gaussian blur applied,
377    /// computing the blur analytically rather than as a post-process.
378    fn blurred_rounded_rectangle_painter(
379        simd: S,
380        rect: &EncodedBlurredRoundedRectangle,
381        start_x: u16,
382        start_y: u16,
383    ) -> impl Painter {
384        simd.vectorize(
385            #[inline(always)]
386            || BlurredRoundedRectFiller::new(simd, rect, start_x, start_y),
387        )
388    }
389    /// Apply a mask to the destination buffer.
390    ///
391    /// Multiplies each pixel in the destination by the corresponding mask value,
392    /// effectively masking out or reducing the opacity of pixels.
393    fn apply_mask(simd: S, dest: &mut [Self::Numeric], src: impl Iterator<Item = Self::NumericVec>);
394
395    /// Apply a painter to render content into the destination buffer.
396    ///
397    /// Invokes the painter to generate pixel values and writes them to the destination.
398    fn apply_painter<'a>(simd: S, dest: &mut [Self::Numeric], painter: impl Painter + 'a);
399
400    /// Perform alpha compositing with a solid color over the target buffer.
401    ///
402    /// Blends a solid RGBA color over the existing contents using standard alpha compositing
403    /// (Porter-Duff source-over). Optionally applies additional per-pixel alpha values.
404    fn alpha_composite_solid(
405        simd: S,
406        target: &mut [Self::Numeric],
407        src: [Self::Numeric; 4],
408        alphas: Option<&[u8]>,
409    );
410
411    /// Perform alpha compositing with a source buffer over the destination buffer.
412    ///
413    /// Blends the source buffer contents over the destination using standard alpha compositing.
414    /// Optionally applies additional per-pixel alpha values.
415    fn alpha_composite_buffer(
416        simd: S,
417        dest: &mut [Self::Numeric],
418        src: &[Self::Numeric],
419        alphas: Option<&[u8]>,
420    );
421
422    /// Blend the source into the destination with a specified blend mode.
423    ///
424    /// Applies advanced blending operations (e.g., multiply, screen, overlay) as specified
425    /// by the blend mode. Optionally applies additional per-pixel alpha values.
426    fn blend(
427        simd: S,
428        dest: &mut [Self::Numeric],
429        start_x: u16,
430        start_y: u16,
431        src: impl Iterator<Item = Self::Composite>,
432        blend_mode: BlendMode,
433        alphas: Option<&[u8]>,
434        mask: Option<&Mask>,
435    );
436}
437
438/// Fine rasterizer for processing tiles at the pixel level.
439///
440/// This structure maintains the state and scratch buffers needed for tile-based rendering.
441/// It processes rendering commands and manages a stack of blend buffers for layer composition.
442#[derive(Debug)]
443pub struct Fine<S: Simd, T: FineKernel<S>> {
444    /// The (x, y) coordinates of the currently active wide tile being rendered.
445    pub(crate) wide_coords: (u16, u16),
446
447    /// Stack of blend buffers for managing layers and composition.
448    ///
449    /// Each layer pushes a new buffer onto this stack, and layers are composited
450    /// by popping and blending with the buffer below.
451    pub(crate) blend_buf: Vec<ScratchBuf<T::Numeric>>,
452
453    /// Intermediate buffer used by painters to store generated pixel data before compositing.
454    pub(crate) paint_buf: ScratchBuf<T::Numeric>,
455
456    /// Buffer for storing gradient interpolation parameters (t values).
457    ///
458    /// Gradients pre-compute these values for efficiency before color lookup.
459    pub(crate) f32_buf: Vec<f32>,
460
461    /// The SIMD context used for vectorized operations.
462    pub(crate) simd: S,
463}
464
465impl<S: Simd, T: FineKernel<S>> Fine<S, T> {
466    /// Create a new fine rasterizer with the given SIMD context.
467    ///
468    /// Initializes all scratch buffers and sets up the initial blend buffer.
469    pub fn new(simd: S) -> Self {
470        Self {
471            simd,
472            wide_coords: (0, 0),
473            blend_buf: vec![[T::Numeric::ZERO; SCRATCH_BUF_SIZE]],
474            f32_buf: vec![0.0; SCRATCH_BUF_SIZE / 4],
475            paint_buf: [T::Numeric::ZERO; SCRATCH_BUF_SIZE],
476        }
477    }
478
479    /// Set the coordinates of the wide tile currently being rendered.
480    ///
481    /// This is used by painters and other operations to compute absolute pixel positions.
482    pub fn set_coords(&mut self, x: u16, y: u16) {
483        self.wide_coords = (x, y);
484    }
485
486    /// Clear the current blend buffer to a solid color.
487    ///
488    /// This efficiently fills the entire buffer with the given premultiplied color.
489    pub fn clear(&mut self, premul_color: PremulColor) {
490        let converted_color = T::extract_color(premul_color);
491        let blend_buf = self.blend_buf.last_mut().unwrap();
492
493        T::copy_solid(self.simd, blend_buf, converted_color);
494    }
495
496    /// Writes the current blend buffer contents to the output region.
497    ///
498    /// This copies pixel data from the internal scratch buffer to the tile region,
499    /// converting the layout from the internal representation to the output format.
500    pub fn pack(&self, region: &mut Region<'_>) {
501        let blend_buf = self.blend_buf.last().unwrap();
502
503        T::pack(self.simd, region, blend_buf);
504    }
505
506    /// Reads the region contents back into the blend buffer.
507    ///
508    /// This copies pixel data from the tile region to the internal scratch buffer,
509    /// performing the reverse operation of `pack`. This is typically used when a layer
510    /// needs to be read back for further processing.
511    pub fn unpack(&mut self, region: &mut Region<'_>) {
512        let blend_buf = self.blend_buf.last_mut().unwrap();
513
514        T::unpack(self.simd, region, blend_buf);
515    }
516
517    /// Apply a filter to a layer.
518    ///
519    /// This applies the filter using the kernel's implementation, mutating the layer.
520    pub fn filter_layer(
521        &self,
522        pixmap: &mut Pixmap,
523        filter: &Filter,
524        layer_manager: &mut LayerManager,
525        transform: Affine,
526    ) {
527        T::filter_layer(pixmap, filter, layer_manager, transform);
528    }
529
530    /// Execute a rendering command on the current tile.
531    ///
532    /// This is the main dispatch method that processes different command types including
533    /// fills, clips, blends, filters, masks, and buffer operations.
534    pub(crate) fn run_cmd(
535        &mut self,
536        cmd: &Cmd,
537        alphas: &[u8],
538        paints: &[EncodedPaint],
539        attrs: &CommandAttrs,
540    ) {
541        match cmd {
542            Cmd::Fill(f) => {
543                let fill_attrs = &attrs.fill[f.attrs_idx as usize];
544                self.fill(
545                    usize::from(f.x),
546                    usize::from(f.width),
547                    &fill_attrs.paint,
548                    fill_attrs.blend_mode,
549                    paints,
550                    None,
551                    fill_attrs.mask.as_ref(),
552                );
553            }
554            Cmd::AlphaFill(s) => {
555                let fill_attrs = &attrs.fill[s.attrs_idx as usize];
556                let alpha_idx = fill_attrs.alpha_idx(s.alpha_offset) as usize;
557                self.fill(
558                    usize::from(s.x),
559                    usize::from(s.width),
560                    &fill_attrs.paint,
561                    fill_attrs.blend_mode,
562                    paints,
563                    Some(&alphas[alpha_idx..]),
564                    fill_attrs.mask.as_ref(),
565                );
566            }
567            Cmd::Filter(_filter, _) => {
568                // TODO: Apply non-spatial filters here; spatial filters need layer-level processing
569                //
570                // Spatial filters (e.g., Gaussian blur) need neighboring pixels and must be
571                // rendered to a pixmap for layer-level processing. Non-spatial effects (e.g.,
572                // color matrix, component transfer) can be processed here directly on the
573                // blend buffer per-pixel as wide commands.
574            }
575            Cmd::PushBuf(_layer_kind) => {
576                self.blend_buf.push([T::Numeric::ZERO; SCRATCH_BUF_SIZE]);
577            }
578            Cmd::PopBuf => {
579                self.blend_buf.pop();
580            }
581            Cmd::ClipFill(cf) => {
582                self.clip(cf.x as usize, cf.width as usize, None);
583            }
584            Cmd::ClipStrip(cs) => {
585                let clip_attrs = &attrs.clip[cs.attrs_idx as usize];
586                let alpha_idx = clip_attrs.alpha_idx(cs.alpha_offset) as usize;
587                self.clip(cs.x as usize, cs.width as usize, Some(&alphas[alpha_idx..]));
588            }
589            Cmd::Blend(b) => self.blend(*b),
590            Cmd::Mask(m) => {
591                let start_x = self.wide_coords.0 * WideTile::WIDTH;
592                let start_y = self.wide_coords.1 * Tile::HEIGHT;
593
594                let blend_buf = self.blend_buf.last_mut().unwrap();
595
596                let width = (blend_buf.len() / (Tile::HEIGHT as usize * COLOR_COMPONENTS)) as u16;
597                let y = start_y as u32 + u32x4::from_slice(self.simd, &[0, 1, 2, 3]);
598
599                let iter = (start_x..(start_x + width)).map(|x| {
600                    let x_in_range = x < m.width();
601
602                    macro_rules! sample {
603                        ($idx:expr) => {
604                            if x_in_range && (y[$idx] as u16) < m.height() {
605                                m.sample(x, y[$idx] as u16)
606                            } else {
607                                0
608                            }
609                        };
610                    }
611
612                    let s1 = sample!(0);
613                    let s2 = sample!(1);
614                    let s3 = sample!(2);
615                    let s4 = sample!(3);
616
617                    let samples = u8x16::from_slice(
618                        self.simd,
619                        &[
620                            s1, s1, s1, s1, s2, s2, s2, s2, s3, s3, s3, s3, s4, s4, s4, s4,
621                        ],
622                    );
623                    T::NumericVec::from_u8(self.simd, samples)
624                });
625
626                T::apply_mask(self.simd, blend_buf, iter);
627            }
628            Cmd::Opacity(o) => {
629                if *o != 1.0 {
630                    let blend_buf = self.blend_buf.last_mut().unwrap();
631
632                    T::apply_mask(
633                        self.simd,
634                        blend_buf,
635                        iter::repeat(T::NumericVec::from_f32(
636                            self.simd,
637                            f32x16::splat(self.simd, *o),
638                        )),
639                    );
640                }
641            }
642            Cmd::PushZeroClip(_) | Cmd::PopZeroClip => {
643                // These commands are handled by the dispatcher and should not reach fine rasterization
644                unreachable!();
645            }
646        }
647    }
648
649    /// Fill a horizontal strip within the current tile using the given paint.
650    ///
651    /// This is the core painting method that handles solid colors, gradients, images,
652    /// and blurred rounded rectangles. It applies the paint starting at the given x
653    /// coordinate with the specified width, using the provided blend mode.
654    ///
655    /// Note: For short strip segments, benchmarks showed that not inlining this method
656    /// leads to significantly worse performance.
657    pub fn fill(
658        &mut self,
659        x: usize,
660        width: usize,
661        fill: &Paint,
662        blend_mode: BlendMode,
663        encoded_paints: &[EncodedPaint],
664        alphas: Option<&[u8]>,
665        mask: Option<&Mask>,
666    ) {
667        let blend_buf = &mut self.blend_buf.last_mut().unwrap()[x * TILE_HEIGHT_COMPONENTS..]
668            [..TILE_HEIGHT_COMPONENTS * width];
669        let default_blend = blend_mode == BlendMode::default();
670
671        match fill {
672            Paint::Solid(color) => {
673                let color = T::extract_color(*color);
674
675                // If color is completely opaque, we can just directly override
676                // the blend buffer.
677                if color[3] == T::Numeric::ONE
678                    && default_blend
679                    && alphas.is_none()
680                    && mask.is_none()
681                {
682                    T::copy_solid(self.simd, blend_buf, color);
683
684                    return;
685                }
686
687                if default_blend && mask.is_none() {
688                    T::alpha_composite_solid(self.simd, blend_buf, color, alphas);
689                } else {
690                    let start_x = self.wide_coords.0 * WideTile::WIDTH + x as u16;
691                    let start_y = self.wide_coords.1 * Tile::HEIGHT;
692
693                    T::blend(
694                        self.simd,
695                        blend_buf,
696                        start_x,
697                        start_y,
698                        iter::repeat(T::Composite::from_color(self.simd, color)),
699                        blend_mode,
700                        alphas,
701                        mask,
702                    );
703                }
704            }
705            Paint::Indexed(paint) => {
706                let color_buf = &mut self.paint_buf[x * TILE_HEIGHT_COMPONENTS..]
707                    [..TILE_HEIGHT_COMPONENTS * width];
708
709                let encoded_paint = &encoded_paints[paint.index()];
710
711                let start_x = self.wide_coords.0 * WideTile::WIDTH + x as u16;
712                let start_y = self.wide_coords.1 * Tile::HEIGHT;
713
714                // We need to have this as a macro because closures cannot take generic arguments, and
715                // we would have to repeatedly provide all arguments if we made it a function.
716                macro_rules! fill_complex_paint {
717                    ($may_have_opacities:expr, $filler:expr) => {
718                        if $may_have_opacities || alphas.is_some() {
719                            T::apply_painter(self.simd, color_buf, $filler);
720
721                            if default_blend && mask.is_none() {
722                                T::alpha_composite_buffer(self.simd, blend_buf, color_buf, alphas);
723                            } else {
724                                T::blend(
725                                    self.simd,
726                                    blend_buf,
727                                    start_x,
728                                    start_y,
729                                    color_buf
730                                        .chunks_exact(T::Composite::LENGTH)
731                                        .map(|s| T::Composite::from_slice(self.simd, s)),
732                                    blend_mode,
733                                    alphas,
734                                    mask,
735                                );
736                            }
737                        } else {
738                            // Similarly to solid colors we can just override the previous values
739                            // if all colors in the gradient are fully opaque.
740                            T::apply_painter(self.simd, blend_buf, $filler);
741                        }
742                    };
743                }
744
745                match encoded_paint {
746                    EncodedPaint::BlurredRoundedRect(b) => {
747                        fill_complex_paint!(
748                            true,
749                            T::blurred_rounded_rectangle_painter(self.simd, b, start_x, start_y)
750                        );
751                    }
752                    EncodedPaint::Gradient(g) => {
753                        // Note that we are calculating the t values first, store them in a separate
754                        // buffer and then pass that buffer to the iterator instead of calculating
755                        // the t values on the fly in the iterator. The latter would be faster, but
756                        // it would probably increase code size a lot, because the functions for
757                        // position calculation need to be inlined for good performance.
758                        let f32_buf = &mut self.f32_buf[..width * Tile::HEIGHT as usize];
759
760                        match &g.kind {
761                            EncodedKind::Linear(l) => {
762                                calculate_t_vals(
763                                    self.simd,
764                                    SimdLinearKind::new(self.simd, *l),
765                                    f32_buf,
766                                    g,
767                                    start_x,
768                                    start_y,
769                                );
770
771                                fill_complex_paint!(
772                                    g.may_have_opacities,
773                                    T::gradient_painter(self.simd, g, f32_buf)
774                                );
775                            }
776                            EncodedKind::Sweep(s) => {
777                                calculate_t_vals(
778                                    self.simd,
779                                    SimdSweepKind::new(self.simd, s),
780                                    f32_buf,
781                                    g,
782                                    start_x,
783                                    start_y,
784                                );
785
786                                fill_complex_paint!(
787                                    g.may_have_opacities,
788                                    T::gradient_painter(self.simd, g, f32_buf)
789                                );
790                            }
791                            EncodedKind::Radial(r) => {
792                                calculate_t_vals(
793                                    self.simd,
794                                    SimdRadialKind::new(self.simd, r),
795                                    f32_buf,
796                                    g,
797                                    start_x,
798                                    start_y,
799                                );
800
801                                if r.has_undefined() {
802                                    fill_complex_paint!(
803                                        g.may_have_opacities,
804                                        T::gradient_painter_with_undefined(self.simd, g, f32_buf)
805                                    );
806                                } else {
807                                    fill_complex_paint!(
808                                        g.may_have_opacities,
809                                        T::gradient_painter(self.simd, g, f32_buf)
810                                    );
811                                }
812                            }
813                        }
814                    }
815                    EncodedPaint::Image(i) => {
816                        let ImageSource::Pixmap(pixmap) = &i.source else {
817                            panic!("vello_cpu doesn't support the opaque image source.");
818                        };
819
820                        match (i.has_skew(), i.nearest_neighbor()) {
821                            (false, false) => {
822                                // Axis-aligned with filtering - use optimized plain painters
823                                if i.sampler.quality == ImageQuality::Medium {
824                                    fill_complex_paint!(
825                                        i.may_have_opacities,
826                                        T::plain_medium_quality_image_painter(
827                                            self.simd, i, pixmap, start_x, start_y
828                                        )
829                                    );
830                                } else {
831                                    fill_complex_paint!(
832                                        i.may_have_opacities,
833                                        T::high_quality_image_painter(
834                                            self.simd, i, pixmap, start_x, start_y
835                                        )
836                                    );
837                                }
838                            }
839                            (true, false) => {
840                                // Skewed with filtering - use generic filtered painters
841                                if i.sampler.quality == ImageQuality::Medium {
842                                    fill_complex_paint!(
843                                        i.may_have_opacities,
844                                        T::medium_quality_image_painter(
845                                            self.simd, i, pixmap, start_x, start_y
846                                        )
847                                    );
848                                } else {
849                                    fill_complex_paint!(
850                                        i.may_have_opacities,
851                                        T::high_quality_image_painter(
852                                            self.simd, i, pixmap, start_x, start_y
853                                        )
854                                    );
855                                }
856                            }
857                            (false, true) => {
858                                fill_complex_paint!(
859                                    i.may_have_opacities,
860                                    T::plain_nn_image_painter(
861                                        self.simd, i, pixmap, start_x, start_y
862                                    )
863                                );
864                            }
865                            (true, true) => {
866                                fill_complex_paint!(
867                                    i.may_have_opacities,
868                                    T::nn_image_painter(self.simd, i, pixmap, start_x, start_y)
869                                );
870                            }
871                        }
872                    }
873                }
874            }
875        }
876    }
877
878    /// Blend the top blend buffer into the buffer below it.
879    ///
880    /// This pops the top buffer from the blend stack and composites it onto the
881    /// buffer below using the specified blend mode. This is the core operation for
882    /// layer composition.
883    pub(crate) fn blend(&mut self, blend_mode: BlendMode) {
884        let (source_buffer, rest) = self.blend_buf.split_last_mut().unwrap();
885        let target_buffer = rest.last_mut().unwrap();
886
887        if blend_mode == BlendMode::default() {
888            T::alpha_composite_buffer(self.simd, target_buffer, source_buffer, None);
889        } else {
890            T::blend(
891                self.simd,
892                target_buffer,
893                // `start_x` and `start_y` are only needed to sample the correct position
894                // of a mask, so we can just pass dummy values here.
895                0,
896                0,
897                source_buffer
898                    .chunks_exact(T::Composite::LENGTH)
899                    .map(|s| T::Composite::from_slice(self.simd, s)),
900                blend_mode,
901                None,
902                None,
903            );
904        }
905    }
906
907    /// Apply a clipping mask from the top buffer to the buffer below.
908    ///
909    /// Uses the top buffer's alpha channel as a mask, multiplying it with the buffer
910    /// below. This implements clipping by masking out pixels outside the clip region.
911    fn clip(&mut self, x: usize, width: usize, alphas: Option<&[u8]>) {
912        let (source_buffer, rest) = self.blend_buf.split_last_mut().unwrap();
913        let target_buffer = rest.last_mut().unwrap();
914
915        let source_buffer =
916            &mut source_buffer[x * TILE_HEIGHT_COMPONENTS..][..TILE_HEIGHT_COMPONENTS * width];
917        let target_buffer =
918            &mut target_buffer[x * TILE_HEIGHT_COMPONENTS..][..TILE_HEIGHT_COMPONENTS * width];
919
920        T::alpha_composite_buffer(self.simd, target_buffer, source_buffer, alphas);
921    }
922}
923
924/// A trait for objects that can render pixel data into buffers.
925///
926/// Painters abstract over different content sources (gradients, images, etc.) and can
927/// generate pixel data in either u8 or f32 format. Implementations should provide at least
928/// one of these methods; the other can delegate through conversion.
929///
930/// Note: Some painters may only efficiently support one numeric type. The implementation
931/// may convert between types as needed.
932pub trait Painter {
933    /// Paint pixel data into a u8 buffer (values in 0-255 range).
934    fn paint_u8(&mut self, buf: &mut [u8]);
935
936    /// Paint pixel data into an f32 buffer (values in 0.0-1.0 range).
937    fn paint_f32(&mut self, buf: &mut [f32]);
938}
939
940/// Extension trait for creating position vectors for gradient and image sampling.
941///
942/// This trait provides a method to generate SIMD vectors of positions that advance
943/// correctly across a tile. It's used by painters to compute per-pixel coordinates
944/// for sampling operations.
945pub trait PosExt<S: Simd> {
946    /// Create a position vector that advances appropriately across a tile.
947    ///
948    /// Given a starting position and per-pixel advances in x and y directions,
949    /// generates a SIMD vector with the correct position for each element.
950    fn splat_pos(simd: S, pos: f32, x_advance: f32, y_advance: f32) -> Self;
951}
952
953impl<S: Simd> PosExt<S> for f32x4<S> {
954    #[inline(always)]
955    fn splat_pos(simd: S, pos: f32, _: f32, y_advance: f32) -> Self {
956        let columns: [f32; Tile::HEIGHT as usize] = [0.0, 1.0, 2.0, 3.0];
957        let column_mask: Self = columns.simd_into(simd);
958
959        column_mask.madd(Self::splat(simd, y_advance), Self::splat(simd, pos))
960    }
961}
962
963impl<S: Simd> PosExt<S> for f32x8<S> {
964    #[inline(always)]
965    fn splat_pos(simd: S, pos: f32, x_advance: f32, y_advance: f32) -> Self {
966        simd.combine_f32x4(
967            f32x4::splat_pos(simd, pos, x_advance, y_advance),
968            f32x4::splat_pos(simd, pos + x_advance, x_advance, y_advance),
969        )
970    }
971}
972
973/// Intermediate shader result with color channels stored separately for efficient processing.
974///
975/// This structure holds 8 pixels worth of data in planar format (separate R, G, B, A vectors).
976/// The planar layout is more efficient for certain SIMD operations before final interleaving.
977pub(crate) struct ShaderResultF32<S: Simd> {
978    /// Red channel values for 8 pixels.
979    pub(crate) r: f32x8<S>,
980    /// Green channel values for 8 pixels.
981    pub(crate) g: f32x8<S>,
982    /// Blue channel values for 8 pixels.
983    pub(crate) b: f32x8<S>,
984    /// Alpha channel values for 8 pixels.
985    pub(crate) a: f32x8<S>,
986}
987
988impl<S: Simd> ShaderResultF32<S> {
989    /// Convert from planar format to interleaved RGBA format.
990    ///
991    /// Returns two f32x16 vectors containing 8 pixels (4 RGBA components each)
992    /// with channels interleaved in the standard RGBA order.
993    #[inline(always)]
994    pub(crate) fn get(&self) -> (f32x16<S>, f32x16<S>) {
995        let (r_1, r_2) = self.r.simd.split_f32x8(self.r);
996        let (g_1, g_2) = self.g.simd.split_f32x8(self.g);
997        let (b_1, b_2) = self.b.simd.split_f32x8(self.b);
998        let (a_1, a_2) = self.a.simd.split_f32x8(self.a);
999
1000        let first = self.r.simd.combine_f32x8(
1001            self.r.simd.combine_f32x4(r_1, g_1),
1002            self.r.simd.combine_f32x4(b_1, a_1),
1003        );
1004
1005        let second = self.r.simd.combine_f32x8(
1006            self.r.simd.combine_f32x4(r_2, g_2),
1007            self.r.simd.combine_f32x4(b_2, a_2),
1008        );
1009
1010        (first, second)
1011    }
1012}
1013
1014mod macros {
1015    /// Implements the `Painter` trait for an iterator that produces f32x16 SIMD vectors.
1016    ///
1017    /// This macro generates both `paint_u8` and `paint_f32` methods, converting between
1018    /// formats as needed. Used for painters that work natively with high-precision f32 data.
1019    macro_rules! f32x16_painter {
1020        ($($type_path:tt)+) => {
1021            impl<S: Simd> crate::fine::Painter for $($type_path)+ {
1022                fn paint_u8(&mut self, buf: &mut [u8]) {
1023                    use vello_common::fearless_simd::*;
1024                    use crate::fine::NumericVec;
1025
1026                    self.simd.vectorize(#[inline(always)] || {
1027                        for chunk in buf.chunks_exact_mut(16) {
1028                            let next = self.next().unwrap();
1029                            let converted = u8x16::<S>::from_f32(next.simd, next);
1030                            chunk.copy_from_slice(converted.as_slice());
1031                        }
1032                    })
1033                }
1034
1035                fn paint_f32(&mut self, buf: &mut [f32]) {
1036                    self.simd.vectorize(#[inline(always)] || {
1037                        for chunk in buf.chunks_exact_mut(16) {
1038                            let next = self.next().unwrap();
1039                            chunk.copy_from_slice(next.as_slice());
1040                        }
1041                    })
1042                }
1043            }
1044        };
1045    }
1046
1047    /// Implements the `Painter` trait for an iterator that produces u8x16 SIMD vectors.
1048    ///
1049    /// This macro generates both `paint_u8` and `paint_f32` methods, converting between
1050    /// formats as needed. Used for painters that work natively with low-precision u8 data.
1051    macro_rules! u8x16_painter {
1052        ($($type_path:tt)+) => {
1053            impl<S: Simd> crate::fine::Painter for $($type_path)+ {
1054                fn paint_u8(&mut self, buf: &mut [u8]) {
1055                    self.simd.vectorize(#[inline(always)] || {
1056                        for chunk in buf.chunks_exact_mut(16) {
1057                            let next = self.next().unwrap();
1058                            chunk.copy_from_slice(next.as_slice());
1059                        }
1060                    })
1061                }
1062
1063                fn paint_f32(&mut self, buf: &mut [f32]) {
1064                    use vello_common::fearless_simd::*;
1065                    use crate::fine::NumericVec;
1066
1067                    self.simd.vectorize(#[inline(always)] || {
1068                        for chunk in buf.chunks_exact_mut(16) {
1069                            let next = self.next().unwrap();
1070                            let converted = f32x16::<S>::from_u8(next.simd, next);
1071                            chunk.copy_from_slice(converted.as_slice());
1072                        }
1073                    })
1074                }
1075            }
1076        };
1077    }
1078
1079    pub(crate) use f32x16_painter;
1080    pub(crate) use u8x16_painter;
1081}