vello_cpu/fine/mod.rs
1// Copyright 2025 the Vello Authors
2// SPDX-License-Identifier: Apache-2.0 OR MIT
3
4//! Fine rasterization stage of the rendering pipeline.
5//!
6//! This module implements the fine rasterization phase, which processes tiles at the pixel level.
7//! It supports both high-precision (f32) and low-precision (u8) rendering paths, along with
8//! various paint types including solid colors, gradients, images, and blurred rounded rectangles.
9
10mod common;
11mod highp;
12mod lowp;
13
14use crate::fine::common::gradient::linear::SimdLinearKind;
15use crate::fine::common::gradient::radial::SimdRadialKind;
16use crate::fine::common::gradient::sweep::SimdSweepKind;
17use crate::fine::common::gradient::{GradientPainter, calculate_t_vals};
18use crate::fine::common::image::{FilteredImagePainter, NNImagePainter, PlainNNImagePainter};
19use crate::fine::common::rounded_blurred_rect::BlurredRoundedRectFiller;
20use crate::layer_manager::LayerManager;
21use crate::peniko::{BlendMode, ImageQuality};
22use crate::region::Region;
23use crate::util::EncodedImageExt;
24use alloc::vec;
25use alloc::vec::Vec;
26use core::fmt::Debug;
27use core::iter;
28use vello_common::coarse::{Cmd, CommandAttrs, WideTile};
29use vello_common::encode::{
30 EncodedBlurredRoundedRectangle, EncodedGradient, EncodedImage, EncodedKind, EncodedPaint,
31};
32use vello_common::fearless_simd::{
33 Bytes, Simd, SimdBase, SimdFloat, SimdInt, SimdInto, f32x4, f32x8, f32x16, u8x16, u8x32, u32x4,
34 u32x8,
35};
36use vello_common::filter_effects::Filter;
37use vello_common::kurbo::Affine;
38use vello_common::mask::Mask;
39use vello_common::paint::{ImageSource, Paint, PremulColor};
40use vello_common::pixmap::Pixmap;
41use vello_common::simd::Splat4thExt;
42use vello_common::tile::Tile;
43use vello_common::util::f32_to_u8;
44
45pub use highp::F32Kernel;
46pub use lowp::U8Kernel;
47
48/// Number of color components per pixel (RGBA).
49pub(crate) const COLOR_COMPONENTS: usize = 4;
50
51/// Number of color components in a single column of a tile (height * components).
52pub(crate) const TILE_HEIGHT_COMPONENTS: usize = Tile::HEIGHT as usize * COLOR_COMPONENTS;
53
54/// Size of the scratch buffer used for intermediate rendering operations.
55/// Sized to hold a full wide tile with all color components.
56pub const SCRATCH_BUF_SIZE: usize =
57 WideTile::WIDTH as usize * Tile::HEIGHT as usize * COLOR_COMPONENTS;
58
59/// Type alias for a scratch buffer that can hold a full wide tile's worth of data.
60pub type ScratchBuf<F> = [F; SCRATCH_BUF_SIZE];
61
62/// Trait for numeric types used in fine rasterization.
63///
64/// This trait abstracts over `f32` and `u8` to allow the same rendering logic
65/// to work with both high-precision (floating-point) and low-precision (integer)
66/// representations. This enables performance optimizations while maintaining accuracy
67/// where needed.
68pub trait Numeric: Copy + Default + Clone + Debug + PartialEq + Send + Sync + 'static {
69 /// The zero value for this numeric type (0.0 for f32, 0 for u8).
70 const ZERO: Self;
71
72 /// The maximum opacity value for this numeric type (1.0 for f32, 255 for u8).
73 const ONE: Self;
74}
75
76impl Numeric for f32 {
77 const ZERO: Self = 0.0;
78 const ONE: Self = 1.0;
79}
80
81impl Numeric for u8 {
82 const ZERO: Self = 0;
83 const ONE: Self = 255;
84}
85
86/// Trait for SIMD vector types that can convert between f32 and u8 representations.
87///
88/// This trait enables efficient batch conversions between different numeric representations
89/// during rendering operations, supporting both high-precision and low-precision rendering paths.
90pub trait NumericVec<S: Simd>: Copy + Clone + Send + Sync {
91 /// Convert from a SIMD vector of f32 values to this type.
92 fn from_f32(simd: S, val: f32x16<S>) -> Self;
93
94 /// Convert from a SIMD vector of u8 values to this type.
95 fn from_u8(simd: S, val: u8x16<S>) -> Self;
96}
97
98impl<S: Simd> NumericVec<S> for f32x16<S> {
99 #[inline(always)]
100 fn from_f32(_: S, val: Self) -> Self {
101 val
102 }
103
104 #[inline(always)]
105 fn from_u8(simd: S, val: u8x16<S>) -> Self {
106 let converted = u8_to_f32(val);
107 converted * Self::splat(simd, 1.0 / 255.0)
108 }
109}
110
111impl<S: Simd> NumericVec<S> for u8x16<S> {
112 #[inline(always)]
113 fn from_f32(simd: S, val: f32x16<S>) -> Self {
114 let v1 = f32x16::splat(simd, 255.0);
115 let v2 = f32x16::splat(simd, 0.5);
116 let mulled = val.madd(v1, v2);
117
118 f32_to_u8(mulled)
119 }
120
121 #[inline(always)]
122 fn from_u8(_: S, val: Self) -> Self {
123 val
124 }
125}
126
127/// Convert a SIMD vector of u8 values to f32 values.
128///
129/// This function efficiently converts 16 u8 values to their f32 equivalents using SIMD operations,
130/// preserving the values without normalization (i.e., 255 becomes 255.0, not 1.0).
131#[inline(always)]
132pub(crate) fn u8_to_f32<S: Simd>(val: u8x16<S>) -> f32x16<S> {
133 let simd = val.simd;
134 let zeroes = u8x16::splat(simd, 0);
135
136 let zip1 = simd.zip_high_u8x16(val, zeroes);
137 let zip2 = simd.zip_low_u8x16(val, zeroes);
138
139 let p1 = simd
140 .zip_low_u8x16(zip2, zeroes)
141 .bitcast::<u32x4<S>>()
142 .to_float::<f32x4<S>>();
143 let p2 = simd
144 .zip_high_u8x16(zip2, zeroes)
145 .bitcast::<u32x4<S>>()
146 .to_float::<f32x4<S>>();
147 let p3 = simd
148 .zip_low_u8x16(zip1, zeroes)
149 .bitcast::<u32x4<S>>()
150 .to_float::<f32x4<S>>();
151 let p4 = simd
152 .zip_high_u8x16(zip1, zeroes)
153 .bitcast::<u32x4<S>>()
154 .to_float::<f32x4<S>>();
155
156 simd.combine_f32x8(simd.combine_f32x4(p1, p2), simd.combine_f32x4(p3, p4))
157}
158
159/// Trait for SIMD vector types used in compositing and blending operations.
160///
161/// This trait abstracts over different SIMD vector widths (f32x16 for high-precision,
162/// u8x32 for low-precision) to enable efficient batch processing of pixel data during
163/// blending and compositing.
164pub trait CompositeType<N: Numeric, S: Simd>: Copy + Clone + Send + Sync {
165 /// The number of numeric values this composite type can hold.
166 const LENGTH: usize;
167
168 /// Load values from a slice into this composite type.
169 fn from_slice(simd: S, slice: &[N]) -> Self;
170
171 /// Create a composite type by repeating a single RGBA color across all elements.
172 fn from_color(simd: S, color: [N; 4]) -> Self;
173}
174
175impl<S: Simd> CompositeType<f32, S> for f32x16<S> {
176 const LENGTH: usize = 16;
177
178 #[inline(always)]
179 fn from_slice(simd: S, slice: &[f32]) -> Self {
180 <Self as SimdBase<_, _>>::from_slice(simd, slice)
181 }
182
183 #[inline(always)]
184 fn from_color(simd: S, color: [f32; 4]) -> Self {
185 Self::block_splat(f32x4::from_slice(simd, &color[..]))
186 }
187}
188
189impl<S: Simd> CompositeType<u8, S> for u8x32<S> {
190 const LENGTH: usize = 32;
191
192 #[inline(always)]
193 fn from_slice(simd: S, slice: &[u8]) -> Self {
194 <Self as SimdBase<_, _>>::from_slice(simd, slice)
195 }
196
197 #[inline(always)]
198 fn from_color(simd: S, color: [u8; 4]) -> Self {
199 u32x8::block_splat(u32x4::splat(simd, u32::from_ne_bytes(color))).to_bytes()
200 }
201}
202
203/// A kernel for performing fine rasterization.
204///
205/// This trait defines the interface for tile-level rendering operations, abstracting over
206/// different numeric precisions (f32 vs u8). Implementations provide the low-level pixel
207/// manipulation, blending, and painting operations needed to render tiles.
208///
209/// The two main implementations are:
210/// - [`F32Kernel`]: High-precision rendering using 32-bit floating-point values
211/// - [`U8Kernel`]: Low-precision rendering using 8-bit integer values
212pub trait FineKernel<S: Simd>: Send + Sync + 'static {
213 /// The basic underlying numerical type of the kernel (f32 or u8).
214 type Numeric: Numeric;
215
216 /// The SIMD composite type used for efficient batch blending and compositing operations.
217 type Composite: CompositeType<Self::Numeric, S>;
218
219 /// The SIMD vector type used for conversions between u8 and f32 representations.
220 type NumericVec: NumericVec<S>;
221
222 /// Extract and convert a premultiplied color to the kernel's numeric type.
223 ///
224 /// Converts RGBA components from the standard premultiplied color format to
225 /// the kernel's internal representation (e.g., 0.0-1.0 for f32, 0-255 for u8).
226 fn extract_color(color: PremulColor) -> [Self::Numeric; 4];
227
228 /// Pack the blend buffer contents into the output region.
229 ///
230 /// Converts from the internal scratch buffer format to the output tile format,
231 /// writing the results to the provided region.
232 fn pack(simd: S, region: &mut Region<'_>, blend_buf: &[Self::Numeric]);
233
234 /// Unpack the region contents back into the blend buffer.
235 ///
236 /// Performs the reverse of `pack`, reading pixel data from the tile region
237 /// and loading it into the scratch buffer for further processing.
238 fn unpack(simd: S, region: &mut Region<'_>, blend_buf: &mut [Self::Numeric]);
239
240 /// Apply a filter to a layer.
241 ///
242 /// This is used for applying filters to whole layers, which is necessary for
243 /// spatial filters (like blur) that need to access neighboring pixels. The filter
244 /// is applied in-place to the provided pixmap.
245 ///
246 /// The transform parameter is used to scale filter parameters based on the current
247 /// transformation matrix (e.g., zoom level), ensuring filters look consistent
248 /// regardless of scale.
249 fn filter_layer(
250 pixmap: &mut Pixmap,
251 filter: &Filter,
252 layer_manager: &mut LayerManager,
253 transform: Affine,
254 );
255
256 /// Fill the target buffer with a solid color.
257 ///
258 /// Efficiently replicates the given RGBA color across all pixels in the target buffer.
259 fn copy_solid(simd: S, target: &mut [Self::Numeric], color: [Self::Numeric; 4]);
260 /// Create a painter for rendering gradients.
261 ///
262 /// Returns a painter that can render linear, radial, or sweep gradients based on
263 /// pre-computed t values (gradient interpolation parameters).
264 fn gradient_painter<'a>(
265 simd: S,
266 gradient: &'a EncodedGradient,
267 t_vals: &'a [f32],
268 ) -> impl Painter + 'a {
269 simd.vectorize(
270 #[inline(always)]
271 || GradientPainter::new(simd, gradient, t_vals),
272 )
273 }
274
275 /// Create a painter for rendering gradients with undefined region support.
276 ///
277 /// Similar to `gradient_painter`, but with support for masking undefined locations
278 /// (used for radial gradients that may have mathematically undefined regions).
279 ///
280 /// This is intentionally a duplicate of the default [`FineKernel::gradient_painter`]
281 /// implementation--the `U8Kernel` overrides that method, but not this one.
282 fn gradient_painter_with_undefined<'a>(
283 simd: S,
284 gradient: &'a EncodedGradient,
285 t_vals: &'a [f32],
286 ) -> impl Painter + 'a {
287 simd.vectorize(
288 #[inline(always)]
289 || GradientPainter::new(simd, gradient, t_vals),
290 )
291 }
292 /// Create a painter for rendering axis-aligned nearest-neighbor images.
293 ///
294 /// Optimized painter for images with `Low` quality and no skewing component in their
295 /// transform. This is the fastest image rendering path.
296 fn plain_nn_image_painter<'a>(
297 simd: S,
298 image: &'a EncodedImage,
299 pixmap: &'a Pixmap,
300 start_x: u16,
301 start_y: u16,
302 ) -> impl Painter + 'a {
303 simd.vectorize(
304 #[inline(always)]
305 || PlainNNImagePainter::new(simd, image, pixmap, start_x, start_y),
306 )
307 }
308
309 /// Create a painter for rendering nearest-neighbor images with transforms.
310 ///
311 /// Similar to `plain_nn_image_painter`, but supports arbitrary affine transforms
312 /// including skewing and rotation.
313 fn nn_image_painter<'a>(
314 simd: S,
315 image: &'a EncodedImage,
316 pixmap: &'a Pixmap,
317 start_x: u16,
318 start_y: u16,
319 ) -> impl Painter + 'a {
320 simd.vectorize(
321 #[inline(always)]
322 || NNImagePainter::new(simd, image, pixmap, start_x, start_y),
323 )
324 }
325
326 /// Create a painter for rendering images with `Medium` quality filtering.
327 ///
328 /// Uses bilinear filtering for smoother appearance than nearest-neighbor.
329 fn medium_quality_image_painter<'a>(
330 simd: S,
331 image: &'a EncodedImage,
332 pixmap: &'a Pixmap,
333 start_x: u16,
334 start_y: u16,
335 ) -> impl Painter + 'a {
336 simd.vectorize(
337 #[inline(always)]
338 || FilteredImagePainter::new(simd, image, pixmap, start_x, start_y),
339 )
340 }
341
342 /// Create a painter for rendering axis-aligned images with `Medium` quality filtering.
343 ///
344 /// Optimized painter for images with bilinear filtering and no skewing component.
345 fn plain_medium_quality_image_painter<'a>(
346 simd: S,
347 image: &'a EncodedImage,
348 pixmap: &'a Pixmap,
349 start_x: u16,
350 start_y: u16,
351 ) -> impl Painter + 'a {
352 simd.vectorize(
353 #[inline(always)]
354 || FilteredImagePainter::new(simd, image, pixmap, start_x, start_y),
355 )
356 }
357
358 /// Create a painter for rendering images with `High` quality filtering.
359 ///
360 /// Uses high-quality filtering for the best visual appearance.
361 fn high_quality_image_painter<'a>(
362 simd: S,
363 image: &'a EncodedImage,
364 pixmap: &'a Pixmap,
365 start_x: u16,
366 start_y: u16,
367 ) -> impl Painter + 'a {
368 simd.vectorize(
369 #[inline(always)]
370 || FilteredImagePainter::new(simd, image, pixmap, start_x, start_y),
371 )
372 }
373
374 /// Create a painter for rendering blurred rounded rectangles.
375 ///
376 /// Efficiently renders rounded rectangles with gaussian blur applied,
377 /// computing the blur analytically rather than as a post-process.
378 fn blurred_rounded_rectangle_painter(
379 simd: S,
380 rect: &EncodedBlurredRoundedRectangle,
381 start_x: u16,
382 start_y: u16,
383 ) -> impl Painter {
384 simd.vectorize(
385 #[inline(always)]
386 || BlurredRoundedRectFiller::new(simd, rect, start_x, start_y),
387 )
388 }
389 /// Apply a mask to the destination buffer.
390 ///
391 /// Multiplies each pixel in the destination by the corresponding mask value,
392 /// effectively masking out or reducing the opacity of pixels.
393 fn apply_mask(simd: S, dest: &mut [Self::Numeric], src: impl Iterator<Item = Self::NumericVec>);
394
395 /// Apply a painter to render content into the destination buffer.
396 ///
397 /// Invokes the painter to generate pixel values and writes them to the destination.
398 fn apply_painter<'a>(simd: S, dest: &mut [Self::Numeric], painter: impl Painter + 'a);
399
400 /// Perform alpha compositing with a solid color over the target buffer.
401 ///
402 /// Blends a solid RGBA color over the existing contents using standard alpha compositing
403 /// (Porter-Duff source-over). Optionally applies additional per-pixel alpha values.
404 fn alpha_composite_solid(
405 simd: S,
406 target: &mut [Self::Numeric],
407 src: [Self::Numeric; 4],
408 alphas: Option<&[u8]>,
409 );
410
411 /// Perform alpha compositing with a source buffer over the destination buffer.
412 ///
413 /// Blends the source buffer contents over the destination using standard alpha compositing.
414 /// Optionally applies additional per-pixel alpha values.
415 fn alpha_composite_buffer(
416 simd: S,
417 dest: &mut [Self::Numeric],
418 src: &[Self::Numeric],
419 alphas: Option<&[u8]>,
420 );
421
422 /// Blend the source into the destination with a specified blend mode.
423 ///
424 /// Applies advanced blending operations (e.g., multiply, screen, overlay) as specified
425 /// by the blend mode. Optionally applies additional per-pixel alpha values.
426 fn blend(
427 simd: S,
428 dest: &mut [Self::Numeric],
429 start_x: u16,
430 start_y: u16,
431 src: impl Iterator<Item = Self::Composite>,
432 blend_mode: BlendMode,
433 alphas: Option<&[u8]>,
434 mask: Option<&Mask>,
435 );
436}
437
438/// Fine rasterizer for processing tiles at the pixel level.
439///
440/// This structure maintains the state and scratch buffers needed for tile-based rendering.
441/// It processes rendering commands and manages a stack of blend buffers for layer composition.
442#[derive(Debug)]
443pub struct Fine<S: Simd, T: FineKernel<S>> {
444 /// The (x, y) coordinates of the currently active wide tile being rendered.
445 pub(crate) wide_coords: (u16, u16),
446
447 /// Stack of blend buffers for managing layers and composition.
448 ///
449 /// Each layer pushes a new buffer onto this stack, and layers are composited
450 /// by popping and blending with the buffer below.
451 pub(crate) blend_buf: Vec<ScratchBuf<T::Numeric>>,
452
453 /// Intermediate buffer used by painters to store generated pixel data before compositing.
454 pub(crate) paint_buf: ScratchBuf<T::Numeric>,
455
456 /// Buffer for storing gradient interpolation parameters (t values).
457 ///
458 /// Gradients pre-compute these values for efficiency before color lookup.
459 pub(crate) f32_buf: Vec<f32>,
460
461 /// The SIMD context used for vectorized operations.
462 pub(crate) simd: S,
463}
464
465impl<S: Simd, T: FineKernel<S>> Fine<S, T> {
466 /// Create a new fine rasterizer with the given SIMD context.
467 ///
468 /// Initializes all scratch buffers and sets up the initial blend buffer.
469 pub fn new(simd: S) -> Self {
470 Self {
471 simd,
472 wide_coords: (0, 0),
473 blend_buf: vec![[T::Numeric::ZERO; SCRATCH_BUF_SIZE]],
474 f32_buf: vec![0.0; SCRATCH_BUF_SIZE / 4],
475 paint_buf: [T::Numeric::ZERO; SCRATCH_BUF_SIZE],
476 }
477 }
478
479 /// Set the coordinates of the wide tile currently being rendered.
480 ///
481 /// This is used by painters and other operations to compute absolute pixel positions.
482 pub fn set_coords(&mut self, x: u16, y: u16) {
483 self.wide_coords = (x, y);
484 }
485
486 /// Clear the current blend buffer to a solid color.
487 ///
488 /// This efficiently fills the entire buffer with the given premultiplied color.
489 pub fn clear(&mut self, premul_color: PremulColor) {
490 let converted_color = T::extract_color(premul_color);
491 let blend_buf = self.blend_buf.last_mut().unwrap();
492
493 T::copy_solid(self.simd, blend_buf, converted_color);
494 }
495
496 /// Writes the current blend buffer contents to the output region.
497 ///
498 /// This copies pixel data from the internal scratch buffer to the tile region,
499 /// converting the layout from the internal representation to the output format.
500 pub fn pack(&self, region: &mut Region<'_>) {
501 let blend_buf = self.blend_buf.last().unwrap();
502
503 T::pack(self.simd, region, blend_buf);
504 }
505
506 /// Reads the region contents back into the blend buffer.
507 ///
508 /// This copies pixel data from the tile region to the internal scratch buffer,
509 /// performing the reverse operation of `pack`. This is typically used when a layer
510 /// needs to be read back for further processing.
511 pub fn unpack(&mut self, region: &mut Region<'_>) {
512 let blend_buf = self.blend_buf.last_mut().unwrap();
513
514 T::unpack(self.simd, region, blend_buf);
515 }
516
517 /// Apply a filter to a layer.
518 ///
519 /// This applies the filter using the kernel's implementation, mutating the layer.
520 pub fn filter_layer(
521 &self,
522 pixmap: &mut Pixmap,
523 filter: &Filter,
524 layer_manager: &mut LayerManager,
525 transform: Affine,
526 ) {
527 T::filter_layer(pixmap, filter, layer_manager, transform);
528 }
529
530 /// Execute a rendering command on the current tile.
531 ///
532 /// This is the main dispatch method that processes different command types including
533 /// fills, clips, blends, filters, masks, and buffer operations.
534 pub(crate) fn run_cmd(
535 &mut self,
536 cmd: &Cmd,
537 alphas: &[u8],
538 paints: &[EncodedPaint],
539 attrs: &CommandAttrs,
540 ) {
541 match cmd {
542 Cmd::Fill(f) => {
543 let fill_attrs = &attrs.fill[f.attrs_idx as usize];
544 self.fill(
545 usize::from(f.x),
546 usize::from(f.width),
547 &fill_attrs.paint,
548 fill_attrs.blend_mode,
549 paints,
550 None,
551 fill_attrs.mask.as_ref(),
552 );
553 }
554 Cmd::AlphaFill(s) => {
555 let fill_attrs = &attrs.fill[s.attrs_idx as usize];
556 let alpha_idx = fill_attrs.alpha_idx(s.alpha_offset) as usize;
557 self.fill(
558 usize::from(s.x),
559 usize::from(s.width),
560 &fill_attrs.paint,
561 fill_attrs.blend_mode,
562 paints,
563 Some(&alphas[alpha_idx..]),
564 fill_attrs.mask.as_ref(),
565 );
566 }
567 Cmd::Filter(_filter, _) => {
568 // TODO: Apply non-spatial filters here; spatial filters need layer-level processing
569 //
570 // Spatial filters (e.g., Gaussian blur) need neighboring pixels and must be
571 // rendered to a pixmap for layer-level processing. Non-spatial effects (e.g.,
572 // color matrix, component transfer) can be processed here directly on the
573 // blend buffer per-pixel as wide commands.
574 }
575 Cmd::PushBuf(_layer_kind) => {
576 self.blend_buf.push([T::Numeric::ZERO; SCRATCH_BUF_SIZE]);
577 }
578 Cmd::PopBuf => {
579 self.blend_buf.pop();
580 }
581 Cmd::ClipFill(cf) => {
582 self.clip(cf.x as usize, cf.width as usize, None);
583 }
584 Cmd::ClipStrip(cs) => {
585 let clip_attrs = &attrs.clip[cs.attrs_idx as usize];
586 let alpha_idx = clip_attrs.alpha_idx(cs.alpha_offset) as usize;
587 self.clip(cs.x as usize, cs.width as usize, Some(&alphas[alpha_idx..]));
588 }
589 Cmd::Blend(b) => self.blend(*b),
590 Cmd::Mask(m) => {
591 let start_x = self.wide_coords.0 * WideTile::WIDTH;
592 let start_y = self.wide_coords.1 * Tile::HEIGHT;
593
594 let blend_buf = self.blend_buf.last_mut().unwrap();
595
596 let width = (blend_buf.len() / (Tile::HEIGHT as usize * COLOR_COMPONENTS)) as u16;
597 let y = start_y as u32 + u32x4::from_slice(self.simd, &[0, 1, 2, 3]);
598
599 let iter = (start_x..(start_x + width)).map(|x| {
600 let x_in_range = x < m.width();
601
602 macro_rules! sample {
603 ($idx:expr) => {
604 if x_in_range && (y[$idx] as u16) < m.height() {
605 m.sample(x, y[$idx] as u16)
606 } else {
607 0
608 }
609 };
610 }
611
612 let s1 = sample!(0);
613 let s2 = sample!(1);
614 let s3 = sample!(2);
615 let s4 = sample!(3);
616
617 let samples = u8x16::from_slice(
618 self.simd,
619 &[
620 s1, s1, s1, s1, s2, s2, s2, s2, s3, s3, s3, s3, s4, s4, s4, s4,
621 ],
622 );
623 T::NumericVec::from_u8(self.simd, samples)
624 });
625
626 T::apply_mask(self.simd, blend_buf, iter);
627 }
628 Cmd::Opacity(o) => {
629 if *o != 1.0 {
630 let blend_buf = self.blend_buf.last_mut().unwrap();
631
632 T::apply_mask(
633 self.simd,
634 blend_buf,
635 iter::repeat(T::NumericVec::from_f32(
636 self.simd,
637 f32x16::splat(self.simd, *o),
638 )),
639 );
640 }
641 }
642 Cmd::PushZeroClip(_) | Cmd::PopZeroClip => {
643 // These commands are handled by the dispatcher and should not reach fine rasterization
644 unreachable!();
645 }
646 }
647 }
648
649 /// Fill a horizontal strip within the current tile using the given paint.
650 ///
651 /// This is the core painting method that handles solid colors, gradients, images,
652 /// and blurred rounded rectangles. It applies the paint starting at the given x
653 /// coordinate with the specified width, using the provided blend mode.
654 ///
655 /// Note: For short strip segments, benchmarks showed that not inlining this method
656 /// leads to significantly worse performance.
657 pub fn fill(
658 &mut self,
659 x: usize,
660 width: usize,
661 fill: &Paint,
662 blend_mode: BlendMode,
663 encoded_paints: &[EncodedPaint],
664 alphas: Option<&[u8]>,
665 mask: Option<&Mask>,
666 ) {
667 let blend_buf = &mut self.blend_buf.last_mut().unwrap()[x * TILE_HEIGHT_COMPONENTS..]
668 [..TILE_HEIGHT_COMPONENTS * width];
669 let default_blend = blend_mode == BlendMode::default();
670
671 match fill {
672 Paint::Solid(color) => {
673 let color = T::extract_color(*color);
674
675 // If color is completely opaque, we can just directly override
676 // the blend buffer.
677 if color[3] == T::Numeric::ONE
678 && default_blend
679 && alphas.is_none()
680 && mask.is_none()
681 {
682 T::copy_solid(self.simd, blend_buf, color);
683
684 return;
685 }
686
687 if default_blend && mask.is_none() {
688 T::alpha_composite_solid(self.simd, blend_buf, color, alphas);
689 } else {
690 let start_x = self.wide_coords.0 * WideTile::WIDTH + x as u16;
691 let start_y = self.wide_coords.1 * Tile::HEIGHT;
692
693 T::blend(
694 self.simd,
695 blend_buf,
696 start_x,
697 start_y,
698 iter::repeat(T::Composite::from_color(self.simd, color)),
699 blend_mode,
700 alphas,
701 mask,
702 );
703 }
704 }
705 Paint::Indexed(paint) => {
706 let color_buf = &mut self.paint_buf[x * TILE_HEIGHT_COMPONENTS..]
707 [..TILE_HEIGHT_COMPONENTS * width];
708
709 let encoded_paint = &encoded_paints[paint.index()];
710
711 let start_x = self.wide_coords.0 * WideTile::WIDTH + x as u16;
712 let start_y = self.wide_coords.1 * Tile::HEIGHT;
713
714 // We need to have this as a macro because closures cannot take generic arguments, and
715 // we would have to repeatedly provide all arguments if we made it a function.
716 macro_rules! fill_complex_paint {
717 ($may_have_opacities:expr, $filler:expr) => {
718 if $may_have_opacities || alphas.is_some() {
719 T::apply_painter(self.simd, color_buf, $filler);
720
721 if default_blend && mask.is_none() {
722 T::alpha_composite_buffer(self.simd, blend_buf, color_buf, alphas);
723 } else {
724 T::blend(
725 self.simd,
726 blend_buf,
727 start_x,
728 start_y,
729 color_buf
730 .chunks_exact(T::Composite::LENGTH)
731 .map(|s| T::Composite::from_slice(self.simd, s)),
732 blend_mode,
733 alphas,
734 mask,
735 );
736 }
737 } else {
738 // Similarly to solid colors we can just override the previous values
739 // if all colors in the gradient are fully opaque.
740 T::apply_painter(self.simd, blend_buf, $filler);
741 }
742 };
743 }
744
745 match encoded_paint {
746 EncodedPaint::BlurredRoundedRect(b) => {
747 fill_complex_paint!(
748 true,
749 T::blurred_rounded_rectangle_painter(self.simd, b, start_x, start_y)
750 );
751 }
752 EncodedPaint::Gradient(g) => {
753 // Note that we are calculating the t values first, store them in a separate
754 // buffer and then pass that buffer to the iterator instead of calculating
755 // the t values on the fly in the iterator. The latter would be faster, but
756 // it would probably increase code size a lot, because the functions for
757 // position calculation need to be inlined for good performance.
758 let f32_buf = &mut self.f32_buf[..width * Tile::HEIGHT as usize];
759
760 match &g.kind {
761 EncodedKind::Linear(l) => {
762 calculate_t_vals(
763 self.simd,
764 SimdLinearKind::new(self.simd, *l),
765 f32_buf,
766 g,
767 start_x,
768 start_y,
769 );
770
771 fill_complex_paint!(
772 g.may_have_opacities,
773 T::gradient_painter(self.simd, g, f32_buf)
774 );
775 }
776 EncodedKind::Sweep(s) => {
777 calculate_t_vals(
778 self.simd,
779 SimdSweepKind::new(self.simd, s),
780 f32_buf,
781 g,
782 start_x,
783 start_y,
784 );
785
786 fill_complex_paint!(
787 g.may_have_opacities,
788 T::gradient_painter(self.simd, g, f32_buf)
789 );
790 }
791 EncodedKind::Radial(r) => {
792 calculate_t_vals(
793 self.simd,
794 SimdRadialKind::new(self.simd, r),
795 f32_buf,
796 g,
797 start_x,
798 start_y,
799 );
800
801 if r.has_undefined() {
802 fill_complex_paint!(
803 g.may_have_opacities,
804 T::gradient_painter_with_undefined(self.simd, g, f32_buf)
805 );
806 } else {
807 fill_complex_paint!(
808 g.may_have_opacities,
809 T::gradient_painter(self.simd, g, f32_buf)
810 );
811 }
812 }
813 }
814 }
815 EncodedPaint::Image(i) => {
816 let ImageSource::Pixmap(pixmap) = &i.source else {
817 panic!("vello_cpu doesn't support the opaque image source.");
818 };
819
820 match (i.has_skew(), i.nearest_neighbor()) {
821 (false, false) => {
822 // Axis-aligned with filtering - use optimized plain painters
823 if i.sampler.quality == ImageQuality::Medium {
824 fill_complex_paint!(
825 i.may_have_opacities,
826 T::plain_medium_quality_image_painter(
827 self.simd, i, pixmap, start_x, start_y
828 )
829 );
830 } else {
831 fill_complex_paint!(
832 i.may_have_opacities,
833 T::high_quality_image_painter(
834 self.simd, i, pixmap, start_x, start_y
835 )
836 );
837 }
838 }
839 (true, false) => {
840 // Skewed with filtering - use generic filtered painters
841 if i.sampler.quality == ImageQuality::Medium {
842 fill_complex_paint!(
843 i.may_have_opacities,
844 T::medium_quality_image_painter(
845 self.simd, i, pixmap, start_x, start_y
846 )
847 );
848 } else {
849 fill_complex_paint!(
850 i.may_have_opacities,
851 T::high_quality_image_painter(
852 self.simd, i, pixmap, start_x, start_y
853 )
854 );
855 }
856 }
857 (false, true) => {
858 fill_complex_paint!(
859 i.may_have_opacities,
860 T::plain_nn_image_painter(
861 self.simd, i, pixmap, start_x, start_y
862 )
863 );
864 }
865 (true, true) => {
866 fill_complex_paint!(
867 i.may_have_opacities,
868 T::nn_image_painter(self.simd, i, pixmap, start_x, start_y)
869 );
870 }
871 }
872 }
873 }
874 }
875 }
876 }
877
878 /// Blend the top blend buffer into the buffer below it.
879 ///
880 /// This pops the top buffer from the blend stack and composites it onto the
881 /// buffer below using the specified blend mode. This is the core operation for
882 /// layer composition.
883 pub(crate) fn blend(&mut self, blend_mode: BlendMode) {
884 let (source_buffer, rest) = self.blend_buf.split_last_mut().unwrap();
885 let target_buffer = rest.last_mut().unwrap();
886
887 if blend_mode == BlendMode::default() {
888 T::alpha_composite_buffer(self.simd, target_buffer, source_buffer, None);
889 } else {
890 T::blend(
891 self.simd,
892 target_buffer,
893 // `start_x` and `start_y` are only needed to sample the correct position
894 // of a mask, so we can just pass dummy values here.
895 0,
896 0,
897 source_buffer
898 .chunks_exact(T::Composite::LENGTH)
899 .map(|s| T::Composite::from_slice(self.simd, s)),
900 blend_mode,
901 None,
902 None,
903 );
904 }
905 }
906
907 /// Apply a clipping mask from the top buffer to the buffer below.
908 ///
909 /// Uses the top buffer's alpha channel as a mask, multiplying it with the buffer
910 /// below. This implements clipping by masking out pixels outside the clip region.
911 fn clip(&mut self, x: usize, width: usize, alphas: Option<&[u8]>) {
912 let (source_buffer, rest) = self.blend_buf.split_last_mut().unwrap();
913 let target_buffer = rest.last_mut().unwrap();
914
915 let source_buffer =
916 &mut source_buffer[x * TILE_HEIGHT_COMPONENTS..][..TILE_HEIGHT_COMPONENTS * width];
917 let target_buffer =
918 &mut target_buffer[x * TILE_HEIGHT_COMPONENTS..][..TILE_HEIGHT_COMPONENTS * width];
919
920 T::alpha_composite_buffer(self.simd, target_buffer, source_buffer, alphas);
921 }
922}
923
924/// A trait for objects that can render pixel data into buffers.
925///
926/// Painters abstract over different content sources (gradients, images, etc.) and can
927/// generate pixel data in either u8 or f32 format. Implementations should provide at least
928/// one of these methods; the other can delegate through conversion.
929///
930/// Note: Some painters may only efficiently support one numeric type. The implementation
931/// may convert between types as needed.
932pub trait Painter {
933 /// Paint pixel data into a u8 buffer (values in 0-255 range).
934 fn paint_u8(&mut self, buf: &mut [u8]);
935
936 /// Paint pixel data into an f32 buffer (values in 0.0-1.0 range).
937 fn paint_f32(&mut self, buf: &mut [f32]);
938}
939
940/// Extension trait for creating position vectors for gradient and image sampling.
941///
942/// This trait provides a method to generate SIMD vectors of positions that advance
943/// correctly across a tile. It's used by painters to compute per-pixel coordinates
944/// for sampling operations.
945pub trait PosExt<S: Simd> {
946 /// Create a position vector that advances appropriately across a tile.
947 ///
948 /// Given a starting position and per-pixel advances in x and y directions,
949 /// generates a SIMD vector with the correct position for each element.
950 fn splat_pos(simd: S, pos: f32, x_advance: f32, y_advance: f32) -> Self;
951}
952
953impl<S: Simd> PosExt<S> for f32x4<S> {
954 #[inline(always)]
955 fn splat_pos(simd: S, pos: f32, _: f32, y_advance: f32) -> Self {
956 let columns: [f32; Tile::HEIGHT as usize] = [0.0, 1.0, 2.0, 3.0];
957 let column_mask: Self = columns.simd_into(simd);
958
959 column_mask.madd(Self::splat(simd, y_advance), Self::splat(simd, pos))
960 }
961}
962
963impl<S: Simd> PosExt<S> for f32x8<S> {
964 #[inline(always)]
965 fn splat_pos(simd: S, pos: f32, x_advance: f32, y_advance: f32) -> Self {
966 simd.combine_f32x4(
967 f32x4::splat_pos(simd, pos, x_advance, y_advance),
968 f32x4::splat_pos(simd, pos + x_advance, x_advance, y_advance),
969 )
970 }
971}
972
973/// Intermediate shader result with color channels stored separately for efficient processing.
974///
975/// This structure holds 8 pixels worth of data in planar format (separate R, G, B, A vectors).
976/// The planar layout is more efficient for certain SIMD operations before final interleaving.
977pub(crate) struct ShaderResultF32<S: Simd> {
978 /// Red channel values for 8 pixels.
979 pub(crate) r: f32x8<S>,
980 /// Green channel values for 8 pixels.
981 pub(crate) g: f32x8<S>,
982 /// Blue channel values for 8 pixels.
983 pub(crate) b: f32x8<S>,
984 /// Alpha channel values for 8 pixels.
985 pub(crate) a: f32x8<S>,
986}
987
988impl<S: Simd> ShaderResultF32<S> {
989 /// Convert from planar format to interleaved RGBA format.
990 ///
991 /// Returns two f32x16 vectors containing 8 pixels (4 RGBA components each)
992 /// with channels interleaved in the standard RGBA order.
993 #[inline(always)]
994 pub(crate) fn get(&self) -> (f32x16<S>, f32x16<S>) {
995 let (r_1, r_2) = self.r.simd.split_f32x8(self.r);
996 let (g_1, g_2) = self.g.simd.split_f32x8(self.g);
997 let (b_1, b_2) = self.b.simd.split_f32x8(self.b);
998 let (a_1, a_2) = self.a.simd.split_f32x8(self.a);
999
1000 let first = self.r.simd.combine_f32x8(
1001 self.r.simd.combine_f32x4(r_1, g_1),
1002 self.r.simd.combine_f32x4(b_1, a_1),
1003 );
1004
1005 let second = self.r.simd.combine_f32x8(
1006 self.r.simd.combine_f32x4(r_2, g_2),
1007 self.r.simd.combine_f32x4(b_2, a_2),
1008 );
1009
1010 (first, second)
1011 }
1012}
1013
1014mod macros {
1015 /// Implements the `Painter` trait for an iterator that produces f32x16 SIMD vectors.
1016 ///
1017 /// This macro generates both `paint_u8` and `paint_f32` methods, converting between
1018 /// formats as needed. Used for painters that work natively with high-precision f32 data.
1019 macro_rules! f32x16_painter {
1020 ($($type_path:tt)+) => {
1021 impl<S: Simd> crate::fine::Painter for $($type_path)+ {
1022 fn paint_u8(&mut self, buf: &mut [u8]) {
1023 use vello_common::fearless_simd::*;
1024 use crate::fine::NumericVec;
1025
1026 self.simd.vectorize(#[inline(always)] || {
1027 for chunk in buf.chunks_exact_mut(16) {
1028 let next = self.next().unwrap();
1029 let converted = u8x16::<S>::from_f32(next.simd, next);
1030 chunk.copy_from_slice(converted.as_slice());
1031 }
1032 })
1033 }
1034
1035 fn paint_f32(&mut self, buf: &mut [f32]) {
1036 self.simd.vectorize(#[inline(always)] || {
1037 for chunk in buf.chunks_exact_mut(16) {
1038 let next = self.next().unwrap();
1039 chunk.copy_from_slice(next.as_slice());
1040 }
1041 })
1042 }
1043 }
1044 };
1045 }
1046
1047 /// Implements the `Painter` trait for an iterator that produces u8x16 SIMD vectors.
1048 ///
1049 /// This macro generates both `paint_u8` and `paint_f32` methods, converting between
1050 /// formats as needed. Used for painters that work natively with low-precision u8 data.
1051 macro_rules! u8x16_painter {
1052 ($($type_path:tt)+) => {
1053 impl<S: Simd> crate::fine::Painter for $($type_path)+ {
1054 fn paint_u8(&mut self, buf: &mut [u8]) {
1055 self.simd.vectorize(#[inline(always)] || {
1056 for chunk in buf.chunks_exact_mut(16) {
1057 let next = self.next().unwrap();
1058 chunk.copy_from_slice(next.as_slice());
1059 }
1060 })
1061 }
1062
1063 fn paint_f32(&mut self, buf: &mut [f32]) {
1064 use vello_common::fearless_simd::*;
1065 use crate::fine::NumericVec;
1066
1067 self.simd.vectorize(#[inline(always)] || {
1068 for chunk in buf.chunks_exact_mut(16) {
1069 let next = self.next().unwrap();
1070 let converted = f32x16::<S>::from_u8(next.simd, next);
1071 chunk.copy_from_slice(converted.as_slice());
1072 }
1073 })
1074 }
1075 }
1076 };
1077 }
1078
1079 pub(crate) use f32x16_painter;
1080 pub(crate) use u8x16_painter;
1081}