Skip to main content

vello_cpu/fine/lowp/
mod.rs

1// Copyright 2025 the Vello Authors
2// SPDX-License-Identifier: Apache-2.0 OR MIT
3
4//! Low-precision (u8/u16) rendering kernel implementation.
5//!
6//! This module implements the fine rasterization stage using 8-bit unsigned integers
7//! for color values and 16-bit for intermediate calculations. This provides better
8//! performance on many architectures compared to floating-point operations, while
9//! maintaining sufficient precision for most rendering tasks.
10
11mod compose;
12mod gradient;
13mod image;
14
15use crate::filter::filter_lowp;
16use crate::fine::lowp::image::{BilinearImagePainter, PlainBilinearImagePainter};
17use crate::fine::{COLOR_COMPONENTS, Painter, SCRATCH_BUF_SIZE};
18use crate::fine::{FineKernel, highp, u8_to_f32};
19use crate::layer_manager::LayerManager;
20use crate::peniko::BlendMode;
21use crate::region::Region;
22use crate::util::scalar::div_255;
23use bytemuck::cast_slice;
24use core::iter;
25use vello_common::coarse::WideTile;
26use vello_common::encode::{EncodedGradient, EncodedImage};
27use vello_common::fearless_simd::*;
28use vello_common::filter_effects::Filter;
29use vello_common::kurbo::Affine;
30use vello_common::mask::Mask;
31use vello_common::paint::PremulColor;
32use vello_common::pixmap::Pixmap;
33use vello_common::tile::Tile;
34use vello_common::util::{Div255Ext, f32_to_u8};
35
36/// The kernel for doing rendering using u8/u16.
37#[derive(Clone, Copy, Debug)]
38pub struct U8Kernel;
39
40impl<S: Simd> FineKernel<S> for U8Kernel {
41    type Numeric = u8;
42    type Composite = u8x32<S>;
43    type NumericVec = u8x16<S>;
44
45    /// Extracts RGBA color components from a premultiplied color as u8 values [0, 255].
46    #[inline]
47    fn extract_color(color: PremulColor) -> [Self::Numeric; 4] {
48        color.as_premul_rgba8().to_u8_array()
49    }
50
51    /// Copies rendered pixels from the scratch buffer to the output region.
52    ///
53    /// Converts from column-major scratch buffer layout to row-major region layout,
54    /// using either a SIMD-optimized path for full tiles or a scalar fallback.
55    #[inline(always)]
56    fn pack(simd: S, region: &mut Region<'_>, blend_buf: &[Self::Numeric]) {
57        if region.width != WideTile::WIDTH || region.height != Tile::HEIGHT {
58            // Use scalar path for non-standard tile sizes. Wrapping this in `vectorize`
59            // degrades performance significantly on SSE4.2.
60            pack(region, blend_buf);
61        } else {
62            simd.vectorize(
63                #[inline(always)]
64                || {
65                    pack_block(simd, region, blend_buf);
66                },
67            );
68        }
69    }
70
71    /// Copies pixels from the output region to the scratch buffer.
72    ///
73    /// Converts from row-major region layout to column-major scratch buffer layout.
74    /// This is the inverse operation of `pack`.
75    #[inline(always)]
76    fn unpack(simd: S, region: &mut Region<'_>, blend_buf: &mut [Self::Numeric]) {
77        simd.vectorize(
78            #[inline(always)]
79            || {
80                unpack(region, blend_buf);
81            },
82        );
83    }
84
85    /// Applies a filter effect to a rendered layer.
86    ///
87    /// Delegates to the u8-specific filter implementation.
88    fn filter_layer(
89        pixmap: &mut Pixmap,
90        filter: &Filter,
91        layer_manager: &mut LayerManager,
92        transform: Affine,
93    ) {
94        filter_lowp(filter, pixmap, layer_manager, transform);
95    }
96
97    /// Fills a buffer with a solid color using SIMD operations.
98    ///
99    /// Efficiently broadcasts a single RGBA color across all pixels in the destination.
100    fn copy_solid(simd: S, dest: &mut [Self::Numeric], src: [Self::Numeric; 4]) {
101        simd.vectorize(
102            #[inline(always)]
103            || {
104                let color =
105                    u8x64::block_splat(u32x4::splat(simd, u32::from_ne_bytes(src)).to_bytes());
106
107                for el in dest.chunks_exact_mut(64) {
108                    el.copy_from_slice(color.as_slice());
109                }
110            },
111        );
112    }
113
114    /// Creates a painter for rendering gradients in u8 precision.
115    ///
116    /// Returns a painter that evaluates the gradient at each pixel position.
117    fn gradient_painter<'a>(
118        simd: S,
119        gradient: &'a EncodedGradient,
120        t_vals: &'a [f32],
121    ) -> impl Painter + 'a {
122        simd.vectorize(
123            #[inline(always)]
124            || gradient::GradientPainter::new(simd, gradient, t_vals),
125        )
126    }
127
128    /// Creates a painter for rendering images with bilinear filtering in u8 precision.
129    ///
130    /// Returns a painter that samples the image with bilinear interpolation.
131    fn medium_quality_image_painter<'a>(
132        simd: S,
133        image: &'a EncodedImage,
134        pixmap: &'a Pixmap,
135        start_x: u16,
136        start_y: u16,
137    ) -> impl Painter + 'a {
138        simd.vectorize(
139            #[inline(always)]
140            || BilinearImagePainter::new(simd, image, pixmap, start_x, start_y),
141        )
142    }
143
144    /// Creates a painter for rendering axis-aligned images with bilinear filtering in u8 precision.
145    ///
146    /// Returns a painter that samples the image with bilinear interpolation.
147    /// This is an optimized version for images without skew transformation.
148    fn plain_medium_quality_image_painter<'a>(
149        simd: S,
150        image: &'a EncodedImage,
151        pixmap: &'a Pixmap,
152        start_x: u16,
153        start_y: u16,
154    ) -> impl Painter + 'a {
155        simd.vectorize(
156            #[inline(always)]
157            || PlainBilinearImagePainter::new(simd, image, pixmap, start_x, start_y),
158        )
159    }
160
161    /// Applies per-pixel mask values to a buffer by multiplying each component.
162    ///
163    /// Used for anti-aliasing and clipping effects. Each pixel is multiplied by
164    /// its corresponding mask value and normalized.
165    fn apply_mask(
166        simd: S,
167        dest: &mut [Self::Numeric],
168        mut src: impl Iterator<Item = Self::NumericVec>,
169    ) {
170        simd.vectorize(
171            #[inline(always)]
172            || {
173                for el in dest.chunks_exact_mut(16) {
174                    let loaded = u8x16::from_slice(simd, el);
175                    let mulled = simd.narrow_u16x16(
176                        (simd.widen_u8x16(loaded) * simd.widen_u8x16(src.next().unwrap()))
177                            .div_255(),
178                    );
179                    el.copy_from_slice(mulled.as_slice());
180                }
181            },
182        );
183    }
184
185    /// Applies a painter's output to the destination buffer.
186    ///
187    /// Delegates to the painter's u8-specific implementation.
188    #[inline(always)]
189    fn apply_painter<'a>(_: S, dest: &mut [Self::Numeric], mut painter: impl Painter + 'a) {
190        painter.paint_u8(dest);
191    }
192
193    /// Composites a solid color onto a buffer using alpha blending.
194    ///
195    /// Dispatches to either the masked or unmasked implementation based on the
196    /// presence of per-pixel alpha masks.
197    #[inline(always)]
198    fn alpha_composite_solid(
199        simd: S,
200        dest: &mut [Self::Numeric],
201        src: [Self::Numeric; 4],
202        alphas: Option<&[u8]>,
203    ) {
204        if let Some(alphas) = alphas {
205            alpha_fill::alpha_composite_solid(
206                simd,
207                dest,
208                src,
209                bytemuck::cast_slice::<u8, [u8; 8]>(alphas).iter().copied(),
210            );
211        } else {
212            fill::alpha_composite_solid(simd, dest, src);
213        }
214    }
215
216    /// Composites a source buffer onto a destination buffer using alpha blending.
217    ///
218    /// Dispatches to either the masked or unmasked implementation based on the
219    /// presence of per-pixel alpha masks. Each source pixel's alpha determines
220    /// the blending amount.
221    fn alpha_composite_buffer(
222        simd: S,
223        dest: &mut [Self::Numeric],
224        src: &[Self::Numeric],
225        alphas: Option<&[u8]>,
226    ) {
227        let src_iter = src.chunks_exact(32).map(|el| u8x32::from_slice(simd, el));
228
229        if let Some(alphas) = alphas {
230            alpha_fill::alpha_composite(
231                simd,
232                dest,
233                src_iter,
234                bytemuck::cast_slice::<u8, [u8; 8]>(alphas).iter().copied(),
235            );
236        } else {
237            fill::alpha_composite(simd, dest, src_iter);
238        }
239    }
240
241    /// Applies a blend mode to composite source pixels onto destination.
242    ///
243    /// Dispatches to either the masked or unmasked blend implementation.
244    /// Handles both color mixing (multiply, screen, etc.) and compositing.
245    fn blend(
246        simd: S,
247        dest: &mut [Self::Numeric],
248        mut start_x: u16,
249        start_y: u16,
250        src: impl Iterator<Item = Self::Composite>,
251        blend_mode: BlendMode,
252        alphas: Option<&[u8]>,
253        mask: Option<&Mask>,
254    ) {
255        let alpha_iter = alphas.map(|a| bytemuck::cast_slice::<u8, [u8; 8]>(a).iter().copied());
256
257        let mask_iter = mask.map(|m| {
258            iter::from_fn(|| {
259                let sample = |x: u16, y: u16| {
260                    if x < m.width() && y < m.height() {
261                        m.sample(x, y)
262                    } else {
263                        255
264                    }
265                };
266
267                let samples = [
268                    sample(start_x, start_y),
269                    sample(start_x, start_y + 1),
270                    sample(start_x, start_y + 2),
271                    sample(start_x, start_y + 3),
272                    sample(start_x + 1, start_y),
273                    sample(start_x + 1, start_y + 1),
274                    sample(start_x + 1, start_y + 2),
275                    sample(start_x + 1, start_y + 3),
276                ];
277
278                start_x += 2;
279
280                Some(samples)
281            })
282        });
283
284        match (alpha_iter, mask_iter) {
285            (Some(alpha_iter), Some(mut mask_iter)) => {
286                let iter = alpha_iter.map(|a1| {
287                    let a2 = mask_iter.next().unwrap();
288                    [
289                        div_255(a1[0] as u16 * a2[0] as u16) as u8,
290                        div_255(a1[1] as u16 * a2[1] as u16) as u8,
291                        div_255(a1[2] as u16 * a2[2] as u16) as u8,
292                        div_255(a1[3] as u16 * a2[3] as u16) as u8,
293                        div_255(a1[4] as u16 * a2[4] as u16) as u8,
294                        div_255(a1[5] as u16 * a2[5] as u16) as u8,
295                        div_255(a1[6] as u16 * a2[6] as u16) as u8,
296                        div_255(a1[7] as u16 * a2[7] as u16) as u8,
297                    ]
298                });
299                alpha_fill::blend(simd, dest, src, blend_mode, iter);
300            }
301            (None, Some(mask_iter)) => alpha_fill::blend(simd, dest, src, blend_mode, mask_iter),
302            (Some(alpha_iter), None) => alpha_fill::blend(simd, dest, src, blend_mode, alpha_iter),
303            (None, None) => {
304                fill::blend(simd, dest, src, blend_mode);
305            }
306        }
307    }
308}
309
310mod fill {
311    //! Alpha compositing and blending operations without per-pixel alpha masks.
312    //!
313    //! This module handles the case where we're compositing full opacity pixels,
314    //! using only the source alpha channel for compositing.
315
316    use crate::fine::Splat4thExt;
317    use crate::fine::lowp::compose::ComposeExt;
318    use crate::fine::lowp::mix;
319    use crate::peniko::{BlendMode, Mix};
320    use vello_common::fearless_simd::*;
321    use vello_common::util::normalized_mul_u8x32;
322
323    /// Applies blend mode compositing to a buffer without per-pixel masks.
324    pub(super) fn blend<S: Simd, T: Iterator<Item = u8x32<S>>>(
325        simd: S,
326        dest: &mut [u8],
327        src: T,
328        blend_mode: BlendMode,
329    ) {
330        simd.vectorize(
331            #[inline(always)]
332            || {
333                let default_mix = matches!(blend_mode.mix, Mix::Normal);
334                for (next_dest, next_src) in dest.chunks_exact_mut(32).zip(src) {
335                    let bg_v = u8x32::from_slice(simd, next_dest);
336                    let src_v = if default_mix {
337                        next_src
338                    } else {
339                        mix(next_src, bg_v, blend_mode)
340                    };
341                    let res = blend_mode.compose(simd, src_v, bg_v, None);
342                    next_dest.copy_from_slice(res.as_slice());
343                }
344            },
345        );
346    }
347
348    /// Composites a solid color onto a buffer using alpha blending.
349    ///
350    /// Uses the "over" operator: `result = src + bg * (1 - src_alpha)`
351    pub(super) fn alpha_composite_solid<S: Simd>(s: S, dest: &mut [u8], src: [u8; 4]) {
352        s.vectorize(
353            #[inline(always)]
354            || {
355                let one_minus_alpha = 255 - u8x32::splat(s, src[3]);
356                let src_c = u32x8::splat(s, u32::from_ne_bytes(src)).to_bytes();
357
358                for next_dest in dest.chunks_exact_mut(64) {
359                    // We process in batches of 64 because loading/storing is much faster this way (at least on NEON),
360                    // but since we widen to u16, we can only work with 256 bits, so we split it up.
361                    let bg_v = u8x64::from_slice(s, next_dest);
362                    let (bg_1, bg_2) = s.split_u8x64(bg_v);
363                    let res_1 = alpha_composite_inner(s, bg_1, src_c, one_minus_alpha);
364                    let res_2 = alpha_composite_inner(s, bg_2, src_c, one_minus_alpha);
365                    let combined = s.combine_u8x32(res_1, res_2);
366                    next_dest.copy_from_slice(combined.as_slice());
367                }
368            },
369        );
370    }
371
372    /// Composites a buffer of colors onto another buffer using alpha blending.
373    ///
374    /// Each source pixel is composited individually based on its alpha channel.
375    pub(super) fn alpha_composite<S: Simd, T: Iterator<Item = u8x32<S>>>(
376        simd: S,
377        dest: &mut [u8],
378        src: T,
379    ) {
380        simd.vectorize(
381            #[inline(always)]
382            || {
383                for (next_dest, next_src) in dest.chunks_exact_mut(32).zip(src) {
384                    let one_minus_alpha = 255 - next_src.splat_4th();
385                    let bg_v = u8x32::from_slice(simd, next_dest);
386                    let res = alpha_composite_inner(simd, bg_v, next_src, one_minus_alpha);
387                    next_dest.copy_from_slice(res.as_slice());
388                }
389            },
390        );
391    }
392
393    /// Performs the core alpha compositing calculation.
394    ///
395    /// Formula: `result = src + bg * (1 - src_alpha)`
396    /// This implements the Porter-Duff "source over" operator.
397    #[inline(always)]
398    fn alpha_composite_inner<S: Simd>(
399        s: S,
400        bg: u8x32<S>,
401        src: u8x32<S>,
402        one_minus_alpha: u8x32<S>,
403    ) -> u8x32<S> {
404        s.narrow_u16x32(normalized_mul_u8x32(bg, one_minus_alpha)) + src
405    }
406}
407
408mod alpha_fill {
409    //! Alpha compositing and blending operations with per-pixel alpha masks.
410    //!
411    //! This module handles compositing when each pixel has an additional mask value
412    //! (e.g., from anti-aliasing or clip masks) that modulates the source alpha.
413
414    use crate::fine::Splat4thExt;
415    use crate::fine::lowp::compose::ComposeExt;
416    use crate::fine::lowp::{extract_masks, mix};
417    use crate::peniko::{BlendMode, Mix};
418    use vello_common::fearless_simd::*;
419    use vello_common::util::{Div255Ext, normalized_mul_u8x32};
420
421    /// Applies blend mode compositing with per-pixel alpha masks.
422    pub(super) fn blend<S: Simd, T: Iterator<Item = u8x32<S>>>(
423        simd: S,
424        dest: &mut [u8],
425        src: T,
426        blend_mode: BlendMode,
427        alphas: impl Iterator<Item = [u8; 8]>,
428    ) {
429        simd.vectorize(
430            #[inline(always)]
431            || {
432                let default_mix = matches!(blend_mode.mix, Mix::Normal);
433
434                for ((next_bg, next_mask), next_src) in
435                    dest.chunks_exact_mut(32).zip(alphas).zip(src)
436                {
437                    let bg_v = u8x32::from_slice(simd, next_bg);
438                    let src_c = if default_mix {
439                        next_src
440                    } else {
441                        mix(next_src, bg_v, blend_mode)
442                    };
443                    let masks = extract_masks(simd, &next_mask);
444                    let res = blend_mode.compose(simd, src_c, bg_v, Some(masks));
445
446                    next_bg.copy_from_slice(res.as_slice());
447                }
448            },
449        );
450    }
451
452    /// Composites a solid color with per-pixel alpha masks.
453    ///
454    /// Combines source alpha with mask values: `effective_alpha = src_alpha * mask / 255`
455    #[inline(always)]
456    pub(super) fn alpha_composite_solid<S: Simd>(
457        s: S,
458        dest: &mut [u8],
459        src: [u8; 4],
460        alphas: impl Iterator<Item = [u8; 8]>,
461    ) {
462        s.vectorize(
463            #[inline(always)]
464            || {
465                let src_a = u8x32::splat(s, src[3]);
466                let src_c = u32x8::splat(s, u32::from_ne_bytes(src)).to_bytes();
467                let one = u8x32::splat(s, 255);
468
469                for (next_bg, next_mask) in dest.chunks_exact_mut(32).zip(alphas) {
470                    alpha_composite_inner(s, next_bg, &next_mask, src_c, src_a, one);
471                }
472            },
473        );
474    }
475
476    /// Composites a buffer of colors with per-pixel alpha masks.
477    ///
478    /// Each pixel's source alpha is modulated by its corresponding mask value.
479    #[inline(always)]
480    pub(super) fn alpha_composite<S: Simd, T: Iterator<Item = u8x32<S>>>(
481        simd: S,
482        dest: &mut [u8],
483        src: T,
484        alphas: impl Iterator<Item = [u8; 8]>,
485    ) {
486        simd.vectorize(
487            #[inline(always)]
488            || {
489                let one = u8x32::splat(simd, 255);
490
491                for ((next_dest, next_mask), next_src) in
492                    dest.chunks_exact_mut(32).zip(alphas).zip(src)
493                {
494                    let src_a = next_src.splat_4th();
495                    alpha_composite_inner(simd, next_dest, &next_mask, next_src, src_a, one);
496                }
497            },
498        );
499    }
500
501    /// Performs alpha compositing with mask modulation.
502    ///
503    /// Formula: `result = src * mask + bg * (1 - src_alpha * mask)`
504    /// The mask value modulates both the source contribution and the inverse alpha.
505    #[inline(always)]
506    fn alpha_composite_inner<S: Simd>(
507        s: S,
508        dest: &mut [u8],
509        masks: &[u8; 8],
510        src_c: u8x32<S>,
511        src_a: u8x32<S>,
512        one: u8x32<S>,
513    ) {
514        s.vectorize(
515            #[inline(always)]
516            || {
517                let bg_v = u8x32::from_slice(s, dest);
518
519                let mask_v = extract_masks(s, masks);
520                let inv_src_a_mask_a = one - s.narrow_u16x32(normalized_mul_u8x32(src_a, mask_v));
521
522                let p1 = s.widen_u8x32(bg_v) * s.widen_u8x32(inv_src_a_mask_a);
523                let p2 = s.widen_u8x32(src_c) * s.widen_u8x32(mask_v);
524                let res = s.narrow_u16x32((p1 + p2).div_255());
525
526                dest.copy_from_slice(res.as_slice());
527            },
528        );
529    }
530}
531
532/// Applies blend mode mixing by converting to f32, mixing, then converting back to u8.
533///
534/// TODO: Add a proper lowp mix pipeline that operates entirely in integer space
535/// for better performance (currently converts through f32 which is slower).
536fn mix<S: Simd>(src_c: u8x32<S>, bg_c: u8x32<S>, blend_mode: BlendMode) -> u8x32<S> {
537    let to_f32 = |val: u8x32<S>| {
538        let (a, b) = src_c.simd.split_u8x32(val);
539        let mut a = u8_to_f32(a);
540        let mut b = u8_to_f32(b);
541        a *= f32x16::splat(src_c.simd, 1.0 / 255.0);
542        b *= f32x16::splat(src_c.simd, 1.0 / 255.0);
543        (a, b)
544    };
545
546    let to_u8 = |val1: f32x16<S>, val2: f32x16<S>| {
547        let val1 =
548            f32_to_u8(f32x16::splat(val1.simd, 255.0).madd(val1, f32x16::splat(val1.simd, 0.5)));
549        let val2 =
550            f32_to_u8(f32x16::splat(val2.simd, 255.0).madd(val2, f32x16::splat(val2.simd, 0.5)));
551
552        val1.simd.combine_u8x16(val1, val2)
553    };
554
555    let (mut src_1, mut src_2) = to_f32(src_c);
556    let (bg_1, bg_2) = to_f32(bg_c);
557
558    src_1 = highp::blend::mix(src_1, bg_1, blend_mode);
559    src_2 = highp::blend::mix(src_2, bg_2, blend_mode);
560
561    to_u8(src_1, src_2)
562}
563
564/// Expands 8 mask bytes into a 32-byte SIMD vector where each pixel's 4 components
565/// share the same mask value (each of 8 mask values is repeated 4 times).
566///
567/// Input: [m0, m1, m2, m3, m4, m5, m6, m7]
568/// Output: [m0, m0, m0, m0, m1, m1, m1, m1, ..., m7, m7, m7, m7]
569#[inline(always)]
570fn extract_masks<S: Simd>(simd: S, masks: &[u8; 8]) -> u8x32<S> {
571    let m1 = u32x4::splat(simd, u32::from_ne_bytes(masks[0..4].try_into().unwrap())).to_bytes();
572    let m2 = u32x4::splat(simd, u32::from_ne_bytes(masks[4..8].try_into().unwrap())).to_bytes();
573
574    let zipped1 = m1.zip_low(m1);
575    let zipped1 = zipped1.zip_low(zipped1);
576
577    let zipped2 = m2.zip_low(m2);
578    let zipped2 = zipped2.zip_low(zipped2);
579
580    simd.combine_u8x16(zipped1, zipped2)
581}
582
583/// Copies color data from the scratch buffer to the output region (scalar fallback).
584///
585/// The scratch buffer stores pixels in column-major order for SIMD efficiency,
586/// while the region uses row-major order for output.
587#[inline(always)]
588fn pack(region: &mut Region<'_>, blend_buf: &[u8]) {
589    for y in 0..Tile::HEIGHT {
590        for (x, pixel) in region
591            .row_mut(y)
592            .chunks_exact_mut(COLOR_COMPONENTS)
593            .enumerate()
594        {
595            let idx = COLOR_COMPONENTS * (usize::from(Tile::HEIGHT) * x + usize::from(y));
596            pixel.copy_from_slice(&blend_buf[idx..][..COLOR_COMPONENTS]);
597        }
598    }
599}
600
601/// Copies color data from the output region to the scratch buffer.
602///
603/// Converts from row-major (region) to column-major (scratch buffer) layout.
604/// This is the inverse operation of `pack`.
605#[inline(always)]
606fn unpack(region: &mut Region<'_>, blend_buf: &mut [u8]) {
607    for y in 0..Tile::HEIGHT {
608        for (x, pixel) in region.row_mut(y).chunks_exact(COLOR_COMPONENTS).enumerate() {
609            let idx = COLOR_COMPONENTS * (usize::from(Tile::HEIGHT) * x + usize::from(y));
610            blend_buf[idx..][..COLOR_COMPONENTS].copy_from_slice(pixel);
611        }
612    }
613}
614
615/// SIMD-optimized version of `pack` for full-size tiles using interleaved loads.
616///
617/// Uses `load_interleaved_128` to efficiently transpose and copy data from column-major
618/// scratch buffer to row-major output region. Performance characteristics are highly
619/// architecture-dependent:
620/// - On NEON: ~3x faster than scalar `pack`
621/// - On fallback SIMD: ~3x slower than scalar `pack`
622///
623/// TODO: Consider runtime detection to fall back to scalar on non-NEON architectures.
624#[inline(always)]
625fn pack_block<S: Simd>(simd: S, region: &mut Region<'_>, mut buf: &[u8]) {
626    buf = &buf[..SCRATCH_BUF_SIZE];
627
628    const CHUNK_LENGTH: usize = 64;
629    const SLICE_WIDTH: usize = WideTile::WIDTH as usize * COLOR_COMPONENTS;
630
631    let region_areas = region.areas();
632    let [s1, s2, s3, s4] = region_areas;
633    let dest_slices: &mut [&mut [u8; SLICE_WIDTH]; 4] = &mut [
634        (*s1).try_into().unwrap(),
635        (*s2).try_into().unwrap(),
636        (*s3).try_into().unwrap(),
637        (*s4).try_into().unwrap(),
638    ];
639
640    for (idx, col) in buf.chunks_exact(CHUNK_LENGTH).enumerate() {
641        let dest_idx = idx * CHUNK_LENGTH / 4;
642
643        let casted: &[u32; 16] = cast_slice::<u8, u32>(col).try_into().unwrap();
644
645        let loaded = simd.load_interleaved_128_u32x16(casted).to_bytes();
646        dest_slices[0][dest_idx..][..16].copy_from_slice(&loaded.as_slice()[..16]);
647        dest_slices[1][dest_idx..][..16].copy_from_slice(&loaded.as_slice()[16..32]);
648        dest_slices[2][dest_idx..][..16].copy_from_slice(&loaded.as_slice()[32..48]);
649        dest_slices[3][dest_idx..][..16].copy_from_slice(&loaded.as_slice()[48..64]);
650    }
651}