vello_cpu/fine/lowp/
mod.rs

1// Copyright 2025 the Vello Authors
2// SPDX-License-Identifier: Apache-2.0 OR MIT
3
4mod compose;
5mod gradient;
6mod image;
7
8use crate::fine::lowp::image::BilinearImagePainter;
9use crate::fine::{COLOR_COMPONENTS, Painter, SCRATCH_BUF_SIZE};
10use crate::fine::{FineKernel, highp, u8_to_f32};
11use crate::peniko::BlendMode;
12use crate::region::Region;
13use crate::util::Div255Ext;
14use bytemuck::cast_slice;
15use vello_common::coarse::WideTile;
16use vello_common::encode::{EncodedGradient, EncodedImage};
17use vello_common::fearless_simd::*;
18use vello_common::paint::PremulColor;
19use vello_common::pixmap::Pixmap;
20use vello_common::tile::Tile;
21use vello_common::util::f32_to_u8;
22
23/// The kernel for doing rendering using u8/u16.
24#[derive(Clone, Copy, Debug)]
25pub struct U8Kernel;
26
27impl<S: Simd> FineKernel<S> for U8Kernel {
28    type Numeric = u8;
29    type Composite = u8x32<S>;
30    type NumericVec = u8x16<S>;
31
32    #[inline]
33    fn extract_color(color: PremulColor) -> [Self::Numeric; 4] {
34        color.as_premul_rgba8().to_u8_array()
35    }
36
37    #[inline(always)]
38    fn pack(simd: S, region: &mut Region<'_>, blend_buf: &[Self::Numeric]) {
39        if region.width != WideTile::WIDTH || region.height != Tile::HEIGHT {
40            // For some reason putting this into `vectorize` as well makes it much slower on
41            // SSE4.2
42            pack(region, blend_buf);
43        } else {
44            simd.vectorize(
45                #[inline(always)]
46                || {
47                    pack_block(simd, region, blend_buf);
48                },
49            );
50        }
51    }
52
53    fn copy_solid(simd: S, dest: &mut [Self::Numeric], src: [Self::Numeric; 4]) {
54        simd.vectorize(
55            #[inline(always)]
56            || {
57                let color = u8x64::block_splat(
58                    u32x4::splat(simd, u32::from_ne_bytes(src)).reinterpret_u8(),
59                );
60
61                for el in dest.chunks_exact_mut(64) {
62                    el.copy_from_slice(&color.val);
63                }
64            },
65        );
66    }
67
68    fn gradient_painter<'a>(
69        simd: S,
70        gradient: &'a EncodedGradient,
71        t_vals: &'a [f32],
72    ) -> impl Painter + 'a {
73        simd.vectorize(
74            #[inline(always)]
75            || gradient::GradientPainter::new(simd, gradient, t_vals),
76        )
77    }
78
79    fn medium_quality_image_painter<'a>(
80        simd: S,
81        image: &'a EncodedImage,
82        pixmap: &'a Pixmap,
83        start_x: u16,
84        start_y: u16,
85    ) -> impl Painter + 'a {
86        simd.vectorize(
87            #[inline(always)]
88            || BilinearImagePainter::new(simd, image, pixmap, start_x, start_y),
89        )
90    }
91
92    fn apply_mask(
93        simd: S,
94        dest: &mut [Self::Numeric],
95        mut src: impl Iterator<Item = Self::NumericVec>,
96    ) {
97        simd.vectorize(
98            #[inline(always)]
99            || {
100                for el in dest.chunks_exact_mut(16) {
101                    let loaded = u8x16::from_slice(simd, el);
102                    let mulled = simd.narrow_u16x16(
103                        (simd.widen_u8x16(loaded) * simd.widen_u8x16(src.next().unwrap()))
104                            .div_255(),
105                    );
106                    el.copy_from_slice(&mulled.val);
107                }
108            },
109        );
110    }
111
112    #[inline(always)]
113    fn apply_painter<'a>(_: S, dest: &mut [Self::Numeric], mut painter: impl Painter + 'a) {
114        painter.paint_u8(dest);
115    }
116
117    #[inline(always)]
118    fn alpha_composite_solid(
119        simd: S,
120        dest: &mut [Self::Numeric],
121        src: [Self::Numeric; 4],
122        alphas: Option<&[u8]>,
123    ) {
124        if let Some(alphas) = alphas {
125            alpha_fill::alpha_composite_solid(simd, dest, src, alphas);
126        } else {
127            fill::alpha_composite_solid(simd, dest, src);
128        }
129    }
130
131    fn alpha_composite_buffer(
132        simd: S,
133        dest: &mut [Self::Numeric],
134        src: &[Self::Numeric],
135        alphas: Option<&[u8]>,
136    ) {
137        let src_iter = src.chunks_exact(32).map(|el| u8x32::from_slice(simd, el));
138
139        if let Some(alphas) = alphas {
140            alpha_fill::alpha_composite(simd, dest, src_iter, alphas);
141        } else {
142            fill::alpha_composite(simd, dest, src_iter);
143        }
144    }
145
146    fn blend(
147        simd: S,
148        dest: &mut [Self::Numeric],
149        src: impl Iterator<Item = Self::Composite>,
150        blend_mode: BlendMode,
151        alphas: Option<&[u8]>,
152    ) {
153        if let Some(alphas) = alphas {
154            alpha_fill::blend(simd, dest, src, blend_mode, alphas);
155        } else {
156            fill::blend(simd, dest, src, blend_mode);
157        }
158    }
159}
160
161mod fill {
162    use crate::fine::Splat4thExt;
163    use crate::fine::lowp::compose::ComposeExt;
164    use crate::fine::lowp::mix;
165    use crate::peniko::{BlendMode, Mix};
166    use crate::util::normalized_mul;
167    use vello_common::fearless_simd::*;
168
169    pub(super) fn blend<S: Simd, T: Iterator<Item = u8x32<S>>>(
170        simd: S,
171        dest: &mut [u8],
172        src: T,
173        blend_mode: BlendMode,
174    ) {
175        simd.vectorize(
176            #[inline(always)]
177            || {
178                #[expect(deprecated, reason = "Provided by the user, need to handle correctly.")]
179                let default_mix = matches!(blend_mode.mix, Mix::Normal | Mix::Clip);
180                let mask = u8x32::splat(simd, 255);
181                for (next_dest, next_src) in dest.chunks_exact_mut(32).zip(src) {
182                    let bg_v = u8x32::from_slice(simd, next_dest);
183                    let src_v = if default_mix {
184                        next_src
185                    } else {
186                        mix(next_src, bg_v, blend_mode)
187                    };
188                    let res = blend_mode.compose(simd, src_v, bg_v, mask);
189                    next_dest.copy_from_slice(&res.val);
190                }
191            },
192        );
193    }
194
195    pub(super) fn alpha_composite_solid<S: Simd>(s: S, dest: &mut [u8], src: [u8; 4]) {
196        s.vectorize(
197            #[inline(always)]
198            || {
199                let one_minus_alpha = 255 - u8x32::splat(s, src[3]);
200                let src_c = u32x8::splat(s, u32::from_ne_bytes(src)).reinterpret_u8();
201
202                for next_dest in dest.chunks_exact_mut(64) {
203                    // We process in batches of 64 because loading/storing is much faster this way (at least on NEON),
204                    // but since we widen to u16, we can only work with 256 bits, so we split it up.
205                    let bg_v = u8x64::from_slice(s, next_dest);
206                    let (bg_1, bg_2) = s.split_u8x64(bg_v);
207                    let res_1 = alpha_composite_inner(s, bg_1, src_c, one_minus_alpha);
208                    let res_2 = alpha_composite_inner(s, bg_2, src_c, one_minus_alpha);
209                    let combined = s.combine_u8x32(res_1, res_2);
210                    next_dest.copy_from_slice(&combined.val);
211                }
212            },
213        );
214    }
215
216    pub(super) fn alpha_composite<S: Simd, T: Iterator<Item = u8x32<S>>>(
217        simd: S,
218        dest: &mut [u8],
219        src: T,
220    ) {
221        simd.vectorize(
222            #[inline(always)]
223            || {
224                for (next_dest, next_src) in dest.chunks_exact_mut(32).zip(src) {
225                    let one_minus_alpha = 255 - next_src.splat_4th();
226                    let bg_v = u8x32::from_slice(simd, next_dest);
227                    let res = alpha_composite_inner(simd, bg_v, next_src, one_minus_alpha);
228                    next_dest.copy_from_slice(&res.val);
229                }
230            },
231        );
232    }
233
234    #[inline(always)]
235    fn alpha_composite_inner<S: Simd>(
236        s: S,
237        bg: u8x32<S>,
238        src: u8x32<S>,
239        one_minus_alpha: u8x32<S>,
240    ) -> u8x32<S> {
241        s.narrow_u16x32(normalized_mul(bg, one_minus_alpha)) + src
242    }
243}
244
245mod alpha_fill {
246    use crate::fine::Splat4thExt;
247    use crate::fine::lowp::compose::ComposeExt;
248    use crate::fine::lowp::{extract_masks, mix};
249    use crate::peniko::{BlendMode, Mix};
250    use crate::util::{Div255Ext, normalized_mul};
251    use vello_common::fearless_simd::*;
252
253    pub(super) fn blend<S: Simd, T: Iterator<Item = u8x32<S>>>(
254        simd: S,
255        dest: &mut [u8],
256        src: T,
257        blend_mode: BlendMode,
258        alphas: &[u8],
259    ) {
260        simd.vectorize(
261            #[inline(always)]
262            || {
263                #[expect(deprecated, reason = "Provided by the user, need to handle correctly.")]
264                let default_mix = matches!(blend_mode.mix, Mix::Normal | Mix::Clip);
265
266                for ((next_bg, next_mask), next_src) in dest
267                    .chunks_exact_mut(32)
268                    .zip(alphas.chunks_exact(8))
269                    .zip(src)
270                {
271                    let bg_v = u8x32::from_slice(simd, next_bg);
272                    let src_c = if default_mix {
273                        next_src
274                    } else {
275                        mix(next_src, bg_v, blend_mode)
276                    };
277                    let masks = extract_masks(simd, next_mask);
278                    let res = blend_mode.compose(simd, src_c, bg_v, masks);
279
280                    next_bg.copy_from_slice(&res.val);
281                }
282            },
283        );
284    }
285
286    #[inline(always)]
287    pub(super) fn alpha_composite_solid<S: Simd>(
288        s: S,
289        dest: &mut [u8],
290        src: [u8; 4],
291        alphas: &[u8],
292    ) {
293        s.vectorize(
294            #[inline(always)]
295            || {
296                let src_a = u8x32::splat(s, src[3]);
297                let src_c = u32x8::splat(s, u32::from_ne_bytes(src)).reinterpret_u8();
298                let one = u8x32::splat(s, 255);
299
300                for (next_bg, next_mask) in dest.chunks_exact_mut(32).zip(alphas.chunks_exact(8)) {
301                    alpha_composite_inner(s, next_bg, next_mask, src_c, src_a, one);
302                }
303            },
304        );
305    }
306
307    #[inline(always)]
308    pub(super) fn alpha_composite<S: Simd, T: Iterator<Item = u8x32<S>>>(
309        simd: S,
310        dest: &mut [u8],
311        src: T,
312        alphas: &[u8],
313    ) {
314        simd.vectorize(
315            #[inline(always)]
316            || {
317                let one = u8x32::splat(simd, 255);
318
319                for ((next_dest, next_mask), next_src) in dest
320                    .chunks_exact_mut(32)
321                    .zip(alphas.chunks_exact(8))
322                    .zip(src)
323                {
324                    let src_a = next_src.splat_4th();
325                    alpha_composite_inner(simd, next_dest, next_mask, next_src, src_a, one);
326                }
327            },
328        );
329    }
330
331    #[inline(always)]
332    fn alpha_composite_inner<S: Simd>(
333        s: S,
334        dest: &mut [u8],
335        masks: &[u8],
336        src_c: u8x32<S>,
337        src_a: u8x32<S>,
338        one: u8x32<S>,
339    ) {
340        s.vectorize(
341            #[inline(always)]
342            || {
343                let bg_v = u8x32::from_slice(s, dest);
344
345                let mask_v = extract_masks(s, masks);
346                let inv_src_a_mask_a = one - s.narrow_u16x32(normalized_mul(src_a, mask_v));
347
348                let p1 = s.widen_u8x32(bg_v) * s.widen_u8x32(inv_src_a_mask_a);
349                let p2 = s.widen_u8x32(src_c) * s.widen_u8x32(mask_v);
350                let res = s.narrow_u16x32((p1 + p2).div_255());
351
352                dest.copy_from_slice(&res.val);
353            },
354        );
355    }
356}
357
358// TODO: Add a proper lowp mix pipeline
359fn mix<S: Simd>(src_c: u8x32<S>, bg_c: u8x32<S>, blend_mode: BlendMode) -> u8x32<S> {
360    let to_f32 = |val: u8x32<S>| {
361        let (a, b) = src_c.simd.split_u8x32(val);
362        let mut a = u8_to_f32(a);
363        let mut b = u8_to_f32(b);
364        a *= f32x16::splat(src_c.simd, 1.0 / 255.0);
365        b *= f32x16::splat(src_c.simd, 1.0 / 255.0);
366        (a, b)
367    };
368
369    let to_u8 = |val1: f32x16<S>, val2: f32x16<S>| {
370        let val1 =
371            f32_to_u8(f32x16::splat(val1.simd, 255.0).madd(val1, f32x16::splat(val1.simd, 0.5)));
372        let val2 =
373            f32_to_u8(f32x16::splat(val2.simd, 255.0).madd(val2, f32x16::splat(val2.simd, 0.5)));
374
375        val1.simd.combine_u8x16(val1, val2)
376    };
377
378    let (mut src_1, mut src_2) = to_f32(src_c);
379    let (bg_1, bg_2) = to_f32(bg_c);
380
381    src_1 = highp::blend::mix(src_1, bg_1, blend_mode);
382    src_2 = highp::blend::mix(src_2, bg_2, blend_mode);
383
384    to_u8(src_1, src_2)
385}
386
387#[inline(always)]
388fn extract_masks<S: Simd>(simd: S, masks: &[u8]) -> u8x32<S> {
389    let m1 =
390        u32x4::splat(simd, u32::from_ne_bytes(masks[0..4].try_into().unwrap())).reinterpret_u8();
391    let m2 =
392        u32x4::splat(simd, u32::from_ne_bytes(masks[4..8].try_into().unwrap())).reinterpret_u8();
393
394    let zipped1 = m1.zip_low(m1);
395    let zipped1 = zipped1.zip_low(zipped1);
396
397    let zipped2 = m2.zip_low(m2);
398    let zipped2 = zipped2.zip_low(zipped2);
399
400    simd.combine_u8x16(zipped1, zipped2)
401}
402
403#[inline(always)]
404fn pack(region: &mut Region<'_>, blend_buf: &[u8]) {
405    for y in 0..Tile::HEIGHT {
406        for (x, pixel) in region
407            .row_mut(y)
408            .chunks_exact_mut(COLOR_COMPONENTS)
409            .enumerate()
410        {
411            let idx = COLOR_COMPONENTS * (usize::from(Tile::HEIGHT) * x + usize::from(y));
412            pixel.copy_from_slice(&blend_buf[idx..][..COLOR_COMPONENTS]);
413        }
414    }
415}
416
417// Note: This method is 3x slower than `pack_regular` when using fallback SIMD, but it's
418// 3x faster than `pack_regular` using the NEON level. Perhaps we should add a way of
419// always falling back to `regular` when in fallback mode.
420#[inline(always)]
421fn pack_block<S: Simd>(simd: S, region: &mut Region<'_>, mut buf: &[u8]) {
422    buf = &buf[..SCRATCH_BUF_SIZE];
423
424    const CHUNK_LENGTH: usize = 64;
425    const SLICE_WIDTH: usize = WideTile::WIDTH as usize * COLOR_COMPONENTS;
426
427    let region_areas = region.areas();
428    let [s1, s2, s3, s4] = region_areas;
429    let dest_slices: &mut [&mut [u8; SLICE_WIDTH]; 4] = &mut [
430        (*s1).try_into().unwrap(),
431        (*s2).try_into().unwrap(),
432        (*s3).try_into().unwrap(),
433        (*s4).try_into().unwrap(),
434    ];
435
436    for (idx, col) in buf.chunks_exact(CHUNK_LENGTH).enumerate() {
437        let dest_idx = idx * CHUNK_LENGTH / 4;
438
439        let casted: &[u32; 16] = cast_slice::<u8, u32>(col).try_into().unwrap();
440
441        let loaded = simd.load_interleaved_128_u32x16(casted).reinterpret_u8();
442        dest_slices[0][dest_idx..][..16].copy_from_slice(&loaded.val[..16]);
443        dest_slices[1][dest_idx..][..16].copy_from_slice(&loaded.val[16..32]);
444        dest_slices[2][dest_idx..][..16].copy_from_slice(&loaded.val[32..48]);
445        dest_slices[3][dest_idx..][..16].copy_from_slice(&loaded.val[48..64]);
446    }
447}
vello_cpu/fine/lowp/mod.rs

vello_cpu/fine/lowp/
mod.rs