1mod compose;
12mod gradient;
13mod image;
14
15use crate::filter::filter_lowp;
16use crate::fine::lowp::image::{BilinearImagePainter, PlainBilinearImagePainter};
17use crate::fine::{COLOR_COMPONENTS, Painter, SCRATCH_BUF_SIZE};
18use crate::fine::{FineKernel, highp, u8_to_f32};
19use crate::layer_manager::LayerManager;
20use crate::peniko::BlendMode;
21use crate::region::Region;
22use crate::util::scalar::div_255;
23use bytemuck::cast_slice;
24use core::iter;
25use vello_common::coarse::WideTile;
26use vello_common::encode::{EncodedGradient, EncodedImage};
27use vello_common::fearless_simd::*;
28use vello_common::filter_effects::Filter;
29use vello_common::kurbo::Affine;
30use vello_common::mask::Mask;
31use vello_common::paint::PremulColor;
32use vello_common::pixmap::Pixmap;
33use vello_common::tile::Tile;
34use vello_common::util::{Div255Ext, f32_to_u8};
35
36#[derive(Clone, Copy, Debug)]
38pub struct U8Kernel;
39
40impl<S: Simd> FineKernel<S> for U8Kernel {
41 type Numeric = u8;
42 type Composite = u8x32<S>;
43 type NumericVec = u8x16<S>;
44
45 #[inline]
47 fn extract_color(color: PremulColor) -> [Self::Numeric; 4] {
48 color.as_premul_rgba8().to_u8_array()
49 }
50
51 #[inline(always)]
56 fn pack(simd: S, region: &mut Region<'_>, blend_buf: &[Self::Numeric]) {
57 if region.width != WideTile::WIDTH || region.height != Tile::HEIGHT {
58 pack(region, blend_buf);
61 } else {
62 simd.vectorize(
63 #[inline(always)]
64 || {
65 pack_block(simd, region, blend_buf);
66 },
67 );
68 }
69 }
70
71 #[inline(always)]
76 fn unpack(simd: S, region: &mut Region<'_>, blend_buf: &mut [Self::Numeric]) {
77 simd.vectorize(
78 #[inline(always)]
79 || {
80 unpack(region, blend_buf);
81 },
82 );
83 }
84
85 fn filter_layer(
89 pixmap: &mut Pixmap,
90 filter: &Filter,
91 layer_manager: &mut LayerManager,
92 transform: Affine,
93 ) {
94 filter_lowp(filter, pixmap, layer_manager, transform);
95 }
96
97 fn copy_solid(simd: S, dest: &mut [Self::Numeric], src: [Self::Numeric; 4]) {
101 simd.vectorize(
102 #[inline(always)]
103 || {
104 let color =
105 u8x64::block_splat(u32x4::splat(simd, u32::from_ne_bytes(src)).to_bytes());
106
107 for el in dest.chunks_exact_mut(64) {
108 el.copy_from_slice(color.as_slice());
109 }
110 },
111 );
112 }
113
114 fn gradient_painter<'a>(
118 simd: S,
119 gradient: &'a EncodedGradient,
120 t_vals: &'a [f32],
121 ) -> impl Painter + 'a {
122 simd.vectorize(
123 #[inline(always)]
124 || gradient::GradientPainter::new(simd, gradient, t_vals),
125 )
126 }
127
128 fn medium_quality_image_painter<'a>(
132 simd: S,
133 image: &'a EncodedImage,
134 pixmap: &'a Pixmap,
135 start_x: u16,
136 start_y: u16,
137 ) -> impl Painter + 'a {
138 simd.vectorize(
139 #[inline(always)]
140 || BilinearImagePainter::new(simd, image, pixmap, start_x, start_y),
141 )
142 }
143
144 fn plain_medium_quality_image_painter<'a>(
149 simd: S,
150 image: &'a EncodedImage,
151 pixmap: &'a Pixmap,
152 start_x: u16,
153 start_y: u16,
154 ) -> impl Painter + 'a {
155 simd.vectorize(
156 #[inline(always)]
157 || PlainBilinearImagePainter::new(simd, image, pixmap, start_x, start_y),
158 )
159 }
160
161 fn apply_mask(
166 simd: S,
167 dest: &mut [Self::Numeric],
168 mut src: impl Iterator<Item = Self::NumericVec>,
169 ) {
170 simd.vectorize(
171 #[inline(always)]
172 || {
173 for el in dest.chunks_exact_mut(16) {
174 let loaded = u8x16::from_slice(simd, el);
175 let mulled = simd.narrow_u16x16(
176 (simd.widen_u8x16(loaded) * simd.widen_u8x16(src.next().unwrap()))
177 .div_255(),
178 );
179 el.copy_from_slice(mulled.as_slice());
180 }
181 },
182 );
183 }
184
185 #[inline(always)]
189 fn apply_painter<'a>(_: S, dest: &mut [Self::Numeric], mut painter: impl Painter + 'a) {
190 painter.paint_u8(dest);
191 }
192
193 #[inline(always)]
198 fn alpha_composite_solid(
199 simd: S,
200 dest: &mut [Self::Numeric],
201 src: [Self::Numeric; 4],
202 alphas: Option<&[u8]>,
203 ) {
204 if let Some(alphas) = alphas {
205 alpha_fill::alpha_composite_solid(
206 simd,
207 dest,
208 src,
209 bytemuck::cast_slice::<u8, [u8; 8]>(alphas).iter().copied(),
210 );
211 } else {
212 fill::alpha_composite_solid(simd, dest, src);
213 }
214 }
215
216 fn alpha_composite_buffer(
222 simd: S,
223 dest: &mut [Self::Numeric],
224 src: &[Self::Numeric],
225 alphas: Option<&[u8]>,
226 ) {
227 let src_iter = src.chunks_exact(32).map(|el| u8x32::from_slice(simd, el));
228
229 if let Some(alphas) = alphas {
230 alpha_fill::alpha_composite(
231 simd,
232 dest,
233 src_iter,
234 bytemuck::cast_slice::<u8, [u8; 8]>(alphas).iter().copied(),
235 );
236 } else {
237 fill::alpha_composite(simd, dest, src_iter);
238 }
239 }
240
241 fn blend(
246 simd: S,
247 dest: &mut [Self::Numeric],
248 mut start_x: u16,
249 start_y: u16,
250 src: impl Iterator<Item = Self::Composite>,
251 blend_mode: BlendMode,
252 alphas: Option<&[u8]>,
253 mask: Option<&Mask>,
254 ) {
255 let alpha_iter = alphas.map(|a| bytemuck::cast_slice::<u8, [u8; 8]>(a).iter().copied());
256
257 let mask_iter = mask.map(|m| {
258 iter::from_fn(|| {
259 let sample = |x: u16, y: u16| {
260 if x < m.width() && y < m.height() {
261 m.sample(x, y)
262 } else {
263 255
264 }
265 };
266
267 let samples = [
268 sample(start_x, start_y),
269 sample(start_x, start_y + 1),
270 sample(start_x, start_y + 2),
271 sample(start_x, start_y + 3),
272 sample(start_x + 1, start_y),
273 sample(start_x + 1, start_y + 1),
274 sample(start_x + 1, start_y + 2),
275 sample(start_x + 1, start_y + 3),
276 ];
277
278 start_x += 2;
279
280 Some(samples)
281 })
282 });
283
284 match (alpha_iter, mask_iter) {
285 (Some(alpha_iter), Some(mut mask_iter)) => {
286 let iter = alpha_iter.map(|a1| {
287 let a2 = mask_iter.next().unwrap();
288 [
289 div_255(a1[0] as u16 * a2[0] as u16) as u8,
290 div_255(a1[1] as u16 * a2[1] as u16) as u8,
291 div_255(a1[2] as u16 * a2[2] as u16) as u8,
292 div_255(a1[3] as u16 * a2[3] as u16) as u8,
293 div_255(a1[4] as u16 * a2[4] as u16) as u8,
294 div_255(a1[5] as u16 * a2[5] as u16) as u8,
295 div_255(a1[6] as u16 * a2[6] as u16) as u8,
296 div_255(a1[7] as u16 * a2[7] as u16) as u8,
297 ]
298 });
299 alpha_fill::blend(simd, dest, src, blend_mode, iter);
300 }
301 (None, Some(mask_iter)) => alpha_fill::blend(simd, dest, src, blend_mode, mask_iter),
302 (Some(alpha_iter), None) => alpha_fill::blend(simd, dest, src, blend_mode, alpha_iter),
303 (None, None) => {
304 fill::blend(simd, dest, src, blend_mode);
305 }
306 }
307 }
308}
309
310mod fill {
311 use crate::fine::Splat4thExt;
317 use crate::fine::lowp::compose::ComposeExt;
318 use crate::fine::lowp::mix;
319 use crate::peniko::{BlendMode, Mix};
320 use vello_common::fearless_simd::*;
321 use vello_common::util::normalized_mul_u8x32;
322
323 pub(super) fn blend<S: Simd, T: Iterator<Item = u8x32<S>>>(
325 simd: S,
326 dest: &mut [u8],
327 src: T,
328 blend_mode: BlendMode,
329 ) {
330 simd.vectorize(
331 #[inline(always)]
332 || {
333 let default_mix = matches!(blend_mode.mix, Mix::Normal);
334 for (next_dest, next_src) in dest.chunks_exact_mut(32).zip(src) {
335 let bg_v = u8x32::from_slice(simd, next_dest);
336 let src_v = if default_mix {
337 next_src
338 } else {
339 mix(next_src, bg_v, blend_mode)
340 };
341 let res = blend_mode.compose(simd, src_v, bg_v, None);
342 next_dest.copy_from_slice(res.as_slice());
343 }
344 },
345 );
346 }
347
348 pub(super) fn alpha_composite_solid<S: Simd>(s: S, dest: &mut [u8], src: [u8; 4]) {
352 s.vectorize(
353 #[inline(always)]
354 || {
355 let one_minus_alpha = 255 - u8x32::splat(s, src[3]);
356 let src_c = u32x8::splat(s, u32::from_ne_bytes(src)).to_bytes();
357
358 for next_dest in dest.chunks_exact_mut(64) {
359 let bg_v = u8x64::from_slice(s, next_dest);
362 let (bg_1, bg_2) = s.split_u8x64(bg_v);
363 let res_1 = alpha_composite_inner(s, bg_1, src_c, one_minus_alpha);
364 let res_2 = alpha_composite_inner(s, bg_2, src_c, one_minus_alpha);
365 let combined = s.combine_u8x32(res_1, res_2);
366 next_dest.copy_from_slice(combined.as_slice());
367 }
368 },
369 );
370 }
371
372 pub(super) fn alpha_composite<S: Simd, T: Iterator<Item = u8x32<S>>>(
376 simd: S,
377 dest: &mut [u8],
378 src: T,
379 ) {
380 simd.vectorize(
381 #[inline(always)]
382 || {
383 for (next_dest, next_src) in dest.chunks_exact_mut(32).zip(src) {
384 let one_minus_alpha = 255 - next_src.splat_4th();
385 let bg_v = u8x32::from_slice(simd, next_dest);
386 let res = alpha_composite_inner(simd, bg_v, next_src, one_minus_alpha);
387 next_dest.copy_from_slice(res.as_slice());
388 }
389 },
390 );
391 }
392
393 #[inline(always)]
398 fn alpha_composite_inner<S: Simd>(
399 s: S,
400 bg: u8x32<S>,
401 src: u8x32<S>,
402 one_minus_alpha: u8x32<S>,
403 ) -> u8x32<S> {
404 s.narrow_u16x32(normalized_mul_u8x32(bg, one_minus_alpha)) + src
405 }
406}
407
408mod alpha_fill {
409 use crate::fine::Splat4thExt;
415 use crate::fine::lowp::compose::ComposeExt;
416 use crate::fine::lowp::{extract_masks, mix};
417 use crate::peniko::{BlendMode, Mix};
418 use vello_common::fearless_simd::*;
419 use vello_common::util::{Div255Ext, normalized_mul_u8x32};
420
421 pub(super) fn blend<S: Simd, T: Iterator<Item = u8x32<S>>>(
423 simd: S,
424 dest: &mut [u8],
425 src: T,
426 blend_mode: BlendMode,
427 alphas: impl Iterator<Item = [u8; 8]>,
428 ) {
429 simd.vectorize(
430 #[inline(always)]
431 || {
432 let default_mix = matches!(blend_mode.mix, Mix::Normal);
433
434 for ((next_bg, next_mask), next_src) in
435 dest.chunks_exact_mut(32).zip(alphas).zip(src)
436 {
437 let bg_v = u8x32::from_slice(simd, next_bg);
438 let src_c = if default_mix {
439 next_src
440 } else {
441 mix(next_src, bg_v, blend_mode)
442 };
443 let masks = extract_masks(simd, &next_mask);
444 let res = blend_mode.compose(simd, src_c, bg_v, Some(masks));
445
446 next_bg.copy_from_slice(res.as_slice());
447 }
448 },
449 );
450 }
451
452 #[inline(always)]
456 pub(super) fn alpha_composite_solid<S: Simd>(
457 s: S,
458 dest: &mut [u8],
459 src: [u8; 4],
460 alphas: impl Iterator<Item = [u8; 8]>,
461 ) {
462 s.vectorize(
463 #[inline(always)]
464 || {
465 let src_a = u8x32::splat(s, src[3]);
466 let src_c = u32x8::splat(s, u32::from_ne_bytes(src)).to_bytes();
467 let one = u8x32::splat(s, 255);
468
469 for (next_bg, next_mask) in dest.chunks_exact_mut(32).zip(alphas) {
470 alpha_composite_inner(s, next_bg, &next_mask, src_c, src_a, one);
471 }
472 },
473 );
474 }
475
476 #[inline(always)]
480 pub(super) fn alpha_composite<S: Simd, T: Iterator<Item = u8x32<S>>>(
481 simd: S,
482 dest: &mut [u8],
483 src: T,
484 alphas: impl Iterator<Item = [u8; 8]>,
485 ) {
486 simd.vectorize(
487 #[inline(always)]
488 || {
489 let one = u8x32::splat(simd, 255);
490
491 for ((next_dest, next_mask), next_src) in
492 dest.chunks_exact_mut(32).zip(alphas).zip(src)
493 {
494 let src_a = next_src.splat_4th();
495 alpha_composite_inner(simd, next_dest, &next_mask, next_src, src_a, one);
496 }
497 },
498 );
499 }
500
501 #[inline(always)]
506 fn alpha_composite_inner<S: Simd>(
507 s: S,
508 dest: &mut [u8],
509 masks: &[u8; 8],
510 src_c: u8x32<S>,
511 src_a: u8x32<S>,
512 one: u8x32<S>,
513 ) {
514 s.vectorize(
515 #[inline(always)]
516 || {
517 let bg_v = u8x32::from_slice(s, dest);
518
519 let mask_v = extract_masks(s, masks);
520 let inv_src_a_mask_a = one - s.narrow_u16x32(normalized_mul_u8x32(src_a, mask_v));
521
522 let p1 = s.widen_u8x32(bg_v) * s.widen_u8x32(inv_src_a_mask_a);
523 let p2 = s.widen_u8x32(src_c) * s.widen_u8x32(mask_v);
524 let res = s.narrow_u16x32((p1 + p2).div_255());
525
526 dest.copy_from_slice(res.as_slice());
527 },
528 );
529 }
530}
531
532fn mix<S: Simd>(src_c: u8x32<S>, bg_c: u8x32<S>, blend_mode: BlendMode) -> u8x32<S> {
537 let to_f32 = |val: u8x32<S>| {
538 let (a, b) = src_c.simd.split_u8x32(val);
539 let mut a = u8_to_f32(a);
540 let mut b = u8_to_f32(b);
541 a *= f32x16::splat(src_c.simd, 1.0 / 255.0);
542 b *= f32x16::splat(src_c.simd, 1.0 / 255.0);
543 (a, b)
544 };
545
546 let to_u8 = |val1: f32x16<S>, val2: f32x16<S>| {
547 let val1 =
548 f32_to_u8(f32x16::splat(val1.simd, 255.0).madd(val1, f32x16::splat(val1.simd, 0.5)));
549 let val2 =
550 f32_to_u8(f32x16::splat(val2.simd, 255.0).madd(val2, f32x16::splat(val2.simd, 0.5)));
551
552 val1.simd.combine_u8x16(val1, val2)
553 };
554
555 let (mut src_1, mut src_2) = to_f32(src_c);
556 let (bg_1, bg_2) = to_f32(bg_c);
557
558 src_1 = highp::blend::mix(src_1, bg_1, blend_mode);
559 src_2 = highp::blend::mix(src_2, bg_2, blend_mode);
560
561 to_u8(src_1, src_2)
562}
563
564#[inline(always)]
570fn extract_masks<S: Simd>(simd: S, masks: &[u8; 8]) -> u8x32<S> {
571 let m1 = u32x4::splat(simd, u32::from_ne_bytes(masks[0..4].try_into().unwrap())).to_bytes();
572 let m2 = u32x4::splat(simd, u32::from_ne_bytes(masks[4..8].try_into().unwrap())).to_bytes();
573
574 let zipped1 = m1.zip_low(m1);
575 let zipped1 = zipped1.zip_low(zipped1);
576
577 let zipped2 = m2.zip_low(m2);
578 let zipped2 = zipped2.zip_low(zipped2);
579
580 simd.combine_u8x16(zipped1, zipped2)
581}
582
583#[inline(always)]
588fn pack(region: &mut Region<'_>, blend_buf: &[u8]) {
589 for y in 0..Tile::HEIGHT {
590 for (x, pixel) in region
591 .row_mut(y)
592 .chunks_exact_mut(COLOR_COMPONENTS)
593 .enumerate()
594 {
595 let idx = COLOR_COMPONENTS * (usize::from(Tile::HEIGHT) * x + usize::from(y));
596 pixel.copy_from_slice(&blend_buf[idx..][..COLOR_COMPONENTS]);
597 }
598 }
599}
600
601#[inline(always)]
606fn unpack(region: &mut Region<'_>, blend_buf: &mut [u8]) {
607 for y in 0..Tile::HEIGHT {
608 for (x, pixel) in region.row_mut(y).chunks_exact(COLOR_COMPONENTS).enumerate() {
609 let idx = COLOR_COMPONENTS * (usize::from(Tile::HEIGHT) * x + usize::from(y));
610 blend_buf[idx..][..COLOR_COMPONENTS].copy_from_slice(pixel);
611 }
612 }
613}
614
615#[inline(always)]
625fn pack_block<S: Simd>(simd: S, region: &mut Region<'_>, mut buf: &[u8]) {
626 buf = &buf[..SCRATCH_BUF_SIZE];
627
628 const CHUNK_LENGTH: usize = 64;
629 const SLICE_WIDTH: usize = WideTile::WIDTH as usize * COLOR_COMPONENTS;
630
631 let region_areas = region.areas();
632 let [s1, s2, s3, s4] = region_areas;
633 let dest_slices: &mut [&mut [u8; SLICE_WIDTH]; 4] = &mut [
634 (*s1).try_into().unwrap(),
635 (*s2).try_into().unwrap(),
636 (*s3).try_into().unwrap(),
637 (*s4).try_into().unwrap(),
638 ];
639
640 for (idx, col) in buf.chunks_exact(CHUNK_LENGTH).enumerate() {
641 let dest_idx = idx * CHUNK_LENGTH / 4;
642
643 let casted: &[u32; 16] = cast_slice::<u8, u32>(col).try_into().unwrap();
644
645 let loaded = simd.load_interleaved_128_u32x16(casted).to_bytes();
646 dest_slices[0][dest_idx..][..16].copy_from_slice(&loaded.as_slice()[..16]);
647 dest_slices[1][dest_idx..][..16].copy_from_slice(&loaded.as_slice()[16..32]);
648 dest_slices[2][dest_idx..][..16].copy_from_slice(&loaded.as_slice()[32..48]);
649 dest_slices[3][dest_idx..][..16].copy_from_slice(&loaded.as_slice()[48..64]);
650 }
651}