1use crate::PremultipliedColorU8;
32
33use crate::pixmap::SubPixmapMut;
34use crate::wide::{f32x8, u16x16, f32x16};
35use crate::geom::ScreenIntRect;
36
37pub const STAGE_WIDTH: usize = 16;
38
39pub type StageFn = fn(p: &mut Pipeline);
40
41pub struct Pipeline<'a, 'b: 'a> {
42 index: usize,
43 functions: &'a [StageFn],
44 pixmap: &'a mut SubPixmapMut<'b>,
45 mask_ctx: super::MaskCtx<'a>,
46 aa_mask_ctx: super::AAMaskCtx,
47 ctx: &'a mut super::Context,
48 r: u16x16,
49 g: u16x16,
50 b: u16x16,
51 a: u16x16,
52 dr: u16x16,
53 dg: u16x16,
54 db: u16x16,
55 da: u16x16,
56 tail: usize,
57 dx: usize,
58 dy: usize,
59}
60
61impl Pipeline<'_, '_> {
62 #[inline(always)]
63 fn next_stage(&mut self) {
64 let next: fn(&mut Self) = self.functions[self.index];
65 self.index += 1;
66 next(self);
67 }
68}
69
70
71pub const STAGES: &[StageFn; super::STAGES_COUNT] = &[
73 move_source_to_destination,
74 move_destination_to_source,
75 null_fn, null_fn, premultiply,
78 uniform_color,
79 seed_shader,
80 load_dst,
81 store,
82 load_dst_u8,
83 store_u8,
84 null_fn, load_mask_u8,
86 mask_u8,
87 scale_u8,
88 lerp_u8,
89 scale_1_float,
90 lerp_1_float,
91 destination_atop,
92 destination_in,
93 destination_out,
94 destination_over,
95 source_atop,
96 source_in,
97 source_out,
98 source_over,
99 clear,
100 modulate,
101 multiply,
102 plus,
103 screen,
104 xor,
105 null_fn, null_fn, darken,
108 difference,
109 exclusion,
110 hard_light,
111 lighten,
112 overlay,
113 null_fn, null_fn, null_fn, null_fn, null_fn, source_over_rgba,
119 transform,
120 null_fn, null_fn, null_fn, null_fn, pad_x1,
125 reflect_x1,
126 repeat_x1,
127 gradient,
128 evenly_spaced_2_stop_gradient,
129 null_fn, xy_to_radius,
133 null_fn, null_fn, null_fn, null_fn, null_fn, null_fn, null_fn, null_fn, null_fn, null_fn, null_fn, null_fn, null_fn, null_fn, null_fn, null_fn, null_fn, null_fn, null_fn, null_fn, null_fn, ];
155
156pub fn fn_ptr(f: StageFn) -> *const () {
157 f as *const ()
158}
159
160pub fn fn_ptr_eq(f1: StageFn, f2: StageFn) -> bool {
161 core::ptr::eq(f1 as *const (), f2 as *const ())
162}
163
164#[inline(never)]
165pub fn start(
166 functions: &[StageFn],
167 functions_tail: &[StageFn],
168 rect: &ScreenIntRect,
169 aa_mask_ctx: super::AAMaskCtx,
170 mask_ctx: super::MaskCtx,
171 ctx: &mut super::Context,
172 pixmap: &mut SubPixmapMut,
173) {
174 let mut p = Pipeline {
175 index: 0,
176 functions: &[],
177 pixmap,
178 mask_ctx,
179 aa_mask_ctx,
180 ctx,
181 r: u16x16::default(),
182 g: u16x16::default(),
183 b: u16x16::default(),
184 a: u16x16::default(),
185 dr: u16x16::default(),
186 dg: u16x16::default(),
187 db: u16x16::default(),
188 da: u16x16::default(),
189 tail: 0,
190 dx: 0,
191 dy: 0,
192 };
193
194 for y in rect.y()..rect.bottom() {
195 let mut x = rect.x() as usize;
196 let end = rect.right() as usize;
197
198 p.functions = functions;
199 while x + STAGE_WIDTH <= end {
200 p.index = 0;
201 p.dx = x;
202 p.dy = y as usize;
203 p.tail = STAGE_WIDTH;
204 p.next_stage();
205 x += STAGE_WIDTH;
206 }
207
208 if x != end {
209 p.index = 0;
210 p.functions = functions_tail;
211 p.dx = x;
212 p.dy = y as usize;
213 p.tail = end - x;
214 p.next_stage();
215 }
216 }
217}
218
219fn move_source_to_destination(p: &mut Pipeline) {
220 p.dr = p.r;
221 p.dg = p.g;
222 p.db = p.b;
223 p.da = p.a;
224
225 p.next_stage();
226}
227
228fn move_destination_to_source(p: &mut Pipeline) {
229 p.r = p.dr;
230 p.g = p.dg;
231 p.b = p.db;
232 p.a = p.da;
233
234 p.next_stage();
235}
236
237fn premultiply(p: &mut Pipeline) {
238 p.r = div255(p.r * p.a);
239 p.g = div255(p.g * p.a);
240 p.b = div255(p.b * p.a);
241
242 p.next_stage();
243}
244
245fn uniform_color(p: &mut Pipeline) {
246 let ctx = p.ctx.uniform_color;
247 p.r = u16x16::splat(ctx.rgba[0]);
248 p.g = u16x16::splat(ctx.rgba[1]);
249 p.b = u16x16::splat(ctx.rgba[2]);
250 p.a = u16x16::splat(ctx.rgba[3]);
251
252 p.next_stage();
253}
254
255fn seed_shader(p: &mut Pipeline) {
256 let iota = f32x16(
257 f32x8::from([0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]),
258 f32x8::from([8.5, 9.5, 10.5, 11.5, 12.5, 13.5, 14.5, 15.5]),
259 );
260
261 let x = f32x16::splat(p.dx as f32) + iota;
262 let y = f32x16::splat(p.dy as f32 + 0.5);
263 split(&x, &mut p.r, &mut p.g);
264 split(&y, &mut p.b, &mut p.a);
265
266 p.next_stage();
267}
268
269pub fn load_dst(p: &mut Pipeline) {
270 load_8888(p.pixmap.slice16_at_xy(p.dx, p.dy), &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
271 p.next_stage();
272}
273
274pub fn load_dst_tail(p: &mut Pipeline) {
275 load_8888_tail(p.tail, p.pixmap.slice_at_xy(p.dx, p.dy), &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
276 p.next_stage();
277}
278
279pub fn store(p: &mut Pipeline) {
280 store_8888(&p.r, &p.g, &p.b, &p.a, p.pixmap.slice16_at_xy(p.dx, p.dy));
281 p.next_stage();
282}
283
284pub fn store_tail(p: &mut Pipeline) {
285 store_8888_tail(&p.r, &p.g, &p.b, &p.a, p.tail, p.pixmap.slice_at_xy(p.dx, p.dy));
286 p.next_stage();
287}
288
289pub fn load_dst_u8(p: &mut Pipeline) {
290 load_8(p.pixmap.slice16_mask_at_xy(p.dx, p.dy), &mut p.da);
291 p.next_stage();
292}
293
294pub fn load_dst_u8_tail(p: &mut Pipeline) {
295 let data = p.pixmap.slice_mask_at_xy(p.dx, p.dy);
298 let mut tmp = [0u8; STAGE_WIDTH];
299 tmp[0..p.tail].copy_from_slice(&data[0..p.tail]);
300 load_8(&tmp, &mut p.da);
301
302 p.next_stage();
303}
304
305pub fn store_u8(p: &mut Pipeline) {
306 let data = p.pixmap.slice16_mask_at_xy(p.dx, p.dy);
307 let a = p.a.as_slice();
308
309 data[ 0] = a[ 0] as u8;
310 data[ 1] = a[ 1] as u8;
311 data[ 2] = a[ 2] as u8;
312 data[ 3] = a[ 3] as u8;
313 data[ 4] = a[ 4] as u8;
314 data[ 5] = a[ 5] as u8;
315 data[ 6] = a[ 6] as u8;
316 data[ 7] = a[ 7] as u8;
317 data[ 8] = a[ 8] as u8;
318 data[ 9] = a[ 9] as u8;
319 data[10] = a[10] as u8;
320 data[11] = a[11] as u8;
321 data[12] = a[12] as u8;
322 data[13] = a[13] as u8;
323 data[14] = a[14] as u8;
324 data[15] = a[15] as u8;
325
326 p.next_stage();
327}
328
329pub fn store_u8_tail(p: &mut Pipeline) {
330 let data = p.pixmap.slice_mask_at_xy(p.dx, p.dy);
331 let a = p.a.as_slice();
332
333 for i in 0..STAGE_WIDTH {
337 data[i] = a[i] as u8;
338
339 if i + 1 == p.tail {
340 break;
341 }
342 }
343
344 p.next_stage();
345}
346
347fn load_mask_u8(p: &mut Pipeline) {
349 let offset = p.mask_ctx.offset(p.dx, p.dy);
350
351 let mut c = u16x16::default();
352 for i in 0..p.tail {
353 c.0[i] = u16::from(p.mask_ctx.data[offset + i]);
354 }
355
356 p.r = u16x16::splat(0);
357 p.g = u16x16::splat(0);
358 p.b = u16x16::splat(0);
359 p.a = c;
360
361 p.next_stage();
362}
363
364fn mask_u8(p: &mut Pipeline) {
365 let offset = p.mask_ctx.offset(p.dx, p.dy);
366
367 let mut c = u16x16::default();
368 for i in 0..p.tail {
369 c.0[i] = u16::from(p.mask_ctx.data[offset + i]);
370 }
371
372 if c == u16x16::default() {
373 return;
374 }
375
376 p.r = div255(p.r * c);
377 p.g = div255(p.g * c);
378 p.b = div255(p.b * c);
379 p.a = div255(p.a * c);
380
381 p.next_stage();
382}
383
384fn scale_u8(p: &mut Pipeline) {
385 let data = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail);
387 let c = u16x16([
388 u16::from(data[0]),
389 u16::from(data[1]),
390 0,
391 0,
392 0,
393 0,
394 0,
395 0,
396 0,
397 0,
398 0,
399 0,
400 0,
401 0,
402 0,
403 0,
404 ]);
405
406 p.r = div255(p.r * c);
407 p.g = div255(p.g * c);
408 p.b = div255(p.b * c);
409 p.a = div255(p.a * c);
410
411 p.next_stage();
412}
413
414fn lerp_u8(p: &mut Pipeline) {
415 let data = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail);
417 let c = u16x16([
418 u16::from(data[0]),
419 u16::from(data[1]),
420 0,
421 0,
422 0,
423 0,
424 0,
425 0,
426 0,
427 0,
428 0,
429 0,
430 0,
431 0,
432 0,
433 0,
434 ]);
435
436 p.r = lerp(p.dr, p.r, c);
437 p.g = lerp(p.dg, p.g, c);
438 p.b = lerp(p.db, p.b, c);
439 p.a = lerp(p.da, p.a, c);
440
441 p.next_stage();
442}
443
444fn scale_1_float(p: &mut Pipeline) {
445 let c = from_float(p.ctx.current_coverage);
446 p.r = div255(p.r * c);
447 p.g = div255(p.g * c);
448 p.b = div255(p.b * c);
449 p.a = div255(p.a * c);
450
451 p.next_stage();
452}
453
454fn lerp_1_float(p: &mut Pipeline) {
455 let c = from_float(p.ctx.current_coverage);
456 p.r = lerp(p.dr, p.r, c);
457 p.g = lerp(p.dg, p.g, c);
458 p.b = lerp(p.db, p.b, c);
459 p.a = lerp(p.da, p.a, c);
460
461 p.next_stage();
462}
463
464macro_rules! blend_fn {
465 ($name:ident, $f:expr) => {
466 fn $name(p: &mut Pipeline) {
467 p.r = $f(p.r, p.dr, p.a, p.da);
468 p.g = $f(p.g, p.dg, p.a, p.da);
469 p.b = $f(p.b, p.db, p.a, p.da);
470 p.a = $f(p.a, p.da, p.a, p.da);
471
472 p.next_stage();
473 }
474 };
475}
476
477blend_fn!(clear, |_, _, _, _| u16x16::splat(0));
478blend_fn!(source_atop, |s, d, sa, da| div255(s * da + d * inv(sa)));
479blend_fn!(destination_atop, |s, d, sa, da| div255(d * sa + s * inv(da)));
480blend_fn!(source_in, |s, _, _, da| div255(s * da));
481blend_fn!(destination_in, |_, d, sa, _| div255(d * sa));
482blend_fn!(source_out, |s, _, _, da| div255(s * inv(da)));
483blend_fn!(destination_out, |_, d, sa, _| div255(d * inv(sa)));
484blend_fn!(source_over, |s, d, sa, _| s + div255(d * inv(sa)));
485blend_fn!(destination_over, |s, d, _, da| d + div255(s * inv(da)));
486blend_fn!(modulate, |s, d, _, _| div255(s * d));
487blend_fn!(multiply, |s, d, sa, da| div255(s * inv(da) + d * inv(sa) + s * d));
488blend_fn!(screen, |s, d, _, _| s + d - div255(s * d));
489blend_fn!(xor, |s, d, sa, da| div255(s * inv(da) + d * inv(sa)));
490
491blend_fn!(plus, |s: u16x16, d, _, _| (s + d).min(&u16x16::splat(255)));
493
494
495macro_rules! blend_fn2 {
496 ($name:ident, $f:expr) => {
497 fn $name(p: &mut Pipeline) {
498 p.r = $f(p.r, p.dr, p.a, p.da);
500 p.g = $f(p.g, p.dg, p.a, p.da);
501 p.b = $f(p.b, p.db, p.a, p.da);
502 p.a = p.a + div255(p.da * inv(p.a));
503
504 p.next_stage();
505 }
506 };
507}
508
509blend_fn2!(darken, |s: u16x16, d, sa, da| s + d - div255((s * da).max(&(d * sa))));
510blend_fn2!(lighten, |s: u16x16, d, sa, da| s + d - div255((s * da).min(&(d * sa))));
511blend_fn2!(exclusion, |s: u16x16, d, _, _| s + d - u16x16::splat(2) * div255(s * d));
512
513blend_fn2!(difference, |s: u16x16, d, sa, da|
514 s + d - u16x16::splat(2) * div255((s * da).min(&(d * sa))));
515
516blend_fn2!(hard_light, |s: u16x16, d: u16x16, sa, da| {
517 div255(s * inv(da) + d * inv(sa)
518 + (s+s).cmp_le(&sa).blend(
519 u16x16::splat(2) * s * d,
520 sa * da - u16x16::splat(2) * (sa-s)*(da-d)
521 )
522 )
523});
524
525blend_fn2!(overlay, |s: u16x16, d: u16x16, sa, da| {
526 div255(s * inv(da) + d * inv(sa)
527 + (d+d).cmp_le(&da).blend(
528 u16x16::splat(2) * s * d,
529 sa * da - u16x16::splat(2) * (sa-s)*(da-d)
530 )
531 )
532});
533
534pub fn source_over_rgba(p: &mut Pipeline) {
535 let pixels = p.pixmap.slice16_at_xy(p.dx, p.dy);
536 load_8888(pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
537 p.r = p.r + div255(p.dr * inv(p.a));
538 p.g = p.g + div255(p.dg * inv(p.a));
539 p.b = p.b + div255(p.db * inv(p.a));
540 p.a = p.a + div255(p.da * inv(p.a));
541 store_8888(&p.r, &p.g, &p.b, &p.a, pixels);
542
543 p.next_stage();
544}
545
546pub fn source_over_rgba_tail(p: &mut Pipeline) {
547 let pixels = p.pixmap.slice_at_xy(p.dx, p.dy);
548 load_8888_tail(p.tail, pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
549 p.r = p.r + div255(p.dr * inv(p.a));
550 p.g = p.g + div255(p.dg * inv(p.a));
551 p.b = p.b + div255(p.db * inv(p.a));
552 p.a = p.a + div255(p.da * inv(p.a));
553 store_8888_tail(&p.r, &p.g, &p.b, &p.a, p.tail, pixels);
554
555 p.next_stage();
556}
557
558fn transform(p: &mut Pipeline) {
559 let ts = &p.ctx.transform;
560
561 let x = join(&p.r, &p.g);
562 let y = join(&p.b, &p.a);
563
564 let nx = mad(x, f32x16::splat(ts.sx), mad(y, f32x16::splat(ts.kx), f32x16::splat(ts.tx)));
565 let ny = mad(x, f32x16::splat(ts.ky), mad(y, f32x16::splat(ts.sy), f32x16::splat(ts.ty)));
566
567 split(&nx, &mut p.r, &mut p.g);
568 split(&ny, &mut p.b, &mut p.a);
569
570 p.next_stage();
571}
572
573fn pad_x1(p: &mut Pipeline) {
574 let x = join(&p.r, &p.g);
575 let x = x.normalize();
576 split(&x, &mut p.r, &mut p.g);
577
578 p.next_stage();
579}
580
581fn reflect_x1(p: &mut Pipeline) {
582 let x = join(&p.r, &p.g);
583 let two = |x| x + x;
584 let x = (
585 (x - f32x16::splat(1.0))
586 - two(((x - f32x16::splat(1.0)) * f32x16::splat(0.5)).floor())
587 - f32x16::splat(1.0)
588 ).abs().normalize();
589 split(&x, &mut p.r, &mut p.g);
590
591 p.next_stage();
592}
593
594fn repeat_x1(p: &mut Pipeline) {
595 let x = join(&p.r, &p.g);
596 let x = (x - x.floor()).normalize();
597 split(&x, &mut p.r, &mut p.g);
598
599 p.next_stage();
600}
601
602fn gradient(p: &mut Pipeline) {
603 let ctx = &p.ctx.gradient;
604
605 let t = join(&p.r, &p.g);
607 let mut idx = u16x16::splat(0);
608 for i in 1..ctx.len {
609 let tt = ctx.t_values[i].get();
610 let t0: [f32; 8] = t.0.into();
611 let t1: [f32; 8] = t.1.into();
612 idx.0[ 0] += (t0[0] >= tt) as u16;
613 idx.0[ 1] += (t0[1] >= tt) as u16;
614 idx.0[ 2] += (t0[2] >= tt) as u16;
615 idx.0[ 3] += (t0[3] >= tt) as u16;
616 idx.0[ 4] += (t0[4] >= tt) as u16;
617 idx.0[ 5] += (t0[5] >= tt) as u16;
618 idx.0[ 6] += (t0[6] >= tt) as u16;
619 idx.0[ 7] += (t0[7] >= tt) as u16;
620 idx.0[ 8] += (t1[0] >= tt) as u16;
621 idx.0[ 9] += (t1[1] >= tt) as u16;
622 idx.0[10] += (t1[2] >= tt) as u16;
623 idx.0[11] += (t1[3] >= tt) as u16;
624 idx.0[12] += (t1[4] >= tt) as u16;
625 idx.0[13] += (t1[5] >= tt) as u16;
626 idx.0[14] += (t1[6] >= tt) as u16;
627 idx.0[15] += (t1[7] >= tt) as u16;
628 }
629 gradient_lookup(ctx, &idx, t, &mut p.r, &mut p.g, &mut p.b, &mut p.a);
630
631 p.next_stage();
632}
633
634fn evenly_spaced_2_stop_gradient(p: &mut Pipeline) {
635 let ctx = &p.ctx.evenly_spaced_2_stop_gradient;
636
637 let t = join(&p.r, &p.g);
638 round_f32_to_u16(
639 mad(t, f32x16::splat(ctx.factor.r), f32x16::splat(ctx.bias.r)),
640 mad(t, f32x16::splat(ctx.factor.g), f32x16::splat(ctx.bias.g)),
641 mad(t, f32x16::splat(ctx.factor.b), f32x16::splat(ctx.bias.b)),
642 mad(t, f32x16::splat(ctx.factor.a), f32x16::splat(ctx.bias.a)),
643 &mut p.r, &mut p.g, &mut p.b, &mut p.a,
644 );
645
646 p.next_stage();
647}
648
649fn xy_to_radius(p: &mut Pipeline) {
650 let x = join(&p.r, &p.g);
651 let y = join(&p.b, &p.a);
652 let x = (x*x + y*y).sqrt();
653 split(&x, &mut p.r, &mut p.g);
654 split(&y, &mut p.b, &mut p.a);
655
656 p.next_stage();
657}
658
659fn gradient_lookup(
662 ctx: &super::GradientCtx, idx: &u16x16, t: f32x16,
663 r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16,
664) {
665 macro_rules! gather {
666 ($d:expr, $c:ident) => {
667 f32x16(
670 f32x8::from([
671 $d[idx.0[ 0] as usize].$c,
672 $d[idx.0[ 1] as usize].$c,
673 $d[idx.0[ 2] as usize].$c,
674 $d[idx.0[ 3] as usize].$c,
675 $d[idx.0[ 4] as usize].$c,
676 $d[idx.0[ 5] as usize].$c,
677 $d[idx.0[ 6] as usize].$c,
678 $d[idx.0[ 7] as usize].$c,
679 ]),
680 f32x8::from([
681 $d[idx.0[ 8] as usize].$c,
682 $d[idx.0[ 9] as usize].$c,
683 $d[idx.0[10] as usize].$c,
684 $d[idx.0[11] as usize].$c,
685 $d[idx.0[12] as usize].$c,
686 $d[idx.0[13] as usize].$c,
687 $d[idx.0[14] as usize].$c,
688 $d[idx.0[15] as usize].$c,
689 ]),
690 )
691 };
692 }
693
694 let fr = gather!(&ctx.factors, r);
695 let fg = gather!(&ctx.factors, g);
696 let fb = gather!(&ctx.factors, b);
697 let fa = gather!(&ctx.factors, a);
698
699 let br = gather!(&ctx.biases, r);
700 let bg = gather!(&ctx.biases, g);
701 let bb = gather!(&ctx.biases, b);
702 let ba = gather!(&ctx.biases, a);
703
704 round_f32_to_u16(
705 mad(t, fr, br),
706 mad(t, fg, bg),
707 mad(t, fb, bb),
708 mad(t, fa, ba),
709 r, g, b, a,
710 );
711}
712
713#[inline(always)]
714fn round_f32_to_u16(
715 rf: f32x16, gf: f32x16, bf: f32x16, af: f32x16,
716 r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16,
717) {
718 let rf = rf.normalize() * f32x16::splat(255.0) + f32x16::splat(0.5);
722 let gf = gf.normalize() * f32x16::splat(255.0) + f32x16::splat(0.5);
723 let bf = bf.normalize() * f32x16::splat(255.0) + f32x16::splat(0.5);
724 let af = af * f32x16::splat(255.0) + f32x16::splat(0.5);
725
726 rf.save_to_u16x16(r);
727 gf.save_to_u16x16(g);
728 bf.save_to_u16x16(b);
729 af.save_to_u16x16(a);
730}
731
732pub fn just_return(_: &mut Pipeline) {
733 }
735
736pub fn null_fn(_: &mut Pipeline) {
737 }
739
740#[inline(always)]
741fn load_8888(
742 data: &[PremultipliedColorU8; STAGE_WIDTH],
743 r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16,
744) {
745 *r = u16x16([
746 data[ 0].red() as u16, data[ 1].red() as u16, data[ 2].red() as u16, data[ 3].red() as u16,
747 data[ 4].red() as u16, data[ 5].red() as u16, data[ 6].red() as u16, data[ 7].red() as u16,
748 data[ 8].red() as u16, data[ 9].red() as u16, data[10].red() as u16, data[11].red() as u16,
749 data[12].red() as u16, data[13].red() as u16, data[14].red() as u16, data[15].red() as u16,
750 ]);
751
752 *g = u16x16([
753 data[ 0].green() as u16, data[ 1].green() as u16, data[ 2].green() as u16, data[ 3].green() as u16,
754 data[ 4].green() as u16, data[ 5].green() as u16, data[ 6].green() as u16, data[ 7].green() as u16,
755 data[ 8].green() as u16, data[ 9].green() as u16, data[10].green() as u16, data[11].green() as u16,
756 data[12].green() as u16, data[13].green() as u16, data[14].green() as u16, data[15].green() as u16,
757 ]);
758
759 *b = u16x16([
760 data[ 0].blue() as u16, data[ 1].blue() as u16, data[ 2].blue() as u16, data[ 3].blue() as u16,
761 data[ 4].blue() as u16, data[ 5].blue() as u16, data[ 6].blue() as u16, data[ 7].blue() as u16,
762 data[ 8].blue() as u16, data[ 9].blue() as u16, data[10].blue() as u16, data[11].blue() as u16,
763 data[12].blue() as u16, data[13].blue() as u16, data[14].blue() as u16, data[15].blue() as u16,
764 ]);
765
766 *a = u16x16([
767 data[ 0].alpha() as u16, data[ 1].alpha() as u16, data[ 2].alpha() as u16, data[ 3].alpha() as u16,
768 data[ 4].alpha() as u16, data[ 5].alpha() as u16, data[ 6].alpha() as u16, data[ 7].alpha() as u16,
769 data[ 8].alpha() as u16, data[ 9].alpha() as u16, data[10].alpha() as u16, data[11].alpha() as u16,
770 data[12].alpha() as u16, data[13].alpha() as u16, data[14].alpha() as u16, data[15].alpha() as u16,
771 ]);
772}
773
774#[inline(always)]
775fn load_8888_tail(
776 tail: usize, data: &[PremultipliedColorU8],
777 r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16,
778) {
779 let mut tmp = [PremultipliedColorU8::TRANSPARENT; STAGE_WIDTH];
782 tmp[0..tail].copy_from_slice(&data[0..tail]);
783 load_8888(&tmp, r, g, b, a);
784}
785
786#[inline(always)]
787fn store_8888(
788 r: &u16x16, g: &u16x16, b: &u16x16, a: &u16x16,
789 data: &mut [PremultipliedColorU8; STAGE_WIDTH],
790) {
791 let r = r.as_slice();
792 let g = g.as_slice();
793 let b = b.as_slice();
794 let a = a.as_slice();
795
796 data[ 0] = PremultipliedColorU8::from_rgba_unchecked(r[ 0] as u8, g[ 0] as u8, b[ 0] as u8, a[ 0] as u8);
797 data[ 1] = PremultipliedColorU8::from_rgba_unchecked(r[ 1] as u8, g[ 1] as u8, b[ 1] as u8, a[ 1] as u8);
798 data[ 2] = PremultipliedColorU8::from_rgba_unchecked(r[ 2] as u8, g[ 2] as u8, b[ 2] as u8, a[ 2] as u8);
799 data[ 3] = PremultipliedColorU8::from_rgba_unchecked(r[ 3] as u8, g[ 3] as u8, b[ 3] as u8, a[ 3] as u8);
800 data[ 4] = PremultipliedColorU8::from_rgba_unchecked(r[ 4] as u8, g[ 4] as u8, b[ 4] as u8, a[ 4] as u8);
801 data[ 5] = PremultipliedColorU8::from_rgba_unchecked(r[ 5] as u8, g[ 5] as u8, b[ 5] as u8, a[ 5] as u8);
802 data[ 6] = PremultipliedColorU8::from_rgba_unchecked(r[ 6] as u8, g[ 6] as u8, b[ 6] as u8, a[ 6] as u8);
803 data[ 7] = PremultipliedColorU8::from_rgba_unchecked(r[ 7] as u8, g[ 7] as u8, b[ 7] as u8, a[ 7] as u8);
804 data[ 8] = PremultipliedColorU8::from_rgba_unchecked(r[ 8] as u8, g[ 8] as u8, b[ 8] as u8, a[ 8] as u8);
805 data[ 9] = PremultipliedColorU8::from_rgba_unchecked(r[ 9] as u8, g[ 9] as u8, b[ 9] as u8, a[ 9] as u8);
806 data[10] = PremultipliedColorU8::from_rgba_unchecked(r[10] as u8, g[10] as u8, b[10] as u8, a[10] as u8);
807 data[11] = PremultipliedColorU8::from_rgba_unchecked(r[11] as u8, g[11] as u8, b[11] as u8, a[11] as u8);
808 data[12] = PremultipliedColorU8::from_rgba_unchecked(r[12] as u8, g[12] as u8, b[12] as u8, a[12] as u8);
809 data[13] = PremultipliedColorU8::from_rgba_unchecked(r[13] as u8, g[13] as u8, b[13] as u8, a[13] as u8);
810 data[14] = PremultipliedColorU8::from_rgba_unchecked(r[14] as u8, g[14] as u8, b[14] as u8, a[14] as u8);
811 data[15] = PremultipliedColorU8::from_rgba_unchecked(r[15] as u8, g[15] as u8, b[15] as u8, a[15] as u8);
812}
813
814#[inline(always)]
815fn store_8888_tail(
816 r: &u16x16, g: &u16x16, b: &u16x16, a: &u16x16,
817 tail: usize, data: &mut [PremultipliedColorU8],
818) {
819 let r = r.as_slice();
820 let g = g.as_slice();
821 let b = b.as_slice();
822 let a = a.as_slice();
823
824 for i in 0..STAGE_WIDTH {
828 data[i] = PremultipliedColorU8::from_rgba_unchecked(
829 r[i] as u8, g[i] as u8, b[i] as u8, a[i] as u8,
830 );
831
832 if i + 1 == tail {
833 break;
834 }
835 }
836}
837
838#[inline(always)]
839fn load_8(data: &[u8; STAGE_WIDTH], a: &mut u16x16) {
840 *a = u16x16([
841 data[ 0] as u16, data[ 1] as u16, data[ 2] as u16, data[ 3] as u16,
842 data[ 4] as u16, data[ 5] as u16, data[ 6] as u16, data[ 7] as u16,
843 data[ 8] as u16, data[ 9] as u16, data[10] as u16, data[11] as u16,
844 data[12] as u16, data[13] as u16, data[14] as u16, data[15] as u16,
845 ]);
846}
847
848#[inline(always)]
849fn div255(v: u16x16) -> u16x16 {
850 (v + u16x16::splat(255)) >> u16x16::splat(8) }
855
856#[inline(always)]
857fn inv(v: u16x16) -> u16x16 {
858 u16x16::splat(255) - v
859}
860
861#[inline(always)]
862fn from_float(f: f32) -> u16x16 {
863 u16x16::splat((f * 255.0 + 0.5) as u16)
864}
865
866#[inline(always)]
867fn lerp(from: u16x16, to: u16x16, t: u16x16) -> u16x16 {
868 div255(from * inv(t) + to * t)
869}
870
871#[inline(always)]
872fn split(v: &f32x16, lo: &mut u16x16, hi: &mut u16x16) {
873 let data: [u8; 64] = bytemuck::cast(*v);
875 let d0: &mut [u8; 32] = bytemuck::cast_mut(&mut lo.0);
876 let d1: &mut [u8; 32] = bytemuck::cast_mut(&mut hi.0);
877
878 d0.copy_from_slice(&data[0..32]);
879 d1.copy_from_slice(&data[32..64]);
880}
881
882#[inline(always)]
883fn join(lo: &u16x16, hi: &u16x16) -> f32x16 {
884 let d0: [u8; 32] = bytemuck::cast(lo.0);
887 let d1: [u8; 32] = bytemuck::cast(hi.0);
888
889 let mut v = f32x16::default();
890 let data: &mut [u8; 64] = bytemuck::cast_mut(&mut v);
891
892 data[0..32].copy_from_slice(&d0);
893 data[32..64].copy_from_slice(&d1);
894
895 v
896}
897
898#[inline(always)]
899fn mad(f: f32x16, m: f32x16, a: f32x16) -> f32x16 {
900 f * m + a
902}