1use crate::PremultipliedColorU8;
32
33use crate::pixmap::SubPixmapMut;
34use crate::wide::{f32x8, u16x16, f32x16};
35use crate::geom::ScreenIntRect;
36
37pub const STAGE_WIDTH: usize = 16;
38
39pub type StageFn = fn(p: &mut Pipeline);
40
41pub struct Pipeline<'a, 'b: 'a> {
42    index: usize,
43    functions: &'a [StageFn],
44    pixmap: &'a mut SubPixmapMut<'b>,
45    mask_ctx: super::MaskCtx<'a>,
46    aa_mask_ctx: super::AAMaskCtx,
47    ctx: &'a mut super::Context,
48    r: u16x16,
49    g: u16x16,
50    b: u16x16,
51    a: u16x16,
52    dr: u16x16,
53    dg: u16x16,
54    db: u16x16,
55    da: u16x16,
56    tail: usize,
57    dx: usize,
58    dy: usize,
59}
60
61impl Pipeline<'_, '_> {
62    #[inline(always)]
63    fn next_stage(&mut self) {
64        let next: fn(&mut Self) = self.functions[self.index];
65        self.index += 1;
66        next(self);
67    }
68}
69
70
71pub const STAGES: &[StageFn; super::STAGES_COUNT] = &[
73    move_source_to_destination,
74    move_destination_to_source,
75    null_fn, null_fn, premultiply,
78    uniform_color,
79    seed_shader,
80    load_dst,
81    store,
82    load_dst_u8,
83    store_u8,
84    null_fn, load_mask_u8,
86    mask_u8,
87    scale_u8,
88    lerp_u8,
89    scale_1_float,
90    lerp_1_float,
91    destination_atop,
92    destination_in,
93    destination_out,
94    destination_over,
95    source_atop,
96    source_in,
97    source_out,
98    source_over,
99    clear,
100    modulate,
101    multiply,
102    plus,
103    screen,
104    xor,
105    null_fn, null_fn, darken,
108    difference,
109    exclusion,
110    hard_light,
111    lighten,
112    overlay,
113    null_fn, null_fn, null_fn, null_fn, null_fn, source_over_rgba,
119    transform,
120    null_fn, null_fn, null_fn, null_fn, pad_x1,
125    reflect_x1,
126    repeat_x1,
127    gradient,
128    evenly_spaced_2_stop_gradient,
129    xy_to_radius,
130    null_fn, null_fn, null_fn, null_fn, null_fn, ];
136
137pub fn fn_ptr(f: StageFn) -> *const () {
138    f as *const ()
139}
140
141pub fn fn_ptr_eq(f1: StageFn, f2: StageFn) -> bool {
142    core::ptr::eq(f1 as *const (), f2 as *const ())
143}
144
145#[inline(never)]
146pub fn start(
147    functions: &[StageFn],
148    functions_tail: &[StageFn],
149    rect: &ScreenIntRect,
150    aa_mask_ctx: super::AAMaskCtx,
151    mask_ctx: super::MaskCtx,
152    ctx: &mut super::Context,
153    pixmap: &mut SubPixmapMut,
154) {
155    let mut p = Pipeline {
156        index: 0,
157        functions: &[],
158        pixmap,
159        mask_ctx,
160        aa_mask_ctx,
161        ctx,
162        r: u16x16::default(),
163        g: u16x16::default(),
164        b: u16x16::default(),
165        a: u16x16::default(),
166        dr: u16x16::default(),
167        dg: u16x16::default(),
168        db: u16x16::default(),
169        da: u16x16::default(),
170        tail: 0,
171        dx: 0,
172        dy: 0,
173    };
174
175    for y in rect.y()..rect.bottom() {
176        let mut x = rect.x() as usize;
177        let end = rect.right() as usize;
178
179        p.functions = functions;
180        while x + STAGE_WIDTH <= end {
181            p.index = 0;
182            p.dx = x;
183            p.dy = y as usize;
184            p.tail = STAGE_WIDTH;
185            p.next_stage();
186            x += STAGE_WIDTH;
187        }
188
189        if x != end {
190            p.index = 0;
191            p.functions = functions_tail;
192            p.dx = x;
193            p.dy = y as usize;
194            p.tail = end - x;
195            p.next_stage();
196        }
197    }
198}
199
200fn move_source_to_destination(p: &mut Pipeline) {
201    p.dr = p.r;
202    p.dg = p.g;
203    p.db = p.b;
204    p.da = p.a;
205
206    p.next_stage();
207}
208
209fn move_destination_to_source(p: &mut Pipeline) {
210    p.r = p.dr;
211    p.g = p.dg;
212    p.b = p.db;
213    p.a = p.da;
214
215    p.next_stage();
216}
217
218fn premultiply(p: &mut Pipeline) {
219    p.r = div255(p.r * p.a);
220    p.g = div255(p.g * p.a);
221    p.b = div255(p.b * p.a);
222
223    p.next_stage();
224}
225
226fn uniform_color(p: &mut Pipeline) {
227    let ctx = p.ctx.uniform_color;
228    p.r = u16x16::splat(ctx.rgba[0]);
229    p.g = u16x16::splat(ctx.rgba[1]);
230    p.b = u16x16::splat(ctx.rgba[2]);
231    p.a = u16x16::splat(ctx.rgba[3]);
232
233    p.next_stage();
234}
235
236fn seed_shader(p: &mut Pipeline) {
237    let iota = f32x16(
238        f32x8::from([0.5,  1.5,  2.5,  3.5,  4.5,  5.5,  6.5,  7.5]),
239        f32x8::from([8.5,  9.5, 10.5, 11.5, 12.5, 13.5, 14.5, 15.5]),
240    );
241
242    let x = f32x16::splat(p.dx as f32) + iota;
243    let y = f32x16::splat(p.dy as f32 + 0.5);
244    split(&x, &mut p.r, &mut p.g);
245    split(&y, &mut p.b, &mut p.a);
246
247    p.next_stage();
248}
249
250pub fn load_dst(p: &mut Pipeline) {
251    load_8888(p.pixmap.slice16_at_xy(p.dx, p.dy), &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
252    p.next_stage();
253}
254
255pub fn load_dst_tail(p: &mut Pipeline) {
256    load_8888_tail(p.tail, p.pixmap.slice_at_xy(p.dx, p.dy), &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
257    p.next_stage();
258}
259
260pub fn store(p: &mut Pipeline) {
261    store_8888(&p.r, &p.g, &p.b, &p.a, p.pixmap.slice16_at_xy(p.dx, p.dy));
262    p.next_stage();
263}
264
265pub fn store_tail(p: &mut Pipeline) {
266    store_8888_tail(&p.r, &p.g, &p.b, &p.a, p.tail, p.pixmap.slice_at_xy(p.dx, p.dy));
267    p.next_stage();
268}
269
270pub fn load_dst_u8(p: &mut Pipeline) {
271    load_8(p.pixmap.slice16_mask_at_xy(p.dx, p.dy), &mut p.da);
272    p.next_stage();
273}
274
275pub fn load_dst_u8_tail(p: &mut Pipeline) {
276    let data = p.pixmap.slice_mask_at_xy(p.dx, p.dy);
279    let mut tmp = [0u8; STAGE_WIDTH];
280    tmp[0..p.tail].copy_from_slice(&data[0..p.tail]);
281    load_8(&tmp, &mut p.da);
282
283    p.next_stage();
284}
285
286pub fn store_u8(p: &mut Pipeline) {
287    let data = p.pixmap.slice16_mask_at_xy(p.dx, p.dy);
288    let a = p.a.as_slice();
289
290    data[ 0] = a[ 0] as u8;
291    data[ 1] = a[ 1] as u8;
292    data[ 2] = a[ 2] as u8;
293    data[ 3] = a[ 3] as u8;
294    data[ 4] = a[ 4] as u8;
295    data[ 5] = a[ 5] as u8;
296    data[ 6] = a[ 6] as u8;
297    data[ 7] = a[ 7] as u8;
298    data[ 8] = a[ 8] as u8;
299    data[ 9] = a[ 9] as u8;
300    data[10] = a[10] as u8;
301    data[11] = a[11] as u8;
302    data[12] = a[12] as u8;
303    data[13] = a[13] as u8;
304    data[14] = a[14] as u8;
305    data[15] = a[15] as u8;
306
307    p.next_stage();
308}
309
310pub fn store_u8_tail(p: &mut Pipeline) {
311    let data = p.pixmap.slice_mask_at_xy(p.dx, p.dy);
312    let a = p.a.as_slice();
313
314    for i in 0..STAGE_WIDTH {
318        data[i] = a[i] as u8;
319
320        if i + 1 == p.tail {
321            break;
322        }
323    }
324
325    p.next_stage();
326}
327
328fn load_mask_u8(p: &mut Pipeline) {
330    let offset = p.mask_ctx.offset(p.dx, p.dy);
331
332    let mut c = u16x16::default();
333    for i in 0..p.tail {
334        c.0[i] = u16::from(p.mask_ctx.data[offset + i]);
335    }
336
337    p.r = u16x16::splat(0);
338    p.g = u16x16::splat(0);
339    p.b = u16x16::splat(0);
340    p.a = c;
341
342    p.next_stage();
343}
344
345fn mask_u8(p: &mut Pipeline) {
346    let offset = p.mask_ctx.offset(p.dx, p.dy);
347
348    let mut c = u16x16::default();
349    for i in 0..p.tail {
350        c.0[i] = u16::from(p.mask_ctx.data[offset + i]);
351    }
352
353    if c == u16x16::default() {
354        return;
355    }
356
357    p.r = div255(p.r * c);
358    p.g = div255(p.g * c);
359    p.b = div255(p.b * c);
360    p.a = div255(p.a * c);
361
362    p.next_stage();
363}
364
365fn scale_u8(p: &mut Pipeline) {
366    let data = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail);
368    let c = u16x16([
369        u16::from(data[0]),
370        u16::from(data[1]),
371        0,
372        0,
373        0,
374        0,
375        0,
376        0,
377        0,
378        0,
379        0,
380        0,
381        0,
382        0,
383        0,
384        0,
385    ]);
386
387    p.r = div255(p.r * c);
388    p.g = div255(p.g * c);
389    p.b = div255(p.b * c);
390    p.a = div255(p.a * c);
391
392    p.next_stage();
393}
394
395fn lerp_u8(p: &mut Pipeline) {
396    let data = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail);
398    let c = u16x16([
399        u16::from(data[0]),
400        u16::from(data[1]),
401        0,
402        0,
403        0,
404        0,
405        0,
406        0,
407        0,
408        0,
409        0,
410        0,
411        0,
412        0,
413        0,
414        0,
415    ]);
416
417    p.r = lerp(p.dr, p.r, c);
418    p.g = lerp(p.dg, p.g, c);
419    p.b = lerp(p.db, p.b, c);
420    p.a = lerp(p.da, p.a, c);
421
422    p.next_stage();
423}
424
425fn scale_1_float(p: &mut Pipeline) {
426    let c = from_float(p.ctx.current_coverage);
427    p.r = div255(p.r * c);
428    p.g = div255(p.g * c);
429    p.b = div255(p.b * c);
430    p.a = div255(p.a * c);
431
432    p.next_stage();
433}
434
435fn lerp_1_float(p: &mut Pipeline) {
436    let c = from_float(p.ctx.current_coverage);
437    p.r = lerp(p.dr, p.r, c);
438    p.g = lerp(p.dg, p.g, c);
439    p.b = lerp(p.db, p.b, c);
440    p.a = lerp(p.da, p.a, c);
441
442    p.next_stage();
443}
444
445macro_rules! blend_fn {
446    ($name:ident, $f:expr) => {
447        fn $name(p: &mut Pipeline) {
448            p.r = $f(p.r, p.dr, p.a, p.da);
449            p.g = $f(p.g, p.dg, p.a, p.da);
450            p.b = $f(p.b, p.db, p.a, p.da);
451            p.a = $f(p.a, p.da, p.a, p.da);
452
453            p.next_stage();
454        }
455    };
456}
457
458blend_fn!(clear,            |_, _,  _,  _| u16x16::splat(0));
459blend_fn!(source_atop,      |s, d, sa, da| div255(s * da + d * inv(sa)));
460blend_fn!(destination_atop, |s, d, sa, da| div255(d * sa + s * inv(da)));
461blend_fn!(source_in,        |s, _,  _, da| div255(s * da));
462blend_fn!(destination_in,   |_, d, sa,  _| div255(d * sa));
463blend_fn!(source_out,       |s, _,  _, da| div255(s * inv(da)));
464blend_fn!(destination_out,  |_, d, sa,  _| div255(d * inv(sa)));
465blend_fn!(source_over,      |s, d, sa,  _| s + div255(d * inv(sa)));
466blend_fn!(destination_over, |s, d,  _, da| d + div255(s * inv(da)));
467blend_fn!(modulate,         |s, d,  _,  _| div255(s * d));
468blend_fn!(multiply,         |s, d, sa, da| div255(s * inv(da) + d * inv(sa) + s * d));
469blend_fn!(screen,           |s, d,  _,  _| s + d - div255(s * d));
470blend_fn!(xor,              |s, d, sa, da| div255(s * inv(da) + d * inv(sa)));
471
472blend_fn!(plus, |s: u16x16, d, _, _| (s + d).min(&u16x16::splat(255)));
474
475
476macro_rules! blend_fn2 {
477    ($name:ident, $f:expr) => {
478        fn $name(p: &mut Pipeline) {
479            p.r = $f(p.r, p.dr, p.a, p.da);
481            p.g = $f(p.g, p.dg, p.a, p.da);
482            p.b = $f(p.b, p.db, p.a, p.da);
483            p.a = p.a + div255(p.da * inv(p.a));
484
485            p.next_stage();
486        }
487    };
488}
489
490blend_fn2!(darken,      |s: u16x16, d, sa, da| s + d - div255((s * da).max(&(d * sa))));
491blend_fn2!(lighten,     |s: u16x16, d, sa, da| s + d - div255((s * da).min(&(d * sa))));
492blend_fn2!(exclusion,   |s: u16x16, d,  _,  _| s + d - u16x16::splat(2) * div255(s * d));
493
494blend_fn2!(difference,  |s: u16x16, d, sa, da|
495    s + d - u16x16::splat(2) * div255((s * da).min(&(d * sa))));
496
497blend_fn2!(hard_light, |s: u16x16, d: u16x16, sa, da| {
498    div255(s * inv(da) + d * inv(sa)
499        + (s+s).cmp_le(&sa).blend(
500            u16x16::splat(2) * s * d,
501            sa * da - u16x16::splat(2) * (sa-s)*(da-d)
502        )
503    )
504});
505
506blend_fn2!(overlay, |s: u16x16, d: u16x16, sa, da| {
507    div255(s * inv(da) + d * inv(sa)
508        + (d+d).cmp_le(&da).blend(
509            u16x16::splat(2) * s * d,
510            sa * da - u16x16::splat(2) * (sa-s)*(da-d)
511        )
512    )
513});
514
515pub fn source_over_rgba(p: &mut Pipeline) {
516    let pixels = p.pixmap.slice16_at_xy(p.dx, p.dy);
517    load_8888(pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
518    p.r = p.r + div255(p.dr * inv(p.a));
519    p.g = p.g + div255(p.dg * inv(p.a));
520    p.b = p.b + div255(p.db * inv(p.a));
521    p.a = p.a + div255(p.da * inv(p.a));
522    store_8888(&p.r, &p.g, &p.b, &p.a, pixels);
523
524    p.next_stage();
525}
526
527pub fn source_over_rgba_tail(p: &mut Pipeline) {
528    let pixels = p.pixmap.slice_at_xy(p.dx, p.dy);
529    load_8888_tail(p.tail, pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
530    p.r = p.r + div255(p.dr * inv(p.a));
531    p.g = p.g + div255(p.dg * inv(p.a));
532    p.b = p.b + div255(p.db * inv(p.a));
533    p.a = p.a + div255(p.da * inv(p.a));
534    store_8888_tail(&p.r, &p.g, &p.b, &p.a, p.tail, pixels);
535
536    p.next_stage();
537}
538
539fn transform(p: &mut Pipeline) {
540    let ts = &p.ctx.transform;
541
542    let x = join(&p.r, &p.g);
543    let y = join(&p.b, &p.a);
544
545    let nx = mad(x, f32x16::splat(ts.sx), mad(y, f32x16::splat(ts.kx), f32x16::splat(ts.tx)));
546    let ny = mad(x, f32x16::splat(ts.ky), mad(y, f32x16::splat(ts.sy), f32x16::splat(ts.ty)));
547
548    split(&nx, &mut p.r, &mut p.g);
549    split(&ny, &mut p.b, &mut p.a);
550
551    p.next_stage();
552}
553
554fn pad_x1(p: &mut Pipeline) {
555    let x = join(&p.r, &p.g);
556    let x = x.normalize();
557    split(&x, &mut p.r, &mut p.g);
558
559    p.next_stage();
560}
561
562fn reflect_x1(p: &mut Pipeline) {
563    let x = join(&p.r, &p.g);
564    let two = |x| x + x;
565    let x = (
566        (x - f32x16::splat(1.0))
567        - two(((x - f32x16::splat(1.0)) * f32x16::splat(0.5)).floor())
568        - f32x16::splat(1.0)
569    ).abs().normalize();
570    split(&x, &mut p.r, &mut p.g);
571
572    p.next_stage();
573}
574
575fn repeat_x1(p: &mut Pipeline) {
576    let x = join(&p.r, &p.g);
577    let x = (x - x.floor()).normalize();
578    split(&x, &mut p.r, &mut p.g);
579
580    p.next_stage();
581}
582
583fn gradient(p: &mut Pipeline) {
584    let ctx = &p.ctx.gradient;
585
586    let t = join(&p.r, &p.g);
588    let mut idx = u16x16::splat(0);
589    for i in 1..ctx.len {
590        let tt = ctx.t_values[i].get();
591        let t0: [f32; 8] = t.0.into();
592        let t1: [f32; 8] = t.1.into();
593        idx.0[ 0] += (t0[0] >= tt) as u16;
594        idx.0[ 1] += (t0[1] >= tt) as u16;
595        idx.0[ 2] += (t0[2] >= tt) as u16;
596        idx.0[ 3] += (t0[3] >= tt) as u16;
597        idx.0[ 4] += (t0[4] >= tt) as u16;
598        idx.0[ 5] += (t0[5] >= tt) as u16;
599        idx.0[ 6] += (t0[6] >= tt) as u16;
600        idx.0[ 7] += (t0[7] >= tt) as u16;
601        idx.0[ 8] += (t1[0] >= tt) as u16;
602        idx.0[ 9] += (t1[1] >= tt) as u16;
603        idx.0[10] += (t1[2] >= tt) as u16;
604        idx.0[11] += (t1[3] >= tt) as u16;
605        idx.0[12] += (t1[4] >= tt) as u16;
606        idx.0[13] += (t1[5] >= tt) as u16;
607        idx.0[14] += (t1[6] >= tt) as u16;
608        idx.0[15] += (t1[7] >= tt) as u16;
609    }
610    gradient_lookup(ctx, &idx, t, &mut p.r, &mut p.g, &mut p.b, &mut p.a);
611
612    p.next_stage();
613}
614
615fn evenly_spaced_2_stop_gradient(p: &mut Pipeline) {
616    let ctx = &p.ctx.evenly_spaced_2_stop_gradient;
617
618    let t = join(&p.r, &p.g);
619    round_f32_to_u16(
620        mad(t, f32x16::splat(ctx.factor.r), f32x16::splat(ctx.bias.r)),
621        mad(t, f32x16::splat(ctx.factor.g), f32x16::splat(ctx.bias.g)),
622        mad(t, f32x16::splat(ctx.factor.b), f32x16::splat(ctx.bias.b)),
623        mad(t, f32x16::splat(ctx.factor.a), f32x16::splat(ctx.bias.a)),
624        &mut p.r, &mut p.g, &mut p.b, &mut p.a,
625    );
626
627    p.next_stage();
628}
629
630fn xy_to_radius(p: &mut Pipeline) {
631    let x = join(&p.r, &p.g);
632    let y = join(&p.b, &p.a);
633    let x = (x*x + y*y).sqrt();
634    split(&x, &mut p.r, &mut p.g);
635    split(&y, &mut p.b, &mut p.a);
636
637    p.next_stage();
638}
639
640fn gradient_lookup(
643    ctx: &super::GradientCtx, idx: &u16x16, t: f32x16,
644    r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16,
645) {
646    macro_rules! gather {
647        ($d:expr, $c:ident) => {
648            f32x16(
651                f32x8::from([
652                    $d[idx.0[ 0] as usize].$c,
653                    $d[idx.0[ 1] as usize].$c,
654                    $d[idx.0[ 2] as usize].$c,
655                    $d[idx.0[ 3] as usize].$c,
656                    $d[idx.0[ 4] as usize].$c,
657                    $d[idx.0[ 5] as usize].$c,
658                    $d[idx.0[ 6] as usize].$c,
659                    $d[idx.0[ 7] as usize].$c,
660                ]),
661                f32x8::from([
662                    $d[idx.0[ 8] as usize].$c,
663                    $d[idx.0[ 9] as usize].$c,
664                    $d[idx.0[10] as usize].$c,
665                    $d[idx.0[11] as usize].$c,
666                    $d[idx.0[12] as usize].$c,
667                    $d[idx.0[13] as usize].$c,
668                    $d[idx.0[14] as usize].$c,
669                    $d[idx.0[15] as usize].$c,
670                ]),
671            )
672        };
673    }
674
675    let fr = gather!(&ctx.factors, r);
676    let fg = gather!(&ctx.factors, g);
677    let fb = gather!(&ctx.factors, b);
678    let fa = gather!(&ctx.factors, a);
679
680    let br = gather!(&ctx.biases, r);
681    let bg = gather!(&ctx.biases, g);
682    let bb = gather!(&ctx.biases, b);
683    let ba = gather!(&ctx.biases, a);
684
685    round_f32_to_u16(
686        mad(t, fr, br),
687        mad(t, fg, bg),
688        mad(t, fb, bb),
689        mad(t, fa, ba),
690        r, g, b, a,
691    );
692}
693
694#[inline(always)]
695fn round_f32_to_u16(
696    rf: f32x16, gf: f32x16, bf: f32x16, af: f32x16,
697    r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16,
698) {
699    let rf = rf.normalize() * f32x16::splat(255.0) + f32x16::splat(0.5);
703    let gf = gf.normalize() * f32x16::splat(255.0) + f32x16::splat(0.5);
704    let bf = bf.normalize() * f32x16::splat(255.0) + f32x16::splat(0.5);
705    let af = af * f32x16::splat(255.0) + f32x16::splat(0.5);
706
707    rf.save_to_u16x16(r);
708    gf.save_to_u16x16(g);
709    bf.save_to_u16x16(b);
710    af.save_to_u16x16(a);
711}
712
713pub fn just_return(_: &mut Pipeline) {
714    }
716
717pub fn null_fn(_: &mut Pipeline) {
718    }
720
721#[inline(always)]
722fn load_8888(
723    data: &[PremultipliedColorU8; STAGE_WIDTH],
724    r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16,
725) {
726    *r = u16x16([
727        data[ 0].red() as u16, data[ 1].red() as u16, data[ 2].red() as u16, data[ 3].red() as u16,
728        data[ 4].red() as u16, data[ 5].red() as u16, data[ 6].red() as u16, data[ 7].red() as u16,
729        data[ 8].red() as u16, data[ 9].red() as u16, data[10].red() as u16, data[11].red() as u16,
730        data[12].red() as u16, data[13].red() as u16, data[14].red() as u16, data[15].red() as u16,
731    ]);
732
733    *g = u16x16([
734        data[ 0].green() as u16, data[ 1].green() as u16, data[ 2].green() as u16, data[ 3].green() as u16,
735        data[ 4].green() as u16, data[ 5].green() as u16, data[ 6].green() as u16, data[ 7].green() as u16,
736        data[ 8].green() as u16, data[ 9].green() as u16, data[10].green() as u16, data[11].green() as u16,
737        data[12].green() as u16, data[13].green() as u16, data[14].green() as u16, data[15].green() as u16,
738    ]);
739
740    *b = u16x16([
741        data[ 0].blue() as u16, data[ 1].blue() as u16, data[ 2].blue() as u16, data[ 3].blue() as u16,
742        data[ 4].blue() as u16, data[ 5].blue() as u16, data[ 6].blue() as u16, data[ 7].blue() as u16,
743        data[ 8].blue() as u16, data[ 9].blue() as u16, data[10].blue() as u16, data[11].blue() as u16,
744        data[12].blue() as u16, data[13].blue() as u16, data[14].blue() as u16, data[15].blue() as u16,
745    ]);
746
747    *a = u16x16([
748        data[ 0].alpha() as u16, data[ 1].alpha() as u16, data[ 2].alpha() as u16, data[ 3].alpha() as u16,
749        data[ 4].alpha() as u16, data[ 5].alpha() as u16, data[ 6].alpha() as u16, data[ 7].alpha() as u16,
750        data[ 8].alpha() as u16, data[ 9].alpha() as u16, data[10].alpha() as u16, data[11].alpha() as u16,
751        data[12].alpha() as u16, data[13].alpha() as u16, data[14].alpha() as u16, data[15].alpha() as u16,
752    ]);
753}
754
755#[inline(always)]
756fn load_8888_tail(
757    tail: usize, data: &[PremultipliedColorU8],
758    r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16,
759) {
760    let mut tmp = [PremultipliedColorU8::TRANSPARENT; STAGE_WIDTH];
763    tmp[0..tail].copy_from_slice(&data[0..tail]);
764    load_8888(&tmp, r, g, b, a);
765}
766
767#[inline(always)]
768fn store_8888(
769    r: &u16x16, g: &u16x16, b: &u16x16, a: &u16x16,
770    data: &mut [PremultipliedColorU8; STAGE_WIDTH],
771) {
772    let r = r.as_slice();
773    let g = g.as_slice();
774    let b = b.as_slice();
775    let a = a.as_slice();
776
777    data[ 0] = PremultipliedColorU8::from_rgba_unchecked(r[ 0] as u8, g[ 0] as u8, b[ 0] as u8, a[ 0] as u8);
778    data[ 1] = PremultipliedColorU8::from_rgba_unchecked(r[ 1] as u8, g[ 1] as u8, b[ 1] as u8, a[ 1] as u8);
779    data[ 2] = PremultipliedColorU8::from_rgba_unchecked(r[ 2] as u8, g[ 2] as u8, b[ 2] as u8, a[ 2] as u8);
780    data[ 3] = PremultipliedColorU8::from_rgba_unchecked(r[ 3] as u8, g[ 3] as u8, b[ 3] as u8, a[ 3] as u8);
781    data[ 4] = PremultipliedColorU8::from_rgba_unchecked(r[ 4] as u8, g[ 4] as u8, b[ 4] as u8, a[ 4] as u8);
782    data[ 5] = PremultipliedColorU8::from_rgba_unchecked(r[ 5] as u8, g[ 5] as u8, b[ 5] as u8, a[ 5] as u8);
783    data[ 6] = PremultipliedColorU8::from_rgba_unchecked(r[ 6] as u8, g[ 6] as u8, b[ 6] as u8, a[ 6] as u8);
784    data[ 7] = PremultipliedColorU8::from_rgba_unchecked(r[ 7] as u8, g[ 7] as u8, b[ 7] as u8, a[ 7] as u8);
785    data[ 8] = PremultipliedColorU8::from_rgba_unchecked(r[ 8] as u8, g[ 8] as u8, b[ 8] as u8, a[ 8] as u8);
786    data[ 9] = PremultipliedColorU8::from_rgba_unchecked(r[ 9] as u8, g[ 9] as u8, b[ 9] as u8, a[ 9] as u8);
787    data[10] = PremultipliedColorU8::from_rgba_unchecked(r[10] as u8, g[10] as u8, b[10] as u8, a[10] as u8);
788    data[11] = PremultipliedColorU8::from_rgba_unchecked(r[11] as u8, g[11] as u8, b[11] as u8, a[11] as u8);
789    data[12] = PremultipliedColorU8::from_rgba_unchecked(r[12] as u8, g[12] as u8, b[12] as u8, a[12] as u8);
790    data[13] = PremultipliedColorU8::from_rgba_unchecked(r[13] as u8, g[13] as u8, b[13] as u8, a[13] as u8);
791    data[14] = PremultipliedColorU8::from_rgba_unchecked(r[14] as u8, g[14] as u8, b[14] as u8, a[14] as u8);
792    data[15] = PremultipliedColorU8::from_rgba_unchecked(r[15] as u8, g[15] as u8, b[15] as u8, a[15] as u8);
793}
794
795#[inline(always)]
796fn store_8888_tail(
797    r: &u16x16, g: &u16x16, b: &u16x16, a: &u16x16,
798    tail: usize, data: &mut [PremultipliedColorU8],
799) {
800    let r = r.as_slice();
801    let g = g.as_slice();
802    let b = b.as_slice();
803    let a = a.as_slice();
804
805    for i in 0..STAGE_WIDTH {
809        data[i] = PremultipliedColorU8::from_rgba_unchecked(
810            r[i] as u8, g[i] as u8, b[i] as u8, a[i] as u8,
811        );
812
813        if i + 1 == tail {
814            break;
815        }
816    }
817}
818
819#[inline(always)]
820fn load_8(data: &[u8; STAGE_WIDTH], a: &mut u16x16) {
821    *a = u16x16([
822        data[ 0] as u16, data[ 1] as u16, data[ 2] as u16, data[ 3] as u16,
823        data[ 4] as u16, data[ 5] as u16, data[ 6] as u16, data[ 7] as u16,
824        data[ 8] as u16, data[ 9] as u16, data[10] as u16, data[11] as u16,
825        data[12] as u16, data[13] as u16, data[14] as u16, data[15] as u16,
826    ]);
827}
828
829#[inline(always)]
830fn div255(v: u16x16) -> u16x16 {
831    (v + u16x16::splat(255)) >> u16x16::splat(8) }
836
837#[inline(always)]
838fn inv(v: u16x16) -> u16x16 {
839    u16x16::splat(255) - v
840}
841
842#[inline(always)]
843fn from_float(f: f32) -> u16x16 {
844    u16x16::splat((f * 255.0 + 0.5) as u16)
845}
846
847#[inline(always)]
848fn lerp(from: u16x16, to: u16x16, t: u16x16) -> u16x16 {
849    div255(from * inv(t) + to * t)
850}
851
852#[inline(always)]
853fn split(v: &f32x16, lo: &mut u16x16, hi: &mut u16x16) {
854    let data: [u8; 64] = bytemuck::cast(*v);
856    let d0: &mut [u8; 32] = bytemuck::cast_mut(&mut lo.0);
857    let d1: &mut [u8; 32] = bytemuck::cast_mut(&mut hi.0);
858
859    d0.copy_from_slice(&data[0..32]);
860    d1.copy_from_slice(&data[32..64]);
861}
862
863#[inline(always)]
864fn join(lo: &u16x16, hi: &u16x16) -> f32x16 {
865    let d0: [u8; 32] = bytemuck::cast(lo.0);
868    let d1: [u8; 32] = bytemuck::cast(hi.0);
869
870    let mut v = f32x16::default();
871    let data: &mut [u8; 64] = bytemuck::cast_mut(&mut v);
872
873    data[0..32].copy_from_slice(&d0);
874    data[32..64].copy_from_slice(&d1);
875
876    v
877}
878
879#[inline(always)]
880fn mad(f: f32x16, m: f32x16, a: f32x16) -> f32x16 {
881    f * m + a
883}