1use crate::{PremultipliedColorU8, SpreadMode, PixmapRef};
19
20use crate::geom::ScreenIntRect;
21use crate::pixmap::SubPixmapMut;
22use crate::wide::{f32x8, i32x8, u32x8};
23
24pub const STAGE_WIDTH: usize = 8;
25
26pub type StageFn = fn(p: &mut Pipeline);
27
28pub struct Pipeline<'a, 'b: 'a> {
29    index: usize,
30    functions: &'a [StageFn],
31    pixmap_src: PixmapRef<'a>,
32    pixmap_dst: &'a mut SubPixmapMut<'b>,
33    ctx: &'a mut super::Context, mask_ctx: super::MaskCtx<'a>,
35    aa_mask_ctx: super::AAMaskCtx,
36    r: f32x8,
37    g: f32x8,
38    b: f32x8,
39    a: f32x8,
40    dr: f32x8,
41    dg: f32x8,
42    db: f32x8,
43    da: f32x8,
44    tail: usize,
45    dx: usize,
46    dy: usize,
47}
48
49impl Pipeline<'_, '_> {
50    #[inline(always)]
51    fn next_stage(&mut self) {
52        let next: fn(&mut Self) = self.functions[self.index];
53        self.index += 1;
54        next(self);
55    }
56}
57
58pub const STAGES: &[StageFn; super::STAGES_COUNT] = &[
60    move_source_to_destination,
61    move_destination_to_source,
62    clamp_0,
63    clamp_a,
64    premultiply,
65    uniform_color,
66    seed_shader,
67    load_dst,
68    store,
69    load_dst_u8,
70    store_u8,
71    gather,
72    load_mask_u8,
73    mask_u8,
74    scale_u8,
75    lerp_u8,
76    scale_1_float,
77    lerp_1_float,
78    destination_atop,
79    destination_in,
80    destination_out,
81    destination_over,
82    source_atop,
83    source_in,
84    source_out,
85    source_over,
86    clear,
87    modulate,
88    multiply,
89    plus,
90    screen,
91    xor,
92    color_burn,
93    color_dodge,
94    darken,
95    difference,
96    exclusion,
97    hard_light,
98    lighten,
99    overlay,
100    soft_light,
101    hue,
102    saturation,
103    color,
104    luminosity,
105    source_over_rgba,
106    transform,
107    reflect,
108    repeat,
109    bilinear,
110    bicubic,
111    pad_x1,
112    reflect_x1,
113    repeat_x1,
114    gradient,
115    evenly_spaced_2_stop_gradient,
116    xy_to_radius,
117    xy_to_2pt_conical_focal_on_circle,
118    xy_to_2pt_conical_well_behaved,
119    xy_to_2pt_conical_greater,
120    mask_2pt_conical_degenerates,
121    apply_vector_mask,
122];
123
124pub fn fn_ptr(f: StageFn) -> *const () {
125    f as *const ()
126}
127
128#[inline(never)]
129pub fn start(
130    functions: &[StageFn],
131    functions_tail: &[StageFn],
132    rect: &ScreenIntRect,
133    aa_mask_ctx: super::AAMaskCtx,
134    mask_ctx: super::MaskCtx,
135    ctx: &mut super::Context,
136    pixmap_src: PixmapRef,
137    pixmap_dst: &mut SubPixmapMut,
138) {
139    let mut p = Pipeline {
140        index: 0,
141        functions: &[],
142        pixmap_src,
143        pixmap_dst,
144        mask_ctx,
145        aa_mask_ctx,
146        ctx,
147        r: f32x8::default(),
148        g: f32x8::default(),
149        b: f32x8::default(),
150        a: f32x8::default(),
151        dr: f32x8::default(),
152        dg: f32x8::default(),
153        db: f32x8::default(),
154        da: f32x8::default(),
155        tail: 0,
156        dx: 0,
157        dy: 0,
158    };
159
160    for y in rect.y()..rect.bottom() {
161        let mut x = rect.x() as usize;
162        let end = rect.right() as usize;
163
164        p.functions = functions;
165        while x + STAGE_WIDTH <= end {
166            p.index = 0;
167            p.dx = x;
168            p.dy = y as usize;
169            p.tail = STAGE_WIDTH;
170            p.next_stage();
171            x += STAGE_WIDTH;
172        }
173
174        if x != end {
175            p.index = 0;
176            p.functions = functions_tail;
177            p.dx = x;
178            p.dy = y as usize;
179            p.tail = end - x;
180            p.next_stage();
181        }
182    }
183}
184
185fn move_source_to_destination(p: &mut Pipeline) {
186    p.dr = p.r;
187    p.dg = p.g;
188    p.db = p.b;
189    p.da = p.a;
190
191    p.next_stage();
192}
193
194fn premultiply(p: &mut Pipeline) {
195    p.r *= p.a;
196    p.g *= p.a;
197    p.b *= p.a;
198
199    p.next_stage();
200}
201
202fn move_destination_to_source(p: &mut Pipeline) {
203    p.r = p.dr;
204    p.g = p.dg;
205    p.b = p.db;
206    p.a = p.da;
207
208    p.next_stage();
209}
210
211fn clamp_0(p: &mut Pipeline) {
212    p.r = p.r.max(f32x8::default());
213    p.g = p.g.max(f32x8::default());
214    p.b = p.b.max(f32x8::default());
215    p.a = p.a.max(f32x8::default());
216
217    p.next_stage();
218}
219
220fn clamp_a(p: &mut Pipeline) {
221    p.r = p.r.min(f32x8::splat(1.0));
222    p.g = p.g.min(f32x8::splat(1.0));
223    p.b = p.b.min(f32x8::splat(1.0));
224    p.a = p.a.min(f32x8::splat(1.0));
225
226    p.next_stage();
227}
228
229fn uniform_color(p: &mut Pipeline) {
230    let ctx = &p.ctx.uniform_color;
231    p.r = f32x8::splat(ctx.r);
232    p.g = f32x8::splat(ctx.g);
233    p.b = f32x8::splat(ctx.b);
234    p.a = f32x8::splat(ctx.a);
235
236    p.next_stage();
237}
238
239fn seed_shader(p: &mut Pipeline) {
240    let iota = f32x8::from([0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]);
241
242    p.r = f32x8::splat(p.dx as f32) + iota;
243    p.g = f32x8::splat(p.dy as f32 + 0.5);
244    p.b = f32x8::splat(1.0);
245    p.a = f32x8::default();
246
247    p.dr = f32x8::default();
248    p.dg = f32x8::default();
249    p.db = f32x8::default();
250    p.da = f32x8::default();
251
252    p.next_stage();
253}
254
255pub fn load_dst(p: &mut Pipeline) {
256    load_8888(p.pixmap_dst.slice4_at_xy(p.dx, p.dy), &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
257    p.next_stage();
258}
259
260pub fn load_dst_tail(p: &mut Pipeline) {
261    load_8888_tail(p.tail, p.pixmap_dst.slice_at_xy(p.dx, p.dy), &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
262    p.next_stage();
263}
264
265pub fn store(p: &mut Pipeline) {
266    store_8888(&p.r, &p.g, &p.b, &p.a, p.pixmap_dst.slice4_at_xy(p.dx, p.dy));
267    p.next_stage();
268}
269
270pub fn store_tail(p: &mut Pipeline) {
271    store_8888_tail(&p.r, &p.g, &p.b, &p.a, p.tail, p.pixmap_dst.slice_at_xy(p.dx, p.dy));
272    p.next_stage();
273}
274
275pub fn load_dst_u8(_: &mut Pipeline) {
277    }
279
280pub fn load_dst_u8_tail(_: &mut Pipeline) {
281    }
283
284pub fn store_u8(_: &mut Pipeline) {
285    }
287
288pub fn store_u8_tail(_: &mut Pipeline) {
289    }
291
292pub fn gather(p: &mut Pipeline) {
293    let ix = gather_ix(p.pixmap_src, p.r, p.g);
294    load_8888(&p.pixmap_src.gather(ix), &mut p.r, &mut p.g, &mut p.b, &mut p.a);
295
296    p.next_stage();
297}
298
299#[inline(always)]
300fn gather_ix(pixmap: PixmapRef, mut x: f32x8, mut y: f32x8) -> u32x8 {
301    let w = ulp_sub(pixmap.width() as f32);
303    let h = ulp_sub(pixmap.height() as f32);
304    x = x.max(f32x8::default()).min(f32x8::splat(w));
305    y = y.max(f32x8::default()).min(f32x8::splat(h));
306
307    (y.trunc_int() * i32x8::splat(pixmap.width() as i32) + x.trunc_int()).to_u32x8_bitcast()
308}
309
310#[inline(always)]
311fn ulp_sub(v: f32) -> f32 {
312    bytemuck::cast::<u32, f32>(bytemuck::cast::<f32, u32>(v) - 1)
314}
315
316fn load_mask_u8(_: &mut Pipeline) {
317    }
319
320fn mask_u8(p: &mut Pipeline) {
321    let offset = p.mask_ctx.offset(p.dx, p.dy);
322    let mut c = [0.0; 8];
323    for i in 0..p.tail {
324        c[i] = p.mask_ctx.data[offset + i] as f32;
325    }
326    let c = f32x8::from(c) / f32x8::splat(255.0);
327
328    if c == f32x8::default() {
329        return;
330    }
331
332    p.r *= c;
333    p.g *= c;
334    p.b *= c;
335    p.a *= c;
336
337    p.next_stage();
338}
339
340fn scale_u8(p: &mut Pipeline) {
341    let data = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail);
343    let c = f32x8::from([data[0] as f32, data[1] as f32, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]);
344    let c = c / f32x8::splat(255.0);
345
346    p.r *= c;
347    p.g *= c;
348    p.b *= c;
349    p.a *= c;
350
351    p.next_stage();
352}
353
354fn lerp_u8(p: &mut Pipeline) {
355    let data = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail);
357    let c = f32x8::from([data[0] as f32, data[1] as f32, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]);
358    let c = c / f32x8::splat(255.0);
359
360    p.r = lerp(p.dr, p.r, c);
361    p.g = lerp(p.dg, p.g, c);
362    p.b = lerp(p.db, p.b, c);
363    p.a = lerp(p.da, p.a, c);
364
365    p.next_stage();
366}
367
368fn scale_1_float(p: &mut Pipeline) {
369    let c = f32x8::splat(p.ctx.current_coverage);
370    p.r *= c;
371    p.g *= c;
372    p.b *= c;
373    p.a *= c;
374
375    p.next_stage();
376}
377
378fn lerp_1_float(p: &mut Pipeline) {
379    let c = f32x8::splat(p.ctx.current_coverage);
380    p.r = lerp(p.dr, p.r, c);
381    p.g = lerp(p.dg, p.g, c);
382    p.b = lerp(p.db, p.b, c);
383    p.a = lerp(p.da, p.a, c);
384
385    p.next_stage();
386}
387
388macro_rules! blend_fn {
389    ($name:ident, $f:expr) => {
390        fn $name(p: &mut Pipeline) {
391            p.r = $f(p.r, p.dr, p.a, p.da);
392            p.g = $f(p.g, p.dg, p.a, p.da);
393            p.b = $f(p.b, p.db, p.a, p.da);
394            p.a = $f(p.a, p.da, p.a, p.da);
395
396            p.next_stage();
397        }
398    };
399}
400
401blend_fn!(clear,            |_, _,  _,  _| f32x8::default());
402blend_fn!(source_atop,      |s, d, sa, da| s * da + d * inv(sa));
403blend_fn!(destination_atop, |s, d, sa, da| d * sa + s * inv(da));
404blend_fn!(source_in,        |s, _,  _, da| s * da);
405blend_fn!(destination_in,   |_, d, sa,  _| d * sa);
406blend_fn!(source_out,       |s, _,  _, da| s * inv(da));
407blend_fn!(destination_out,  |_, d, sa,  _| d * inv(sa));
408blend_fn!(source_over,      |s, d, sa,  _| mad(d, inv(sa), s));
409blend_fn!(destination_over, |s, d,  _, da| mad(s, inv(da), d));
410blend_fn!(modulate,         |s, d,  _,  _| s * d);
411blend_fn!(multiply,         |s, d, sa, da| s * inv(da) + d * inv(sa) + s * d);
412blend_fn!(screen,           |s, d,  _,  _| s + d - s * d);
413blend_fn!(xor,              |s, d, sa, da| s * inv(da) + d * inv(sa));
414
415blend_fn!(plus, |s: f32x8, d: f32x8, _, _| (s + d).min(f32x8::splat(1.0)));
417
418macro_rules! blend_fn2 {
419    ($name:ident, $f:expr) => {
420        fn $name(p: &mut Pipeline) {
421            p.r = $f(p.r, p.dr, p.a, p.da);
423            p.g = $f(p.g, p.dg, p.a, p.da);
424            p.b = $f(p.b, p.db, p.a, p.da);
425            p.a = mad(p.da, inv(p.a), p.a);
426
427            p.next_stage();
428        }
429    };
430}
431
432blend_fn2!(darken,      |s: f32x8, d, sa, da: f32x8| s + d - (s * da).max(d * sa));
433blend_fn2!(lighten,     |s: f32x8, d, sa, da: f32x8| s + d - (s * da).min(d * sa));
434blend_fn2!(difference,  |s: f32x8, d, sa, da: f32x8| s + d - two((s * da).min(d * sa)));
435blend_fn2!(exclusion,   |s: f32x8, d,  _,  _| s + d - two(s * d));
436
437blend_fn2!(color_burn, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8|
438    d.cmp_eq(da).blend(
439        d + s * inv(da),
440        s.cmp_eq(f32x8::default()).blend(
441            d * inv(sa),
442            sa * (da - da.min((da - d) * sa * s.recip_fast())) + s * inv(da) + d * inv(sa)
443        )
444    )
445);
446
447blend_fn2!(color_dodge, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8|
448    d.cmp_eq(f32x8::default()).blend(
449        s * inv(da),
450        s.cmp_eq(sa).blend(
451            s + d * inv(sa),
452            sa * da.min((d * sa) * (sa - s).recip_fast()) + s * inv(da) + d * inv(sa)
453        )
454    )
455);
456
457blend_fn2!(hard_light, |s: f32x8, d: f32x8, sa, da|
458    s * inv(da) + d * inv(sa) + two(s).cmp_le(sa).blend(
459        two(s * d),
460        sa * da - two((da - d) * (sa - s))
461    )
462);
463
464blend_fn2!(overlay, |s: f32x8, d: f32x8, sa, da|
465    s * inv(da) + d * inv(sa) + two(d).cmp_le(da).blend(
466        two(s * d),
467        sa * da - two((da - d) * (sa - s))
468    )
469);
470
471blend_fn2!(soft_light, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8| {
472    let m  = da.cmp_gt(f32x8::default()).blend(d / da, f32x8::default());
473    let s2 = two(s);
474    let m4 = two(two(m));
475
476    let dark_src = d * (sa + (s2 - sa) * (f32x8::splat(1.0) - m));
481    let dark_dst = (m4 * m4 + m4) * (m - f32x8::splat(1.0)) + f32x8::splat(7.0) * m;
482    let lite_dst = m.sqrt() - m;
483    let lite_src = d * sa + da * (s2 - sa)
484        * two(two(d)).cmp_le(da).blend(dark_dst, lite_dst); s * inv(da) + d * inv(sa) + s2.cmp_le(sa).blend(dark_src, lite_src) });
488
489macro_rules! blend_fn3 {
498    ($name:ident, $f:expr) => {
499        fn $name(p: &mut Pipeline) {
500            let (tr, tg, tb, ta) = $f(p.r, p.g, p.b, p.a, p.dr, p.dg, p.db, p.da);
501            p.r = tr;
502            p.g = tg;
503            p.b = tb;
504            p.a = ta;
505
506            p.next_stage();
507        }
508    };
509}
510
511blend_fn3!(hue, hue_k);
512
513#[inline(always)]
514fn hue_k(
515    r: f32x8, g: f32x8, b: f32x8, a: f32x8,
516    dr: f32x8, dg: f32x8, db: f32x8, da: f32x8,
517) -> (f32x8, f32x8, f32x8, f32x8) {
518    let rr = &mut (r * a);
519    let gg = &mut (g * a);
520    let bb = &mut (b * a);
521
522    set_sat(rr, gg, bb, sat(dr, dg, db) * a);
523    set_lum(rr, gg, bb, lum(dr, dg, db) * a);
524    clip_color(rr, gg, bb, a * da);
525
526    let r = r * inv(da) + dr * inv(a) + *rr;
527    let g = g * inv(da) + dg * inv(a) + *gg;
528    let b = b * inv(da) + db * inv(a) + *bb;
529    let a = a + da - a * da;
530
531    (r, g, b, a)
532}
533
534blend_fn3!(saturation, saturation_k);
535
536#[inline(always)]
537fn saturation_k(
538    r: f32x8, g: f32x8, b: f32x8, a: f32x8,
539    dr: f32x8, dg: f32x8, db: f32x8, da: f32x8,
540) -> (f32x8, f32x8, f32x8, f32x8) {
541    let rr = &mut (dr * a);
542    let gg = &mut (dg * a);
543    let bb = &mut (db * a);
544
545    set_sat(rr, gg, bb, sat(r, g, b) * da);
546    set_lum(rr, gg, bb, lum(dr, dg, db) * a); clip_color(rr, gg, bb, a * da);
548
549    let r = r * inv(da) + dr * inv(a) + *rr;
550    let g = g * inv(da) + dg * inv(a) + *gg;
551    let b = b * inv(da) + db * inv(a) + *bb;
552    let a = a + da - a * da;
553
554    (r, g, b, a)
555}
556
557blend_fn3!(color, color_k);
558
559#[inline(always)]
560fn color_k(
561    r: f32x8, g: f32x8, b: f32x8, a: f32x8,
562    dr: f32x8, dg: f32x8, db: f32x8, da: f32x8,
563) -> (f32x8, f32x8, f32x8, f32x8) {
564    let rr = &mut (r * da);
565    let gg = &mut (g * da);
566    let bb = &mut (b * da);
567
568    set_lum(rr, gg, bb, lum(dr, dg, db) * a);
569    clip_color(rr, gg, bb, a * da);
570
571    let r = r * inv(da) + dr * inv(a) + *rr;
572    let g = g * inv(da) + dg * inv(a) + *gg;
573    let b = b * inv(da) + db * inv(a) + *bb;
574    let a = a + da - a * da;
575
576    (r, g, b, a)
577}
578
579blend_fn3!(luminosity, luminosity_k);
580
581#[inline(always)]
582fn luminosity_k(
583    r: f32x8, g: f32x8, b: f32x8, a: f32x8,
584    dr: f32x8, dg: f32x8, db: f32x8, da: f32x8,
585) -> (f32x8, f32x8, f32x8, f32x8) {
586    let rr = &mut (dr * a);
587    let gg = &mut (dg * a);
588    let bb = &mut (db * a);
589
590    set_lum(rr, gg, bb, lum(r, g, b) * da);
591    clip_color(rr, gg, bb, a * da);
592
593    let r = r * inv(da) + dr * inv(a) + *rr;
594    let g = g * inv(da) + dg * inv(a) + *gg;
595    let b = b * inv(da) + db * inv(a) + *bb;
596    let a = a + da - a * da;
597
598    (r, g, b, a)
599}
600
601#[inline(always)]
602fn sat(r: f32x8, g: f32x8, b: f32x8) -> f32x8 {
603    r.max(g.max(b)) - r.min(g.min(b))
604}
605
606#[inline(always)]
607fn lum(r: f32x8, g: f32x8, b: f32x8) -> f32x8 {
608    r * f32x8::splat(0.30) + g * f32x8::splat(0.59) + b * f32x8::splat(0.11)
609}
610
611#[inline(always)]
612fn set_sat(r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, s: f32x8) {
613    let mn  = r.min(g.min(*b));
614    let mx  = r.max(g.max(*b));
615    let sat = mx - mn;
616
617    let scale = |c| sat.cmp_eq(f32x8::default())
619                       .blend(f32x8::default(), (c - mn) * s / sat);
620
621    *r = scale(*r);
622    *g = scale(*g);
623    *b = scale(*b);
624}
625
626#[inline(always)]
627fn set_lum(r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, l: f32x8) {
628    let diff = l - lum(*r, *g, *b);
629    *r += diff;
630    *g += diff;
631    *b += diff;
632}
633
634#[inline(always)]
635fn clip_color(r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: f32x8) {
636    let mn = r.min(g.min(*b));
637    let mx = r.max(g.max(*b));
638    let l  = lum(*r, *g, *b);
639
640    let clip = |mut c| {
641        c = mx.cmp_ge(f32x8::default()).blend(c, l + (c - l) * l / (l - mn));
642        c = mx.cmp_gt(a).blend(l + (c - l) * (a - l) / (mx - l), c);
643        c = c.max(f32x8::default()); c
645    };
646
647    *r = clip(*r);
648    *g = clip(*g);
649    *b = clip(*b);
650}
651
652pub fn source_over_rgba(p: &mut Pipeline) {
653    let pixels = p.pixmap_dst.slice4_at_xy(p.dx, p.dy);
654    load_8888(pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
655    p.r = mad(p.dr, inv(p.a), p.r);
656    p.g = mad(p.dg, inv(p.a), p.g);
657    p.b = mad(p.db, inv(p.a), p.b);
658    p.a = mad(p.da, inv(p.a), p.a);
659    store_8888(&p.r, &p.g, &p.b, &p.a, pixels);
660
661    p.next_stage();
662}
663
664pub fn source_over_rgba_tail(p: &mut Pipeline) {
665    let pixels = p.pixmap_dst.slice_at_xy(p.dx, p.dy);
666    load_8888_tail(p.tail, pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
667    p.r = mad(p.dr, inv(p.a), p.r);
668    p.g = mad(p.dg, inv(p.a), p.g);
669    p.b = mad(p.db, inv(p.a), p.b);
670    p.a = mad(p.da, inv(p.a), p.a);
671    store_8888_tail(&p.r, &p.g, &p.b, &p.a, p.tail, pixels);
672
673    p.next_stage();
674}
675
676fn transform(p: &mut Pipeline) {
677    let ts = &p.ctx.transform;
678
679    let tr = mad(p.r, f32x8::splat(ts.sx), mad(p.g, f32x8::splat(ts.kx), f32x8::splat(ts.tx)));
680    let tg = mad(p.r, f32x8::splat(ts.ky), mad(p.g, f32x8::splat(ts.sy), f32x8::splat(ts.ty)));
681    p.r = tr;
682    p.g = tg;
683
684    p.next_stage();
685}
686
687fn reflect(p: &mut Pipeline) {
692    let ctx = &p.ctx.limit_x;
693    p.r = exclusive_reflect(p.r, ctx.scale, ctx.inv_scale);
694
695    let ctx = &p.ctx.limit_y;
696    p.g = exclusive_reflect(p.g, ctx.scale, ctx.inv_scale);
697
698    p.next_stage();
699}
700
701#[inline(always)]
702fn exclusive_reflect(v: f32x8, limit: f32, inv_limit: f32) -> f32x8 {
703    let limit = f32x8::splat(limit);
704    let inv_limit = f32x8::splat(inv_limit);
705    ((v - limit) - (limit + limit)
706        * ((v - limit) * (inv_limit * f32x8::splat(0.5))).floor() - limit).abs()
707}
708
709fn repeat(p: &mut Pipeline) {
710    let ctx = &p.ctx.limit_x;
711    p.r = exclusive_repeat(p.r, ctx.scale, ctx.inv_scale);
712
713    let ctx = &p.ctx.limit_y;
714    p.g = exclusive_repeat(p.g, ctx.scale, ctx.inv_scale);
715
716    p.next_stage();
717}
718
719#[inline(always)]
720fn exclusive_repeat(v: f32x8, limit: f32, inv_limit: f32) -> f32x8 {
721    v - (v * f32x8::splat(inv_limit)).floor() * f32x8::splat(limit)
722}
723
724fn bilinear(p: &mut Pipeline) {
725    let x = p.r;
726    let fx = (x + f32x8::splat(0.5)).fract();
727    let y = p.g;
728    let fy = (y + f32x8::splat(0.5)).fract();
729    let one = f32x8::splat(1.0);
730    let wx = [one - fx, fx];
731    let wy = [one - fy, fy];
732
733    sampler_2x2(p.pixmap_src, &p.ctx.sampler, x, y, &wx, &wy, &mut p.r, &mut p.g, &mut p.b, &mut p.a);
734
735    p.next_stage();
736}
737
738fn bicubic(p: &mut Pipeline) {
739    let x = p.r;
740    let fx = (x + f32x8::splat(0.5)).fract();
741    let y = p.g;
742    let fy = (y + f32x8::splat(0.5)).fract();
743    let one = f32x8::splat(1.0);
744    let wx = [bicubic_far(one - fx), bicubic_near(one - fx), bicubic_near(fx), bicubic_far(fx)];
745    let wy = [bicubic_far(one - fy), bicubic_near(one - fy), bicubic_near(fy), bicubic_far(fy)];
746
747    sampler_4x4(p.pixmap_src, &p.ctx.sampler, x, y, &wx, &wy, &mut p.r, &mut p.g, &mut p.b, &mut p.a);
748
749    p.next_stage();
750}
751
752#[inline(always)]
758fn bicubic_near(t: f32x8) -> f32x8 {
759    mad(
761        t,
762        mad(t,
763            mad(
764                f32x8::splat(-21.0/18.0),
765                t,
766                f32x8::splat(27.0/18.0),
767            ),
768            f32x8::splat(9.0/18.0),
769        ),
770        f32x8::splat(1.0/18.0),
771    )
772}
773
774#[inline(always)]
775fn bicubic_far(t: f32x8) -> f32x8 {
776    (t * t) * mad(f32x8::splat(7.0/18.0), t, f32x8::splat(-6.0/18.0))
778}
779
780#[inline(always)]
781fn sampler_2x2(
782    pixmap: PixmapRef,
783    ctx: &super::SamplerCtx,
784    cx: f32x8, cy: f32x8,
785    wx: &[f32x8; 2], wy: &[f32x8; 2],
786    r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
787) {
788    *r = f32x8::default();
789    *g = f32x8::default();
790    *b = f32x8::default();
791    *a = f32x8::default();
792
793    let one = f32x8::splat(1.0);
794    let start = -0.5;
795    let mut y = cy + f32x8::splat(start);
796    for j in 0..2 {
797        let mut x = cx + f32x8::splat(start);
798        for i in 0..2 {
799            let mut rr = f32x8::default();
800            let mut gg = f32x8::default();
801            let mut bb = f32x8::default();
802            let mut aa = f32x8::default();
803            sample(pixmap, ctx, x,y, &mut rr, &mut gg, &mut bb, &mut aa);
804
805            let w = wx[i] * wy[j];
806            *r = mad(w, rr, *r);
807            *g = mad(w, gg, *g);
808            *b = mad(w, bb, *b);
809            *a = mad(w, aa, *a);
810
811            x += one;
812        }
813
814        y += one;
815    }
816}
817
818#[inline(always)]
819fn sampler_4x4(
820    pixmap: PixmapRef,
821    ctx: &super::SamplerCtx,
822    cx: f32x8, cy: f32x8,
823    wx: &[f32x8; 4], wy: &[f32x8; 4],
824    r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
825) {
826    *r = f32x8::default();
827    *g = f32x8::default();
828    *b = f32x8::default();
829    *a = f32x8::default();
830
831    let one = f32x8::splat(1.0);
832    let start = -1.5;
833    let mut y = cy + f32x8::splat(start);
834    for j in 0..4 {
835        let mut x = cx + f32x8::splat(start);
836        for i in 0..4 {
837            let mut rr = f32x8::default();
838            let mut gg = f32x8::default();
839            let mut bb = f32x8::default();
840            let mut aa = f32x8::default();
841            sample(pixmap, ctx, x,y, &mut rr, &mut gg, &mut bb, &mut aa);
842
843            let w = wx[i] * wy[j];
844            *r = mad(w, rr, *r);
845            *g = mad(w, gg, *g);
846            *b = mad(w, bb, *b);
847            *a = mad(w, aa, *a);
848
849            x += one;
850        }
851
852        y += one;
853    }
854}
855
856#[inline(always)]
857fn sample(
858    pixmap: PixmapRef, ctx: &super::SamplerCtx, mut x: f32x8, mut y: f32x8,
859    r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
860) {
861    x = tile(x, ctx.spread_mode, pixmap.width() as f32, ctx.inv_width);
862    y = tile(y, ctx.spread_mode, pixmap.height() as f32, ctx.inv_height);
863
864    let ix = gather_ix(pixmap, x, y);
865    load_8888(&pixmap.gather(ix), r, g, b, a);
866}
867
868#[inline(always)]
869fn tile(v: f32x8, mode: SpreadMode, limit: f32, inv_limit: f32) -> f32x8 {
870    match mode {
871        SpreadMode::Pad => v,
872        SpreadMode::Repeat => exclusive_repeat(v, limit, inv_limit),
873        SpreadMode::Reflect => exclusive_reflect(v, limit, inv_limit),
874    }
875}
876
877fn pad_x1(p: &mut Pipeline) {
878    p.r = p.r.normalize();
879
880    p.next_stage();
881}
882
883fn reflect_x1(p: &mut Pipeline) {
884    p.r = (
885        (p.r - f32x8::splat(1.0))
886            - two(((p.r - f32x8::splat(1.0)) * f32x8::splat(0.5)).floor())
887            - f32x8::splat(1.0)
888    ).abs().normalize();
889
890    p.next_stage();
891}
892
893fn repeat_x1(p: &mut Pipeline) {
894    p.r = (p.r - p.r.floor()).normalize();
895
896    p.next_stage();
897}
898
899fn gradient(p: &mut Pipeline) {
900    let ctx = &p.ctx.gradient;
901
902    let t: [f32; 8] = p.r.into();
904    let mut idx = u32x8::default();
905    for i in 1..ctx.len {
906        let tt = ctx.t_values[i].get();
907        let n: u32x8 = bytemuck::cast([
908            (t[0] >= tt) as u32,
909            (t[1] >= tt) as u32,
910            (t[2] >= tt) as u32,
911            (t[3] >= tt) as u32,
912            (t[4] >= tt) as u32,
913            (t[5] >= tt) as u32,
914            (t[6] >= tt) as u32,
915            (t[7] >= tt) as u32,
916        ]);
917        idx = idx + n;
918    }
919    gradient_lookup(ctx, &idx, p.r, &mut p.r, &mut p.g, &mut p.b, &mut p.a);
920
921    p.next_stage();
922}
923
924fn gradient_lookup(
925    ctx: &super::GradientCtx, idx: &u32x8, t: f32x8,
926    r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
927) {
928    let idx: [u32; 8] = bytemuck::cast(*idx);
929
930    macro_rules! gather {
931        ($d:expr, $c:ident) => {
932            f32x8::from([
935                $d[idx[0] as usize].$c,
936                $d[idx[1] as usize].$c,
937                $d[idx[2] as usize].$c,
938                $d[idx[3] as usize].$c,
939                $d[idx[4] as usize].$c,
940                $d[idx[5] as usize].$c,
941                $d[idx[6] as usize].$c,
942                $d[idx[7] as usize].$c,
943            ])
944        };
945    }
946
947    let fr = gather!(&ctx.factors, r);
948    let fg = gather!(&ctx.factors, g);
949    let fb = gather!(&ctx.factors, b);
950    let fa = gather!(&ctx.factors, a);
951
952    let br = gather!(&ctx.biases, r);
953    let bg = gather!(&ctx.biases, g);
954    let bb = gather!(&ctx.biases, b);
955    let ba = gather!(&ctx.biases, a);
956
957    *r = mad(t, fr, br);
958    *g = mad(t, fg, bg);
959    *b = mad(t, fb, bb);
960    *a = mad(t, fa, ba);
961}
962
963fn evenly_spaced_2_stop_gradient(p: &mut Pipeline) {
964    let ctx = &p.ctx.evenly_spaced_2_stop_gradient;
965
966    let t = p.r;
967    p.r = mad(t, f32x8::splat(ctx.factor.r), f32x8::splat(ctx.bias.r));
968    p.g = mad(t, f32x8::splat(ctx.factor.g), f32x8::splat(ctx.bias.g));
969    p.b = mad(t, f32x8::splat(ctx.factor.b), f32x8::splat(ctx.bias.b));
970    p.a = mad(t, f32x8::splat(ctx.factor.a), f32x8::splat(ctx.bias.a));
971
972    p.next_stage();
973}
974
975fn xy_to_radius(p: &mut Pipeline) {
976    let x2 = p.r * p.r;
977    let y2 = p.g * p.g;
978    p.r = (x2 + y2).sqrt();
979
980    p.next_stage();
981}
982
983fn xy_to_2pt_conical_focal_on_circle(p: &mut Pipeline) {
984    let x = p.r;
985    let y = p.g;
986    p.r = x + y * y / x;
987
988    p.next_stage();
989}
990
991fn xy_to_2pt_conical_well_behaved(p: &mut Pipeline) {
992    let ctx = &p.ctx.two_point_conical_gradient;
993
994    let x = p.r;
995    let y = p.g;
996    p.r = (x * x + y * y).sqrt() - x * f32x8::splat(ctx.p0);
997
998    p.next_stage();
999}
1000
1001fn xy_to_2pt_conical_greater(p: &mut Pipeline) {
1002    let ctx = &p.ctx.two_point_conical_gradient;
1003
1004    let x = p.r;
1005    let y = p.g;
1006    p.r = (x * x - y * y).sqrt() - x * f32x8::splat(ctx.p0);
1007
1008    p.next_stage();
1009}
1010
1011fn mask_2pt_conical_degenerates(p: &mut Pipeline) {
1012    let ctx = &mut p.ctx.two_point_conical_gradient;
1013
1014    let t = p.r;
1015    let is_degenerate = t.cmp_le(f32x8::default()) | t.cmp_ne(t);
1016    p.r = is_degenerate.blend(f32x8::default(), t);
1017
1018    let is_not_degenerate = !is_degenerate.to_u32x8_bitcast();
1019    let is_not_degenerate: [u32; 8] = bytemuck::cast(is_not_degenerate);
1020    ctx.mask = bytemuck::cast([
1021        if is_not_degenerate[0] != 0 { !0 } else { 0 },
1022        if is_not_degenerate[1] != 0 { !0 } else { 0 },
1023        if is_not_degenerate[2] != 0 { !0 } else { 0 },
1024        if is_not_degenerate[3] != 0 { !0 } else { 0 },
1025        if is_not_degenerate[4] != 0 { !0 } else { 0 },
1026        if is_not_degenerate[5] != 0 { !0 } else { 0 },
1027        if is_not_degenerate[6] != 0 { !0 } else { 0 },
1028        if is_not_degenerate[7] != 0 { !0 } else { 0 },
1029    ]);
1030
1031    p.next_stage();
1032}
1033
1034fn apply_vector_mask(p: &mut Pipeline) {
1035    let ctx = &p.ctx.two_point_conical_gradient;
1036
1037    p.r = (p.r.to_u32x8_bitcast() & ctx.mask).to_f32x8_bitcast();
1038    p.g = (p.g.to_u32x8_bitcast() & ctx.mask).to_f32x8_bitcast();
1039    p.b = (p.b.to_u32x8_bitcast() & ctx.mask).to_f32x8_bitcast();
1040    p.a = (p.a.to_u32x8_bitcast() & ctx.mask).to_f32x8_bitcast();
1041
1042    p.next_stage();
1043}
1044
1045pub fn just_return(_: &mut Pipeline) {
1046    }
1048
1049#[inline(always)]
1050fn load_8888(
1051    data: &[PremultipliedColorU8; STAGE_WIDTH],
1052    r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
1053) {
1054    const FACTOR: f32 = 1.0 / 255.0;
1057
1058    *r = f32x8::from([
1059        data[0].red() as f32 * FACTOR, data[1].red() as f32 * FACTOR,
1060        data[2].red() as f32 * FACTOR, data[3].red() as f32 * FACTOR,
1061        data[4].red() as f32 * FACTOR, data[5].red() as f32 * FACTOR,
1062        data[6].red() as f32 * FACTOR, data[7].red() as f32 * FACTOR,
1063    ]);
1064
1065    *g = f32x8::from([
1066        data[0].green() as f32 * FACTOR, data[1].green() as f32 * FACTOR,
1067        data[2].green() as f32 * FACTOR, data[3].green() as f32 * FACTOR,
1068        data[4].green() as f32 * FACTOR, data[5].green() as f32 * FACTOR,
1069        data[6].green() as f32 * FACTOR, data[7].green() as f32 * FACTOR,
1070    ]);
1071
1072    *b = f32x8::from([
1073        data[0].blue() as f32 * FACTOR, data[1].blue() as f32 * FACTOR,
1074        data[2].blue() as f32 * FACTOR, data[3].blue() as f32 * FACTOR,
1075        data[4].blue() as f32 * FACTOR, data[5].blue() as f32 * FACTOR,
1076        data[6].blue() as f32 * FACTOR, data[7].blue() as f32 * FACTOR,
1077    ]);
1078
1079    *a = f32x8::from([
1080        data[0].alpha() as f32 * FACTOR, data[1].alpha() as f32 * FACTOR,
1081        data[2].alpha() as f32 * FACTOR, data[3].alpha() as f32 * FACTOR,
1082        data[4].alpha() as f32 * FACTOR, data[5].alpha() as f32 * FACTOR,
1083        data[6].alpha() as f32 * FACTOR, data[7].alpha() as f32 * FACTOR,
1084    ]);
1085}
1086
1087#[inline(always)]
1088fn load_8888_tail(
1089    tail: usize, data: &[PremultipliedColorU8],
1090    r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
1091) {
1092    let mut tmp = [PremultipliedColorU8::TRANSPARENT; STAGE_WIDTH];
1095    tmp[0..tail].copy_from_slice(&data[0..tail]);
1096    load_8888(&tmp, r, g, b, a);
1097}
1098
1099#[inline(always)]
1100fn store_8888(
1101    r: &f32x8, g: &f32x8, b: &f32x8, a: &f32x8,
1102    data: &mut [PremultipliedColorU8; STAGE_WIDTH],
1103) {
1104    let r: [i32; 8] = unnorm(r).into();
1105    let g: [i32; 8] = unnorm(g).into();
1106    let b: [i32; 8] = unnorm(b).into();
1107    let a: [i32; 8] = unnorm(a).into();
1108
1109    let conv = |rr, gg, bb, aa|
1110        PremultipliedColorU8::from_rgba_unchecked(rr as u8, gg as u8, bb as u8, aa as u8);
1111
1112    data[0] = conv(r[0], g[0], b[0], a[0]);
1113    data[1] = conv(r[1], g[1], b[1], a[1]);
1114    data[2] = conv(r[2], g[2], b[2], a[2]);
1115    data[3] = conv(r[3], g[3], b[3], a[3]);
1116    data[4] = conv(r[4], g[4], b[4], a[4]);
1117    data[5] = conv(r[5], g[5], b[5], a[5]);
1118    data[6] = conv(r[6], g[6], b[6], a[6]);
1119    data[7] = conv(r[7], g[7], b[7], a[7]);
1120}
1121
1122#[inline(always)]
1123fn store_8888_tail(
1124    r: &f32x8, g: &f32x8, b: &f32x8, a: &f32x8,
1125    tail: usize, data: &mut [PremultipliedColorU8],
1126) {
1127    let r: [i32; 8] = unnorm(r).into();
1128    let g: [i32; 8] = unnorm(g).into();
1129    let b: [i32; 8] = unnorm(b).into();
1130    let a: [i32; 8] = unnorm(a).into();
1131
1132    for i in 0..STAGE_WIDTH {
1136        data[i] = PremultipliedColorU8::from_rgba_unchecked(
1137            r[i] as u8, g[i] as u8, b[i] as u8, a[i] as u8,
1138        );
1139
1140        if i + 1 == tail {
1141            break;
1142        }
1143    }
1144}
1145
1146#[inline(always)]
1147fn unnorm(v: &f32x8) -> i32x8 {
1148    (v.max(f32x8::default()).min(f32x8::splat(1.0)) * f32x8::splat(255.0)).round_int()
1149}
1150
1151#[inline(always)]
1152fn inv(v: f32x8) -> f32x8 {
1153    f32x8::splat(1.0) - v
1154}
1155
1156#[inline(always)]
1157fn two(v: f32x8) -> f32x8 {
1158    v + v
1159}
1160
1161#[inline(always)]
1162fn mad(f: f32x8, m: f32x8, a: f32x8) -> f32x8 {
1163    f * m + a
1164}
1165
1166#[inline(always)]
1167fn lerp(from: f32x8, to: f32x8, t: f32x8) -> f32x8 {
1168    mad(to - from, t, from)
1169}