Skip to main content

tiny_skia/pipeline/
highp.rs

1// Copyright 2018 Google Inc.
2// Copyright 2020 Yevhenii Reizner
3//
4// Use of this source code is governed by a BSD-style license that can be
5// found in the LICENSE file.
6
7/*!
8A high precision raster pipeline implementation.
9
10Unlike lowp, this one implements all stages.
11
12Just like Skia, this pipeline is implemented using f32x8.
13
14For some reason, we are almost 2x slower. Maybe because Skia uses clang's vector extensions
15and we're using a manual implementation.
16*/
17
18use crate::{PremultipliedColorU8, SpreadMode, PixmapRef};
19
20use crate::geom::ScreenIntRect;
21use crate::pixmap::SubPixmapMut;
22use crate::wide::{f32x8, i32x8, u32x8};
23
24pub const STAGE_WIDTH: usize = 8;
25
26pub type StageFn = fn(p: &mut Pipeline);
27
28pub struct Pipeline<'a, 'b: 'a> {
29    index: usize,
30    functions: &'a [StageFn],
31    pixmap_src: PixmapRef<'a>,
32    pixmap_dst: &'a mut SubPixmapMut<'b>,
33    ctx: &'a mut super::Context, // TODO: remove mut
34    mask_ctx: super::MaskCtx<'a>,
35    aa_mask_ctx: super::AAMaskCtx,
36    r: f32x8,
37    g: f32x8,
38    b: f32x8,
39    a: f32x8,
40    dr: f32x8,
41    dg: f32x8,
42    db: f32x8,
43    da: f32x8,
44    tail: usize,
45    dx: usize,
46    dy: usize,
47}
48
49impl Pipeline<'_, '_> {
50    #[inline(always)]
51    fn next_stage(&mut self) {
52        let next: fn(&mut Self) = self.functions[self.index];
53        self.index += 1;
54        next(self);
55    }
56}
57
58// Must be in the same order as raster_pipeline::Stage
59pub const STAGES: &[StageFn; super::STAGES_COUNT] = &[
60    move_source_to_destination,
61    move_destination_to_source,
62    clamp_0,
63    clamp_a,
64    premultiply,
65    uniform_color,
66    seed_shader,
67    load_dst,
68    store,
69    load_dst_u8,
70    store_u8,
71    gather,
72    load_mask_u8,
73    mask_u8,
74    scale_u8,
75    lerp_u8,
76    scale_1_float,
77    lerp_1_float,
78    destination_atop,
79    destination_in,
80    destination_out,
81    destination_over,
82    source_atop,
83    source_in,
84    source_out,
85    source_over,
86    clear,
87    modulate,
88    multiply,
89    plus,
90    screen,
91    xor,
92    color_burn,
93    color_dodge,
94    darken,
95    difference,
96    exclusion,
97    hard_light,
98    lighten,
99    overlay,
100    soft_light,
101    hue,
102    saturation,
103    color,
104    luminosity,
105    source_over_rgba,
106    transform,
107    reflect,
108    repeat,
109    bilinear,
110    bicubic,
111    pad_x1,
112    reflect_x1,
113    repeat_x1,
114    gradient,
115    evenly_spaced_2_stop_gradient,
116    xy_to_unit_angle,
117    xy_to_radius,
118    xy_to_2pt_conical_focal_on_circle,
119    xy_to_2pt_conical_well_behaved,
120    xy_to_2pt_conical_smaller,
121    xy_to_2pt_conical_greater,
122    xy_to_2pt_conical_strip,
123    mask_2pt_conical_nan,
124    mask_2pt_conical_degenerates,
125    apply_vector_mask,
126    alter_2pt_conical_compensate_focal,
127    alter_2pt_conical_unswap,
128    negate_x,
129    apply_concentric_scale_bias,
130    gamma_expand_2,
131    gamma_expand_dst_2,
132    gamma_compress_2,
133    gamma_expand_22,
134    gamma_expand_dst_22,
135    gamma_compress_22,
136    gamma_expand_srgb,
137    gamma_expand_dst_srgb,
138    gamma_compress_srgb,
139];
140
141pub fn fn_ptr(f: StageFn) -> *const () {
142    f as *const ()
143}
144
145#[inline(never)]
146pub fn start(
147    functions: &[StageFn],
148    functions_tail: &[StageFn],
149    rect: &ScreenIntRect,
150    aa_mask_ctx: super::AAMaskCtx,
151    mask_ctx: super::MaskCtx,
152    ctx: &mut super::Context,
153    pixmap_src: PixmapRef,
154    pixmap_dst: &mut SubPixmapMut,
155) {
156    let mut p = Pipeline {
157        index: 0,
158        functions: &[],
159        pixmap_src,
160        pixmap_dst,
161        mask_ctx,
162        aa_mask_ctx,
163        ctx,
164        r: f32x8::default(),
165        g: f32x8::default(),
166        b: f32x8::default(),
167        a: f32x8::default(),
168        dr: f32x8::default(),
169        dg: f32x8::default(),
170        db: f32x8::default(),
171        da: f32x8::default(),
172        tail: 0,
173        dx: 0,
174        dy: 0,
175    };
176
177    for y in rect.y()..rect.bottom() {
178        let mut x = rect.x() as usize;
179        let end = rect.right() as usize;
180
181        p.functions = functions;
182        while x + STAGE_WIDTH <= end {
183            p.index = 0;
184            p.dx = x;
185            p.dy = y as usize;
186            p.tail = STAGE_WIDTH;
187            p.next_stage();
188            x += STAGE_WIDTH;
189        }
190
191        if x != end {
192            p.index = 0;
193            p.functions = functions_tail;
194            p.dx = x;
195            p.dy = y as usize;
196            p.tail = end - x;
197            p.next_stage();
198        }
199    }
200}
201
202fn move_source_to_destination(p: &mut Pipeline) {
203    p.dr = p.r;
204    p.dg = p.g;
205    p.db = p.b;
206    p.da = p.a;
207
208    p.next_stage();
209}
210
211fn premultiply(p: &mut Pipeline) {
212    p.r *= p.a;
213    p.g *= p.a;
214    p.b *= p.a;
215
216    p.next_stage();
217}
218
219fn move_destination_to_source(p: &mut Pipeline) {
220    p.r = p.dr;
221    p.g = p.dg;
222    p.b = p.db;
223    p.a = p.da;
224
225    p.next_stage();
226}
227
228fn clamp_0(p: &mut Pipeline) {
229    p.r = p.r.max(f32x8::default());
230    p.g = p.g.max(f32x8::default());
231    p.b = p.b.max(f32x8::default());
232    p.a = p.a.max(f32x8::default());
233
234    p.next_stage();
235}
236
237fn clamp_a(p: &mut Pipeline) {
238    p.r = p.r.min(f32x8::splat(1.0));
239    p.g = p.g.min(f32x8::splat(1.0));
240    p.b = p.b.min(f32x8::splat(1.0));
241    p.a = p.a.min(f32x8::splat(1.0));
242
243    p.next_stage();
244}
245
246fn uniform_color(p: &mut Pipeline) {
247    let ctx = &p.ctx.uniform_color;
248    p.r = f32x8::splat(ctx.r);
249    p.g = f32x8::splat(ctx.g);
250    p.b = f32x8::splat(ctx.b);
251    p.a = f32x8::splat(ctx.a);
252
253    p.next_stage();
254}
255
256fn seed_shader(p: &mut Pipeline) {
257    let iota = f32x8::from([0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]);
258
259    p.r = f32x8::splat(p.dx as f32) + iota;
260    p.g = f32x8::splat(p.dy as f32 + 0.5);
261    p.b = f32x8::splat(1.0);
262    p.a = f32x8::default();
263
264    p.dr = f32x8::default();
265    p.dg = f32x8::default();
266    p.db = f32x8::default();
267    p.da = f32x8::default();
268
269    p.next_stage();
270}
271
272pub fn load_dst(p: &mut Pipeline) {
273    load_8888(p.pixmap_dst.slice4_at_xy(p.dx, p.dy), &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
274    p.next_stage();
275}
276
277pub fn load_dst_tail(p: &mut Pipeline) {
278    load_8888_tail(p.tail, p.pixmap_dst.slice_at_xy(p.dx, p.dy), &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
279    p.next_stage();
280}
281
282pub fn store(p: &mut Pipeline) {
283    store_8888(&p.r, &p.g, &p.b, &p.a, p.pixmap_dst.slice4_at_xy(p.dx, p.dy));
284    p.next_stage();
285}
286
287pub fn store_tail(p: &mut Pipeline) {
288    store_8888_tail(&p.r, &p.g, &p.b, &p.a, p.tail, p.pixmap_dst.slice_at_xy(p.dx, p.dy));
289    p.next_stage();
290}
291
292// Currently, all mask/A8 pixmaps are handled by lowp.
293pub fn load_dst_u8(_: &mut Pipeline) {
294    // unreachable
295}
296
297pub fn load_dst_u8_tail(_: &mut Pipeline) {
298    // unreachable
299}
300
301pub fn store_u8(_: &mut Pipeline) {
302    // unreachable
303}
304
305pub fn store_u8_tail(_: &mut Pipeline) {
306    // unreachable
307}
308
309pub fn gather(p: &mut Pipeline) {
310    let ix = gather_ix(p.pixmap_src, p.r, p.g);
311    load_8888(&p.pixmap_src.gather(ix), &mut p.r, &mut p.g, &mut p.b, &mut p.a);
312
313    p.next_stage();
314}
315
316#[inline(always)]
317fn gather_ix(pixmap: PixmapRef, mut x: f32x8, mut y: f32x8) -> u32x8 {
318    // Exclusive -> inclusive.
319    let w = ulp_sub(pixmap.width() as f32);
320    let h = ulp_sub(pixmap.height() as f32);
321    x = x.max(f32x8::default()).min(f32x8::splat(w));
322    y = y.max(f32x8::default()).min(f32x8::splat(h));
323
324    (y.trunc_int() * i32x8::splat(pixmap.width() as i32) + x.trunc_int()).to_u32x8_bitcast()
325}
326
327#[inline(always)]
328fn ulp_sub(v: f32) -> f32 {
329    // Somewhat similar to v - f32::EPSILON
330    bytemuck::cast::<u32, f32>(bytemuck::cast::<f32, u32>(v) - 1)
331}
332
333fn load_mask_u8(_: &mut Pipeline) {
334    // unreachable
335}
336
337fn mask_u8(p: &mut Pipeline) {
338    let offset = p.mask_ctx.offset(p.dx, p.dy);
339    let mut c = [0.0; 8];
340    for i in 0..p.tail {
341        c[i] = p.mask_ctx.data[offset + i] as f32;
342    }
343    let c = f32x8::from(c) / f32x8::splat(255.0);
344
345    if c == f32x8::default() {
346        return;
347    }
348
349    p.r *= c;
350    p.g *= c;
351    p.b *= c;
352    p.a *= c;
353
354    p.next_stage();
355}
356
357fn scale_u8(p: &mut Pipeline) {
358    // Load u8xTail and cast it to f32x8.
359    let data = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail);
360    let c = f32x8::from([data[0] as f32, data[1] as f32, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]);
361    let c = c / f32x8::splat(255.0);
362
363    p.r *= c;
364    p.g *= c;
365    p.b *= c;
366    p.a *= c;
367
368    p.next_stage();
369}
370
371fn lerp_u8(p: &mut Pipeline) {
372    // Load u8xTail and cast it to f32x8.
373    let data = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail);
374    let c = f32x8::from([data[0] as f32, data[1] as f32, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]);
375    let c = c / f32x8::splat(255.0);
376
377    p.r = lerp(p.dr, p.r, c);
378    p.g = lerp(p.dg, p.g, c);
379    p.b = lerp(p.db, p.b, c);
380    p.a = lerp(p.da, p.a, c);
381
382    p.next_stage();
383}
384
385fn scale_1_float(p: &mut Pipeline) {
386    let c = f32x8::splat(p.ctx.current_coverage);
387    p.r *= c;
388    p.g *= c;
389    p.b *= c;
390    p.a *= c;
391
392    p.next_stage();
393}
394
395fn lerp_1_float(p: &mut Pipeline) {
396    let c = f32x8::splat(p.ctx.current_coverage);
397    p.r = lerp(p.dr, p.r, c);
398    p.g = lerp(p.dg, p.g, c);
399    p.b = lerp(p.db, p.b, c);
400    p.a = lerp(p.da, p.a, c);
401
402    p.next_stage();
403}
404
405macro_rules! blend_fn {
406    ($name:ident, $f:expr) => {
407        fn $name(p: &mut Pipeline) {
408            p.r = $f(p.r, p.dr, p.a, p.da);
409            p.g = $f(p.g, p.dg, p.a, p.da);
410            p.b = $f(p.b, p.db, p.a, p.da);
411            p.a = $f(p.a, p.da, p.a, p.da);
412
413            p.next_stage();
414        }
415    };
416}
417
418blend_fn!(clear,            |_, _,  _,  _| f32x8::default());
419blend_fn!(source_atop,      |s, d, sa, da| s * da + d * inv(sa));
420blend_fn!(destination_atop, |s, d, sa, da| d * sa + s * inv(da));
421blend_fn!(source_in,        |s, _,  _, da| s * da);
422blend_fn!(destination_in,   |_, d, sa,  _| d * sa);
423blend_fn!(source_out,       |s, _,  _, da| s * inv(da));
424blend_fn!(destination_out,  |_, d, sa,  _| d * inv(sa));
425blend_fn!(source_over,      |s, d, sa,  _| mad(d, inv(sa), s));
426blend_fn!(destination_over, |s, d,  _, da| mad(s, inv(da), d));
427blend_fn!(modulate,         |s, d,  _,  _| s * d);
428blend_fn!(multiply,         |s, d, sa, da| s * inv(da) + d * inv(sa) + s * d);
429blend_fn!(screen,           |s, d,  _,  _| s + d - s * d);
430blend_fn!(xor,              |s, d, sa, da| s * inv(da) + d * inv(sa));
431
432// Wants a type for some reason.
433blend_fn!(plus, |s: f32x8, d: f32x8, _, _| (s + d).min(f32x8::splat(1.0)));
434
435macro_rules! blend_fn2 {
436    ($name:ident, $f:expr) => {
437        fn $name(p: &mut Pipeline) {
438            // The same logic applied to color, and source_over for alpha.
439            p.r = $f(p.r, p.dr, p.a, p.da);
440            p.g = $f(p.g, p.dg, p.a, p.da);
441            p.b = $f(p.b, p.db, p.a, p.da);
442            p.a = mad(p.da, inv(p.a), p.a);
443
444            p.next_stage();
445        }
446    };
447}
448
449blend_fn2!(darken,      |s: f32x8, d, sa, da: f32x8| s + d - (s * da).max(d * sa));
450blend_fn2!(lighten,     |s: f32x8, d, sa, da: f32x8| s + d - (s * da).min(d * sa));
451blend_fn2!(difference,  |s: f32x8, d, sa, da: f32x8| s + d - two((s * da).min(d * sa)));
452blend_fn2!(exclusion,   |s: f32x8, d,  _,  _| s + d - two(s * d));
453
454blend_fn2!(color_burn, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8|
455    d.cmp_eq(da).blend(
456        d + s * inv(da),
457        s.cmp_eq(f32x8::default()).blend(
458            d * inv(sa),
459            sa * (da - da.min((da - d) * sa * s.recip_fast())) + s * inv(da) + d * inv(sa)
460        )
461    )
462);
463
464blend_fn2!(color_dodge, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8|
465    d.cmp_eq(f32x8::default()).blend(
466        s * inv(da),
467        s.cmp_eq(sa).blend(
468            s + d * inv(sa),
469            sa * da.min((d * sa) * (sa - s).recip_fast()) + s * inv(da) + d * inv(sa)
470        )
471    )
472);
473
474blend_fn2!(hard_light, |s: f32x8, d: f32x8, sa, da|
475    s * inv(da) + d * inv(sa) + two(s).cmp_le(sa).blend(
476        two(s * d),
477        sa * da - two((da - d) * (sa - s))
478    )
479);
480
481blend_fn2!(overlay, |s: f32x8, d: f32x8, sa, da|
482    s * inv(da) + d * inv(sa) + two(d).cmp_le(da).blend(
483        two(s * d),
484        sa * da - two((da - d) * (sa - s))
485    )
486);
487
488blend_fn2!(soft_light, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8| {
489    let m  = da.cmp_gt(f32x8::default()).blend(d / da, f32x8::default());
490    let s2 = two(s);
491    let m4 = two(two(m));
492
493    // The logic forks three ways:
494    //    1. dark src?
495    //    2. light src, dark dst?
496    //    3. light src, light dst?
497    let dark_src = d * (sa + (s2 - sa) * (f32x8::splat(1.0) - m));
498    let dark_dst = (m4 * m4 + m4) * (m - f32x8::splat(1.0)) + f32x8::splat(7.0) * m;
499    let lite_dst = m.sqrt() - m;
500    let lite_src = d * sa + da * (s2 - sa)
501        * two(two(d)).cmp_le(da).blend(dark_dst, lite_dst); // 2 or 3?
502
503    s * inv(da) + d * inv(sa) + s2.cmp_le(sa).blend(dark_src, lite_src) // 1 or (2 or 3)?
504});
505
506// We're basing our implementation of non-separable blend modes on
507//   https://www.w3.org/TR/compositing-1/#blendingnonseparable.
508// and
509//   https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf
510// They're equivalent, but ES' math has been better simplified.
511//
512// Anything extra we add beyond that is to make the math work with premul inputs.
513
514macro_rules! blend_fn3 {
515    ($name:ident, $f:expr) => {
516        fn $name(p: &mut Pipeline) {
517            let (tr, tg, tb, ta) = $f(p.r, p.g, p.b, p.a, p.dr, p.dg, p.db, p.da);
518            p.r = tr;
519            p.g = tg;
520            p.b = tb;
521            p.a = ta;
522
523            p.next_stage();
524        }
525    };
526}
527
528blend_fn3!(hue, hue_k);
529
530#[inline(always)]
531fn hue_k(
532    r: f32x8, g: f32x8, b: f32x8, a: f32x8,
533    dr: f32x8, dg: f32x8, db: f32x8, da: f32x8,
534) -> (f32x8, f32x8, f32x8, f32x8) {
535    let rr = &mut (r * a);
536    let gg = &mut (g * a);
537    let bb = &mut (b * a);
538
539    set_sat(rr, gg, bb, sat(dr, dg, db) * a);
540    set_lum(rr, gg, bb, lum(dr, dg, db) * a);
541    clip_color(rr, gg, bb, a * da);
542
543    let r = r * inv(da) + dr * inv(a) + *rr;
544    let g = g * inv(da) + dg * inv(a) + *gg;
545    let b = b * inv(da) + db * inv(a) + *bb;
546    let a = a + da - a * da;
547
548    (r, g, b, a)
549}
550
551blend_fn3!(saturation, saturation_k);
552
553#[inline(always)]
554fn saturation_k(
555    r: f32x8, g: f32x8, b: f32x8, a: f32x8,
556    dr: f32x8, dg: f32x8, db: f32x8, da: f32x8,
557) -> (f32x8, f32x8, f32x8, f32x8) {
558    let rr = &mut (dr * a);
559    let gg = &mut (dg * a);
560    let bb = &mut (db * a);
561
562    set_sat(rr, gg, bb, sat(r, g, b) * da);
563    set_lum(rr, gg, bb, lum(dr, dg, db) * a); // (This is not redundant.)
564    clip_color(rr, gg, bb, a * da);
565
566    let r = r * inv(da) + dr * inv(a) + *rr;
567    let g = g * inv(da) + dg * inv(a) + *gg;
568    let b = b * inv(da) + db * inv(a) + *bb;
569    let a = a + da - a * da;
570
571    (r, g, b, a)
572}
573
574blend_fn3!(color, color_k);
575
576#[inline(always)]
577fn color_k(
578    r: f32x8, g: f32x8, b: f32x8, a: f32x8,
579    dr: f32x8, dg: f32x8, db: f32x8, da: f32x8,
580) -> (f32x8, f32x8, f32x8, f32x8) {
581    let rr = &mut (r * da);
582    let gg = &mut (g * da);
583    let bb = &mut (b * da);
584
585    set_lum(rr, gg, bb, lum(dr, dg, db) * a);
586    clip_color(rr, gg, bb, a * da);
587
588    let r = r * inv(da) + dr * inv(a) + *rr;
589    let g = g * inv(da) + dg * inv(a) + *gg;
590    let b = b * inv(da) + db * inv(a) + *bb;
591    let a = a + da - a * da;
592
593    (r, g, b, a)
594}
595
596blend_fn3!(luminosity, luminosity_k);
597
598#[inline(always)]
599fn luminosity_k(
600    r: f32x8, g: f32x8, b: f32x8, a: f32x8,
601    dr: f32x8, dg: f32x8, db: f32x8, da: f32x8,
602) -> (f32x8, f32x8, f32x8, f32x8) {
603    let rr = &mut (dr * a);
604    let gg = &mut (dg * a);
605    let bb = &mut (db * a);
606
607    set_lum(rr, gg, bb, lum(r, g, b) * da);
608    clip_color(rr, gg, bb, a * da);
609
610    let r = r * inv(da) + dr * inv(a) + *rr;
611    let g = g * inv(da) + dg * inv(a) + *gg;
612    let b = b * inv(da) + db * inv(a) + *bb;
613    let a = a + da - a * da;
614
615    (r, g, b, a)
616}
617
618#[inline(always)]
619fn sat(r: f32x8, g: f32x8, b: f32x8) -> f32x8 {
620    r.max(g.max(b)) - r.min(g.min(b))
621}
622
623#[inline(always)]
624fn lum(r: f32x8, g: f32x8, b: f32x8) -> f32x8 {
625    r * f32x8::splat(0.30) + g * f32x8::splat(0.59) + b * f32x8::splat(0.11)
626}
627
628#[inline(always)]
629fn set_sat(r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, s: f32x8) {
630    let mn  = r.min(g.min(*b));
631    let mx  = r.max(g.max(*b));
632    let sat = mx - mn;
633
634    // Map min channel to 0, max channel to s, and scale the middle proportionally.
635    let scale = |c| sat.cmp_eq(f32x8::default())
636                       .blend(f32x8::default(), (c - mn) * s / sat);
637
638    *r = scale(*r);
639    *g = scale(*g);
640    *b = scale(*b);
641}
642
643#[inline(always)]
644fn set_lum(r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, l: f32x8) {
645    let diff = l - lum(*r, *g, *b);
646    *r += diff;
647    *g += diff;
648    *b += diff;
649}
650
651#[inline(always)]
652fn clip_color(r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: f32x8) {
653    let mn = r.min(g.min(*b));
654    let mx = r.max(g.max(*b));
655    let l  = lum(*r, *g, *b);
656
657    let clip = |mut c| {
658        c = mx.cmp_ge(f32x8::default()).blend(c, l + (c - l) * l / (l - mn));
659        c = mx.cmp_gt(a).blend(l + (c - l) * (a - l) / (mx - l), c);
660        c = c.max(f32x8::default()); // Sometimes without this we may dip just a little negative.
661        c
662    };
663
664    *r = clip(*r);
665    *g = clip(*g);
666    *b = clip(*b);
667}
668
669pub fn source_over_rgba(p: &mut Pipeline) {
670    let pixels = p.pixmap_dst.slice4_at_xy(p.dx, p.dy);
671    load_8888(pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
672    p.r = mad(p.dr, inv(p.a), p.r);
673    p.g = mad(p.dg, inv(p.a), p.g);
674    p.b = mad(p.db, inv(p.a), p.b);
675    p.a = mad(p.da, inv(p.a), p.a);
676    store_8888(&p.r, &p.g, &p.b, &p.a, pixels);
677
678    p.next_stage();
679}
680
681pub fn source_over_rgba_tail(p: &mut Pipeline) {
682    let pixels = p.pixmap_dst.slice_at_xy(p.dx, p.dy);
683    load_8888_tail(p.tail, pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
684    p.r = mad(p.dr, inv(p.a), p.r);
685    p.g = mad(p.dg, inv(p.a), p.g);
686    p.b = mad(p.db, inv(p.a), p.b);
687    p.a = mad(p.da, inv(p.a), p.a);
688    store_8888_tail(&p.r, &p.g, &p.b, &p.a, p.tail, pixels);
689
690    p.next_stage();
691}
692
693fn transform(p: &mut Pipeline) {
694    let ts = &p.ctx.transform;
695
696    let tr = mad(p.r, f32x8::splat(ts.sx), mad(p.g, f32x8::splat(ts.kx), f32x8::splat(ts.tx)));
697    let tg = mad(p.r, f32x8::splat(ts.ky), mad(p.g, f32x8::splat(ts.sy), f32x8::splat(ts.ty)));
698    p.r = tr;
699    p.g = tg;
700
701    p.next_stage();
702}
703
704// Tile x or y to [0,limit) == [0,limit - 1 ulp] (think, sampling from images).
705// The gather stages will hard clamp the output of these stages to [0,limit)...
706// we just need to do the basic repeat or mirroring.
707
708fn reflect(p: &mut Pipeline) {
709    let ctx = &p.ctx.limit_x;
710    p.r = exclusive_reflect(p.r, ctx.scale, ctx.inv_scale);
711
712    let ctx = &p.ctx.limit_y;
713    p.g = exclusive_reflect(p.g, ctx.scale, ctx.inv_scale);
714
715    p.next_stage();
716}
717
718#[inline(always)]
719fn exclusive_reflect(v: f32x8, limit: f32, inv_limit: f32) -> f32x8 {
720    let limit = f32x8::splat(limit);
721    let inv_limit = f32x8::splat(inv_limit);
722    ((v - limit) - (limit + limit)
723        * ((v - limit) * (inv_limit * f32x8::splat(0.5))).floor() - limit).abs()
724}
725
726fn repeat(p: &mut Pipeline) {
727    let ctx = &p.ctx.limit_x;
728    p.r = exclusive_repeat(p.r, ctx.scale, ctx.inv_scale);
729
730    let ctx = &p.ctx.limit_y;
731    p.g = exclusive_repeat(p.g, ctx.scale, ctx.inv_scale);
732
733    p.next_stage();
734}
735
736#[inline(always)]
737fn exclusive_repeat(v: f32x8, limit: f32, inv_limit: f32) -> f32x8 {
738    v - (v * f32x8::splat(inv_limit)).floor() * f32x8::splat(limit)
739}
740
741fn bilinear(p: &mut Pipeline) {
742    let x = p.r;
743    let fx = (x + f32x8::splat(0.5)).fract();
744    let y = p.g;
745    let fy = (y + f32x8::splat(0.5)).fract();
746    let one = f32x8::splat(1.0);
747    let wx = [one - fx, fx];
748    let wy = [one - fy, fy];
749
750    sampler_2x2(p.pixmap_src, &p.ctx.sampler, x, y, &wx, &wy, &mut p.r, &mut p.g, &mut p.b, &mut p.a);
751
752    p.next_stage();
753}
754
755fn bicubic(p: &mut Pipeline) {
756    let x = p.r;
757    let fx = (x + f32x8::splat(0.5)).fract();
758    let y = p.g;
759    let fy = (y + f32x8::splat(0.5)).fract();
760    let one = f32x8::splat(1.0);
761    let wx = [bicubic_far(one - fx), bicubic_near(one - fx), bicubic_near(fx), bicubic_far(fx)];
762    let wy = [bicubic_far(one - fy), bicubic_near(one - fy), bicubic_near(fy), bicubic_far(fy)];
763
764    sampler_4x4(p.pixmap_src, &p.ctx.sampler, x, y, &wx, &wy, &mut p.r, &mut p.g, &mut p.b, &mut p.a);
765
766    p.next_stage();
767}
768
769// In bicubic interpolation, the 16 pixels and +/- 0.5 and +/- 1.5 offsets from the sample
770// pixel center are combined with a non-uniform cubic filter, with higher values near the center.
771//
772// We break this function into two parts, one for near 0.5 offsets and one for far 1.5 offsets.
773
774#[inline(always)]
775fn bicubic_near(t: f32x8) -> f32x8 {
776    // 1/18 + 9/18t + 27/18t^2 - 21/18t^3 == t ( t ( -21/18t + 27/18) + 9/18) + 1/18
777    mad(
778        t,
779        mad(t,
780            mad(
781                f32x8::splat(-21.0/18.0),
782                t,
783                f32x8::splat(27.0/18.0),
784            ),
785            f32x8::splat(9.0/18.0),
786        ),
787        f32x8::splat(1.0/18.0),
788    )
789}
790
791#[inline(always)]
792fn bicubic_far(t: f32x8) -> f32x8 {
793    // 0/18 + 0/18*t - 6/18t^2 + 7/18t^3 == t^2 (7/18t - 6/18)
794    (t * t) * mad(f32x8::splat(7.0/18.0), t, f32x8::splat(-6.0/18.0))
795}
796
797#[inline(always)]
798fn sampler_2x2(
799    pixmap: PixmapRef,
800    ctx: &super::SamplerCtx,
801    cx: f32x8, cy: f32x8,
802    wx: &[f32x8; 2], wy: &[f32x8; 2],
803    r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
804) {
805    *r = f32x8::default();
806    *g = f32x8::default();
807    *b = f32x8::default();
808    *a = f32x8::default();
809
810    let one = f32x8::splat(1.0);
811    let start = -0.5;
812    let mut y = cy + f32x8::splat(start);
813    for j in 0..2 {
814        let mut x = cx + f32x8::splat(start);
815        for i in 0..2 {
816            let mut rr = f32x8::default();
817            let mut gg = f32x8::default();
818            let mut bb = f32x8::default();
819            let mut aa = f32x8::default();
820            sample(pixmap, ctx, x,y, &mut rr, &mut gg, &mut bb, &mut aa);
821
822            let w = wx[i] * wy[j];
823            *r = mad(w, rr, *r);
824            *g = mad(w, gg, *g);
825            *b = mad(w, bb, *b);
826            *a = mad(w, aa, *a);
827
828            x += one;
829        }
830
831        y += one;
832    }
833}
834
835#[inline(always)]
836fn sampler_4x4(
837    pixmap: PixmapRef,
838    ctx: &super::SamplerCtx,
839    cx: f32x8, cy: f32x8,
840    wx: &[f32x8; 4], wy: &[f32x8; 4],
841    r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
842) {
843    *r = f32x8::default();
844    *g = f32x8::default();
845    *b = f32x8::default();
846    *a = f32x8::default();
847
848    let one = f32x8::splat(1.0);
849    let start = -1.5;
850    let mut y = cy + f32x8::splat(start);
851    for j in 0..4 {
852        let mut x = cx + f32x8::splat(start);
853        for i in 0..4 {
854            let mut rr = f32x8::default();
855            let mut gg = f32x8::default();
856            let mut bb = f32x8::default();
857            let mut aa = f32x8::default();
858            sample(pixmap, ctx, x,y, &mut rr, &mut gg, &mut bb, &mut aa);
859
860            let w = wx[i] * wy[j];
861            *r = mad(w, rr, *r);
862            *g = mad(w, gg, *g);
863            *b = mad(w, bb, *b);
864            *a = mad(w, aa, *a);
865
866            x += one;
867        }
868
869        y += one;
870    }
871}
872
873#[inline(always)]
874fn sample(
875    pixmap: PixmapRef, ctx: &super::SamplerCtx, mut x: f32x8, mut y: f32x8,
876    r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
877) {
878    x = tile(x, ctx.spread_mode, pixmap.width() as f32, ctx.inv_width);
879    y = tile(y, ctx.spread_mode, pixmap.height() as f32, ctx.inv_height);
880
881    let ix = gather_ix(pixmap, x, y);
882    load_8888(&pixmap.gather(ix), r, g, b, a);
883}
884
885#[inline(always)]
886fn tile(v: f32x8, mode: SpreadMode, limit: f32, inv_limit: f32) -> f32x8 {
887    match mode {
888        SpreadMode::Pad => v,
889        SpreadMode::Repeat => exclusive_repeat(v, limit, inv_limit),
890        SpreadMode::Reflect => exclusive_reflect(v, limit, inv_limit),
891    }
892}
893
894fn pad_x1(p: &mut Pipeline) {
895    p.r = p.r.normalize();
896
897    p.next_stage();
898}
899
900fn reflect_x1(p: &mut Pipeline) {
901    p.r = (
902        (p.r - f32x8::splat(1.0))
903            - two(((p.r - f32x8::splat(1.0)) * f32x8::splat(0.5)).floor())
904            - f32x8::splat(1.0)
905    ).abs().normalize();
906
907    p.next_stage();
908}
909
910fn repeat_x1(p: &mut Pipeline) {
911    p.r = (p.r - p.r.floor()).normalize();
912
913    p.next_stage();
914}
915
916fn gradient(p: &mut Pipeline) {
917    let ctx = &p.ctx.gradient;
918
919    // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop.
920    let t: [f32; 8] = p.r.into();
921    let mut idx = u32x8::default();
922    for i in 1..ctx.len {
923        let tt = ctx.t_values[i].get();
924        let n: u32x8 = bytemuck::cast([
925            (t[0] >= tt) as u32,
926            (t[1] >= tt) as u32,
927            (t[2] >= tt) as u32,
928            (t[3] >= tt) as u32,
929            (t[4] >= tt) as u32,
930            (t[5] >= tt) as u32,
931            (t[6] >= tt) as u32,
932            (t[7] >= tt) as u32,
933        ]);
934        idx = idx + n;
935    }
936    gradient_lookup(ctx, &idx, p.r, &mut p.r, &mut p.g, &mut p.b, &mut p.a);
937
938    p.next_stage();
939}
940
941fn gradient_lookup(
942    ctx: &super::GradientCtx, idx: &u32x8, t: f32x8,
943    r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
944) {
945    let idx: [u32; 8] = bytemuck::cast(*idx);
946
947    macro_rules! gather {
948        ($d:expr, $c:ident) => {
949            // Surprisingly, but bound checking doesn't affect the performance.
950            // And since `idx` can contain any number, we should leave it in place.
951            f32x8::from([
952                $d[idx[0] as usize].$c,
953                $d[idx[1] as usize].$c,
954                $d[idx[2] as usize].$c,
955                $d[idx[3] as usize].$c,
956                $d[idx[4] as usize].$c,
957                $d[idx[5] as usize].$c,
958                $d[idx[6] as usize].$c,
959                $d[idx[7] as usize].$c,
960            ])
961        };
962    }
963
964    let fr = gather!(&ctx.factors, r);
965    let fg = gather!(&ctx.factors, g);
966    let fb = gather!(&ctx.factors, b);
967    let fa = gather!(&ctx.factors, a);
968
969    let br = gather!(&ctx.biases, r);
970    let bg = gather!(&ctx.biases, g);
971    let bb = gather!(&ctx.biases, b);
972    let ba = gather!(&ctx.biases, a);
973
974    *r = mad(t, fr, br);
975    *g = mad(t, fg, bg);
976    *b = mad(t, fb, bb);
977    *a = mad(t, fa, ba);
978}
979
980fn evenly_spaced_2_stop_gradient(p: &mut Pipeline) {
981    let ctx = &p.ctx.evenly_spaced_2_stop_gradient;
982
983    let t = p.r;
984    p.r = mad(t, f32x8::splat(ctx.factor.r), f32x8::splat(ctx.bias.r));
985    p.g = mad(t, f32x8::splat(ctx.factor.g), f32x8::splat(ctx.bias.g));
986    p.b = mad(t, f32x8::splat(ctx.factor.b), f32x8::splat(ctx.bias.b));
987    p.a = mad(t, f32x8::splat(ctx.factor.a), f32x8::splat(ctx.bias.a));
988
989    p.next_stage();
990}
991
992fn xy_to_unit_angle(p: &mut Pipeline) {
993    let x = p.r;
994    let y = p.g;
995    let x_abs = x.abs();
996    let y_abs = y.abs();
997    let slope = x_abs.min(y_abs) / x_abs.max(y_abs);
998    let s = slope * slope;
999    // Use a 7th degree polynomial to approximate atan.
1000    // This was generated using sollya.gforge.inria.fr.
1001    // A float optimized polynomial was generated using the following command.
1002    // P1 = fpminimax((1/(2*Pi))*atan(x),[|1,3,5,7|],[|24...|],[2^(-40),1],relative);
1003    let phi = slope
1004        * (f32x8::splat(0.15912117063999176025390625)
1005           + s * (f32x8::splat(-5.185396969318389892578125e-2)
1006                  + s * (f32x8::splat(2.476101927459239959716796875e-2)
1007                         + s * (f32x8::splat(-7.0547382347285747528076171875e-3)))));
1008    let phi = x_abs.cmp_lt(y_abs).blend(f32x8::splat(0.25) - phi, phi);
1009    let phi = x
1010        .cmp_lt(f32x8::splat(0.0))
1011        .blend(f32x8::splat(0.5) - phi, phi);
1012    let phi = y
1013        .cmp_lt(f32x8::splat(0.0))
1014        .blend(f32x8::splat(1.0) - phi, phi);
1015    let phi = phi.cmp_ne(phi).blend(f32x8::splat(0.0), phi);
1016    p.r = phi;
1017    p.next_stage();
1018}
1019
1020fn xy_to_radius(p: &mut Pipeline) {
1021    let x2 = p.r * p.r;
1022    let y2 = p.g * p.g;
1023    p.r = (x2 + y2).sqrt();
1024
1025    p.next_stage();
1026}
1027
1028fn xy_to_2pt_conical_focal_on_circle(p: &mut Pipeline) {
1029    let x = p.r;
1030    let y = p.g;
1031    p.r = x + y * y / x;
1032
1033    p.next_stage();
1034}
1035
1036fn xy_to_2pt_conical_well_behaved(p: &mut Pipeline) {
1037    let ctx = &p.ctx.two_point_conical_gradient;
1038
1039    let x = p.r;
1040    let y = p.g;
1041    p.r = (x * x + y * y).sqrt() - x * f32x8::splat(ctx.p0);
1042
1043    p.next_stage();
1044}
1045
1046fn xy_to_2pt_conical_greater(p: &mut Pipeline) {
1047    let ctx = &p.ctx.two_point_conical_gradient;
1048
1049    let x = p.r;
1050    let y = p.g;
1051    p.r = (x * x - y * y).sqrt() - x * f32x8::splat(ctx.p0);
1052
1053    p.next_stage();
1054}
1055
1056fn xy_to_2pt_conical_smaller(p: &mut Pipeline) {
1057    let ctx = &p.ctx.two_point_conical_gradient;
1058
1059    let x = p.r;
1060    let y = p.g;
1061    p.r = -(x * x - y * y).sqrt() - x * f32x8::splat(ctx.p0);
1062
1063    p.next_stage();
1064}
1065
1066fn xy_to_2pt_conical_strip(p: &mut Pipeline) {
1067    let ctx = &p.ctx.two_point_conical_gradient;
1068
1069    let x = p.r;
1070    let y = p.g;
1071    p.r = x + (f32x8::splat(ctx.p0) - y * y).sqrt();
1072
1073    p.next_stage();
1074}
1075
1076fn mask_2pt_conical_nan(p: &mut Pipeline) {
1077    let ctx = &mut p.ctx.two_point_conical_gradient;
1078
1079    let t = p.r;
1080    let is_degenerate = t.cmp_ne(t);
1081    p.r = is_degenerate.blend(f32x8::default(), t);
1082    ctx.mask = cond_to_mask(!is_degenerate.to_u32x8_bitcast());
1083
1084    p.next_stage();
1085}
1086
1087fn mask_2pt_conical_degenerates(p: &mut Pipeline) {
1088    let ctx = &mut p.ctx.two_point_conical_gradient;
1089
1090    let t = p.r;
1091    let is_degenerate = t.cmp_le(f32x8::default()) | t.cmp_ne(t);
1092    p.r = is_degenerate.blend(f32x8::default(), t);
1093    ctx.mask = cond_to_mask(!is_degenerate.to_u32x8_bitcast());
1094
1095    p.next_stage();
1096}
1097
1098fn apply_vector_mask(p: &mut Pipeline) {
1099    let ctx = &p.ctx.two_point_conical_gradient;
1100
1101    p.r = (p.r.to_u32x8_bitcast() & ctx.mask).to_f32x8_bitcast();
1102    p.g = (p.g.to_u32x8_bitcast() & ctx.mask).to_f32x8_bitcast();
1103    p.b = (p.b.to_u32x8_bitcast() & ctx.mask).to_f32x8_bitcast();
1104    p.a = (p.a.to_u32x8_bitcast() & ctx.mask).to_f32x8_bitcast();
1105
1106    p.next_stage();
1107}
1108
1109fn alter_2pt_conical_compensate_focal(p: &mut Pipeline) {
1110    let ctx = &p.ctx.two_point_conical_gradient;
1111
1112    p.r = p.r + f32x8::splat(ctx.p1);
1113
1114    p.next_stage();
1115}
1116
1117fn alter_2pt_conical_unswap(p: &mut Pipeline) {
1118    p.r = f32x8::splat(1.0) - p.r;
1119
1120    p.next_stage();
1121}
1122
1123fn negate_x(p: &mut Pipeline) {
1124    p.r = -p.r;
1125
1126    p.next_stage();
1127}
1128
1129fn apply_concentric_scale_bias(p: &mut Pipeline) {
1130    let ctx = &p.ctx.two_point_conical_gradient;
1131
1132    // Apply t = t * scale + bias for concentric gradients
1133    let x = p.r;
1134    p.r = x * f32x8::splat(ctx.p0) + f32x8::splat(ctx.p1);
1135
1136    p.next_stage();
1137}
1138
1139fn gamma_expand_2(p: &mut Pipeline) {
1140    p.r = p.r * p.r;
1141    p.g = p.g * p.g;
1142    p.b = p.b * p.b;
1143
1144    p.next_stage();
1145}
1146
1147fn gamma_expand_dst_2(p: &mut Pipeline) {
1148    p.dr = p.dr * p.dr;
1149    p.dg = p.dg * p.dg;
1150    p.db = p.db * p.db;
1151
1152    p.next_stage();
1153}
1154
1155fn gamma_compress_2(p: &mut Pipeline) {
1156    p.r = p.r.sqrt();
1157    p.g = p.g.sqrt();
1158    p.b = p.b.sqrt();
1159
1160    p.next_stage();
1161}
1162
1163fn gamma_expand_22(p: &mut Pipeline) {
1164    p.r = p.r.powf(2.2);
1165    p.g = p.g.powf(2.2);
1166    p.b = p.b.powf(2.2);
1167
1168    p.next_stage();
1169}
1170
1171fn gamma_expand_dst_22(p: &mut Pipeline) {
1172    p.dr = p.dr.powf(2.2);
1173    p.dg = p.dg.powf(2.2);
1174    p.db = p.db.powf(2.2);
1175
1176    p.next_stage();
1177}
1178
1179fn gamma_compress_22(p: &mut Pipeline) {
1180    p.r = p.r.powf(0.45454545);
1181    p.g = p.g.powf(0.45454545);
1182    p.b = p.b.powf(0.45454545);
1183
1184    p.next_stage();
1185}
1186
1187fn srgb_expand(x: f32x8) -> f32x8 {
1188    let small = x.cmp_le(f32x8::splat(0.04045));
1189    let linear = x / f32x8::splat(12.92);
1190    let exp = ((x + f32x8::splat(0.055)) / f32x8::splat(1.055)).powf(2.4);
1191    small.blend(linear, exp)
1192}
1193
1194fn srgb_compress(x: f32x8) -> f32x8 {
1195    let small = x.cmp_le(f32x8::splat(0.0031308));
1196    let linear = x * f32x8::splat(12.92);
1197    let exp = x.powf(0.416666666) * f32x8::splat(1.055) - f32x8::splat(0.055);
1198    small.blend(linear, exp)
1199}
1200
1201fn gamma_expand_srgb(p: &mut Pipeline) {
1202    p.r = srgb_expand(p.r);
1203    p.g = srgb_expand(p.g);
1204    p.b = srgb_expand(p.b);
1205
1206    p.next_stage();
1207}
1208
1209fn gamma_expand_dst_srgb(p: &mut Pipeline) {
1210    p.dr = srgb_expand(p.dr);
1211    p.dg = srgb_expand(p.dg);
1212    p.db = srgb_expand(p.db);
1213
1214    p.next_stage();
1215}
1216
1217fn gamma_compress_srgb(p: &mut Pipeline) {
1218    p.r = srgb_compress(p.r);
1219    p.g = srgb_compress(p.g);
1220    p.b = srgb_compress(p.b);
1221
1222    p.next_stage();
1223}
1224
1225pub fn just_return(_: &mut Pipeline) {
1226    // Ends the loop.
1227}
1228
1229#[inline(always)]
1230fn cond_to_mask(cond: u32x8) -> u32x8 {
1231    let cond: [u32; 8] = bytemuck::cast(cond);
1232    bytemuck::cast([
1233        if cond[0] != 0 { !0 } else { 0 },
1234        if cond[1] != 0 { !0 } else { 0 },
1235        if cond[2] != 0 { !0 } else { 0 },
1236        if cond[3] != 0 { !0 } else { 0 },
1237        if cond[4] != 0 { !0 } else { 0 },
1238        if cond[5] != 0 { !0 } else { 0 },
1239        if cond[6] != 0 { !0 } else { 0 },
1240        if cond[7] != 0 { !0 } else { 0 },
1241    ])
1242}
1243
1244#[inline(always)]
1245fn load_8888(
1246    data: &[PremultipliedColorU8; STAGE_WIDTH],
1247    r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
1248) {
1249    // Surprisingly, `f32 * FACTOR` is way faster than `f32x8 * f32x8::splat(FACTOR)`.
1250
1251    const FACTOR: f32 = 1.0 / 255.0;
1252
1253    *r = f32x8::from([
1254        data[0].red() as f32 * FACTOR, data[1].red() as f32 * FACTOR,
1255        data[2].red() as f32 * FACTOR, data[3].red() as f32 * FACTOR,
1256        data[4].red() as f32 * FACTOR, data[5].red() as f32 * FACTOR,
1257        data[6].red() as f32 * FACTOR, data[7].red() as f32 * FACTOR,
1258    ]);
1259
1260    *g = f32x8::from([
1261        data[0].green() as f32 * FACTOR, data[1].green() as f32 * FACTOR,
1262        data[2].green() as f32 * FACTOR, data[3].green() as f32 * FACTOR,
1263        data[4].green() as f32 * FACTOR, data[5].green() as f32 * FACTOR,
1264        data[6].green() as f32 * FACTOR, data[7].green() as f32 * FACTOR,
1265    ]);
1266
1267    *b = f32x8::from([
1268        data[0].blue() as f32 * FACTOR, data[1].blue() as f32 * FACTOR,
1269        data[2].blue() as f32 * FACTOR, data[3].blue() as f32 * FACTOR,
1270        data[4].blue() as f32 * FACTOR, data[5].blue() as f32 * FACTOR,
1271        data[6].blue() as f32 * FACTOR, data[7].blue() as f32 * FACTOR,
1272    ]);
1273
1274    *a = f32x8::from([
1275        data[0].alpha() as f32 * FACTOR, data[1].alpha() as f32 * FACTOR,
1276        data[2].alpha() as f32 * FACTOR, data[3].alpha() as f32 * FACTOR,
1277        data[4].alpha() as f32 * FACTOR, data[5].alpha() as f32 * FACTOR,
1278        data[6].alpha() as f32 * FACTOR, data[7].alpha() as f32 * FACTOR,
1279    ]);
1280}
1281
1282#[inline(always)]
1283fn load_8888_tail(
1284    tail: usize, data: &[PremultipliedColorU8],
1285    r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
1286) {
1287    // Fill a dummy array with `tail` values. `tail` is always in a 1..STAGE_WIDTH-1 range.
1288    // This way we can reuse the `load_8888_` method and remove any branches.
1289    let mut tmp = [PremultipliedColorU8::TRANSPARENT; STAGE_WIDTH];
1290    tmp[0..tail].copy_from_slice(&data[0..tail]);
1291    load_8888(&tmp, r, g, b, a);
1292}
1293
1294#[inline(always)]
1295fn store_8888(
1296    r: &f32x8, g: &f32x8, b: &f32x8, a: &f32x8,
1297    data: &mut [PremultipliedColorU8; STAGE_WIDTH],
1298) {
1299    let r: [i32; 8] = unnorm(r).into();
1300    let g: [i32; 8] = unnorm(g).into();
1301    let b: [i32; 8] = unnorm(b).into();
1302    let a: [i32; 8] = unnorm(a).into();
1303
1304    let conv = |rr, gg, bb, aa|
1305        PremultipliedColorU8::from_rgba_unchecked(rr as u8, gg as u8, bb as u8, aa as u8);
1306
1307    data[0] = conv(r[0], g[0], b[0], a[0]);
1308    data[1] = conv(r[1], g[1], b[1], a[1]);
1309    data[2] = conv(r[2], g[2], b[2], a[2]);
1310    data[3] = conv(r[3], g[3], b[3], a[3]);
1311    data[4] = conv(r[4], g[4], b[4], a[4]);
1312    data[5] = conv(r[5], g[5], b[5], a[5]);
1313    data[6] = conv(r[6], g[6], b[6], a[6]);
1314    data[7] = conv(r[7], g[7], b[7], a[7]);
1315}
1316
1317#[inline(always)]
1318fn store_8888_tail(
1319    r: &f32x8, g: &f32x8, b: &f32x8, a: &f32x8,
1320    tail: usize, data: &mut [PremultipliedColorU8],
1321) {
1322    let r: [i32; 8] = unnorm(r).into();
1323    let g: [i32; 8] = unnorm(g).into();
1324    let b: [i32; 8] = unnorm(b).into();
1325    let a: [i32; 8] = unnorm(a).into();
1326
1327    // This is better than `for i in 0..tail`, because this way the compiler
1328    // knows that we have only 4 steps and slices access is guarantee to be valid.
1329    // This removes bounds checking and a possible panic call.
1330    for i in 0..STAGE_WIDTH {
1331        data[i] = PremultipliedColorU8::from_rgba_unchecked(
1332            r[i] as u8, g[i] as u8, b[i] as u8, a[i] as u8,
1333        );
1334
1335        if i + 1 == tail {
1336            break;
1337        }
1338    }
1339}
1340
1341#[inline(always)]
1342fn unnorm(v: &f32x8) -> i32x8 {
1343    (v.max(f32x8::default()).min(f32x8::splat(1.0)) * f32x8::splat(255.0)).round_int()
1344}
1345
1346#[inline(always)]
1347fn inv(v: f32x8) -> f32x8 {
1348    f32x8::splat(1.0) - v
1349}
1350
1351#[inline(always)]
1352fn two(v: f32x8) -> f32x8 {
1353    v + v
1354}
1355
1356#[inline(always)]
1357fn mad(f: f32x8, m: f32x8, a: f32x8) -> f32x8 {
1358    f * m + a
1359}
1360
1361#[inline(always)]
1362fn lerp(from: f32x8, to: f32x8, t: f32x8) -> f32x8 {
1363    mad(to - from, t, from)
1364}