Skip to main content

tiny_skia/pipeline/
lowp.rs

1// Copyright 2018 Google Inc.
2// Copyright 2020 Yevhenii Reizner
3//
4// Use of this source code is governed by a BSD-style license that can be
5// found in the LICENSE file.
6
7/*!
8A low precision raster pipeline implementation.
9
10A lowp pipeline uses u16 instead of f32 for math.
11Because of that, it doesn't implement stages that require high precision.
12The pipeline compiler will automatically decide which one to use.
13
14Skia uses u16x8 (128bit) types for a generic CPU and u16x16 (256bit) for modern x86 CPUs.
15But instead of explicit SIMD instructions, it mainly relies on clang's vector extensions.
16And since they are unavailable in Rust, we have to do everything manually.
17
18According to our benchmarks, a SIMD-accelerated u16x8 in Rust is almost 2x slower than in Skia.
19Not sure why. For example, there are no div instruction for u16x8, so we have to use
20a basic scalar version. Which means unnecessary load/store. No idea what clang does in this case.
21Surprisingly, a SIMD-accelerated u16x8 is even slower than a scalar one. Again, not sure why.
22
23Therefore we are using scalar u16x16 by default and relying on rustc/llvm auto vectorization instead.
24When targeting a generic CPU, we're just 5-10% slower than Skia. While u16x8 is 30-40% slower.
25And while `-C target-cpu=haswell` boosts our performance by around 25%,
26we are still 40-60% behind Skia built for Haswell.
27
28On ARM AArch64 the story is different and explicit SIMD make our code up to 2-3x faster.
29*/
30
31use crate::PremultipliedColorU8;
32
33use crate::pixmap::SubPixmapMut;
34use crate::wide::{f32x8, u16x16, f32x16};
35use crate::geom::ScreenIntRect;
36
37pub const STAGE_WIDTH: usize = 16;
38
39pub type StageFn = fn(p: &mut Pipeline);
40
41pub struct Pipeline<'a, 'b: 'a> {
42    index: usize,
43    functions: &'a [StageFn],
44    pixmap: &'a mut SubPixmapMut<'b>,
45    mask_ctx: super::MaskCtx<'a>,
46    aa_mask_ctx: super::AAMaskCtx,
47    ctx: &'a mut super::Context,
48    r: u16x16,
49    g: u16x16,
50    b: u16x16,
51    a: u16x16,
52    dr: u16x16,
53    dg: u16x16,
54    db: u16x16,
55    da: u16x16,
56    tail: usize,
57    dx: usize,
58    dy: usize,
59}
60
61impl Pipeline<'_, '_> {
62    #[inline(always)]
63    fn next_stage(&mut self) {
64        let next: fn(&mut Self) = self.functions[self.index];
65        self.index += 1;
66        next(self);
67    }
68}
69
70
71// Must be in the same order as raster_pipeline::Stage
72pub const STAGES: &[StageFn; super::STAGES_COUNT] = &[
73    move_source_to_destination,
74    move_destination_to_source,
75    null_fn, // Clamp0
76    null_fn, // ClampA
77    premultiply,
78    uniform_color,
79    seed_shader,
80    load_dst,
81    store,
82    load_dst_u8,
83    store_u8,
84    null_fn, // Gather
85    load_mask_u8,
86    mask_u8,
87    scale_u8,
88    lerp_u8,
89    scale_1_float,
90    lerp_1_float,
91    destination_atop,
92    destination_in,
93    destination_out,
94    destination_over,
95    source_atop,
96    source_in,
97    source_out,
98    source_over,
99    clear,
100    modulate,
101    multiply,
102    plus,
103    screen,
104    xor,
105    null_fn, // ColorBurn
106    null_fn, // ColorDodge
107    darken,
108    difference,
109    exclusion,
110    hard_light,
111    lighten,
112    overlay,
113    null_fn, // SoftLight
114    null_fn, // Hue
115    null_fn, // Saturation
116    null_fn, // Color
117    null_fn, // Luminosity
118    source_over_rgba,
119    transform,
120    null_fn, // Reflect
121    null_fn, // Repeat
122    null_fn, // Bilinear
123    null_fn, // Bicubic
124    pad_x1,
125    reflect_x1,
126    repeat_x1,
127    gradient,
128    evenly_spaced_2_stop_gradient,
129    // TODO: Can be implemented for lowp as well. The implementation is very similar to its highp
130    // variant.
131    null_fn, // XYToUnitAngle
132    xy_to_radius,
133    null_fn, // XYTo2PtConicalFocalOnCircle
134    null_fn, // XYTo2PtConicalWellBehaved
135    null_fn, // XYTo2PtConicalSmaller
136    null_fn, // XYTo2PtConicalGreater
137    null_fn, // XYTo2PtConicalStrip
138    null_fn, // Mask2PtConicalNan
139    null_fn, // Mask2PtConicalDegenerates
140    null_fn, // ApplyVectorMask
141    null_fn, // Alter2PtConicalCompensateFocal
142    null_fn, // Alter2PtConicalUnswap
143    null_fn, // NegateX
144    null_fn, // ApplyConcentricScaleBias
145    null_fn, // GammaExpand2
146    null_fn, // GammaExpandDestination2
147    null_fn, // GammaCompress2
148    null_fn, // GammaExpand22
149    null_fn, // GammaExpandDestination22
150    null_fn, // GammaCompress22
151    null_fn, // GammaExpandSrgb
152    null_fn, // GammaExpandDestinationSrgb
153    null_fn, // GammaCompressSrgb
154];
155
156pub fn fn_ptr(f: StageFn) -> *const () {
157    f as *const ()
158}
159
160pub fn fn_ptr_eq(f1: StageFn, f2: StageFn) -> bool {
161    core::ptr::eq(f1 as *const (), f2 as *const ())
162}
163
164#[inline(never)]
165pub fn start(
166    functions: &[StageFn],
167    functions_tail: &[StageFn],
168    rect: &ScreenIntRect,
169    aa_mask_ctx: super::AAMaskCtx,
170    mask_ctx: super::MaskCtx,
171    ctx: &mut super::Context,
172    pixmap: &mut SubPixmapMut,
173) {
174    let mut p = Pipeline {
175        index: 0,
176        functions: &[],
177        pixmap,
178        mask_ctx,
179        aa_mask_ctx,
180        ctx,
181        r: u16x16::default(),
182        g: u16x16::default(),
183        b: u16x16::default(),
184        a: u16x16::default(),
185        dr: u16x16::default(),
186        dg: u16x16::default(),
187        db: u16x16::default(),
188        da: u16x16::default(),
189        tail: 0,
190        dx: 0,
191        dy: 0,
192    };
193
194    for y in rect.y()..rect.bottom() {
195        let mut x = rect.x() as usize;
196        let end = rect.right() as usize;
197
198        p.functions = functions;
199        while x + STAGE_WIDTH <= end {
200            p.index = 0;
201            p.dx = x;
202            p.dy = y as usize;
203            p.tail = STAGE_WIDTH;
204            p.next_stage();
205            x += STAGE_WIDTH;
206        }
207
208        if x != end {
209            p.index = 0;
210            p.functions = functions_tail;
211            p.dx = x;
212            p.dy = y as usize;
213            p.tail = end - x;
214            p.next_stage();
215        }
216    }
217}
218
219fn move_source_to_destination(p: &mut Pipeline) {
220    p.dr = p.r;
221    p.dg = p.g;
222    p.db = p.b;
223    p.da = p.a;
224
225    p.next_stage();
226}
227
228fn move_destination_to_source(p: &mut Pipeline) {
229    p.r = p.dr;
230    p.g = p.dg;
231    p.b = p.db;
232    p.a = p.da;
233
234    p.next_stage();
235}
236
237fn premultiply(p: &mut Pipeline) {
238    p.r = div255(p.r * p.a);
239    p.g = div255(p.g * p.a);
240    p.b = div255(p.b * p.a);
241
242    p.next_stage();
243}
244
245fn uniform_color(p: &mut Pipeline) {
246    let ctx = p.ctx.uniform_color;
247    p.r = u16x16::splat(ctx.rgba[0]);
248    p.g = u16x16::splat(ctx.rgba[1]);
249    p.b = u16x16::splat(ctx.rgba[2]);
250    p.a = u16x16::splat(ctx.rgba[3]);
251
252    p.next_stage();
253}
254
255fn seed_shader(p: &mut Pipeline) {
256    let iota = f32x16(
257        f32x8::from([0.5,  1.5,  2.5,  3.5,  4.5,  5.5,  6.5,  7.5]),
258        f32x8::from([8.5,  9.5, 10.5, 11.5, 12.5, 13.5, 14.5, 15.5]),
259    );
260
261    let x = f32x16::splat(p.dx as f32) + iota;
262    let y = f32x16::splat(p.dy as f32 + 0.5);
263    split(&x, &mut p.r, &mut p.g);
264    split(&y, &mut p.b, &mut p.a);
265
266    p.next_stage();
267}
268
269pub fn load_dst(p: &mut Pipeline) {
270    load_8888(p.pixmap.slice16_at_xy(p.dx, p.dy), &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
271    p.next_stage();
272}
273
274pub fn load_dst_tail(p: &mut Pipeline) {
275    load_8888_tail(p.tail, p.pixmap.slice_at_xy(p.dx, p.dy), &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
276    p.next_stage();
277}
278
279pub fn store(p: &mut Pipeline) {
280    store_8888(&p.r, &p.g, &p.b, &p.a, p.pixmap.slice16_at_xy(p.dx, p.dy));
281    p.next_stage();
282}
283
284pub fn store_tail(p: &mut Pipeline) {
285    store_8888_tail(&p.r, &p.g, &p.b, &p.a, p.tail, p.pixmap.slice_at_xy(p.dx, p.dy));
286    p.next_stage();
287}
288
289pub fn load_dst_u8(p: &mut Pipeline) {
290    load_8(p.pixmap.slice16_mask_at_xy(p.dx, p.dy), &mut p.da);
291    p.next_stage();
292}
293
294pub fn load_dst_u8_tail(p: &mut Pipeline) {
295    // Fill a dummy array with `tail` values. `tail` is always in a 1..STAGE_WIDTH-1 range.
296    // This way we can reuse the `load_8888__` method and remove any branches.
297    let data = p.pixmap.slice_mask_at_xy(p.dx, p.dy);
298    let mut tmp = [0u8; STAGE_WIDTH];
299    tmp[0..p.tail].copy_from_slice(&data[0..p.tail]);
300    load_8(&tmp, &mut p.da);
301
302    p.next_stage();
303}
304
305pub fn store_u8(p: &mut Pipeline) {
306    let data = p.pixmap.slice16_mask_at_xy(p.dx, p.dy);
307    let a = p.a.as_slice();
308
309    data[ 0] = a[ 0] as u8;
310    data[ 1] = a[ 1] as u8;
311    data[ 2] = a[ 2] as u8;
312    data[ 3] = a[ 3] as u8;
313    data[ 4] = a[ 4] as u8;
314    data[ 5] = a[ 5] as u8;
315    data[ 6] = a[ 6] as u8;
316    data[ 7] = a[ 7] as u8;
317    data[ 8] = a[ 8] as u8;
318    data[ 9] = a[ 9] as u8;
319    data[10] = a[10] as u8;
320    data[11] = a[11] as u8;
321    data[12] = a[12] as u8;
322    data[13] = a[13] as u8;
323    data[14] = a[14] as u8;
324    data[15] = a[15] as u8;
325
326    p.next_stage();
327}
328
329pub fn store_u8_tail(p: &mut Pipeline) {
330    let data = p.pixmap.slice_mask_at_xy(p.dx, p.dy);
331    let a = p.a.as_slice();
332
333    // This is better than `for i in 0..tail`, because this way the compiler
334    // knows that we have only 16 steps and slices access is guarantee to be valid.
335    // This removes bounds checking and a possible panic call.
336    for i in 0..STAGE_WIDTH {
337        data[i] = a[i] as u8;
338
339        if i + 1 == p.tail {
340            break;
341        }
342    }
343
344    p.next_stage();
345}
346
347// Similar to mask_u8, but only loads the mask values without actually masking the pipeline.
348fn load_mask_u8(p: &mut Pipeline) {
349    let offset = p.mask_ctx.offset(p.dx, p.dy);
350
351    let mut c = u16x16::default();
352    for i in 0..p.tail {
353        c.0[i] = u16::from(p.mask_ctx.data[offset + i]);
354    }
355
356    p.r = u16x16::splat(0);
357    p.g = u16x16::splat(0);
358    p.b = u16x16::splat(0);
359    p.a = c;
360
361    p.next_stage();
362}
363
364fn mask_u8(p: &mut Pipeline) {
365    let offset = p.mask_ctx.offset(p.dx, p.dy);
366
367    let mut c = u16x16::default();
368    for i in 0..p.tail {
369        c.0[i] = u16::from(p.mask_ctx.data[offset + i]);
370    }
371
372    if c == u16x16::default() {
373        return;
374    }
375
376    p.r = div255(p.r * c);
377    p.g = div255(p.g * c);
378    p.b = div255(p.b * c);
379    p.a = div255(p.a * c);
380
381    p.next_stage();
382}
383
384fn scale_u8(p: &mut Pipeline) {
385    // Load u8xTail and cast it to u16x16.
386    let data = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail);
387    let c = u16x16([
388        u16::from(data[0]),
389        u16::from(data[1]),
390        0,
391        0,
392        0,
393        0,
394        0,
395        0,
396        0,
397        0,
398        0,
399        0,
400        0,
401        0,
402        0,
403        0,
404    ]);
405
406    p.r = div255(p.r * c);
407    p.g = div255(p.g * c);
408    p.b = div255(p.b * c);
409    p.a = div255(p.a * c);
410
411    p.next_stage();
412}
413
414fn lerp_u8(p: &mut Pipeline) {
415    // Load u8xTail and cast it to u16x16.
416    let data = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail);
417    let c = u16x16([
418        u16::from(data[0]),
419        u16::from(data[1]),
420        0,
421        0,
422        0,
423        0,
424        0,
425        0,
426        0,
427        0,
428        0,
429        0,
430        0,
431        0,
432        0,
433        0,
434    ]);
435
436    p.r = lerp(p.dr, p.r, c);
437    p.g = lerp(p.dg, p.g, c);
438    p.b = lerp(p.db, p.b, c);
439    p.a = lerp(p.da, p.a, c);
440
441    p.next_stage();
442}
443
444fn scale_1_float(p: &mut Pipeline) {
445    let c = from_float(p.ctx.current_coverage);
446    p.r = div255(p.r * c);
447    p.g = div255(p.g * c);
448    p.b = div255(p.b * c);
449    p.a = div255(p.a * c);
450
451    p.next_stage();
452}
453
454fn lerp_1_float(p: &mut Pipeline) {
455    let c = from_float(p.ctx.current_coverage);
456    p.r = lerp(p.dr, p.r, c);
457    p.g = lerp(p.dg, p.g, c);
458    p.b = lerp(p.db, p.b, c);
459    p.a = lerp(p.da, p.a, c);
460
461    p.next_stage();
462}
463
464macro_rules! blend_fn {
465    ($name:ident, $f:expr) => {
466        fn $name(p: &mut Pipeline) {
467            p.r = $f(p.r, p.dr, p.a, p.da);
468            p.g = $f(p.g, p.dg, p.a, p.da);
469            p.b = $f(p.b, p.db, p.a, p.da);
470            p.a = $f(p.a, p.da, p.a, p.da);
471
472            p.next_stage();
473        }
474    };
475}
476
477blend_fn!(clear,            |_, _,  _,  _| u16x16::splat(0));
478blend_fn!(source_atop,      |s, d, sa, da| div255(s * da + d * inv(sa)));
479blend_fn!(destination_atop, |s, d, sa, da| div255(d * sa + s * inv(da)));
480blend_fn!(source_in,        |s, _,  _, da| div255(s * da));
481blend_fn!(destination_in,   |_, d, sa,  _| div255(d * sa));
482blend_fn!(source_out,       |s, _,  _, da| div255(s * inv(da)));
483blend_fn!(destination_out,  |_, d, sa,  _| div255(d * inv(sa)));
484blend_fn!(source_over,      |s, d, sa,  _| s + div255(d * inv(sa)));
485blend_fn!(destination_over, |s, d,  _, da| d + div255(s * inv(da)));
486blend_fn!(modulate,         |s, d,  _,  _| div255(s * d));
487blend_fn!(multiply,         |s, d, sa, da| div255(s * inv(da) + d * inv(sa) + s * d));
488blend_fn!(screen,           |s, d,  _,  _| s + d - div255(s * d));
489blend_fn!(xor,              |s, d, sa, da| div255(s * inv(da) + d * inv(sa)));
490
491// Wants a type for some reason.
492blend_fn!(plus, |s: u16x16, d, _, _| (s + d).min(&u16x16::splat(255)));
493
494
495macro_rules! blend_fn2 {
496    ($name:ident, $f:expr) => {
497        fn $name(p: &mut Pipeline) {
498            // The same logic applied to color, and source_over for alpha.
499            p.r = $f(p.r, p.dr, p.a, p.da);
500            p.g = $f(p.g, p.dg, p.a, p.da);
501            p.b = $f(p.b, p.db, p.a, p.da);
502            p.a = p.a + div255(p.da * inv(p.a));
503
504            p.next_stage();
505        }
506    };
507}
508
509blend_fn2!(darken,      |s: u16x16, d, sa, da| s + d - div255((s * da).max(&(d * sa))));
510blend_fn2!(lighten,     |s: u16x16, d, sa, da| s + d - div255((s * da).min(&(d * sa))));
511blend_fn2!(exclusion,   |s: u16x16, d,  _,  _| s + d - u16x16::splat(2) * div255(s * d));
512
513blend_fn2!(difference,  |s: u16x16, d, sa, da|
514    s + d - u16x16::splat(2) * div255((s * da).min(&(d * sa))));
515
516blend_fn2!(hard_light, |s: u16x16, d: u16x16, sa, da| {
517    div255(s * inv(da) + d * inv(sa)
518        + (s+s).cmp_le(&sa).blend(
519            u16x16::splat(2) * s * d,
520            sa * da - u16x16::splat(2) * (sa-s)*(da-d)
521        )
522    )
523});
524
525blend_fn2!(overlay, |s: u16x16, d: u16x16, sa, da| {
526    div255(s * inv(da) + d * inv(sa)
527        + (d+d).cmp_le(&da).blend(
528            u16x16::splat(2) * s * d,
529            sa * da - u16x16::splat(2) * (sa-s)*(da-d)
530        )
531    )
532});
533
534pub fn source_over_rgba(p: &mut Pipeline) {
535    let pixels = p.pixmap.slice16_at_xy(p.dx, p.dy);
536    load_8888(pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
537    p.r = p.r + div255(p.dr * inv(p.a));
538    p.g = p.g + div255(p.dg * inv(p.a));
539    p.b = p.b + div255(p.db * inv(p.a));
540    p.a = p.a + div255(p.da * inv(p.a));
541    store_8888(&p.r, &p.g, &p.b, &p.a, pixels);
542
543    p.next_stage();
544}
545
546pub fn source_over_rgba_tail(p: &mut Pipeline) {
547    let pixels = p.pixmap.slice_at_xy(p.dx, p.dy);
548    load_8888_tail(p.tail, pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
549    p.r = p.r + div255(p.dr * inv(p.a));
550    p.g = p.g + div255(p.dg * inv(p.a));
551    p.b = p.b + div255(p.db * inv(p.a));
552    p.a = p.a + div255(p.da * inv(p.a));
553    store_8888_tail(&p.r, &p.g, &p.b, &p.a, p.tail, pixels);
554
555    p.next_stage();
556}
557
558fn transform(p: &mut Pipeline) {
559    let ts = &p.ctx.transform;
560
561    let x = join(&p.r, &p.g);
562    let y = join(&p.b, &p.a);
563
564    let nx = mad(x, f32x16::splat(ts.sx), mad(y, f32x16::splat(ts.kx), f32x16::splat(ts.tx)));
565    let ny = mad(x, f32x16::splat(ts.ky), mad(y, f32x16::splat(ts.sy), f32x16::splat(ts.ty)));
566
567    split(&nx, &mut p.r, &mut p.g);
568    split(&ny, &mut p.b, &mut p.a);
569
570    p.next_stage();
571}
572
573fn pad_x1(p: &mut Pipeline) {
574    let x = join(&p.r, &p.g);
575    let x = x.normalize();
576    split(&x, &mut p.r, &mut p.g);
577
578    p.next_stage();
579}
580
581fn reflect_x1(p: &mut Pipeline) {
582    let x = join(&p.r, &p.g);
583    let two = |x| x + x;
584    let x = (
585        (x - f32x16::splat(1.0))
586        - two(((x - f32x16::splat(1.0)) * f32x16::splat(0.5)).floor())
587        - f32x16::splat(1.0)
588    ).abs().normalize();
589    split(&x, &mut p.r, &mut p.g);
590
591    p.next_stage();
592}
593
594fn repeat_x1(p: &mut Pipeline) {
595    let x = join(&p.r, &p.g);
596    let x = (x - x.floor()).normalize();
597    split(&x, &mut p.r, &mut p.g);
598
599    p.next_stage();
600}
601
602fn gradient(p: &mut Pipeline) {
603    let ctx = &p.ctx.gradient;
604
605    // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop.
606    let t = join(&p.r, &p.g);
607    let mut idx = u16x16::splat(0);
608    for i in 1..ctx.len {
609        let tt = ctx.t_values[i].get();
610        let t0: [f32; 8] = t.0.into();
611        let t1: [f32; 8] = t.1.into();
612        idx.0[ 0] += (t0[0] >= tt) as u16;
613        idx.0[ 1] += (t0[1] >= tt) as u16;
614        idx.0[ 2] += (t0[2] >= tt) as u16;
615        idx.0[ 3] += (t0[3] >= tt) as u16;
616        idx.0[ 4] += (t0[4] >= tt) as u16;
617        idx.0[ 5] += (t0[5] >= tt) as u16;
618        idx.0[ 6] += (t0[6] >= tt) as u16;
619        idx.0[ 7] += (t0[7] >= tt) as u16;
620        idx.0[ 8] += (t1[0] >= tt) as u16;
621        idx.0[ 9] += (t1[1] >= tt) as u16;
622        idx.0[10] += (t1[2] >= tt) as u16;
623        idx.0[11] += (t1[3] >= tt) as u16;
624        idx.0[12] += (t1[4] >= tt) as u16;
625        idx.0[13] += (t1[5] >= tt) as u16;
626        idx.0[14] += (t1[6] >= tt) as u16;
627        idx.0[15] += (t1[7] >= tt) as u16;
628    }
629    gradient_lookup(ctx, &idx, t, &mut p.r, &mut p.g, &mut p.b, &mut p.a);
630
631    p.next_stage();
632}
633
634fn evenly_spaced_2_stop_gradient(p: &mut Pipeline) {
635    let ctx = &p.ctx.evenly_spaced_2_stop_gradient;
636
637    let t = join(&p.r, &p.g);
638    round_f32_to_u16(
639        mad(t, f32x16::splat(ctx.factor.r), f32x16::splat(ctx.bias.r)),
640        mad(t, f32x16::splat(ctx.factor.g), f32x16::splat(ctx.bias.g)),
641        mad(t, f32x16::splat(ctx.factor.b), f32x16::splat(ctx.bias.b)),
642        mad(t, f32x16::splat(ctx.factor.a), f32x16::splat(ctx.bias.a)),
643        &mut p.r, &mut p.g, &mut p.b, &mut p.a,
644    );
645
646    p.next_stage();
647}
648
649fn xy_to_radius(p: &mut Pipeline) {
650    let x = join(&p.r, &p.g);
651    let y = join(&p.b, &p.a);
652    let x = (x*x + y*y).sqrt();
653    split(&x, &mut p.r, &mut p.g);
654    split(&y, &mut p.b, &mut p.a);
655
656    p.next_stage();
657}
658
659// We are using u16 for index, not u32 as Skia, to simplify the code a bit.
660// The gradient creation code will not allow that many stops anyway.
661fn gradient_lookup(
662    ctx: &super::GradientCtx, idx: &u16x16, t: f32x16,
663    r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16,
664) {
665    macro_rules! gather {
666        ($d:expr, $c:ident) => {
667            // Surprisingly, but bound checking doesn't affect the performance.
668            // And since `idx` can contain any number, we should leave it in place.
669            f32x16(
670                f32x8::from([
671                    $d[idx.0[ 0] as usize].$c,
672                    $d[idx.0[ 1] as usize].$c,
673                    $d[idx.0[ 2] as usize].$c,
674                    $d[idx.0[ 3] as usize].$c,
675                    $d[idx.0[ 4] as usize].$c,
676                    $d[idx.0[ 5] as usize].$c,
677                    $d[idx.0[ 6] as usize].$c,
678                    $d[idx.0[ 7] as usize].$c,
679                ]),
680                f32x8::from([
681                    $d[idx.0[ 8] as usize].$c,
682                    $d[idx.0[ 9] as usize].$c,
683                    $d[idx.0[10] as usize].$c,
684                    $d[idx.0[11] as usize].$c,
685                    $d[idx.0[12] as usize].$c,
686                    $d[idx.0[13] as usize].$c,
687                    $d[idx.0[14] as usize].$c,
688                    $d[idx.0[15] as usize].$c,
689                ]),
690            )
691        };
692    }
693
694    let fr = gather!(&ctx.factors, r);
695    let fg = gather!(&ctx.factors, g);
696    let fb = gather!(&ctx.factors, b);
697    let fa = gather!(&ctx.factors, a);
698
699    let br = gather!(&ctx.biases, r);
700    let bg = gather!(&ctx.biases, g);
701    let bb = gather!(&ctx.biases, b);
702    let ba = gather!(&ctx.biases, a);
703
704    round_f32_to_u16(
705        mad(t, fr, br),
706        mad(t, fg, bg),
707        mad(t, fb, bb),
708        mad(t, fa, ba),
709        r, g, b, a,
710    );
711}
712
713#[inline(always)]
714fn round_f32_to_u16(
715    rf: f32x16, gf: f32x16, bf: f32x16, af: f32x16,
716    r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16,
717) {
718    // TODO: may produce a slightly different result to Skia
719    //       affects the two_stops_linear_mirror test
720
721    let rf = rf.normalize() * f32x16::splat(255.0) + f32x16::splat(0.5);
722    let gf = gf.normalize() * f32x16::splat(255.0) + f32x16::splat(0.5);
723    let bf = bf.normalize() * f32x16::splat(255.0) + f32x16::splat(0.5);
724    let af = af * f32x16::splat(255.0) + f32x16::splat(0.5);
725
726    rf.save_to_u16x16(r);
727    gf.save_to_u16x16(g);
728    bf.save_to_u16x16(b);
729    af.save_to_u16x16(a);
730}
731
732pub fn just_return(_: &mut Pipeline) {
733    // Ends the loop.
734}
735
736pub fn null_fn(_: &mut Pipeline) {
737    // Just for unsupported functions in STAGES.
738}
739
740#[inline(always)]
741fn load_8888(
742    data: &[PremultipliedColorU8; STAGE_WIDTH],
743    r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16,
744) {
745    *r = u16x16([
746        data[ 0].red() as u16, data[ 1].red() as u16, data[ 2].red() as u16, data[ 3].red() as u16,
747        data[ 4].red() as u16, data[ 5].red() as u16, data[ 6].red() as u16, data[ 7].red() as u16,
748        data[ 8].red() as u16, data[ 9].red() as u16, data[10].red() as u16, data[11].red() as u16,
749        data[12].red() as u16, data[13].red() as u16, data[14].red() as u16, data[15].red() as u16,
750    ]);
751
752    *g = u16x16([
753        data[ 0].green() as u16, data[ 1].green() as u16, data[ 2].green() as u16, data[ 3].green() as u16,
754        data[ 4].green() as u16, data[ 5].green() as u16, data[ 6].green() as u16, data[ 7].green() as u16,
755        data[ 8].green() as u16, data[ 9].green() as u16, data[10].green() as u16, data[11].green() as u16,
756        data[12].green() as u16, data[13].green() as u16, data[14].green() as u16, data[15].green() as u16,
757    ]);
758
759    *b = u16x16([
760        data[ 0].blue() as u16, data[ 1].blue() as u16, data[ 2].blue() as u16, data[ 3].blue() as u16,
761        data[ 4].blue() as u16, data[ 5].blue() as u16, data[ 6].blue() as u16, data[ 7].blue() as u16,
762        data[ 8].blue() as u16, data[ 9].blue() as u16, data[10].blue() as u16, data[11].blue() as u16,
763        data[12].blue() as u16, data[13].blue() as u16, data[14].blue() as u16, data[15].blue() as u16,
764    ]);
765
766    *a = u16x16([
767        data[ 0].alpha() as u16, data[ 1].alpha() as u16, data[ 2].alpha() as u16, data[ 3].alpha() as u16,
768        data[ 4].alpha() as u16, data[ 5].alpha() as u16, data[ 6].alpha() as u16, data[ 7].alpha() as u16,
769        data[ 8].alpha() as u16, data[ 9].alpha() as u16, data[10].alpha() as u16, data[11].alpha() as u16,
770        data[12].alpha() as u16, data[13].alpha() as u16, data[14].alpha() as u16, data[15].alpha() as u16,
771    ]);
772}
773
774#[inline(always)]
775fn load_8888_tail(
776    tail: usize, data: &[PremultipliedColorU8],
777    r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16,
778) {
779    // Fill a dummy array with `tail` values. `tail` is always in a 1..STAGE_WIDTH-1 range.
780    // This way we can reuse the `load_8888__` method and remove any branches.
781    let mut tmp = [PremultipliedColorU8::TRANSPARENT; STAGE_WIDTH];
782    tmp[0..tail].copy_from_slice(&data[0..tail]);
783    load_8888(&tmp, r, g, b, a);
784}
785
786#[inline(always)]
787fn store_8888(
788    r: &u16x16, g: &u16x16, b: &u16x16, a: &u16x16,
789    data: &mut [PremultipliedColorU8; STAGE_WIDTH],
790) {
791    let r = r.as_slice();
792    let g = g.as_slice();
793    let b = b.as_slice();
794    let a = a.as_slice();
795
796    data[ 0] = PremultipliedColorU8::from_rgba_unchecked(r[ 0] as u8, g[ 0] as u8, b[ 0] as u8, a[ 0] as u8);
797    data[ 1] = PremultipliedColorU8::from_rgba_unchecked(r[ 1] as u8, g[ 1] as u8, b[ 1] as u8, a[ 1] as u8);
798    data[ 2] = PremultipliedColorU8::from_rgba_unchecked(r[ 2] as u8, g[ 2] as u8, b[ 2] as u8, a[ 2] as u8);
799    data[ 3] = PremultipliedColorU8::from_rgba_unchecked(r[ 3] as u8, g[ 3] as u8, b[ 3] as u8, a[ 3] as u8);
800    data[ 4] = PremultipliedColorU8::from_rgba_unchecked(r[ 4] as u8, g[ 4] as u8, b[ 4] as u8, a[ 4] as u8);
801    data[ 5] = PremultipliedColorU8::from_rgba_unchecked(r[ 5] as u8, g[ 5] as u8, b[ 5] as u8, a[ 5] as u8);
802    data[ 6] = PremultipliedColorU8::from_rgba_unchecked(r[ 6] as u8, g[ 6] as u8, b[ 6] as u8, a[ 6] as u8);
803    data[ 7] = PremultipliedColorU8::from_rgba_unchecked(r[ 7] as u8, g[ 7] as u8, b[ 7] as u8, a[ 7] as u8);
804    data[ 8] = PremultipliedColorU8::from_rgba_unchecked(r[ 8] as u8, g[ 8] as u8, b[ 8] as u8, a[ 8] as u8);
805    data[ 9] = PremultipliedColorU8::from_rgba_unchecked(r[ 9] as u8, g[ 9] as u8, b[ 9] as u8, a[ 9] as u8);
806    data[10] = PremultipliedColorU8::from_rgba_unchecked(r[10] as u8, g[10] as u8, b[10] as u8, a[10] as u8);
807    data[11] = PremultipliedColorU8::from_rgba_unchecked(r[11] as u8, g[11] as u8, b[11] as u8, a[11] as u8);
808    data[12] = PremultipliedColorU8::from_rgba_unchecked(r[12] as u8, g[12] as u8, b[12] as u8, a[12] as u8);
809    data[13] = PremultipliedColorU8::from_rgba_unchecked(r[13] as u8, g[13] as u8, b[13] as u8, a[13] as u8);
810    data[14] = PremultipliedColorU8::from_rgba_unchecked(r[14] as u8, g[14] as u8, b[14] as u8, a[14] as u8);
811    data[15] = PremultipliedColorU8::from_rgba_unchecked(r[15] as u8, g[15] as u8, b[15] as u8, a[15] as u8);
812}
813
814#[inline(always)]
815fn store_8888_tail(
816    r: &u16x16, g: &u16x16, b: &u16x16, a: &u16x16,
817    tail: usize, data: &mut [PremultipliedColorU8],
818) {
819    let r = r.as_slice();
820    let g = g.as_slice();
821    let b = b.as_slice();
822    let a = a.as_slice();
823
824    // This is better than `for i in 0..tail`, because this way the compiler
825    // knows that we have only 16 steps and slices access is guarantee to be valid.
826    // This removes bounds checking and a possible panic call.
827    for i in 0..STAGE_WIDTH {
828        data[i] = PremultipliedColorU8::from_rgba_unchecked(
829            r[i] as u8, g[i] as u8, b[i] as u8, a[i] as u8,
830        );
831
832        if i + 1 == tail {
833            break;
834        }
835    }
836}
837
838#[inline(always)]
839fn load_8(data: &[u8; STAGE_WIDTH], a: &mut u16x16) {
840    *a = u16x16([
841        data[ 0] as u16, data[ 1] as u16, data[ 2] as u16, data[ 3] as u16,
842        data[ 4] as u16, data[ 5] as u16, data[ 6] as u16, data[ 7] as u16,
843        data[ 8] as u16, data[ 9] as u16, data[10] as u16, data[11] as u16,
844        data[12] as u16, data[13] as u16, data[14] as u16, data[15] as u16,
845    ]);
846}
847
848#[inline(always)]
849fn div255(v: u16x16) -> u16x16 {
850    // Skia uses `vrshrq_n_u16(vrsraq_n_u16(v, v, 8), 8)` here when NEON is available,
851    // but it doesn't affect performance much and breaks reproducible result. Ignore it.
852    // NOTE: the compiler does not replace the division with a shift.
853    (v + u16x16::splat(255)) >> u16x16::splat(8) // / u16x16::splat(256)
854}
855
856#[inline(always)]
857fn inv(v: u16x16) -> u16x16 {
858    u16x16::splat(255) - v
859}
860
861#[inline(always)]
862fn from_float(f: f32) -> u16x16 {
863    u16x16::splat((f * 255.0 + 0.5) as u16)
864}
865
866#[inline(always)]
867fn lerp(from: u16x16, to: u16x16, t: u16x16) -> u16x16 {
868    div255(from * inv(t) + to * t)
869}
870
871#[inline(always)]
872fn split(v: &f32x16, lo: &mut u16x16, hi: &mut u16x16) {
873    // We're splitting f32x16 (512bit) into two u16x16 (256 bit).
874    let data: [u8; 64] = bytemuck::cast(*v);
875    let d0: &mut [u8; 32] = bytemuck::cast_mut(&mut lo.0);
876    let d1: &mut [u8; 32] = bytemuck::cast_mut(&mut hi.0);
877
878    d0.copy_from_slice(&data[0..32]);
879    d1.copy_from_slice(&data[32..64]);
880}
881
882#[inline(always)]
883fn join(lo: &u16x16, hi: &u16x16) -> f32x16 {
884    // We're joining two u16x16 (256 bit) into f32x16 (512bit).
885
886    let d0: [u8; 32] = bytemuck::cast(lo.0);
887    let d1: [u8; 32] = bytemuck::cast(hi.0);
888
889    let mut v = f32x16::default();
890    let data: &mut [u8; 64] = bytemuck::cast_mut(&mut v);
891
892    data[0..32].copy_from_slice(&d0);
893    data[32..64].copy_from_slice(&d1);
894
895    v
896}
897
898#[inline(always)]
899fn mad(f: f32x16, m: f32x16, a: f32x16) -> f32x16 {
900    // NEON vmlaq_f32 doesn't seem to affect performance in any way. Ignore it.
901    f * m + a
902}