Skip to main content

moxcms/conversions/avx/
interpolator_q0_15.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29#![cfg(feature = "avx_luts")]
30use crate::conversions::interpolator::BarycentricWeight;
31use crate::math::FusedMultiplyAdd;
32use std::arch::x86_64::*;
33use std::ops::{Add, Mul, Sub};
34
35#[repr(align(8), C)]
36pub(crate) struct AvxAlignedI16(pub(crate) [i16; 4]);
37
38#[cfg(feature = "options")]
39pub(crate) struct TetrahedralAvxQ0_15<const GRID_SIZE: usize> {}
40
41#[cfg(feature = "options")]
42pub(crate) struct PyramidalAvxQ0_15<const GRID_SIZE: usize> {}
43
44#[cfg(feature = "options")]
45pub(crate) struct PrismaticAvxQ0_15<const GRID_SIZE: usize> {}
46
47pub(crate) struct TrilinearAvxQ0_15<const GRID_SIZE: usize> {}
48
49#[cfg(feature = "options")]
50pub(crate) struct PrismaticAvxQ0_15Double<const GRID_SIZE: usize> {}
51
52pub(crate) struct TrilinearAvxQ0_15Double<const GRID_SIZE: usize> {}
53
54#[cfg(feature = "options")]
55pub(crate) struct PyramidAvxFmaQ0_15Double<const GRID_SIZE: usize> {}
56
57#[cfg(feature = "options")]
58pub(crate) struct TetrahedralAvxQ0_15Double<const GRID_SIZE: usize> {}
59
60pub(crate) trait AvxMdInterpolationQ0_15Double {
61    fn inter3_sse(
62        &self,
63        table0: &[AvxAlignedI16],
64        table1: &[AvxAlignedI16],
65        in_r: usize,
66        in_g: usize,
67        in_b: usize,
68        lut: &[BarycentricWeight<i16>],
69    ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse);
70}
71
72pub(crate) trait AvxMdInterpolationQ0_15 {
73    fn inter3_sse(
74        &self,
75        table: &[AvxAlignedI16],
76        in_r: usize,
77        in_g: usize,
78        in_b: usize,
79        lut: &[BarycentricWeight<i16>],
80    ) -> AvxVectorQ0_15Sse;
81}
82
83trait Fetcher<T> {
84    fn fetch(&self, x: i32, y: i32, z: i32) -> T;
85}
86
87#[derive(Copy, Clone)]
88#[repr(transparent)]
89pub(crate) struct AvxVectorQ0_15Sse {
90    pub(crate) v: __m128i,
91}
92
93#[derive(Copy, Clone)]
94#[repr(transparent)]
95pub(crate) struct AvxVectorQ0_15 {
96    pub(crate) v: __m256i,
97}
98
99impl AvxVectorQ0_15 {
100    #[inline(always)]
101    pub(crate) fn from_sse(lo: AvxVectorQ0_15Sse, hi: AvxVectorQ0_15Sse) -> AvxVectorQ0_15 {
102        unsafe {
103            AvxVectorQ0_15 {
104                v: _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(lo.v), hi.v),
105            }
106        }
107    }
108
109    #[inline(always)]
110    pub(crate) fn split(self) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
111        unsafe {
112            (
113                AvxVectorQ0_15Sse {
114                    v: _mm256_castsi256_si128(self.v),
115                },
116                AvxVectorQ0_15Sse {
117                    v: _mm256_extracti128_si256::<1>(self.v),
118                },
119            )
120        }
121    }
122}
123
124impl From<i16> for AvxVectorQ0_15Sse {
125    #[inline(always)]
126    fn from(v: i16) -> Self {
127        AvxVectorQ0_15Sse {
128            v: unsafe { _mm_set1_epi16(v) },
129        }
130    }
131}
132
133impl From<i16> for AvxVectorQ0_15 {
134    #[inline(always)]
135    fn from(v: i16) -> Self {
136        AvxVectorQ0_15 {
137            v: unsafe { _mm256_set1_epi16(v) },
138        }
139    }
140}
141
142impl Sub<AvxVectorQ0_15Sse> for AvxVectorQ0_15Sse {
143    type Output = Self;
144    #[inline(always)]
145    fn sub(self, rhs: AvxVectorQ0_15Sse) -> Self::Output {
146        AvxVectorQ0_15Sse {
147            v: unsafe { _mm_sub_epi16(self.v, rhs.v) },
148        }
149    }
150}
151
152impl Sub<AvxVectorQ0_15> for AvxVectorQ0_15 {
153    type Output = Self;
154    #[inline(always)]
155    fn sub(self, rhs: AvxVectorQ0_15) -> Self::Output {
156        AvxVectorQ0_15 {
157            v: unsafe { _mm256_sub_epi16(self.v, rhs.v) },
158        }
159    }
160}
161
162impl Add<AvxVectorQ0_15Sse> for AvxVectorQ0_15Sse {
163    type Output = Self;
164    #[inline(always)]
165    fn add(self, rhs: AvxVectorQ0_15Sse) -> Self::Output {
166        AvxVectorQ0_15Sse {
167            v: unsafe { _mm_add_epi16(self.v, rhs.v) },
168        }
169    }
170}
171
172impl Mul<AvxVectorQ0_15Sse> for AvxVectorQ0_15Sse {
173    type Output = Self;
174    #[inline(always)]
175    fn mul(self, rhs: AvxVectorQ0_15Sse) -> Self::Output {
176        AvxVectorQ0_15Sse {
177            v: unsafe { _mm_mulhrs_epi16(self.v, rhs.v) },
178        }
179    }
180}
181
182impl Add<AvxVectorQ0_15> for AvxVectorQ0_15 {
183    type Output = Self;
184    #[inline(always)]
185    fn add(self, rhs: AvxVectorQ0_15) -> Self::Output {
186        AvxVectorQ0_15 {
187            v: unsafe { _mm256_add_epi16(self.v, rhs.v) },
188        }
189    }
190}
191
192impl Mul<AvxVectorQ0_15> for AvxVectorQ0_15 {
193    type Output = Self;
194    #[inline(always)]
195    fn mul(self, rhs: AvxVectorQ0_15) -> Self::Output {
196        AvxVectorQ0_15 {
197            v: unsafe { _mm256_mulhrs_epi16(self.v, rhs.v) },
198        }
199    }
200}
201
202impl FusedMultiplyAdd<AvxVectorQ0_15Sse> for AvxVectorQ0_15Sse {
203    #[inline(always)]
204    fn mla(&self, b: AvxVectorQ0_15Sse, c: AvxVectorQ0_15Sse) -> AvxVectorQ0_15Sse {
205        AvxVectorQ0_15Sse {
206            v: unsafe { _mm_add_epi16(_mm_mulhrs_epi16(b.v, c.v), self.v) },
207        }
208    }
209}
210
211impl FusedMultiplyAdd<AvxVectorQ0_15> for AvxVectorQ0_15 {
212    #[inline(always)]
213    fn mla(&self, b: AvxVectorQ0_15, c: AvxVectorQ0_15) -> AvxVectorQ0_15 {
214        AvxVectorQ0_15 {
215            v: unsafe { _mm256_add_epi16(_mm256_mulhrs_epi16(b.v, c.v), self.v) },
216        }
217    }
218}
219
220struct TetrahedralAvxSseFetchVector<'a, const GRID_SIZE: usize> {
221    cube: &'a [AvxAlignedI16],
222}
223
224struct TetrahedralAvxFetchVector<'a, const GRID_SIZE: usize> {
225    cube0: &'a [AvxAlignedI16],
226    cube1: &'a [AvxAlignedI16],
227}
228
229/// LUT size here is always fixed size (GRID_SIZE^3) and its use
230/// is hardened at [crate::conversions::avx::assert_barycentric_lut_size_precondition].
231impl<const GRID_SIZE: usize> Fetcher<AvxVectorQ0_15> for TetrahedralAvxFetchVector<'_, GRID_SIZE> {
232    #[inline(always)]
233    fn fetch(&self, x: i32, y: i32, z: i32) -> AvxVectorQ0_15 {
234        let offset = (x as u32 * (GRID_SIZE as u32 * GRID_SIZE as u32)
235            + y as u32 * GRID_SIZE as u32
236            + z as u32) as usize;
237        let jx0 = unsafe { self.cube0.get_unchecked(offset..) };
238        let jx1 = unsafe { self.cube1.get_unchecked(offset..) };
239        AvxVectorQ0_15 {
240            v: unsafe {
241                _mm256_inserti128_si256::<1>(
242                    _mm256_castsi128_si256(_mm_loadu_si64(jx0.as_ptr() as *const _)),
243                    _mm_loadu_si64(jx1.as_ptr() as *const _),
244                )
245            },
246        }
247    }
248}
249
250impl<const GRID_SIZE: usize> Fetcher<AvxVectorQ0_15Sse>
251    for TetrahedralAvxSseFetchVector<'_, GRID_SIZE>
252{
253    #[inline(always)]
254    fn fetch(&self, x: i32, y: i32, z: i32) -> AvxVectorQ0_15Sse {
255        let offset = (x as u32 * (GRID_SIZE as u32 * GRID_SIZE as u32)
256            + y as u32 * GRID_SIZE as u32
257            + z as u32) as usize;
258        let jx = unsafe { self.cube.get_unchecked(offset..) };
259        AvxVectorQ0_15Sse {
260            v: unsafe { _mm_loadu_si64(jx.as_ptr() as *const _) },
261        }
262    }
263}
264
265#[cfg(feature = "options")]
266impl<const GRID_SIZE: usize> TetrahedralAvxQ0_15<GRID_SIZE> {
267    #[target_feature(enable = "avx2")]
268    unsafe fn interpolate(
269        &self,
270        in_r: usize,
271        in_g: usize,
272        in_b: usize,
273        lut: &[BarycentricWeight<i16>],
274        r: impl Fetcher<AvxVectorQ0_15Sse>,
275    ) -> AvxVectorQ0_15Sse {
276        let lut_r = unsafe { *lut.get_unchecked(in_r) };
277        let lut_g = unsafe { *lut.get_unchecked(in_g) };
278        let lut_b = unsafe { *lut.get_unchecked(in_b) };
279
280        let x: i32 = lut_r.x;
281        let y: i32 = lut_g.x;
282        let z: i32 = lut_b.x;
283
284        let x_n: i32 = lut_r.x_n;
285        let y_n: i32 = lut_g.x_n;
286        let z_n: i32 = lut_b.x_n;
287
288        let rx = lut_r.w;
289        let ry = lut_g.w;
290        let rz = lut_b.w;
291
292        let c0 = r.fetch(x, y, z);
293
294        let c2;
295        let c1;
296        let c3;
297        if rx >= ry {
298            if ry >= rz {
299                //rx >= ry && ry >= rz
300                c1 = r.fetch(x_n, y, z) - c0;
301                c2 = r.fetch(x_n, y_n, z) - r.fetch(x_n, y, z);
302                c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
303            } else if rx >= rz {
304                //rx >= rz && rz >= ry
305                c1 = r.fetch(x_n, y, z) - c0;
306                c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
307                c3 = r.fetch(x_n, y, z_n) - r.fetch(x_n, y, z);
308            } else {
309                //rz > rx && rx >= ry
310                c1 = r.fetch(x_n, y, z_n) - r.fetch(x, y, z_n);
311                c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
312                c3 = r.fetch(x, y, z_n) - c0;
313            }
314        } else if rx >= rz {
315            //ry > rx && rx >= rz
316            c1 = r.fetch(x_n, y_n, z) - r.fetch(x, y_n, z);
317            c2 = r.fetch(x, y_n, z) - c0;
318            c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
319        } else if ry >= rz {
320            //ry >= rz && rz > rx
321            c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
322            c2 = r.fetch(x, y_n, z) - c0;
323            c3 = r.fetch(x, y_n, z_n) - r.fetch(x, y_n, z);
324        } else {
325            //rz > ry && ry > rx
326            c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
327            c2 = r.fetch(x, y_n, z_n) - r.fetch(x, y, z_n);
328            c3 = r.fetch(x, y, z_n) - c0;
329        }
330        let s0 = c0.mla(c1, AvxVectorQ0_15Sse::from(rx));
331        let s1 = s0.mla(c2, AvxVectorQ0_15Sse::from(ry));
332        s1.mla(c3, AvxVectorQ0_15Sse::from(rz))
333    }
334}
335
336macro_rules! define_interp_avx {
337    ($interpolator: ident) => {
338        impl<const GRID_SIZE: usize> AvxMdInterpolationQ0_15 for $interpolator<GRID_SIZE> {
339            fn inter3_sse(
340                &self,
341                table: &[AvxAlignedI16],
342                in_r: usize,
343                in_g: usize,
344                in_b: usize,
345                lut: &[BarycentricWeight<i16>],
346            ) -> AvxVectorQ0_15Sse {
347                unsafe {
348                    self.interpolate(
349                        in_r,
350                        in_g,
351                        in_b,
352                        lut,
353                        TetrahedralAvxSseFetchVector::<GRID_SIZE> { cube: table },
354                    )
355                }
356            }
357        }
358    };
359}
360
361#[cfg(feature = "options")]
362macro_rules! define_interp_avx_d {
363    ($interpolator: ident) => {
364        impl<const GRID_SIZE: usize> AvxMdInterpolationQ0_15Double for $interpolator<GRID_SIZE> {
365            fn inter3_sse(
366                &self,
367                table0: &[AvxAlignedI16],
368                table1: &[AvxAlignedI16],
369                in_r: usize,
370                in_g: usize,
371                in_b: usize,
372                lut: &[BarycentricWeight<i16>],
373            ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
374                unsafe {
375                    self.interpolate(
376                        in_r,
377                        in_g,
378                        in_b,
379                        lut,
380                        TetrahedralAvxSseFetchVector::<GRID_SIZE> { cube: table0 },
381                        TetrahedralAvxSseFetchVector::<GRID_SIZE> { cube: table1 },
382                    )
383                }
384            }
385        }
386    };
387}
388
389#[cfg(feature = "options")]
390define_interp_avx!(TetrahedralAvxQ0_15);
391#[cfg(feature = "options")]
392define_interp_avx!(PyramidalAvxQ0_15);
393#[cfg(feature = "options")]
394define_interp_avx!(PrismaticAvxQ0_15);
395define_interp_avx!(TrilinearAvxQ0_15);
396#[cfg(feature = "options")]
397define_interp_avx_d!(PrismaticAvxQ0_15Double);
398#[cfg(feature = "options")]
399define_interp_avx_d!(PyramidAvxFmaQ0_15Double);
400
401#[cfg(feature = "options")]
402impl<const GRID_SIZE: usize> AvxMdInterpolationQ0_15Double
403    for TetrahedralAvxQ0_15Double<GRID_SIZE>
404{
405    fn inter3_sse(
406        &self,
407        table0: &[AvxAlignedI16],
408        table1: &[AvxAlignedI16],
409        in_r: usize,
410        in_g: usize,
411        in_b: usize,
412        lut: &[BarycentricWeight<i16>],
413    ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
414        unsafe {
415            self.interpolate(
416                in_r,
417                in_g,
418                in_b,
419                lut,
420                TetrahedralAvxFetchVector::<GRID_SIZE> {
421                    cube0: table0,
422                    cube1: table1,
423                },
424            )
425        }
426    }
427}
428
429impl<const GRID_SIZE: usize> AvxMdInterpolationQ0_15Double for TrilinearAvxQ0_15Double<GRID_SIZE> {
430    fn inter3_sse(
431        &self,
432        table0: &[AvxAlignedI16],
433        table1: &[AvxAlignedI16],
434        in_r: usize,
435        in_g: usize,
436        in_b: usize,
437        lut: &[BarycentricWeight<i16>],
438    ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
439        unsafe {
440            self.interpolate(
441                in_r,
442                in_g,
443                in_b,
444                lut,
445                TetrahedralAvxFetchVector::<GRID_SIZE> {
446                    cube0: table0,
447                    cube1: table1,
448                },
449            )
450        }
451    }
452}
453
454#[cfg(feature = "options")]
455impl<const GRID_SIZE: usize> PyramidalAvxQ0_15<GRID_SIZE> {
456    #[target_feature(enable = "avx2")]
457    unsafe fn interpolate(
458        &self,
459        in_r: usize,
460        in_g: usize,
461        in_b: usize,
462        lut: &[BarycentricWeight<i16>],
463        r: impl Fetcher<AvxVectorQ0_15Sse>,
464    ) -> AvxVectorQ0_15Sse {
465        let lut_r = unsafe { *lut.get_unchecked(in_r) };
466        let lut_g = unsafe { *lut.get_unchecked(in_g) };
467        let lut_b = unsafe { *lut.get_unchecked(in_b) };
468
469        let x: i32 = lut_r.x;
470        let y: i32 = lut_g.x;
471        let z: i32 = lut_b.x;
472
473        let x_n: i32 = lut_r.x_n;
474        let y_n: i32 = lut_g.x_n;
475        let z_n: i32 = lut_b.x_n;
476
477        let dr = lut_r.w;
478        let dg = lut_g.w;
479        let db = lut_b.w;
480
481        let c0 = r.fetch(x, y, z);
482
483        let w0 = AvxVectorQ0_15Sse::from(db);
484        let w1 = AvxVectorQ0_15Sse::from(dr);
485        let w2 = AvxVectorQ0_15Sse::from(dg);
486
487        if dr > db && dg > db {
488            let w3 = AvxVectorQ0_15Sse::from(dr) * AvxVectorQ0_15Sse::from(dg);
489            let x0 = r.fetch(x_n, y_n, z_n);
490            let x1 = r.fetch(x_n, y_n, z);
491            let x2 = r.fetch(x_n, y, z);
492            let x3 = r.fetch(x, y_n, z);
493
494            let c1 = x0 - x1;
495            let c2 = x2 - c0;
496            let c3 = x3 - c0;
497            let c4 = c0 - x3 - x2 + x1;
498
499            let s0 = c0.mla(c1, w0);
500            let s1 = s0.mla(c2, w1);
501            let s2 = s1.mla(c3, w2);
502            s2.mla(c4, w3)
503        } else if db > dr && dg > dr {
504            let w3 = AvxVectorQ0_15Sse::from(dg) * AvxVectorQ0_15Sse::from(db);
505
506            let x0 = r.fetch(x, y, z_n);
507            let x1 = r.fetch(x_n, y_n, z_n);
508            let x2 = r.fetch(x, y_n, z_n);
509            let x3 = r.fetch(x, y_n, z);
510
511            let c1 = x0 - c0;
512            let c2 = x1 - x2;
513            let c3 = x3 - c0;
514            let c4 = c0 - x3 - x0 + x2;
515
516            let s0 = c0.mla(c1, w0);
517            let s1 = s0.mla(c2, w1);
518            let s2 = s1.mla(c3, w2);
519            s2.mla(c4, w3)
520        } else {
521            let w3 = AvxVectorQ0_15Sse::from(db) * AvxVectorQ0_15Sse::from(dr);
522
523            let x0 = r.fetch(x, y, z_n);
524            let x1 = r.fetch(x_n, y, z);
525            let x2 = r.fetch(x_n, y, z_n);
526            let x3 = r.fetch(x_n, y_n, z_n);
527
528            let c1 = x0 - c0;
529            let c2 = x1 - c0;
530            let c3 = x3 - x2;
531            let c4 = c0 - x1 - x0 + x2;
532
533            let s0 = c0.mla(c1, w0);
534            let s1 = s0.mla(c2, w1);
535            let s2 = s1.mla(c3, w2);
536            s2.mla(c4, w3)
537        }
538    }
539}
540
541#[cfg(feature = "options")]
542impl<const GRID_SIZE: usize> PrismaticAvxQ0_15<GRID_SIZE> {
543    #[target_feature(enable = "avx2")]
544    unsafe fn interpolate(
545        &self,
546        in_r: usize,
547        in_g: usize,
548        in_b: usize,
549        lut: &[BarycentricWeight<i16>],
550        r: impl Fetcher<AvxVectorQ0_15Sse>,
551    ) -> AvxVectorQ0_15Sse {
552        let lut_r = unsafe { *lut.get_unchecked(in_r) };
553        let lut_g = unsafe { *lut.get_unchecked(in_g) };
554        let lut_b = unsafe { *lut.get_unchecked(in_b) };
555
556        let x: i32 = lut_r.x;
557        let y: i32 = lut_g.x;
558        let z: i32 = lut_b.x;
559
560        let x_n: i32 = lut_r.x_n;
561        let y_n: i32 = lut_g.x_n;
562        let z_n: i32 = lut_b.x_n;
563
564        let dr = lut_r.w;
565        let dg = lut_g.w;
566        let db = lut_b.w;
567
568        let c0 = r.fetch(x, y, z);
569
570        let w0 = AvxVectorQ0_15Sse::from(db);
571        let w1 = AvxVectorQ0_15Sse::from(dr);
572        let w2 = AvxVectorQ0_15Sse::from(dg);
573        let w3 = AvxVectorQ0_15Sse::from(dg) * AvxVectorQ0_15Sse::from(db);
574        let w4 = AvxVectorQ0_15Sse::from(dr) * AvxVectorQ0_15Sse::from(dg);
575
576        if db > dr {
577            let x0 = r.fetch(x, y, z_n);
578            let x1 = r.fetch(x_n, y, z_n);
579            let x2 = r.fetch(x, y_n, z);
580            let x3 = r.fetch(x, y_n, z_n);
581            let x4 = r.fetch(x_n, y_n, z_n);
582
583            let c1 = x0 - c0;
584            let c2 = x1 - x0;
585            let c3 = x2 - c0;
586            let c4 = c0 - x2 - x0 + x3;
587            let c5 = x0 - x3 - x1 + x4;
588
589            let s0 = c0.mla(c1, w0);
590            let s1 = s0.mla(c2, w1);
591            let s2 = s1.mla(c3, w2);
592            let s3 = s2.mla(c4, w3);
593            s3.mla(c5, w4)
594        } else {
595            let x0 = r.fetch(x_n, y, z);
596            let x1 = r.fetch(x_n, y, z_n);
597            let x2 = r.fetch(x, y_n, z);
598            let x3 = r.fetch(x_n, y_n, z);
599            let x4 = r.fetch(x_n, y_n, z_n);
600
601            let c1 = x1 - x0;
602            let c2 = x0 - c0;
603            let c3 = x2 - c0;
604            let c4 = x0 - x3 - x1 + x4;
605            let c5 = c0 - x2 - x0 + x3;
606
607            let s0 = c0.mla(c1, w0);
608            let s1 = s0.mla(c2, w1);
609            let s2 = s1.mla(c3, w2);
610            let s3 = s2.mla(c4, w3);
611            s3.mla(c5, w4)
612        }
613    }
614}
615
616#[cfg(feature = "options")]
617impl<const GRID_SIZE: usize> PrismaticAvxQ0_15Double<GRID_SIZE> {
618    #[target_feature(enable = "avx2")]
619    unsafe fn interpolate(
620        &self,
621        in_r: usize,
622        in_g: usize,
623        in_b: usize,
624        lut: &[BarycentricWeight<i16>],
625        r0: impl Fetcher<AvxVectorQ0_15Sse>,
626        r1: impl Fetcher<AvxVectorQ0_15Sse>,
627    ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
628        let lut_r = unsafe { *lut.get_unchecked(in_r) };
629        let lut_g = unsafe { *lut.get_unchecked(in_g) };
630        let lut_b = unsafe { *lut.get_unchecked(in_b) };
631
632        let x: i32 = lut_r.x;
633        let y: i32 = lut_g.x;
634        let z: i32 = lut_b.x;
635
636        let x_n: i32 = lut_r.x_n;
637        let y_n: i32 = lut_g.x_n;
638        let z_n: i32 = lut_b.x_n;
639
640        let dr = lut_r.w;
641        let dg = lut_g.w;
642        let db = lut_b.w;
643
644        let c0_0 = r0.fetch(x, y, z);
645        let c0_1 = r0.fetch(x, y, z);
646
647        let w0 = AvxVectorQ0_15::from(db);
648        let w1 = AvxVectorQ0_15::from(dr);
649        let w2 = AvxVectorQ0_15::from(dg);
650        let w3 = AvxVectorQ0_15::from(dg) * AvxVectorQ0_15::from(db);
651        let w4 = AvxVectorQ0_15::from(dr) * AvxVectorQ0_15::from(dg);
652
653        let c0 = AvxVectorQ0_15::from_sse(c0_0, c0_1);
654
655        if db > dr {
656            let x0_0 = r0.fetch(x, y, z_n);
657            let x1_0 = r0.fetch(x_n, y, z_n);
658            let x2_0 = r0.fetch(x, y_n, z);
659            let x3_0 = r0.fetch(x, y_n, z_n);
660            let x4_0 = r0.fetch(x_n, y_n, z_n);
661
662            let x0_1 = r1.fetch(x, y, z_n);
663            let x1_1 = r1.fetch(x_n, y, z_n);
664            let x2_1 = r1.fetch(x, y_n, z);
665            let x3_1 = r1.fetch(x, y_n, z_n);
666            let x4_1 = r1.fetch(x_n, y_n, z_n);
667
668            let x0 = AvxVectorQ0_15::from_sse(x0_0, x0_1);
669            let x1 = AvxVectorQ0_15::from_sse(x1_0, x1_1);
670            let x2 = AvxVectorQ0_15::from_sse(x2_0, x2_1);
671            let x3 = AvxVectorQ0_15::from_sse(x3_0, x3_1);
672            let x4 = AvxVectorQ0_15::from_sse(x4_0, x4_1);
673
674            let c1 = x0 - c0;
675            let c2 = x1 - x0;
676            let c3 = x2 - c0;
677            let c4 = c0 - x2 - x0 + x3;
678            let c5 = x0 - x3 - x1 + x4;
679
680            let s0 = c0.mla(c1, w0);
681            let s1 = s0.mla(c2, w1);
682            let s2 = s1.mla(c3, w2);
683            let s3 = s2.mla(c4, w3);
684            s3.mla(c5, w4).split()
685        } else {
686            let x0_0 = r0.fetch(x_n, y, z);
687            let x1_0 = r0.fetch(x_n, y, z_n);
688            let x2_0 = r0.fetch(x, y_n, z);
689            let x3_0 = r0.fetch(x_n, y_n, z);
690            let x4_0 = r0.fetch(x_n, y_n, z_n);
691
692            let x0_1 = r1.fetch(x_n, y, z);
693            let x1_1 = r1.fetch(x_n, y, z_n);
694            let x2_1 = r1.fetch(x, y_n, z);
695            let x3_1 = r1.fetch(x_n, y_n, z);
696            let x4_1 = r1.fetch(x_n, y_n, z_n);
697
698            let x0 = AvxVectorQ0_15::from_sse(x0_0, x0_1);
699            let x1 = AvxVectorQ0_15::from_sse(x1_0, x1_1);
700            let x2 = AvxVectorQ0_15::from_sse(x2_0, x2_1);
701            let x3 = AvxVectorQ0_15::from_sse(x3_0, x3_1);
702            let x4 = AvxVectorQ0_15::from_sse(x4_0, x4_1);
703
704            let c1 = x1 - x0;
705            let c2 = x0 - c0;
706            let c3 = x2 - c0;
707            let c4 = x0 - x3 - x1 + x4;
708            let c5 = c0 - x2 - x0 + x3;
709
710            let s0 = c0.mla(c1, w0);
711            let s1 = s0.mla(c2, w1);
712            let s2 = s1.mla(c3, w2);
713            let s3 = s2.mla(c4, w3);
714            s3.mla(c5, w4).split()
715        }
716    }
717}
718
719#[cfg(feature = "options")]
720impl<const GRID_SIZE: usize> PyramidAvxFmaQ0_15Double<GRID_SIZE> {
721    #[target_feature(enable = "avx2")]
722    unsafe fn interpolate(
723        &self,
724        in_r: usize,
725        in_g: usize,
726        in_b: usize,
727        lut: &[BarycentricWeight<i16>],
728        r0: impl Fetcher<AvxVectorQ0_15Sse>,
729        r1: impl Fetcher<AvxVectorQ0_15Sse>,
730    ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
731        let lut_r = unsafe { *lut.get_unchecked(in_r) };
732        let lut_g = unsafe { *lut.get_unchecked(in_g) };
733        let lut_b = unsafe { *lut.get_unchecked(in_b) };
734
735        let x: i32 = lut_r.x;
736        let y: i32 = lut_g.x;
737        let z: i32 = lut_b.x;
738
739        let x_n: i32 = lut_r.x_n;
740        let y_n: i32 = lut_g.x_n;
741        let z_n: i32 = lut_b.x_n;
742
743        let dr = lut_r.w;
744        let dg = lut_g.w;
745        let db = lut_b.w;
746
747        let c0_0 = r0.fetch(x, y, z);
748        let c0_1 = r1.fetch(x, y, z);
749
750        let w0 = AvxVectorQ0_15::from(db);
751        let w1 = AvxVectorQ0_15::from(dr);
752        let w2 = AvxVectorQ0_15::from(dg);
753
754        let c0 = AvxVectorQ0_15::from_sse(c0_0, c0_1);
755
756        if dr > db && dg > db {
757            let w3 = AvxVectorQ0_15::from(dr) * AvxVectorQ0_15::from(dg);
758
759            let x0_0 = r0.fetch(x_n, y_n, z_n);
760            let x1_0 = r0.fetch(x_n, y_n, z);
761            let x2_0 = r0.fetch(x_n, y, z);
762            let x3_0 = r0.fetch(x, y_n, z);
763
764            let x0_1 = r1.fetch(x_n, y_n, z_n);
765            let x1_1 = r1.fetch(x_n, y_n, z);
766            let x2_1 = r1.fetch(x_n, y, z);
767            let x3_1 = r1.fetch(x, y_n, z);
768
769            let x0 = AvxVectorQ0_15::from_sse(x0_0, x0_1);
770            let x1 = AvxVectorQ0_15::from_sse(x1_0, x1_1);
771            let x2 = AvxVectorQ0_15::from_sse(x2_0, x2_1);
772            let x3 = AvxVectorQ0_15::from_sse(x3_0, x3_1);
773
774            let c1 = x0 - x1;
775            let c2 = x2 - c0;
776            let c3 = x3 - c0;
777            let c4 = c0 - x3 - x2 + x1;
778
779            let s0 = c0.mla(c1, w0);
780            let s1 = s0.mla(c2, w1);
781            let s2 = s1.mla(c3, w2);
782            s2.mla(c4, w3).split()
783        } else if db > dr && dg > dr {
784            let w3 = AvxVectorQ0_15::from(dg) * AvxVectorQ0_15::from(db);
785
786            let x0_0 = r0.fetch(x, y, z_n);
787            let x1_0 = r0.fetch(x_n, y_n, z_n);
788            let x2_0 = r0.fetch(x, y_n, z_n);
789            let x3_0 = r0.fetch(x, y_n, z);
790
791            let x0_1 = r1.fetch(x, y, z_n);
792            let x1_1 = r1.fetch(x_n, y_n, z_n);
793            let x2_1 = r1.fetch(x, y_n, z_n);
794            let x3_1 = r1.fetch(x, y_n, z);
795
796            let x0 = AvxVectorQ0_15::from_sse(x0_0, x0_1);
797            let x1 = AvxVectorQ0_15::from_sse(x1_0, x1_1);
798            let x2 = AvxVectorQ0_15::from_sse(x2_0, x2_1);
799            let x3 = AvxVectorQ0_15::from_sse(x3_0, x3_1);
800
801            let c1 = x0 - c0;
802            let c2 = x1 - x2;
803            let c3 = x3 - c0;
804            let c4 = c0 - x3 - x0 + x2;
805
806            let s0 = c0.mla(c1, w0);
807            let s1 = s0.mla(c2, w1);
808            let s2 = s1.mla(c3, w2);
809            s2.mla(c4, w3).split()
810        } else {
811            let w3 = AvxVectorQ0_15::from(db) * AvxVectorQ0_15::from(dr);
812
813            let x0_0 = r0.fetch(x, y, z_n);
814            let x1_0 = r0.fetch(x_n, y, z);
815            let x2_0 = r0.fetch(x_n, y, z_n);
816            let x3_0 = r0.fetch(x_n, y_n, z_n);
817
818            let x0_1 = r1.fetch(x, y, z_n);
819            let x1_1 = r1.fetch(x_n, y, z);
820            let x2_1 = r1.fetch(x_n, y, z_n);
821            let x3_1 = r1.fetch(x_n, y_n, z_n);
822
823            let x0 = AvxVectorQ0_15::from_sse(x0_0, x0_1);
824            let x1 = AvxVectorQ0_15::from_sse(x1_0, x1_1);
825            let x2 = AvxVectorQ0_15::from_sse(x2_0, x2_1);
826            let x3 = AvxVectorQ0_15::from_sse(x3_0, x3_1);
827
828            let c1 = x0 - c0;
829            let c2 = x1 - c0;
830            let c3 = x3 - x2;
831            let c4 = c0 - x1 - x0 + x2;
832
833            let s0 = c0.mla(c1, w0);
834            let s1 = s0.mla(c2, w1);
835            let s2 = s1.mla(c3, w2);
836            s2.mla(c4, w3).split()
837        }
838    }
839}
840
841#[cfg(feature = "options")]
842impl<const GRID_SIZE: usize> TetrahedralAvxQ0_15Double<GRID_SIZE> {
843    #[target_feature(enable = "avx2")]
844    unsafe fn interpolate(
845        &self,
846        in_r: usize,
847        in_g: usize,
848        in_b: usize,
849        lut: &[BarycentricWeight<i16>],
850        rv: impl Fetcher<AvxVectorQ0_15>,
851    ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
852        let lut_r = unsafe { *lut.get_unchecked(in_r) };
853        let lut_g = unsafe { *lut.get_unchecked(in_g) };
854        let lut_b = unsafe { *lut.get_unchecked(in_b) };
855
856        let x: i32 = lut_r.x;
857        let y: i32 = lut_g.x;
858        let z: i32 = lut_b.x;
859
860        let x_n: i32 = lut_r.x_n;
861        let y_n: i32 = lut_g.x_n;
862        let z_n: i32 = lut_b.x_n;
863
864        let rx = lut_r.w;
865        let ry = lut_g.w;
866        let rz = lut_b.w;
867
868        let c0 = rv.fetch(x, y, z);
869
870        let w0 = AvxVectorQ0_15::from(rx);
871        let w1 = AvxVectorQ0_15::from(ry);
872        let w2 = AvxVectorQ0_15::from(rz);
873
874        let c2;
875        let c1;
876        let c3;
877        if rx >= ry {
878            if ry >= rz {
879                //rx >= ry && ry >= rz
880                c1 = rv.fetch(x_n, y, z) - c0;
881                c2 = rv.fetch(x_n, y_n, z) - rv.fetch(x_n, y, z);
882                c3 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y_n, z);
883            } else if rx >= rz {
884                //rx >= rz && rz >= ry
885                c1 = rv.fetch(x_n, y, z) - c0;
886                c2 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y, z_n);
887                c3 = rv.fetch(x_n, y, z_n) - rv.fetch(x_n, y, z);
888            } else {
889                //rz > rx && rx >= ry
890                c1 = rv.fetch(x_n, y, z_n) - rv.fetch(x, y, z_n);
891                c2 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y, z_n);
892                c3 = rv.fetch(x, y, z_n) - c0;
893            }
894        } else if rx >= rz {
895            //ry > rx && rx >= rz
896            c1 = rv.fetch(x_n, y_n, z) - rv.fetch(x, y_n, z);
897            c2 = rv.fetch(x, y_n, z) - c0;
898            c3 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y_n, z);
899        } else if ry >= rz {
900            //ry >= rz && rz > rx
901            c1 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x, y_n, z_n);
902            c2 = rv.fetch(x, y_n, z) - c0;
903            c3 = rv.fetch(x, y_n, z_n) - rv.fetch(x, y_n, z);
904        } else {
905            //rz > ry && ry > rx
906            c1 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x, y_n, z_n);
907            c2 = rv.fetch(x, y_n, z_n) - rv.fetch(x, y, z_n);
908            c3 = rv.fetch(x, y, z_n) - c0;
909        }
910        let s0 = c0.mla(c1, w0);
911        let s1 = s0.mla(c2, w1);
912        s1.mla(c3, w2).split()
913    }
914}
915
916impl<const GRID_SIZE: usize> TrilinearAvxQ0_15Double<GRID_SIZE> {
917    #[target_feature(enable = "avx2")]
918    unsafe fn interpolate(
919        &self,
920        in_r: usize,
921        in_g: usize,
922        in_b: usize,
923        lut: &[BarycentricWeight<i16>],
924        rv: impl Fetcher<AvxVectorQ0_15>,
925    ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
926        let lut_r = unsafe { *lut.get_unchecked(in_r) };
927        let lut_g = unsafe { *lut.get_unchecked(in_g) };
928        let lut_b = unsafe { *lut.get_unchecked(in_b) };
929
930        let x: i32 = lut_r.x;
931        let y: i32 = lut_g.x;
932        let z: i32 = lut_b.x;
933
934        let x_n: i32 = lut_r.x_n;
935        let y_n: i32 = lut_g.x_n;
936        let z_n: i32 = lut_b.x_n;
937
938        let rx = lut_r.w;
939        let ry = lut_g.w;
940        let rz = lut_b.w;
941
942        const Q_MAX: i16 = ((1i32 << 15i32) - 1) as i16;
943
944        let q_max = AvxVectorQ0_15::from(Q_MAX);
945        let w0 = AvxVectorQ0_15::from(rx);
946        let w1 = AvxVectorQ0_15::from(ry);
947        let w2 = AvxVectorQ0_15::from(rz);
948        let dx = q_max - w0;
949        let dy = q_max - w1;
950        let dz = q_max - w2;
951
952        let c000 = rv.fetch(x, y, z);
953        let c100 = rv.fetch(x_n, y, z);
954        let c010 = rv.fetch(x, y_n, z);
955        let c110 = rv.fetch(x_n, y_n, z);
956        let c001 = rv.fetch(x, y, z_n);
957        let c101 = rv.fetch(x_n, y, z_n);
958        let c011 = rv.fetch(x, y_n, z_n);
959        let c111 = rv.fetch(x_n, y_n, z_n);
960
961        let c00 = (c000 * dx).mla(c100, w0);
962        let c10 = (c010 * dx).mla(c110, w0);
963        let c01 = (c001 * dx).mla(c101, w0);
964        let c11 = (c011 * dx).mla(c111, w0);
965
966        let c0 = (c00 * dy).mla(c10, w1);
967        let c1 = (c01 * dy).mla(c11, w1);
968
969        (c0 * dz).mla(c1, w2).split()
970    }
971}
972
973impl<const GRID_SIZE: usize> TrilinearAvxQ0_15<GRID_SIZE> {
974    #[target_feature(enable = "avx2")]
975    unsafe fn interpolate(
976        &self,
977        in_r: usize,
978        in_g: usize,
979        in_b: usize,
980        lut: &[BarycentricWeight<i16>],
981        r: impl Fetcher<AvxVectorQ0_15Sse>,
982    ) -> AvxVectorQ0_15Sse {
983        let lut_r = unsafe { *lut.get_unchecked(in_r) };
984        let lut_g = unsafe { *lut.get_unchecked(in_g) };
985        let lut_b = unsafe { *lut.get_unchecked(in_b) };
986
987        let x: i32 = lut_r.x;
988        let y: i32 = lut_g.x;
989        let z: i32 = lut_b.x;
990
991        let x_n: i32 = lut_r.x_n;
992        let y_n: i32 = lut_g.x_n;
993        let z_n: i32 = lut_b.x_n;
994
995        let dr = lut_r.w;
996        let dg = lut_g.w;
997        let db = lut_b.w;
998
999        const Q_MAX: i16 = ((1i32 << 15i32) - 1) as i16;
1000
1001        let q_max = AvxVectorQ0_15Sse::from(Q_MAX);
1002        let q_max_avx = AvxVectorQ0_15::from(Q_MAX);
1003        let w0 = AvxVectorQ0_15::from(dr);
1004        let w1 = AvxVectorQ0_15::from(dg);
1005        let w2 = AvxVectorQ0_15Sse::from(db);
1006        let dx = q_max_avx - w0;
1007        let dy = q_max_avx - w1;
1008        let dz = q_max - w2;
1009
1010        let c000 = r.fetch(x, y, z);
1011        let c100 = r.fetch(x_n, y, z);
1012        let c010 = r.fetch(x, y_n, z);
1013        let c110 = r.fetch(x_n, y_n, z);
1014        let c001 = r.fetch(x, y, z_n);
1015        let c101 = r.fetch(x_n, y, z_n);
1016        let c011 = r.fetch(x, y_n, z_n);
1017        let c111 = r.fetch(x_n, y_n, z_n);
1018
1019        let x000 = AvxVectorQ0_15::from_sse(c000, c001);
1020        let x010 = AvxVectorQ0_15::from_sse(c010, c011);
1021        let x011 = AvxVectorQ0_15::from_sse(c100, c101);
1022        let x111 = AvxVectorQ0_15::from_sse(c110, c111);
1023
1024        let c00 = (x000 * dx).mla(x011, w0);
1025        let c10 = (x010 * dx).mla(x111, w0);
1026
1027        let c0 = (c00 * dy).mla(c10, w1);
1028
1029        let (c0, c1) = c0.split();
1030
1031        (c0 * dz).mla(c1, w2)
1032    }
1033}