1#![cfg(feature = "avx_luts")]
30use crate::conversions::interpolator::BarycentricWeight;
31use crate::math::FusedMultiplyAdd;
32use std::arch::x86_64::*;
33use std::ops::{Add, Mul, Sub};
34
35#[repr(align(8), C)]
36pub(crate) struct AvxAlignedI16(pub(crate) [i16; 4]);
37
38#[cfg(feature = "options")]
39pub(crate) struct TetrahedralAvxQ0_15<const GRID_SIZE: usize> {}
40
41#[cfg(feature = "options")]
42pub(crate) struct PyramidalAvxQ0_15<const GRID_SIZE: usize> {}
43
44#[cfg(feature = "options")]
45pub(crate) struct PrismaticAvxQ0_15<const GRID_SIZE: usize> {}
46
47pub(crate) struct TrilinearAvxQ0_15<const GRID_SIZE: usize> {}
48
49#[cfg(feature = "options")]
50pub(crate) struct PrismaticAvxQ0_15Double<const GRID_SIZE: usize> {}
51
52pub(crate) struct TrilinearAvxQ0_15Double<const GRID_SIZE: usize> {}
53
54#[cfg(feature = "options")]
55pub(crate) struct PyramidAvxFmaQ0_15Double<const GRID_SIZE: usize> {}
56
57#[cfg(feature = "options")]
58pub(crate) struct TetrahedralAvxQ0_15Double<const GRID_SIZE: usize> {}
59
60pub(crate) trait AvxMdInterpolationQ0_15Double {
61 fn inter3_sse(
62 &self,
63 table0: &[AvxAlignedI16],
64 table1: &[AvxAlignedI16],
65 in_r: usize,
66 in_g: usize,
67 in_b: usize,
68 lut: &[BarycentricWeight<i16>],
69 ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse);
70}
71
72pub(crate) trait AvxMdInterpolationQ0_15 {
73 fn inter3_sse(
74 &self,
75 table: &[AvxAlignedI16],
76 in_r: usize,
77 in_g: usize,
78 in_b: usize,
79 lut: &[BarycentricWeight<i16>],
80 ) -> AvxVectorQ0_15Sse;
81}
82
83trait Fetcher<T> {
84 fn fetch(&self, x: i32, y: i32, z: i32) -> T;
85}
86
87#[derive(Copy, Clone)]
88#[repr(transparent)]
89pub(crate) struct AvxVectorQ0_15Sse {
90 pub(crate) v: __m128i,
91}
92
93#[derive(Copy, Clone)]
94#[repr(transparent)]
95pub(crate) struct AvxVectorQ0_15 {
96 pub(crate) v: __m256i,
97}
98
99impl AvxVectorQ0_15 {
100 #[inline(always)]
101 pub(crate) fn from_sse(lo: AvxVectorQ0_15Sse, hi: AvxVectorQ0_15Sse) -> AvxVectorQ0_15 {
102 unsafe {
103 AvxVectorQ0_15 {
104 v: _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(lo.v), hi.v),
105 }
106 }
107 }
108
109 #[inline(always)]
110 pub(crate) fn split(self) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
111 unsafe {
112 (
113 AvxVectorQ0_15Sse {
114 v: _mm256_castsi256_si128(self.v),
115 },
116 AvxVectorQ0_15Sse {
117 v: _mm256_extracti128_si256::<1>(self.v),
118 },
119 )
120 }
121 }
122}
123
124impl From<i16> for AvxVectorQ0_15Sse {
125 #[inline(always)]
126 fn from(v: i16) -> Self {
127 AvxVectorQ0_15Sse {
128 v: unsafe { _mm_set1_epi16(v) },
129 }
130 }
131}
132
133impl From<i16> for AvxVectorQ0_15 {
134 #[inline(always)]
135 fn from(v: i16) -> Self {
136 AvxVectorQ0_15 {
137 v: unsafe { _mm256_set1_epi16(v) },
138 }
139 }
140}
141
142impl Sub<AvxVectorQ0_15Sse> for AvxVectorQ0_15Sse {
143 type Output = Self;
144 #[inline(always)]
145 fn sub(self, rhs: AvxVectorQ0_15Sse) -> Self::Output {
146 AvxVectorQ0_15Sse {
147 v: unsafe { _mm_sub_epi16(self.v, rhs.v) },
148 }
149 }
150}
151
152impl Sub<AvxVectorQ0_15> for AvxVectorQ0_15 {
153 type Output = Self;
154 #[inline(always)]
155 fn sub(self, rhs: AvxVectorQ0_15) -> Self::Output {
156 AvxVectorQ0_15 {
157 v: unsafe { _mm256_sub_epi16(self.v, rhs.v) },
158 }
159 }
160}
161
162impl Add<AvxVectorQ0_15Sse> for AvxVectorQ0_15Sse {
163 type Output = Self;
164 #[inline(always)]
165 fn add(self, rhs: AvxVectorQ0_15Sse) -> Self::Output {
166 AvxVectorQ0_15Sse {
167 v: unsafe { _mm_add_epi16(self.v, rhs.v) },
168 }
169 }
170}
171
172impl Mul<AvxVectorQ0_15Sse> for AvxVectorQ0_15Sse {
173 type Output = Self;
174 #[inline(always)]
175 fn mul(self, rhs: AvxVectorQ0_15Sse) -> Self::Output {
176 AvxVectorQ0_15Sse {
177 v: unsafe { _mm_mulhrs_epi16(self.v, rhs.v) },
178 }
179 }
180}
181
182impl Add<AvxVectorQ0_15> for AvxVectorQ0_15 {
183 type Output = Self;
184 #[inline(always)]
185 fn add(self, rhs: AvxVectorQ0_15) -> Self::Output {
186 AvxVectorQ0_15 {
187 v: unsafe { _mm256_add_epi16(self.v, rhs.v) },
188 }
189 }
190}
191
192impl Mul<AvxVectorQ0_15> for AvxVectorQ0_15 {
193 type Output = Self;
194 #[inline(always)]
195 fn mul(self, rhs: AvxVectorQ0_15) -> Self::Output {
196 AvxVectorQ0_15 {
197 v: unsafe { _mm256_mulhrs_epi16(self.v, rhs.v) },
198 }
199 }
200}
201
202impl FusedMultiplyAdd<AvxVectorQ0_15Sse> for AvxVectorQ0_15Sse {
203 #[inline(always)]
204 fn mla(&self, b: AvxVectorQ0_15Sse, c: AvxVectorQ0_15Sse) -> AvxVectorQ0_15Sse {
205 AvxVectorQ0_15Sse {
206 v: unsafe { _mm_add_epi16(_mm_mulhrs_epi16(b.v, c.v), self.v) },
207 }
208 }
209}
210
211impl FusedMultiplyAdd<AvxVectorQ0_15> for AvxVectorQ0_15 {
212 #[inline(always)]
213 fn mla(&self, b: AvxVectorQ0_15, c: AvxVectorQ0_15) -> AvxVectorQ0_15 {
214 AvxVectorQ0_15 {
215 v: unsafe { _mm256_add_epi16(_mm256_mulhrs_epi16(b.v, c.v), self.v) },
216 }
217 }
218}
219
220struct TetrahedralAvxSseFetchVector<'a, const GRID_SIZE: usize> {
221 cube: &'a [AvxAlignedI16],
222}
223
224struct TetrahedralAvxFetchVector<'a, const GRID_SIZE: usize> {
225 cube0: &'a [AvxAlignedI16],
226 cube1: &'a [AvxAlignedI16],
227}
228
229impl<const GRID_SIZE: usize> Fetcher<AvxVectorQ0_15> for TetrahedralAvxFetchVector<'_, GRID_SIZE> {
232 #[inline(always)]
233 fn fetch(&self, x: i32, y: i32, z: i32) -> AvxVectorQ0_15 {
234 let offset = (x as u32 * (GRID_SIZE as u32 * GRID_SIZE as u32)
235 + y as u32 * GRID_SIZE as u32
236 + z as u32) as usize;
237 let jx0 = unsafe { self.cube0.get_unchecked(offset..) };
238 let jx1 = unsafe { self.cube1.get_unchecked(offset..) };
239 AvxVectorQ0_15 {
240 v: unsafe {
241 _mm256_inserti128_si256::<1>(
242 _mm256_castsi128_si256(_mm_loadu_si64(jx0.as_ptr() as *const _)),
243 _mm_loadu_si64(jx1.as_ptr() as *const _),
244 )
245 },
246 }
247 }
248}
249
250impl<const GRID_SIZE: usize> Fetcher<AvxVectorQ0_15Sse>
251 for TetrahedralAvxSseFetchVector<'_, GRID_SIZE>
252{
253 #[inline(always)]
254 fn fetch(&self, x: i32, y: i32, z: i32) -> AvxVectorQ0_15Sse {
255 let offset = (x as u32 * (GRID_SIZE as u32 * GRID_SIZE as u32)
256 + y as u32 * GRID_SIZE as u32
257 + z as u32) as usize;
258 let jx = unsafe { self.cube.get_unchecked(offset..) };
259 AvxVectorQ0_15Sse {
260 v: unsafe { _mm_loadu_si64(jx.as_ptr() as *const _) },
261 }
262 }
263}
264
265#[cfg(feature = "options")]
266impl<const GRID_SIZE: usize> TetrahedralAvxQ0_15<GRID_SIZE> {
267 #[target_feature(enable = "avx2")]
268 unsafe fn interpolate(
269 &self,
270 in_r: usize,
271 in_g: usize,
272 in_b: usize,
273 lut: &[BarycentricWeight<i16>],
274 r: impl Fetcher<AvxVectorQ0_15Sse>,
275 ) -> AvxVectorQ0_15Sse {
276 let lut_r = unsafe { *lut.get_unchecked(in_r) };
277 let lut_g = unsafe { *lut.get_unchecked(in_g) };
278 let lut_b = unsafe { *lut.get_unchecked(in_b) };
279
280 let x: i32 = lut_r.x;
281 let y: i32 = lut_g.x;
282 let z: i32 = lut_b.x;
283
284 let x_n: i32 = lut_r.x_n;
285 let y_n: i32 = lut_g.x_n;
286 let z_n: i32 = lut_b.x_n;
287
288 let rx = lut_r.w;
289 let ry = lut_g.w;
290 let rz = lut_b.w;
291
292 let c0 = r.fetch(x, y, z);
293
294 let c2;
295 let c1;
296 let c3;
297 if rx >= ry {
298 if ry >= rz {
299 c1 = r.fetch(x_n, y, z) - c0;
301 c2 = r.fetch(x_n, y_n, z) - r.fetch(x_n, y, z);
302 c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
303 } else if rx >= rz {
304 c1 = r.fetch(x_n, y, z) - c0;
306 c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
307 c3 = r.fetch(x_n, y, z_n) - r.fetch(x_n, y, z);
308 } else {
309 c1 = r.fetch(x_n, y, z_n) - r.fetch(x, y, z_n);
311 c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
312 c3 = r.fetch(x, y, z_n) - c0;
313 }
314 } else if rx >= rz {
315 c1 = r.fetch(x_n, y_n, z) - r.fetch(x, y_n, z);
317 c2 = r.fetch(x, y_n, z) - c0;
318 c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
319 } else if ry >= rz {
320 c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
322 c2 = r.fetch(x, y_n, z) - c0;
323 c3 = r.fetch(x, y_n, z_n) - r.fetch(x, y_n, z);
324 } else {
325 c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
327 c2 = r.fetch(x, y_n, z_n) - r.fetch(x, y, z_n);
328 c3 = r.fetch(x, y, z_n) - c0;
329 }
330 let s0 = c0.mla(c1, AvxVectorQ0_15Sse::from(rx));
331 let s1 = s0.mla(c2, AvxVectorQ0_15Sse::from(ry));
332 s1.mla(c3, AvxVectorQ0_15Sse::from(rz))
333 }
334}
335
336macro_rules! define_interp_avx {
337 ($interpolator: ident) => {
338 impl<const GRID_SIZE: usize> AvxMdInterpolationQ0_15 for $interpolator<GRID_SIZE> {
339 fn inter3_sse(
340 &self,
341 table: &[AvxAlignedI16],
342 in_r: usize,
343 in_g: usize,
344 in_b: usize,
345 lut: &[BarycentricWeight<i16>],
346 ) -> AvxVectorQ0_15Sse {
347 unsafe {
348 self.interpolate(
349 in_r,
350 in_g,
351 in_b,
352 lut,
353 TetrahedralAvxSseFetchVector::<GRID_SIZE> { cube: table },
354 )
355 }
356 }
357 }
358 };
359}
360
361#[cfg(feature = "options")]
362macro_rules! define_interp_avx_d {
363 ($interpolator: ident) => {
364 impl<const GRID_SIZE: usize> AvxMdInterpolationQ0_15Double for $interpolator<GRID_SIZE> {
365 fn inter3_sse(
366 &self,
367 table0: &[AvxAlignedI16],
368 table1: &[AvxAlignedI16],
369 in_r: usize,
370 in_g: usize,
371 in_b: usize,
372 lut: &[BarycentricWeight<i16>],
373 ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
374 unsafe {
375 self.interpolate(
376 in_r,
377 in_g,
378 in_b,
379 lut,
380 TetrahedralAvxSseFetchVector::<GRID_SIZE> { cube: table0 },
381 TetrahedralAvxSseFetchVector::<GRID_SIZE> { cube: table1 },
382 )
383 }
384 }
385 }
386 };
387}
388
389#[cfg(feature = "options")]
390define_interp_avx!(TetrahedralAvxQ0_15);
391#[cfg(feature = "options")]
392define_interp_avx!(PyramidalAvxQ0_15);
393#[cfg(feature = "options")]
394define_interp_avx!(PrismaticAvxQ0_15);
395define_interp_avx!(TrilinearAvxQ0_15);
396#[cfg(feature = "options")]
397define_interp_avx_d!(PrismaticAvxQ0_15Double);
398#[cfg(feature = "options")]
399define_interp_avx_d!(PyramidAvxFmaQ0_15Double);
400
401#[cfg(feature = "options")]
402impl<const GRID_SIZE: usize> AvxMdInterpolationQ0_15Double
403 for TetrahedralAvxQ0_15Double<GRID_SIZE>
404{
405 fn inter3_sse(
406 &self,
407 table0: &[AvxAlignedI16],
408 table1: &[AvxAlignedI16],
409 in_r: usize,
410 in_g: usize,
411 in_b: usize,
412 lut: &[BarycentricWeight<i16>],
413 ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
414 unsafe {
415 self.interpolate(
416 in_r,
417 in_g,
418 in_b,
419 lut,
420 TetrahedralAvxFetchVector::<GRID_SIZE> {
421 cube0: table0,
422 cube1: table1,
423 },
424 )
425 }
426 }
427}
428
429impl<const GRID_SIZE: usize> AvxMdInterpolationQ0_15Double for TrilinearAvxQ0_15Double<GRID_SIZE> {
430 fn inter3_sse(
431 &self,
432 table0: &[AvxAlignedI16],
433 table1: &[AvxAlignedI16],
434 in_r: usize,
435 in_g: usize,
436 in_b: usize,
437 lut: &[BarycentricWeight<i16>],
438 ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
439 unsafe {
440 self.interpolate(
441 in_r,
442 in_g,
443 in_b,
444 lut,
445 TetrahedralAvxFetchVector::<GRID_SIZE> {
446 cube0: table0,
447 cube1: table1,
448 },
449 )
450 }
451 }
452}
453
454#[cfg(feature = "options")]
455impl<const GRID_SIZE: usize> PyramidalAvxQ0_15<GRID_SIZE> {
456 #[target_feature(enable = "avx2")]
457 unsafe fn interpolate(
458 &self,
459 in_r: usize,
460 in_g: usize,
461 in_b: usize,
462 lut: &[BarycentricWeight<i16>],
463 r: impl Fetcher<AvxVectorQ0_15Sse>,
464 ) -> AvxVectorQ0_15Sse {
465 let lut_r = unsafe { *lut.get_unchecked(in_r) };
466 let lut_g = unsafe { *lut.get_unchecked(in_g) };
467 let lut_b = unsafe { *lut.get_unchecked(in_b) };
468
469 let x: i32 = lut_r.x;
470 let y: i32 = lut_g.x;
471 let z: i32 = lut_b.x;
472
473 let x_n: i32 = lut_r.x_n;
474 let y_n: i32 = lut_g.x_n;
475 let z_n: i32 = lut_b.x_n;
476
477 let dr = lut_r.w;
478 let dg = lut_g.w;
479 let db = lut_b.w;
480
481 let c0 = r.fetch(x, y, z);
482
483 let w0 = AvxVectorQ0_15Sse::from(db);
484 let w1 = AvxVectorQ0_15Sse::from(dr);
485 let w2 = AvxVectorQ0_15Sse::from(dg);
486
487 if dr > db && dg > db {
488 let w3 = AvxVectorQ0_15Sse::from(dr) * AvxVectorQ0_15Sse::from(dg);
489 let x0 = r.fetch(x_n, y_n, z_n);
490 let x1 = r.fetch(x_n, y_n, z);
491 let x2 = r.fetch(x_n, y, z);
492 let x3 = r.fetch(x, y_n, z);
493
494 let c1 = x0 - x1;
495 let c2 = x2 - c0;
496 let c3 = x3 - c0;
497 let c4 = c0 - x3 - x2 + x1;
498
499 let s0 = c0.mla(c1, w0);
500 let s1 = s0.mla(c2, w1);
501 let s2 = s1.mla(c3, w2);
502 s2.mla(c4, w3)
503 } else if db > dr && dg > dr {
504 let w3 = AvxVectorQ0_15Sse::from(dg) * AvxVectorQ0_15Sse::from(db);
505
506 let x0 = r.fetch(x, y, z_n);
507 let x1 = r.fetch(x_n, y_n, z_n);
508 let x2 = r.fetch(x, y_n, z_n);
509 let x3 = r.fetch(x, y_n, z);
510
511 let c1 = x0 - c0;
512 let c2 = x1 - x2;
513 let c3 = x3 - c0;
514 let c4 = c0 - x3 - x0 + x2;
515
516 let s0 = c0.mla(c1, w0);
517 let s1 = s0.mla(c2, w1);
518 let s2 = s1.mla(c3, w2);
519 s2.mla(c4, w3)
520 } else {
521 let w3 = AvxVectorQ0_15Sse::from(db) * AvxVectorQ0_15Sse::from(dr);
522
523 let x0 = r.fetch(x, y, z_n);
524 let x1 = r.fetch(x_n, y, z);
525 let x2 = r.fetch(x_n, y, z_n);
526 let x3 = r.fetch(x_n, y_n, z_n);
527
528 let c1 = x0 - c0;
529 let c2 = x1 - c0;
530 let c3 = x3 - x2;
531 let c4 = c0 - x1 - x0 + x2;
532
533 let s0 = c0.mla(c1, w0);
534 let s1 = s0.mla(c2, w1);
535 let s2 = s1.mla(c3, w2);
536 s2.mla(c4, w3)
537 }
538 }
539}
540
541#[cfg(feature = "options")]
542impl<const GRID_SIZE: usize> PrismaticAvxQ0_15<GRID_SIZE> {
543 #[target_feature(enable = "avx2")]
544 unsafe fn interpolate(
545 &self,
546 in_r: usize,
547 in_g: usize,
548 in_b: usize,
549 lut: &[BarycentricWeight<i16>],
550 r: impl Fetcher<AvxVectorQ0_15Sse>,
551 ) -> AvxVectorQ0_15Sse {
552 let lut_r = unsafe { *lut.get_unchecked(in_r) };
553 let lut_g = unsafe { *lut.get_unchecked(in_g) };
554 let lut_b = unsafe { *lut.get_unchecked(in_b) };
555
556 let x: i32 = lut_r.x;
557 let y: i32 = lut_g.x;
558 let z: i32 = lut_b.x;
559
560 let x_n: i32 = lut_r.x_n;
561 let y_n: i32 = lut_g.x_n;
562 let z_n: i32 = lut_b.x_n;
563
564 let dr = lut_r.w;
565 let dg = lut_g.w;
566 let db = lut_b.w;
567
568 let c0 = r.fetch(x, y, z);
569
570 let w0 = AvxVectorQ0_15Sse::from(db);
571 let w1 = AvxVectorQ0_15Sse::from(dr);
572 let w2 = AvxVectorQ0_15Sse::from(dg);
573 let w3 = AvxVectorQ0_15Sse::from(dg) * AvxVectorQ0_15Sse::from(db);
574 let w4 = AvxVectorQ0_15Sse::from(dr) * AvxVectorQ0_15Sse::from(dg);
575
576 if db > dr {
577 let x0 = r.fetch(x, y, z_n);
578 let x1 = r.fetch(x_n, y, z_n);
579 let x2 = r.fetch(x, y_n, z);
580 let x3 = r.fetch(x, y_n, z_n);
581 let x4 = r.fetch(x_n, y_n, z_n);
582
583 let c1 = x0 - c0;
584 let c2 = x1 - x0;
585 let c3 = x2 - c0;
586 let c4 = c0 - x2 - x0 + x3;
587 let c5 = x0 - x3 - x1 + x4;
588
589 let s0 = c0.mla(c1, w0);
590 let s1 = s0.mla(c2, w1);
591 let s2 = s1.mla(c3, w2);
592 let s3 = s2.mla(c4, w3);
593 s3.mla(c5, w4)
594 } else {
595 let x0 = r.fetch(x_n, y, z);
596 let x1 = r.fetch(x_n, y, z_n);
597 let x2 = r.fetch(x, y_n, z);
598 let x3 = r.fetch(x_n, y_n, z);
599 let x4 = r.fetch(x_n, y_n, z_n);
600
601 let c1 = x1 - x0;
602 let c2 = x0 - c0;
603 let c3 = x2 - c0;
604 let c4 = x0 - x3 - x1 + x4;
605 let c5 = c0 - x2 - x0 + x3;
606
607 let s0 = c0.mla(c1, w0);
608 let s1 = s0.mla(c2, w1);
609 let s2 = s1.mla(c3, w2);
610 let s3 = s2.mla(c4, w3);
611 s3.mla(c5, w4)
612 }
613 }
614}
615
616#[cfg(feature = "options")]
617impl<const GRID_SIZE: usize> PrismaticAvxQ0_15Double<GRID_SIZE> {
618 #[target_feature(enable = "avx2")]
619 unsafe fn interpolate(
620 &self,
621 in_r: usize,
622 in_g: usize,
623 in_b: usize,
624 lut: &[BarycentricWeight<i16>],
625 r0: impl Fetcher<AvxVectorQ0_15Sse>,
626 r1: impl Fetcher<AvxVectorQ0_15Sse>,
627 ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
628 let lut_r = unsafe { *lut.get_unchecked(in_r) };
629 let lut_g = unsafe { *lut.get_unchecked(in_g) };
630 let lut_b = unsafe { *lut.get_unchecked(in_b) };
631
632 let x: i32 = lut_r.x;
633 let y: i32 = lut_g.x;
634 let z: i32 = lut_b.x;
635
636 let x_n: i32 = lut_r.x_n;
637 let y_n: i32 = lut_g.x_n;
638 let z_n: i32 = lut_b.x_n;
639
640 let dr = lut_r.w;
641 let dg = lut_g.w;
642 let db = lut_b.w;
643
644 let c0_0 = r0.fetch(x, y, z);
645 let c0_1 = r0.fetch(x, y, z);
646
647 let w0 = AvxVectorQ0_15::from(db);
648 let w1 = AvxVectorQ0_15::from(dr);
649 let w2 = AvxVectorQ0_15::from(dg);
650 let w3 = AvxVectorQ0_15::from(dg) * AvxVectorQ0_15::from(db);
651 let w4 = AvxVectorQ0_15::from(dr) * AvxVectorQ0_15::from(dg);
652
653 let c0 = AvxVectorQ0_15::from_sse(c0_0, c0_1);
654
655 if db > dr {
656 let x0_0 = r0.fetch(x, y, z_n);
657 let x1_0 = r0.fetch(x_n, y, z_n);
658 let x2_0 = r0.fetch(x, y_n, z);
659 let x3_0 = r0.fetch(x, y_n, z_n);
660 let x4_0 = r0.fetch(x_n, y_n, z_n);
661
662 let x0_1 = r1.fetch(x, y, z_n);
663 let x1_1 = r1.fetch(x_n, y, z_n);
664 let x2_1 = r1.fetch(x, y_n, z);
665 let x3_1 = r1.fetch(x, y_n, z_n);
666 let x4_1 = r1.fetch(x_n, y_n, z_n);
667
668 let x0 = AvxVectorQ0_15::from_sse(x0_0, x0_1);
669 let x1 = AvxVectorQ0_15::from_sse(x1_0, x1_1);
670 let x2 = AvxVectorQ0_15::from_sse(x2_0, x2_1);
671 let x3 = AvxVectorQ0_15::from_sse(x3_0, x3_1);
672 let x4 = AvxVectorQ0_15::from_sse(x4_0, x4_1);
673
674 let c1 = x0 - c0;
675 let c2 = x1 - x0;
676 let c3 = x2 - c0;
677 let c4 = c0 - x2 - x0 + x3;
678 let c5 = x0 - x3 - x1 + x4;
679
680 let s0 = c0.mla(c1, w0);
681 let s1 = s0.mla(c2, w1);
682 let s2 = s1.mla(c3, w2);
683 let s3 = s2.mla(c4, w3);
684 s3.mla(c5, w4).split()
685 } else {
686 let x0_0 = r0.fetch(x_n, y, z);
687 let x1_0 = r0.fetch(x_n, y, z_n);
688 let x2_0 = r0.fetch(x, y_n, z);
689 let x3_0 = r0.fetch(x_n, y_n, z);
690 let x4_0 = r0.fetch(x_n, y_n, z_n);
691
692 let x0_1 = r1.fetch(x_n, y, z);
693 let x1_1 = r1.fetch(x_n, y, z_n);
694 let x2_1 = r1.fetch(x, y_n, z);
695 let x3_1 = r1.fetch(x_n, y_n, z);
696 let x4_1 = r1.fetch(x_n, y_n, z_n);
697
698 let x0 = AvxVectorQ0_15::from_sse(x0_0, x0_1);
699 let x1 = AvxVectorQ0_15::from_sse(x1_0, x1_1);
700 let x2 = AvxVectorQ0_15::from_sse(x2_0, x2_1);
701 let x3 = AvxVectorQ0_15::from_sse(x3_0, x3_1);
702 let x4 = AvxVectorQ0_15::from_sse(x4_0, x4_1);
703
704 let c1 = x1 - x0;
705 let c2 = x0 - c0;
706 let c3 = x2 - c0;
707 let c4 = x0 - x3 - x1 + x4;
708 let c5 = c0 - x2 - x0 + x3;
709
710 let s0 = c0.mla(c1, w0);
711 let s1 = s0.mla(c2, w1);
712 let s2 = s1.mla(c3, w2);
713 let s3 = s2.mla(c4, w3);
714 s3.mla(c5, w4).split()
715 }
716 }
717}
718
719#[cfg(feature = "options")]
720impl<const GRID_SIZE: usize> PyramidAvxFmaQ0_15Double<GRID_SIZE> {
721 #[target_feature(enable = "avx2")]
722 unsafe fn interpolate(
723 &self,
724 in_r: usize,
725 in_g: usize,
726 in_b: usize,
727 lut: &[BarycentricWeight<i16>],
728 r0: impl Fetcher<AvxVectorQ0_15Sse>,
729 r1: impl Fetcher<AvxVectorQ0_15Sse>,
730 ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
731 let lut_r = unsafe { *lut.get_unchecked(in_r) };
732 let lut_g = unsafe { *lut.get_unchecked(in_g) };
733 let lut_b = unsafe { *lut.get_unchecked(in_b) };
734
735 let x: i32 = lut_r.x;
736 let y: i32 = lut_g.x;
737 let z: i32 = lut_b.x;
738
739 let x_n: i32 = lut_r.x_n;
740 let y_n: i32 = lut_g.x_n;
741 let z_n: i32 = lut_b.x_n;
742
743 let dr = lut_r.w;
744 let dg = lut_g.w;
745 let db = lut_b.w;
746
747 let c0_0 = r0.fetch(x, y, z);
748 let c0_1 = r1.fetch(x, y, z);
749
750 let w0 = AvxVectorQ0_15::from(db);
751 let w1 = AvxVectorQ0_15::from(dr);
752 let w2 = AvxVectorQ0_15::from(dg);
753
754 let c0 = AvxVectorQ0_15::from_sse(c0_0, c0_1);
755
756 if dr > db && dg > db {
757 let w3 = AvxVectorQ0_15::from(dr) * AvxVectorQ0_15::from(dg);
758
759 let x0_0 = r0.fetch(x_n, y_n, z_n);
760 let x1_0 = r0.fetch(x_n, y_n, z);
761 let x2_0 = r0.fetch(x_n, y, z);
762 let x3_0 = r0.fetch(x, y_n, z);
763
764 let x0_1 = r1.fetch(x_n, y_n, z_n);
765 let x1_1 = r1.fetch(x_n, y_n, z);
766 let x2_1 = r1.fetch(x_n, y, z);
767 let x3_1 = r1.fetch(x, y_n, z);
768
769 let x0 = AvxVectorQ0_15::from_sse(x0_0, x0_1);
770 let x1 = AvxVectorQ0_15::from_sse(x1_0, x1_1);
771 let x2 = AvxVectorQ0_15::from_sse(x2_0, x2_1);
772 let x3 = AvxVectorQ0_15::from_sse(x3_0, x3_1);
773
774 let c1 = x0 - x1;
775 let c2 = x2 - c0;
776 let c3 = x3 - c0;
777 let c4 = c0 - x3 - x2 + x1;
778
779 let s0 = c0.mla(c1, w0);
780 let s1 = s0.mla(c2, w1);
781 let s2 = s1.mla(c3, w2);
782 s2.mla(c4, w3).split()
783 } else if db > dr && dg > dr {
784 let w3 = AvxVectorQ0_15::from(dg) * AvxVectorQ0_15::from(db);
785
786 let x0_0 = r0.fetch(x, y, z_n);
787 let x1_0 = r0.fetch(x_n, y_n, z_n);
788 let x2_0 = r0.fetch(x, y_n, z_n);
789 let x3_0 = r0.fetch(x, y_n, z);
790
791 let x0_1 = r1.fetch(x, y, z_n);
792 let x1_1 = r1.fetch(x_n, y_n, z_n);
793 let x2_1 = r1.fetch(x, y_n, z_n);
794 let x3_1 = r1.fetch(x, y_n, z);
795
796 let x0 = AvxVectorQ0_15::from_sse(x0_0, x0_1);
797 let x1 = AvxVectorQ0_15::from_sse(x1_0, x1_1);
798 let x2 = AvxVectorQ0_15::from_sse(x2_0, x2_1);
799 let x3 = AvxVectorQ0_15::from_sse(x3_0, x3_1);
800
801 let c1 = x0 - c0;
802 let c2 = x1 - x2;
803 let c3 = x3 - c0;
804 let c4 = c0 - x3 - x0 + x2;
805
806 let s0 = c0.mla(c1, w0);
807 let s1 = s0.mla(c2, w1);
808 let s2 = s1.mla(c3, w2);
809 s2.mla(c4, w3).split()
810 } else {
811 let w3 = AvxVectorQ0_15::from(db) * AvxVectorQ0_15::from(dr);
812
813 let x0_0 = r0.fetch(x, y, z_n);
814 let x1_0 = r0.fetch(x_n, y, z);
815 let x2_0 = r0.fetch(x_n, y, z_n);
816 let x3_0 = r0.fetch(x_n, y_n, z_n);
817
818 let x0_1 = r1.fetch(x, y, z_n);
819 let x1_1 = r1.fetch(x_n, y, z);
820 let x2_1 = r1.fetch(x_n, y, z_n);
821 let x3_1 = r1.fetch(x_n, y_n, z_n);
822
823 let x0 = AvxVectorQ0_15::from_sse(x0_0, x0_1);
824 let x1 = AvxVectorQ0_15::from_sse(x1_0, x1_1);
825 let x2 = AvxVectorQ0_15::from_sse(x2_0, x2_1);
826 let x3 = AvxVectorQ0_15::from_sse(x3_0, x3_1);
827
828 let c1 = x0 - c0;
829 let c2 = x1 - c0;
830 let c3 = x3 - x2;
831 let c4 = c0 - x1 - x0 + x2;
832
833 let s0 = c0.mla(c1, w0);
834 let s1 = s0.mla(c2, w1);
835 let s2 = s1.mla(c3, w2);
836 s2.mla(c4, w3).split()
837 }
838 }
839}
840
841#[cfg(feature = "options")]
842impl<const GRID_SIZE: usize> TetrahedralAvxQ0_15Double<GRID_SIZE> {
843 #[target_feature(enable = "avx2")]
844 unsafe fn interpolate(
845 &self,
846 in_r: usize,
847 in_g: usize,
848 in_b: usize,
849 lut: &[BarycentricWeight<i16>],
850 rv: impl Fetcher<AvxVectorQ0_15>,
851 ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
852 let lut_r = unsafe { *lut.get_unchecked(in_r) };
853 let lut_g = unsafe { *lut.get_unchecked(in_g) };
854 let lut_b = unsafe { *lut.get_unchecked(in_b) };
855
856 let x: i32 = lut_r.x;
857 let y: i32 = lut_g.x;
858 let z: i32 = lut_b.x;
859
860 let x_n: i32 = lut_r.x_n;
861 let y_n: i32 = lut_g.x_n;
862 let z_n: i32 = lut_b.x_n;
863
864 let rx = lut_r.w;
865 let ry = lut_g.w;
866 let rz = lut_b.w;
867
868 let c0 = rv.fetch(x, y, z);
869
870 let w0 = AvxVectorQ0_15::from(rx);
871 let w1 = AvxVectorQ0_15::from(ry);
872 let w2 = AvxVectorQ0_15::from(rz);
873
874 let c2;
875 let c1;
876 let c3;
877 if rx >= ry {
878 if ry >= rz {
879 c1 = rv.fetch(x_n, y, z) - c0;
881 c2 = rv.fetch(x_n, y_n, z) - rv.fetch(x_n, y, z);
882 c3 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y_n, z);
883 } else if rx >= rz {
884 c1 = rv.fetch(x_n, y, z) - c0;
886 c2 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y, z_n);
887 c3 = rv.fetch(x_n, y, z_n) - rv.fetch(x_n, y, z);
888 } else {
889 c1 = rv.fetch(x_n, y, z_n) - rv.fetch(x, y, z_n);
891 c2 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y, z_n);
892 c3 = rv.fetch(x, y, z_n) - c0;
893 }
894 } else if rx >= rz {
895 c1 = rv.fetch(x_n, y_n, z) - rv.fetch(x, y_n, z);
897 c2 = rv.fetch(x, y_n, z) - c0;
898 c3 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y_n, z);
899 } else if ry >= rz {
900 c1 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x, y_n, z_n);
902 c2 = rv.fetch(x, y_n, z) - c0;
903 c3 = rv.fetch(x, y_n, z_n) - rv.fetch(x, y_n, z);
904 } else {
905 c1 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x, y_n, z_n);
907 c2 = rv.fetch(x, y_n, z_n) - rv.fetch(x, y, z_n);
908 c3 = rv.fetch(x, y, z_n) - c0;
909 }
910 let s0 = c0.mla(c1, w0);
911 let s1 = s0.mla(c2, w1);
912 s1.mla(c3, w2).split()
913 }
914}
915
916impl<const GRID_SIZE: usize> TrilinearAvxQ0_15Double<GRID_SIZE> {
917 #[target_feature(enable = "avx2")]
918 unsafe fn interpolate(
919 &self,
920 in_r: usize,
921 in_g: usize,
922 in_b: usize,
923 lut: &[BarycentricWeight<i16>],
924 rv: impl Fetcher<AvxVectorQ0_15>,
925 ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
926 let lut_r = unsafe { *lut.get_unchecked(in_r) };
927 let lut_g = unsafe { *lut.get_unchecked(in_g) };
928 let lut_b = unsafe { *lut.get_unchecked(in_b) };
929
930 let x: i32 = lut_r.x;
931 let y: i32 = lut_g.x;
932 let z: i32 = lut_b.x;
933
934 let x_n: i32 = lut_r.x_n;
935 let y_n: i32 = lut_g.x_n;
936 let z_n: i32 = lut_b.x_n;
937
938 let rx = lut_r.w;
939 let ry = lut_g.w;
940 let rz = lut_b.w;
941
942 const Q_MAX: i16 = ((1i32 << 15i32) - 1) as i16;
943
944 let q_max = AvxVectorQ0_15::from(Q_MAX);
945 let w0 = AvxVectorQ0_15::from(rx);
946 let w1 = AvxVectorQ0_15::from(ry);
947 let w2 = AvxVectorQ0_15::from(rz);
948 let dx = q_max - w0;
949 let dy = q_max - w1;
950 let dz = q_max - w2;
951
952 let c000 = rv.fetch(x, y, z);
953 let c100 = rv.fetch(x_n, y, z);
954 let c010 = rv.fetch(x, y_n, z);
955 let c110 = rv.fetch(x_n, y_n, z);
956 let c001 = rv.fetch(x, y, z_n);
957 let c101 = rv.fetch(x_n, y, z_n);
958 let c011 = rv.fetch(x, y_n, z_n);
959 let c111 = rv.fetch(x_n, y_n, z_n);
960
961 let c00 = (c000 * dx).mla(c100, w0);
962 let c10 = (c010 * dx).mla(c110, w0);
963 let c01 = (c001 * dx).mla(c101, w0);
964 let c11 = (c011 * dx).mla(c111, w0);
965
966 let c0 = (c00 * dy).mla(c10, w1);
967 let c1 = (c01 * dy).mla(c11, w1);
968
969 (c0 * dz).mla(c1, w2).split()
970 }
971}
972
973impl<const GRID_SIZE: usize> TrilinearAvxQ0_15<GRID_SIZE> {
974 #[target_feature(enable = "avx2")]
975 unsafe fn interpolate(
976 &self,
977 in_r: usize,
978 in_g: usize,
979 in_b: usize,
980 lut: &[BarycentricWeight<i16>],
981 r: impl Fetcher<AvxVectorQ0_15Sse>,
982 ) -> AvxVectorQ0_15Sse {
983 let lut_r = unsafe { *lut.get_unchecked(in_r) };
984 let lut_g = unsafe { *lut.get_unchecked(in_g) };
985 let lut_b = unsafe { *lut.get_unchecked(in_b) };
986
987 let x: i32 = lut_r.x;
988 let y: i32 = lut_g.x;
989 let z: i32 = lut_b.x;
990
991 let x_n: i32 = lut_r.x_n;
992 let y_n: i32 = lut_g.x_n;
993 let z_n: i32 = lut_b.x_n;
994
995 let dr = lut_r.w;
996 let dg = lut_g.w;
997 let db = lut_b.w;
998
999 const Q_MAX: i16 = ((1i32 << 15i32) - 1) as i16;
1000
1001 let q_max = AvxVectorQ0_15Sse::from(Q_MAX);
1002 let q_max_avx = AvxVectorQ0_15::from(Q_MAX);
1003 let w0 = AvxVectorQ0_15::from(dr);
1004 let w1 = AvxVectorQ0_15::from(dg);
1005 let w2 = AvxVectorQ0_15Sse::from(db);
1006 let dx = q_max_avx - w0;
1007 let dy = q_max_avx - w1;
1008 let dz = q_max - w2;
1009
1010 let c000 = r.fetch(x, y, z);
1011 let c100 = r.fetch(x_n, y, z);
1012 let c010 = r.fetch(x, y_n, z);
1013 let c110 = r.fetch(x_n, y_n, z);
1014 let c001 = r.fetch(x, y, z_n);
1015 let c101 = r.fetch(x_n, y, z_n);
1016 let c011 = r.fetch(x, y_n, z_n);
1017 let c111 = r.fetch(x_n, y_n, z_n);
1018
1019 let x000 = AvxVectorQ0_15::from_sse(c000, c001);
1020 let x010 = AvxVectorQ0_15::from_sse(c010, c011);
1021 let x011 = AvxVectorQ0_15::from_sse(c100, c101);
1022 let x111 = AvxVectorQ0_15::from_sse(c110, c111);
1023
1024 let c00 = (x000 * dx).mla(x011, w0);
1025 let c10 = (x010 * dx).mla(x111, w0);
1026
1027 let c0 = (c00 * dy).mla(c10, w1);
1028
1029 let (c0, c1) = c0.split();
1030
1031 (c0 * dz).mla(c1, w2)
1032 }
1033}