Skip to main content

fearless_simd/generated/
avx2.rs

1// Copyright 2025 the Fearless_SIMD Authors
2// SPDX-License-Identifier: Apache-2.0 OR MIT
3
4// This file is autogenerated by fearless_simd_gen
5
6use crate::{Level, arch_types::ArchTypes, prelude::*, seal::Seal};
7use crate::{
8    f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4,
9    i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4,
10    mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32,
11    u32x4, u32x8, u32x16,
12};
13#[cfg(target_arch = "x86")]
14use core::arch::x86::*;
15#[cfg(target_arch = "x86_64")]
16use core::arch::x86_64::*;
17#[doc = "The SIMD token for the x86-64-v3 level."]
18#[derive(Clone, Copy, Debug)]
19pub struct Avx2 {
20    pub avx2: crate::core_arch::x86::Avx2,
21}
22impl Avx2 {
23    #[doc = r" Create a SIMD token."]
24    #[doc = r""]
25    #[doc = r" # Safety"]
26    #[doc = r""]
27    #[doc = r" The `avx2`, `bmi1`, `bmi2`, `cmpxchg16b`, `f16c`, `fma`,"]
28    #[doc = r" `lzcnt`, `movbe`, `popcnt`, and `xsave` CPU features must"]
29    #[doc = r" be available."]
30    #[inline]
31    pub const unsafe fn new_unchecked() -> Self {
32        Self {
33            avx2: unsafe { crate::core_arch::x86::Avx2::new_unchecked() },
34        }
35    }
36}
37impl Seal for Avx2 {}
38impl ArchTypes for Avx2 {
39    type f32x4 = crate::support::Aligned128<__m128>;
40    type i8x16 = crate::support::Aligned128<__m128i>;
41    type u8x16 = crate::support::Aligned128<__m128i>;
42    type mask8x16 = crate::support::Aligned128<__m128i>;
43    type i16x8 = crate::support::Aligned128<__m128i>;
44    type u16x8 = crate::support::Aligned128<__m128i>;
45    type mask16x8 = crate::support::Aligned128<__m128i>;
46    type i32x4 = crate::support::Aligned128<__m128i>;
47    type u32x4 = crate::support::Aligned128<__m128i>;
48    type mask32x4 = crate::support::Aligned128<__m128i>;
49    type f64x2 = crate::support::Aligned128<__m128d>;
50    type mask64x2 = crate::support::Aligned128<__m128i>;
51    type f32x8 = crate::support::Aligned256<__m256>;
52    type i8x32 = crate::support::Aligned256<__m256i>;
53    type u8x32 = crate::support::Aligned256<__m256i>;
54    type mask8x32 = crate::support::Aligned256<__m256i>;
55    type i16x16 = crate::support::Aligned256<__m256i>;
56    type u16x16 = crate::support::Aligned256<__m256i>;
57    type mask16x16 = crate::support::Aligned256<__m256i>;
58    type i32x8 = crate::support::Aligned256<__m256i>;
59    type u32x8 = crate::support::Aligned256<__m256i>;
60    type mask32x8 = crate::support::Aligned256<__m256i>;
61    type f64x4 = crate::support::Aligned256<__m256d>;
62    type mask64x4 = crate::support::Aligned256<__m256i>;
63    type f32x16 = crate::support::Aligned512<[__m256; 2usize]>;
64    type i8x64 = crate::support::Aligned512<[__m256i; 2usize]>;
65    type u8x64 = crate::support::Aligned512<[__m256i; 2usize]>;
66    type mask8x64 = crate::support::Aligned512<[__m256i; 2usize]>;
67    type i16x32 = crate::support::Aligned512<[__m256i; 2usize]>;
68    type u16x32 = crate::support::Aligned512<[__m256i; 2usize]>;
69    type mask16x32 = crate::support::Aligned512<[__m256i; 2usize]>;
70    type i32x16 = crate::support::Aligned512<[__m256i; 2usize]>;
71    type u32x16 = crate::support::Aligned512<[__m256i; 2usize]>;
72    type mask32x16 = crate::support::Aligned512<[__m256i; 2usize]>;
73    type f64x8 = crate::support::Aligned512<[__m256d; 2usize]>;
74    type mask64x8 = crate::support::Aligned512<[__m256i; 2usize]>;
75}
76impl Simd for Avx2 {
77    type f32s = f32x8<Self>;
78    type f64s = f64x4<Self>;
79    type u8s = u8x32<Self>;
80    type i8s = i8x32<Self>;
81    type u16s = u16x16<Self>;
82    type i16s = i16x16<Self>;
83    type u32s = u32x8<Self>;
84    type i32s = i32x8<Self>;
85    type mask8s = mask8x32<Self>;
86    type mask16s = mask16x16<Self>;
87    type mask32s = mask32x8<Self>;
88    type mask64s = mask64x4<Self>;
89    #[inline(always)]
90    fn level(self) -> Level {
91        Level::Avx2(self)
92    }
93    #[inline]
94    fn vectorize<F: FnOnce() -> R, R>(self, f: F) -> R {
95        #[target_feature(enable = "avx2,bmi1,bmi2,cmpxchg16b,f16c,fma,lzcnt,movbe,popcnt,xsave")]
96        unsafe fn vectorize_avx2<F: FnOnce() -> R, R>(f: F) -> R {
97            f()
98        }
99        unsafe { vectorize_avx2(f) }
100    }
101    #[inline(always)]
102    fn splat_f32x4(self, val: f32) -> f32x4<Self> {
103        unsafe { _mm_set1_ps(val).simd_into(self) }
104    }
105    #[inline(always)]
106    fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4<Self> {
107        f32x4 {
108            val: unsafe { core::mem::transmute_copy(&val) },
109            simd: self,
110        }
111    }
112    #[inline(always)]
113    fn load_array_ref_f32x4(self, val: &[f32; 4usize]) -> f32x4<Self> {
114        f32x4 {
115            val: unsafe { core::mem::transmute_copy(val) },
116            simd: self,
117        }
118    }
119    #[inline(always)]
120    fn as_array_f32x4(self, a: f32x4<Self>) -> [f32; 4usize] {
121        unsafe { core::mem::transmute::<__m128, [f32; 4usize]>(a.val.0) }
122    }
123    #[inline(always)]
124    fn as_array_ref_f32x4(self, a: &f32x4<Self>) -> &[f32; 4usize] {
125        unsafe { core::mem::transmute::<&__m128, &[f32; 4usize]>(&a.val.0) }
126    }
127    #[inline(always)]
128    fn as_array_mut_f32x4(self, a: &mut f32x4<Self>) -> &mut [f32; 4usize] {
129        unsafe { core::mem::transmute::<&mut __m128, &mut [f32; 4usize]>(&mut a.val.0) }
130    }
131    #[inline(always)]
132    fn store_array_f32x4(self, a: f32x4<Self>, dest: &mut [f32; 4usize]) -> () {
133        unsafe {
134            core::ptr::copy_nonoverlapping(
135                (&raw const a.val.0) as *const f32,
136                dest.as_mut_ptr(),
137                4usize,
138            );
139        }
140    }
141    #[inline(always)]
142    fn cvt_from_bytes_f32x4(self, a: u8x16<Self>) -> f32x4<Self> {
143        unsafe {
144            f32x4 {
145                val: core::mem::transmute(a.val),
146                simd: self,
147            }
148        }
149    }
150    #[inline(always)]
151    fn cvt_to_bytes_f32x4(self, a: f32x4<Self>) -> u8x16<Self> {
152        unsafe {
153            u8x16 {
154                val: core::mem::transmute(a.val),
155                simd: self,
156            }
157        }
158    }
159    #[inline(always)]
160    fn slide_f32x4<const SHIFT: usize>(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
161        unsafe {
162            if SHIFT >= 4usize {
163                return b;
164            }
165            let result = dyn_alignr_128(
166                self.cvt_to_bytes_f32x4(b).val.0,
167                self.cvt_to_bytes_f32x4(a).val.0,
168                SHIFT * 4usize,
169            );
170            self.cvt_from_bytes_f32x4(u8x16 {
171                val: crate::support::Aligned128(result),
172                simd: self,
173            })
174        }
175    }
176    #[inline(always)]
177    fn slide_within_blocks_f32x4<const SHIFT: usize>(
178        self,
179        a: f32x4<Self>,
180        b: f32x4<Self>,
181    ) -> f32x4<Self> {
182        self.slide_f32x4::<SHIFT>(a, b)
183    }
184    #[inline(always)]
185    fn abs_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
186        unsafe { _mm_andnot_ps(_mm_set1_ps(-0.0), a.into()).simd_into(self) }
187    }
188    #[inline(always)]
189    fn neg_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
190        unsafe { _mm_xor_ps(a.into(), _mm_set1_ps(-0.0)).simd_into(self) }
191    }
192    #[inline(always)]
193    fn sqrt_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
194        unsafe { _mm_sqrt_ps(a.into()).simd_into(self) }
195    }
196    #[inline(always)]
197    fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
198        unsafe { _mm_add_ps(a.into(), b.into()).simd_into(self) }
199    }
200    #[inline(always)]
201    fn sub_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
202        unsafe { _mm_sub_ps(a.into(), b.into()).simd_into(self) }
203    }
204    #[inline(always)]
205    fn mul_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
206        unsafe { _mm_mul_ps(a.into(), b.into()).simd_into(self) }
207    }
208    #[inline(always)]
209    fn div_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
210        unsafe { _mm_div_ps(a.into(), b.into()).simd_into(self) }
211    }
212    #[inline(always)]
213    fn copysign_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
214        unsafe {
215            let mask = _mm_set1_ps(-0.0);
216            _mm_or_ps(_mm_and_ps(mask, b.into()), _mm_andnot_ps(mask, a.into())).simd_into(self)
217        }
218    }
219    #[inline(always)]
220    fn simd_eq_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
221        unsafe { _mm_castps_si128(_mm_cmpeq_ps(a.into(), b.into())).simd_into(self) }
222    }
223    #[inline(always)]
224    fn simd_lt_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
225        unsafe { _mm_castps_si128(_mm_cmplt_ps(a.into(), b.into())).simd_into(self) }
226    }
227    #[inline(always)]
228    fn simd_le_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
229        unsafe { _mm_castps_si128(_mm_cmple_ps(a.into(), b.into())).simd_into(self) }
230    }
231    #[inline(always)]
232    fn simd_ge_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
233        unsafe { _mm_castps_si128(_mm_cmpge_ps(a.into(), b.into())).simd_into(self) }
234    }
235    #[inline(always)]
236    fn simd_gt_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
237        unsafe { _mm_castps_si128(_mm_cmpgt_ps(a.into(), b.into())).simd_into(self) }
238    }
239    #[inline(always)]
240    fn zip_low_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
241        unsafe { _mm_unpacklo_ps(a.into(), b.into()).simd_into(self) }
242    }
243    #[inline(always)]
244    fn zip_high_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
245        unsafe { _mm_unpackhi_ps(a.into(), b.into()).simd_into(self) }
246    }
247    #[inline(always)]
248    fn unzip_low_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
249        unsafe { _mm_shuffle_ps::<0b10_00_10_00>(a.into(), b.into()).simd_into(self) }
250    }
251    #[inline(always)]
252    fn unzip_high_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
253        unsafe { _mm_shuffle_ps::<0b11_01_11_01>(a.into(), b.into()).simd_into(self) }
254    }
255    #[inline(always)]
256    fn interleave_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> (f32x4<Self>, f32x4<Self>) {
257        (self.zip_low_f32x4(a, b), self.zip_high_f32x4(a, b))
258    }
259    #[inline(always)]
260    fn deinterleave_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> (f32x4<Self>, f32x4<Self>) {
261        (self.unzip_low_f32x4(a, b), self.unzip_high_f32x4(a, b))
262    }
263    #[inline(always)]
264    fn max_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
265        unsafe { _mm_max_ps(a.into(), b.into()).simd_into(self) }
266    }
267    #[inline(always)]
268    fn min_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
269        unsafe { _mm_min_ps(a.into(), b.into()).simd_into(self) }
270    }
271    #[inline(always)]
272    fn max_precise_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
273        unsafe {
274            let intermediate = _mm_max_ps(a.into(), b.into());
275            let b_is_nan = _mm_cmpunord_ps(b.into(), b.into());
276            _mm_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(self)
277        }
278    }
279    #[inline(always)]
280    fn min_precise_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
281        unsafe {
282            let intermediate = _mm_min_ps(a.into(), b.into());
283            let b_is_nan = _mm_cmpunord_ps(b.into(), b.into());
284            _mm_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(self)
285        }
286    }
287    #[inline(always)]
288    fn mul_add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
289        unsafe { _mm_fmadd_ps(a.into(), b.into(), c.into()).simd_into(self) }
290    }
291    #[inline(always)]
292    fn mul_sub_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
293        unsafe { _mm_fmsub_ps(a.into(), b.into(), c.into()).simd_into(self) }
294    }
295    #[inline(always)]
296    fn floor_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
297        unsafe {
298            _mm_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
299        }
300    }
301    #[inline(always)]
302    fn ceil_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
303        unsafe {
304            _mm_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
305        }
306    }
307    #[inline(always)]
308    fn round_ties_even_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
309        unsafe {
310            _mm_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
311                .simd_into(self)
312        }
313    }
314    #[inline(always)]
315    fn fract_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
316        a - self.trunc_f32x4(a)
317    }
318    #[inline(always)]
319    fn trunc_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
320        unsafe {
321            _mm_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
322        }
323    }
324    #[inline(always)]
325    fn select_f32x4(self, a: mask32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
326        unsafe { _mm_blendv_ps(c.into(), b.into(), _mm_castsi128_ps(a.into())).simd_into(self) }
327    }
328    #[inline(always)]
329    fn combine_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x8<Self> {
330        unsafe { _mm256_setr_m128(a.into(), b.into()).simd_into(self) }
331    }
332    #[inline(always)]
333    fn reinterpret_f64_f32x4(self, a: f32x4<Self>) -> f64x2<Self> {
334        unsafe { _mm_castps_pd(a.into()).simd_into(self) }
335    }
336    #[inline(always)]
337    fn reinterpret_i32_f32x4(self, a: f32x4<Self>) -> i32x4<Self> {
338        unsafe { _mm_castps_si128(a.into()).simd_into(self) }
339    }
340    #[inline(always)]
341    fn reinterpret_u8_f32x4(self, a: f32x4<Self>) -> u8x16<Self> {
342        unsafe { _mm_castps_si128(a.into()).simd_into(self) }
343    }
344    #[inline(always)]
345    fn reinterpret_u32_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
346        unsafe { _mm_castps_si128(a.into()).simd_into(self) }
347    }
348    #[inline(always)]
349    fn cvt_u32_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
350        unsafe {
351            let mut converted = _mm_cvttps_epi32(a.into());
352            let in_range = _mm_cmplt_ps(a.into(), _mm_set1_ps(2147483648.0));
353            let all_in_range = _mm_movemask_ps(in_range) == 0b1111;
354            if !all_in_range {
355                let excess = _mm_sub_ps(a.into(), _mm_set1_ps(2147483648.0));
356                let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess));
357                converted = _mm_add_epi32(converted, excess_converted);
358            }
359            converted.simd_into(self)
360        }
361    }
362    #[inline(always)]
363    fn cvt_u32_precise_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
364        unsafe {
365            let a = _mm_max_ps(a.into(), _mm_setzero_ps());
366            let mut converted = _mm_cvttps_epi32(a);
367            let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0));
368            let all_in_range = _mm_movemask_ps(in_range) == 0b1111;
369            if !all_in_range {
370                let exceeds_unsigned_range =
371                    _mm_castps_si128(_mm_cmplt_ps(_mm_set1_ps(4294967040.0), a));
372                let excess = _mm_sub_ps(a, _mm_set1_ps(2147483648.0));
373                let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess));
374                converted = _mm_add_epi32(converted, excess_converted);
375                converted = _mm_blendv_epi8(
376                    converted,
377                    _mm_set1_epi32(u32::MAX.cast_signed()),
378                    exceeds_unsigned_range,
379                );
380            }
381            converted.simd_into(self)
382        }
383    }
384    #[inline(always)]
385    fn cvt_i32_f32x4(self, a: f32x4<Self>) -> i32x4<Self> {
386        unsafe { _mm_cvttps_epi32(a.into()).simd_into(self) }
387    }
388    #[inline(always)]
389    fn cvt_i32_precise_f32x4(self, a: f32x4<Self>) -> i32x4<Self> {
390        unsafe {
391            let a = a.into();
392            let mut converted = _mm_cvttps_epi32(a);
393            let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0));
394            let all_in_range = _mm_movemask_ps(in_range) == 0b1111;
395            if !all_in_range {
396                converted = _mm_blendv_epi8(
397                    _mm_set1_epi32(i32::MAX),
398                    converted,
399                    _mm_castps_si128(in_range),
400                );
401                let is_not_nan = _mm_castps_si128(_mm_cmpord_ps(a, a));
402                converted = _mm_and_si128(converted, is_not_nan);
403            }
404            converted.simd_into(self)
405        }
406    }
407    #[inline(always)]
408    fn splat_i8x16(self, val: i8) -> i8x16<Self> {
409        unsafe { _mm_set1_epi8(val).simd_into(self) }
410    }
411    #[inline(always)]
412    fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16<Self> {
413        i8x16 {
414            val: unsafe { core::mem::transmute_copy(&val) },
415            simd: self,
416        }
417    }
418    #[inline(always)]
419    fn load_array_ref_i8x16(self, val: &[i8; 16usize]) -> i8x16<Self> {
420        i8x16 {
421            val: unsafe { core::mem::transmute_copy(val) },
422            simd: self,
423        }
424    }
425    #[inline(always)]
426    fn as_array_i8x16(self, a: i8x16<Self>) -> [i8; 16usize] {
427        unsafe { core::mem::transmute::<__m128i, [i8; 16usize]>(a.val.0) }
428    }
429    #[inline(always)]
430    fn as_array_ref_i8x16(self, a: &i8x16<Self>) -> &[i8; 16usize] {
431        unsafe { core::mem::transmute::<&__m128i, &[i8; 16usize]>(&a.val.0) }
432    }
433    #[inline(always)]
434    fn as_array_mut_i8x16(self, a: &mut i8x16<Self>) -> &mut [i8; 16usize] {
435        unsafe { core::mem::transmute::<&mut __m128i, &mut [i8; 16usize]>(&mut a.val.0) }
436    }
437    #[inline(always)]
438    fn store_array_i8x16(self, a: i8x16<Self>, dest: &mut [i8; 16usize]) -> () {
439        unsafe {
440            core::ptr::copy_nonoverlapping(
441                (&raw const a.val.0) as *const i8,
442                dest.as_mut_ptr(),
443                16usize,
444            );
445        }
446    }
447    #[inline(always)]
448    fn cvt_from_bytes_i8x16(self, a: u8x16<Self>) -> i8x16<Self> {
449        unsafe {
450            i8x16 {
451                val: core::mem::transmute(a.val),
452                simd: self,
453            }
454        }
455    }
456    #[inline(always)]
457    fn cvt_to_bytes_i8x16(self, a: i8x16<Self>) -> u8x16<Self> {
458        unsafe {
459            u8x16 {
460                val: core::mem::transmute(a.val),
461                simd: self,
462            }
463        }
464    }
465    #[inline(always)]
466    fn slide_i8x16<const SHIFT: usize>(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
467        unsafe {
468            if SHIFT >= 16usize {
469                return b;
470            }
471            let result = dyn_alignr_128(
472                self.cvt_to_bytes_i8x16(b).val.0,
473                self.cvt_to_bytes_i8x16(a).val.0,
474                SHIFT,
475            );
476            self.cvt_from_bytes_i8x16(u8x16 {
477                val: crate::support::Aligned128(result),
478                simd: self,
479            })
480        }
481    }
482    #[inline(always)]
483    fn slide_within_blocks_i8x16<const SHIFT: usize>(
484        self,
485        a: i8x16<Self>,
486        b: i8x16<Self>,
487    ) -> i8x16<Self> {
488        self.slide_i8x16::<SHIFT>(a, b)
489    }
490    #[inline(always)]
491    fn add_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
492        unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) }
493    }
494    #[inline(always)]
495    fn sub_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
496        unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) }
497    }
498    #[inline(always)]
499    fn mul_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
500        unsafe {
501            let dst_even = _mm_mullo_epi16(a.into(), b.into());
502            let dst_odd =
503                _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into()));
504            _mm_or_si128(
505                _mm_slli_epi16(dst_odd, 8),
506                _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)),
507            )
508            .simd_into(self)
509        }
510    }
511    #[inline(always)]
512    fn and_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
513        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
514    }
515    #[inline(always)]
516    fn or_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
517        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
518    }
519    #[inline(always)]
520    fn xor_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
521        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
522    }
523    #[inline(always)]
524    fn not_i8x16(self, a: i8x16<Self>) -> i8x16<Self> {
525        a ^ !0
526    }
527    #[inline(always)]
528    fn shl_i8x16(self, a: i8x16<Self>, shift: u32) -> i8x16<Self> {
529        unsafe {
530            let val = a.into();
531            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
532            let lo_16 = _mm_unpacklo_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val));
533            let hi_16 = _mm_unpackhi_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val));
534            let lo_shifted = _mm_sll_epi16(lo_16, shift_count);
535            let hi_shifted = _mm_sll_epi16(hi_16, shift_count);
536            _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
537        }
538    }
539    #[inline(always)]
540    fn shlv_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
541        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
542    }
543    #[inline(always)]
544    fn shr_i8x16(self, a: i8x16<Self>, shift: u32) -> i8x16<Self> {
545        unsafe {
546            let val = a.into();
547            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
548            let lo_16 = _mm_unpacklo_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val));
549            let hi_16 = _mm_unpackhi_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val));
550            let lo_shifted = _mm_sra_epi16(lo_16, shift_count);
551            let hi_shifted = _mm_sra_epi16(hi_16, shift_count);
552            _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
553        }
554    }
555    #[inline(always)]
556    fn shrv_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
557        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
558    }
559    #[inline(always)]
560    fn simd_eq_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
561        unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) }
562    }
563    #[inline(always)]
564    fn simd_lt_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
565        unsafe { _mm_cmpgt_epi8(b.into(), a.into()).simd_into(self) }
566    }
567    #[inline(always)]
568    fn simd_le_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
569        unsafe { _mm_cmpeq_epi8(_mm_min_epi8(a.into(), b.into()), a.into()).simd_into(self) }
570    }
571    #[inline(always)]
572    fn simd_ge_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
573        unsafe { _mm_cmpeq_epi8(_mm_max_epi8(a.into(), b.into()), a.into()).simd_into(self) }
574    }
575    #[inline(always)]
576    fn simd_gt_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
577        unsafe { _mm_cmpgt_epi8(a.into(), b.into()).simd_into(self) }
578    }
579    #[inline(always)]
580    fn zip_low_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
581        unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) }
582    }
583    #[inline(always)]
584    fn zip_high_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
585        unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) }
586    }
587    #[inline(always)]
588    fn unzip_low_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
589        unsafe {
590            let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
591            let t1 = _mm_shuffle_epi8(a.into(), mask);
592            let t2 = _mm_shuffle_epi8(b.into(), mask);
593            _mm_unpacklo_epi64(t1, t2).simd_into(self)
594        }
595    }
596    #[inline(always)]
597    fn unzip_high_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
598        unsafe {
599            let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
600            let t1 = _mm_shuffle_epi8(a.into(), mask);
601            let t2 = _mm_shuffle_epi8(b.into(), mask);
602            _mm_unpackhi_epi64(t1, t2).simd_into(self)
603        }
604    }
605    #[inline(always)]
606    fn interleave_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> (i8x16<Self>, i8x16<Self>) {
607        (self.zip_low_i8x16(a, b), self.zip_high_i8x16(a, b))
608    }
609    #[inline(always)]
610    fn deinterleave_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> (i8x16<Self>, i8x16<Self>) {
611        (self.unzip_low_i8x16(a, b), self.unzip_high_i8x16(a, b))
612    }
613    #[inline(always)]
614    fn select_i8x16(self, a: mask8x16<Self>, b: i8x16<Self>, c: i8x16<Self>) -> i8x16<Self> {
615        unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
616    }
617    #[inline(always)]
618    fn min_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
619        unsafe { _mm_min_epi8(a.into(), b.into()).simd_into(self) }
620    }
621    #[inline(always)]
622    fn max_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
623        unsafe { _mm_max_epi8(a.into(), b.into()).simd_into(self) }
624    }
625    #[inline(always)]
626    fn combine_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x32<Self> {
627        unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
628    }
629    #[inline(always)]
630    fn neg_i8x16(self, a: i8x16<Self>) -> i8x16<Self> {
631        unsafe { _mm_sub_epi8(_mm_setzero_si128(), a.into()).simd_into(self) }
632    }
633    #[inline(always)]
634    fn reinterpret_u8_i8x16(self, a: i8x16<Self>) -> u8x16<Self> {
635        __m128i::from(a).simd_into(self)
636    }
637    #[inline(always)]
638    fn reinterpret_u32_i8x16(self, a: i8x16<Self>) -> u32x4<Self> {
639        __m128i::from(a).simd_into(self)
640    }
641    #[inline(always)]
642    fn splat_u8x16(self, val: u8) -> u8x16<Self> {
643        unsafe { _mm_set1_epi8(val.cast_signed()).simd_into(self) }
644    }
645    #[inline(always)]
646    fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16<Self> {
647        u8x16 {
648            val: unsafe { core::mem::transmute_copy(&val) },
649            simd: self,
650        }
651    }
652    #[inline(always)]
653    fn load_array_ref_u8x16(self, val: &[u8; 16usize]) -> u8x16<Self> {
654        u8x16 {
655            val: unsafe { core::mem::transmute_copy(val) },
656            simd: self,
657        }
658    }
659    #[inline(always)]
660    fn as_array_u8x16(self, a: u8x16<Self>) -> [u8; 16usize] {
661        unsafe { core::mem::transmute::<__m128i, [u8; 16usize]>(a.val.0) }
662    }
663    #[inline(always)]
664    fn as_array_ref_u8x16(self, a: &u8x16<Self>) -> &[u8; 16usize] {
665        unsafe { core::mem::transmute::<&__m128i, &[u8; 16usize]>(&a.val.0) }
666    }
667    #[inline(always)]
668    fn as_array_mut_u8x16(self, a: &mut u8x16<Self>) -> &mut [u8; 16usize] {
669        unsafe { core::mem::transmute::<&mut __m128i, &mut [u8; 16usize]>(&mut a.val.0) }
670    }
671    #[inline(always)]
672    fn store_array_u8x16(self, a: u8x16<Self>, dest: &mut [u8; 16usize]) -> () {
673        unsafe {
674            core::ptr::copy_nonoverlapping(
675                (&raw const a.val.0) as *const u8,
676                dest.as_mut_ptr(),
677                16usize,
678            );
679        }
680    }
681    #[inline(always)]
682    fn cvt_from_bytes_u8x16(self, a: u8x16<Self>) -> u8x16<Self> {
683        unsafe {
684            u8x16 {
685                val: core::mem::transmute(a.val),
686                simd: self,
687            }
688        }
689    }
690    #[inline(always)]
691    fn cvt_to_bytes_u8x16(self, a: u8x16<Self>) -> u8x16<Self> {
692        unsafe {
693            u8x16 {
694                val: core::mem::transmute(a.val),
695                simd: self,
696            }
697        }
698    }
699    #[inline(always)]
700    fn slide_u8x16<const SHIFT: usize>(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
701        unsafe {
702            if SHIFT >= 16usize {
703                return b;
704            }
705            let result = dyn_alignr_128(
706                self.cvt_to_bytes_u8x16(b).val.0,
707                self.cvt_to_bytes_u8x16(a).val.0,
708                SHIFT,
709            );
710            self.cvt_from_bytes_u8x16(u8x16 {
711                val: crate::support::Aligned128(result),
712                simd: self,
713            })
714        }
715    }
716    #[inline(always)]
717    fn slide_within_blocks_u8x16<const SHIFT: usize>(
718        self,
719        a: u8x16<Self>,
720        b: u8x16<Self>,
721    ) -> u8x16<Self> {
722        self.slide_u8x16::<SHIFT>(a, b)
723    }
724    #[inline(always)]
725    fn add_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
726        unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) }
727    }
728    #[inline(always)]
729    fn sub_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
730        unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) }
731    }
732    #[inline(always)]
733    fn mul_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
734        unsafe {
735            let dst_even = _mm_mullo_epi16(a.into(), b.into());
736            let dst_odd =
737                _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into()));
738            _mm_or_si128(
739                _mm_slli_epi16(dst_odd, 8),
740                _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)),
741            )
742            .simd_into(self)
743        }
744    }
745    #[inline(always)]
746    fn and_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
747        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
748    }
749    #[inline(always)]
750    fn or_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
751        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
752    }
753    #[inline(always)]
754    fn xor_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
755        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
756    }
757    #[inline(always)]
758    fn not_u8x16(self, a: u8x16<Self>) -> u8x16<Self> {
759        a ^ !0
760    }
761    #[inline(always)]
762    fn shl_u8x16(self, a: u8x16<Self>, shift: u32) -> u8x16<Self> {
763        unsafe {
764            let val = a.into();
765            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
766            let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128());
767            let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128());
768            let lo_shifted = _mm_sll_epi16(lo_16, shift_count);
769            let hi_shifted = _mm_sll_epi16(hi_16, shift_count);
770            _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
771        }
772    }
773    #[inline(always)]
774    fn shlv_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
775        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
776    }
777    #[inline(always)]
778    fn shr_u8x16(self, a: u8x16<Self>, shift: u32) -> u8x16<Self> {
779        unsafe {
780            let val = a.into();
781            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
782            let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128());
783            let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128());
784            let lo_shifted = _mm_srl_epi16(lo_16, shift_count);
785            let hi_shifted = _mm_srl_epi16(hi_16, shift_count);
786            _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
787        }
788    }
789    #[inline(always)]
790    fn shrv_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
791        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
792    }
793    #[inline(always)]
794    fn simd_eq_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
795        unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) }
796    }
797    #[inline(always)]
798    fn simd_lt_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
799        unsafe {
800            let sign_bit = _mm_set1_epi8(0x80u8.cast_signed());
801            let a_signed = _mm_xor_si128(a.into(), sign_bit);
802            let b_signed = _mm_xor_si128(b.into(), sign_bit);
803            _mm_cmpgt_epi8(b_signed, a_signed).simd_into(self)
804        }
805    }
806    #[inline(always)]
807    fn simd_le_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
808        unsafe { _mm_cmpeq_epi8(_mm_min_epu8(a.into(), b.into()), a.into()).simd_into(self) }
809    }
810    #[inline(always)]
811    fn simd_ge_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
812        unsafe { _mm_cmpeq_epi8(_mm_max_epu8(a.into(), b.into()), a.into()).simd_into(self) }
813    }
814    #[inline(always)]
815    fn simd_gt_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
816        unsafe {
817            let sign_bit = _mm_set1_epi8(0x80u8.cast_signed());
818            let a_signed = _mm_xor_si128(a.into(), sign_bit);
819            let b_signed = _mm_xor_si128(b.into(), sign_bit);
820            _mm_cmpgt_epi8(a_signed, b_signed).simd_into(self)
821        }
822    }
823    #[inline(always)]
824    fn zip_low_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
825        unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) }
826    }
827    #[inline(always)]
828    fn zip_high_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
829        unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) }
830    }
831    #[inline(always)]
832    fn unzip_low_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
833        unsafe {
834            let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
835            let t1 = _mm_shuffle_epi8(a.into(), mask);
836            let t2 = _mm_shuffle_epi8(b.into(), mask);
837            _mm_unpacklo_epi64(t1, t2).simd_into(self)
838        }
839    }
840    #[inline(always)]
841    fn unzip_high_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
842        unsafe {
843            let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
844            let t1 = _mm_shuffle_epi8(a.into(), mask);
845            let t2 = _mm_shuffle_epi8(b.into(), mask);
846            _mm_unpackhi_epi64(t1, t2).simd_into(self)
847        }
848    }
849    #[inline(always)]
850    fn interleave_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> (u8x16<Self>, u8x16<Self>) {
851        (self.zip_low_u8x16(a, b), self.zip_high_u8x16(a, b))
852    }
853    #[inline(always)]
854    fn deinterleave_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> (u8x16<Self>, u8x16<Self>) {
855        (self.unzip_low_u8x16(a, b), self.unzip_high_u8x16(a, b))
856    }
857    #[inline(always)]
858    fn select_u8x16(self, a: mask8x16<Self>, b: u8x16<Self>, c: u8x16<Self>) -> u8x16<Self> {
859        unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
860    }
861    #[inline(always)]
862    fn min_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
863        unsafe { _mm_min_epu8(a.into(), b.into()).simd_into(self) }
864    }
865    #[inline(always)]
866    fn max_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
867        unsafe { _mm_max_epu8(a.into(), b.into()).simd_into(self) }
868    }
869    #[inline(always)]
870    fn combine_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x32<Self> {
871        unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
872    }
873    #[inline(always)]
874    fn widen_u8x16(self, a: u8x16<Self>) -> u16x16<Self> {
875        unsafe { _mm256_cvtepu8_epi16(a.into()).simd_into(self) }
876    }
877    #[inline(always)]
878    fn reinterpret_u32_u8x16(self, a: u8x16<Self>) -> u32x4<Self> {
879        __m128i::from(a).simd_into(self)
880    }
881    #[inline(always)]
882    fn splat_mask8x16(self, val: i8) -> mask8x16<Self> {
883        unsafe { _mm_set1_epi8(val).simd_into(self) }
884    }
885    #[inline(always)]
886    fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16<Self> {
887        mask8x16 {
888            val: unsafe { core::mem::transmute_copy(&val) },
889            simd: self,
890        }
891    }
892    #[inline(always)]
893    fn load_array_ref_mask8x16(self, val: &[i8; 16usize]) -> mask8x16<Self> {
894        mask8x16 {
895            val: unsafe { core::mem::transmute_copy(val) },
896            simd: self,
897        }
898    }
899    #[inline(always)]
900    fn as_array_mask8x16(self, a: mask8x16<Self>) -> [i8; 16usize] {
901        unsafe { core::mem::transmute::<__m128i, [i8; 16usize]>(a.val.0) }
902    }
903    #[inline(always)]
904    fn as_array_ref_mask8x16(self, a: &mask8x16<Self>) -> &[i8; 16usize] {
905        unsafe { core::mem::transmute::<&__m128i, &[i8; 16usize]>(&a.val.0) }
906    }
907    #[inline(always)]
908    fn as_array_mut_mask8x16(self, a: &mut mask8x16<Self>) -> &mut [i8; 16usize] {
909        unsafe { core::mem::transmute::<&mut __m128i, &mut [i8; 16usize]>(&mut a.val.0) }
910    }
911    #[inline(always)]
912    fn store_array_mask8x16(self, a: mask8x16<Self>, dest: &mut [i8; 16usize]) -> () {
913        unsafe {
914            core::ptr::copy_nonoverlapping(
915                (&raw const a.val.0) as *const i8,
916                dest.as_mut_ptr(),
917                16usize,
918            );
919        }
920    }
921    #[inline(always)]
922    fn cvt_from_bytes_mask8x16(self, a: u8x16<Self>) -> mask8x16<Self> {
923        unsafe {
924            mask8x16 {
925                val: core::mem::transmute(a.val),
926                simd: self,
927            }
928        }
929    }
930    #[inline(always)]
931    fn cvt_to_bytes_mask8x16(self, a: mask8x16<Self>) -> u8x16<Self> {
932        unsafe {
933            u8x16 {
934                val: core::mem::transmute(a.val),
935                simd: self,
936            }
937        }
938    }
939    #[inline(always)]
940    fn slide_mask8x16<const SHIFT: usize>(
941        self,
942        a: mask8x16<Self>,
943        b: mask8x16<Self>,
944    ) -> mask8x16<Self> {
945        unsafe {
946            if SHIFT >= 16usize {
947                return b;
948            }
949            let result = dyn_alignr_128(
950                self.cvt_to_bytes_mask8x16(b).val.0,
951                self.cvt_to_bytes_mask8x16(a).val.0,
952                SHIFT,
953            );
954            self.cvt_from_bytes_mask8x16(u8x16 {
955                val: crate::support::Aligned128(result),
956                simd: self,
957            })
958        }
959    }
960    #[inline(always)]
961    fn slide_within_blocks_mask8x16<const SHIFT: usize>(
962        self,
963        a: mask8x16<Self>,
964        b: mask8x16<Self>,
965    ) -> mask8x16<Self> {
966        self.slide_mask8x16::<SHIFT>(a, b)
967    }
968    #[inline(always)]
969    fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
970        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
971    }
972    #[inline(always)]
973    fn or_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
974        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
975    }
976    #[inline(always)]
977    fn xor_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
978        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
979    }
980    #[inline(always)]
981    fn not_mask8x16(self, a: mask8x16<Self>) -> mask8x16<Self> {
982        a ^ !0
983    }
984    #[inline(always)]
985    fn select_mask8x16(
986        self,
987        a: mask8x16<Self>,
988        b: mask8x16<Self>,
989        c: mask8x16<Self>,
990    ) -> mask8x16<Self> {
991        unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
992    }
993    #[inline(always)]
994    fn simd_eq_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
995        unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) }
996    }
997    #[inline(always)]
998    fn any_true_mask8x16(self, a: mask8x16<Self>) -> bool {
999        unsafe { _mm_movemask_epi8(a.into()) as u32 != 0 }
1000    }
1001    #[inline(always)]
1002    fn all_true_mask8x16(self, a: mask8x16<Self>) -> bool {
1003        unsafe { _mm_movemask_epi8(a.into()) as u32 == 0xffff }
1004    }
1005    #[inline(always)]
1006    fn any_false_mask8x16(self, a: mask8x16<Self>) -> bool {
1007        unsafe { _mm_movemask_epi8(a.into()) as u32 != 0xffff }
1008    }
1009    #[inline(always)]
1010    fn all_false_mask8x16(self, a: mask8x16<Self>) -> bool {
1011        unsafe { _mm_movemask_epi8(a.into()) as u32 == 0 }
1012    }
1013    #[inline(always)]
1014    fn combine_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x32<Self> {
1015        unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
1016    }
1017    #[inline(always)]
1018    fn splat_i16x8(self, val: i16) -> i16x8<Self> {
1019        unsafe { _mm_set1_epi16(val).simd_into(self) }
1020    }
1021    #[inline(always)]
1022    fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8<Self> {
1023        i16x8 {
1024            val: unsafe { core::mem::transmute_copy(&val) },
1025            simd: self,
1026        }
1027    }
1028    #[inline(always)]
1029    fn load_array_ref_i16x8(self, val: &[i16; 8usize]) -> i16x8<Self> {
1030        i16x8 {
1031            val: unsafe { core::mem::transmute_copy(val) },
1032            simd: self,
1033        }
1034    }
1035    #[inline(always)]
1036    fn as_array_i16x8(self, a: i16x8<Self>) -> [i16; 8usize] {
1037        unsafe { core::mem::transmute::<__m128i, [i16; 8usize]>(a.val.0) }
1038    }
1039    #[inline(always)]
1040    fn as_array_ref_i16x8(self, a: &i16x8<Self>) -> &[i16; 8usize] {
1041        unsafe { core::mem::transmute::<&__m128i, &[i16; 8usize]>(&a.val.0) }
1042    }
1043    #[inline(always)]
1044    fn as_array_mut_i16x8(self, a: &mut i16x8<Self>) -> &mut [i16; 8usize] {
1045        unsafe { core::mem::transmute::<&mut __m128i, &mut [i16; 8usize]>(&mut a.val.0) }
1046    }
1047    #[inline(always)]
1048    fn store_array_i16x8(self, a: i16x8<Self>, dest: &mut [i16; 8usize]) -> () {
1049        unsafe {
1050            core::ptr::copy_nonoverlapping(
1051                (&raw const a.val.0) as *const i16,
1052                dest.as_mut_ptr(),
1053                8usize,
1054            );
1055        }
1056    }
1057    #[inline(always)]
1058    fn cvt_from_bytes_i16x8(self, a: u8x16<Self>) -> i16x8<Self> {
1059        unsafe {
1060            i16x8 {
1061                val: core::mem::transmute(a.val),
1062                simd: self,
1063            }
1064        }
1065    }
1066    #[inline(always)]
1067    fn cvt_to_bytes_i16x8(self, a: i16x8<Self>) -> u8x16<Self> {
1068        unsafe {
1069            u8x16 {
1070                val: core::mem::transmute(a.val),
1071                simd: self,
1072            }
1073        }
1074    }
1075    #[inline(always)]
1076    fn slide_i16x8<const SHIFT: usize>(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1077        unsafe {
1078            if SHIFT >= 8usize {
1079                return b;
1080            }
1081            let result = dyn_alignr_128(
1082                self.cvt_to_bytes_i16x8(b).val.0,
1083                self.cvt_to_bytes_i16x8(a).val.0,
1084                SHIFT * 2usize,
1085            );
1086            self.cvt_from_bytes_i16x8(u8x16 {
1087                val: crate::support::Aligned128(result),
1088                simd: self,
1089            })
1090        }
1091    }
1092    #[inline(always)]
1093    fn slide_within_blocks_i16x8<const SHIFT: usize>(
1094        self,
1095        a: i16x8<Self>,
1096        b: i16x8<Self>,
1097    ) -> i16x8<Self> {
1098        self.slide_i16x8::<SHIFT>(a, b)
1099    }
1100    #[inline(always)]
1101    fn add_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1102        unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) }
1103    }
1104    #[inline(always)]
1105    fn sub_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1106        unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) }
1107    }
1108    #[inline(always)]
1109    fn mul_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1110        unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) }
1111    }
1112    #[inline(always)]
1113    fn and_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1114        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1115    }
1116    #[inline(always)]
1117    fn or_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1118        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1119    }
1120    #[inline(always)]
1121    fn xor_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1122        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1123    }
1124    #[inline(always)]
1125    fn not_i16x8(self, a: i16x8<Self>) -> i16x8<Self> {
1126        a ^ !0
1127    }
1128    #[inline(always)]
1129    fn shl_i16x8(self, a: i16x8<Self>, shift: u32) -> i16x8<Self> {
1130        unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1131    }
1132    #[inline(always)]
1133    fn shlv_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1134        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
1135    }
1136    #[inline(always)]
1137    fn shr_i16x8(self, a: i16x8<Self>, shift: u32) -> i16x8<Self> {
1138        unsafe { _mm_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1139    }
1140    #[inline(always)]
1141    fn shrv_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1142        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
1143    }
1144    #[inline(always)]
1145    fn simd_eq_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
1146        unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) }
1147    }
1148    #[inline(always)]
1149    fn simd_lt_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
1150        unsafe { _mm_cmpgt_epi16(b.into(), a.into()).simd_into(self) }
1151    }
1152    #[inline(always)]
1153    fn simd_le_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
1154        unsafe { _mm_cmpeq_epi16(_mm_min_epi16(a.into(), b.into()), a.into()).simd_into(self) }
1155    }
1156    #[inline(always)]
1157    fn simd_ge_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
1158        unsafe { _mm_cmpeq_epi16(_mm_max_epi16(a.into(), b.into()), a.into()).simd_into(self) }
1159    }
1160    #[inline(always)]
1161    fn simd_gt_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
1162        unsafe { _mm_cmpgt_epi16(a.into(), b.into()).simd_into(self) }
1163    }
1164    #[inline(always)]
1165    fn zip_low_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1166        unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) }
1167    }
1168    #[inline(always)]
1169    fn zip_high_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1170        unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) }
1171    }
1172    #[inline(always)]
1173    fn unzip_low_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1174        unsafe {
1175            let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
1176            let t1 = _mm_shuffle_epi8(a.into(), mask);
1177            let t2 = _mm_shuffle_epi8(b.into(), mask);
1178            _mm_unpacklo_epi64(t1, t2).simd_into(self)
1179        }
1180    }
1181    #[inline(always)]
1182    fn unzip_high_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1183        unsafe {
1184            let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
1185            let t1 = _mm_shuffle_epi8(a.into(), mask);
1186            let t2 = _mm_shuffle_epi8(b.into(), mask);
1187            _mm_unpackhi_epi64(t1, t2).simd_into(self)
1188        }
1189    }
1190    #[inline(always)]
1191    fn interleave_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> (i16x8<Self>, i16x8<Self>) {
1192        (self.zip_low_i16x8(a, b), self.zip_high_i16x8(a, b))
1193    }
1194    #[inline(always)]
1195    fn deinterleave_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> (i16x8<Self>, i16x8<Self>) {
1196        (self.unzip_low_i16x8(a, b), self.unzip_high_i16x8(a, b))
1197    }
1198    #[inline(always)]
1199    fn select_i16x8(self, a: mask16x8<Self>, b: i16x8<Self>, c: i16x8<Self>) -> i16x8<Self> {
1200        unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
1201    }
1202    #[inline(always)]
1203    fn min_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1204        unsafe { _mm_min_epi16(a.into(), b.into()).simd_into(self) }
1205    }
1206    #[inline(always)]
1207    fn max_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1208        unsafe { _mm_max_epi16(a.into(), b.into()).simd_into(self) }
1209    }
1210    #[inline(always)]
1211    fn combine_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x16<Self> {
1212        unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
1213    }
1214    #[inline(always)]
1215    fn neg_i16x8(self, a: i16x8<Self>) -> i16x8<Self> {
1216        unsafe { _mm_sub_epi16(_mm_setzero_si128(), a.into()).simd_into(self) }
1217    }
1218    #[inline(always)]
1219    fn reinterpret_u8_i16x8(self, a: i16x8<Self>) -> u8x16<Self> {
1220        __m128i::from(a).simd_into(self)
1221    }
1222    #[inline(always)]
1223    fn reinterpret_u32_i16x8(self, a: i16x8<Self>) -> u32x4<Self> {
1224        __m128i::from(a).simd_into(self)
1225    }
1226    #[inline(always)]
1227    fn splat_u16x8(self, val: u16) -> u16x8<Self> {
1228        unsafe { _mm_set1_epi16(val.cast_signed()).simd_into(self) }
1229    }
1230    #[inline(always)]
1231    fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8<Self> {
1232        u16x8 {
1233            val: unsafe { core::mem::transmute_copy(&val) },
1234            simd: self,
1235        }
1236    }
1237    #[inline(always)]
1238    fn load_array_ref_u16x8(self, val: &[u16; 8usize]) -> u16x8<Self> {
1239        u16x8 {
1240            val: unsafe { core::mem::transmute_copy(val) },
1241            simd: self,
1242        }
1243    }
1244    #[inline(always)]
1245    fn as_array_u16x8(self, a: u16x8<Self>) -> [u16; 8usize] {
1246        unsafe { core::mem::transmute::<__m128i, [u16; 8usize]>(a.val.0) }
1247    }
1248    #[inline(always)]
1249    fn as_array_ref_u16x8(self, a: &u16x8<Self>) -> &[u16; 8usize] {
1250        unsafe { core::mem::transmute::<&__m128i, &[u16; 8usize]>(&a.val.0) }
1251    }
1252    #[inline(always)]
1253    fn as_array_mut_u16x8(self, a: &mut u16x8<Self>) -> &mut [u16; 8usize] {
1254        unsafe { core::mem::transmute::<&mut __m128i, &mut [u16; 8usize]>(&mut a.val.0) }
1255    }
1256    #[inline(always)]
1257    fn store_array_u16x8(self, a: u16x8<Self>, dest: &mut [u16; 8usize]) -> () {
1258        unsafe {
1259            core::ptr::copy_nonoverlapping(
1260                (&raw const a.val.0) as *const u16,
1261                dest.as_mut_ptr(),
1262                8usize,
1263            );
1264        }
1265    }
1266    #[inline(always)]
1267    fn cvt_from_bytes_u16x8(self, a: u8x16<Self>) -> u16x8<Self> {
1268        unsafe {
1269            u16x8 {
1270                val: core::mem::transmute(a.val),
1271                simd: self,
1272            }
1273        }
1274    }
1275    #[inline(always)]
1276    fn cvt_to_bytes_u16x8(self, a: u16x8<Self>) -> u8x16<Self> {
1277        unsafe {
1278            u8x16 {
1279                val: core::mem::transmute(a.val),
1280                simd: self,
1281            }
1282        }
1283    }
1284    #[inline(always)]
1285    fn slide_u16x8<const SHIFT: usize>(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1286        unsafe {
1287            if SHIFT >= 8usize {
1288                return b;
1289            }
1290            let result = dyn_alignr_128(
1291                self.cvt_to_bytes_u16x8(b).val.0,
1292                self.cvt_to_bytes_u16x8(a).val.0,
1293                SHIFT * 2usize,
1294            );
1295            self.cvt_from_bytes_u16x8(u8x16 {
1296                val: crate::support::Aligned128(result),
1297                simd: self,
1298            })
1299        }
1300    }
1301    #[inline(always)]
1302    fn slide_within_blocks_u16x8<const SHIFT: usize>(
1303        self,
1304        a: u16x8<Self>,
1305        b: u16x8<Self>,
1306    ) -> u16x8<Self> {
1307        self.slide_u16x8::<SHIFT>(a, b)
1308    }
1309    #[inline(always)]
1310    fn add_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1311        unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) }
1312    }
1313    #[inline(always)]
1314    fn sub_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1315        unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) }
1316    }
1317    #[inline(always)]
1318    fn mul_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1319        unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) }
1320    }
1321    #[inline(always)]
1322    fn and_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1323        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1324    }
1325    #[inline(always)]
1326    fn or_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1327        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1328    }
1329    #[inline(always)]
1330    fn xor_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1331        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1332    }
1333    #[inline(always)]
1334    fn not_u16x8(self, a: u16x8<Self>) -> u16x8<Self> {
1335        a ^ !0
1336    }
1337    #[inline(always)]
1338    fn shl_u16x8(self, a: u16x8<Self>, shift: u32) -> u16x8<Self> {
1339        unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1340    }
1341    #[inline(always)]
1342    fn shlv_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1343        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
1344    }
1345    #[inline(always)]
1346    fn shr_u16x8(self, a: u16x8<Self>, shift: u32) -> u16x8<Self> {
1347        unsafe { _mm_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1348    }
1349    #[inline(always)]
1350    fn shrv_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1351        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
1352    }
1353    #[inline(always)]
1354    fn simd_eq_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
1355        unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) }
1356    }
1357    #[inline(always)]
1358    fn simd_lt_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
1359        unsafe {
1360            let sign_bit = _mm_set1_epi16(0x8000u16.cast_signed());
1361            let a_signed = _mm_xor_si128(a.into(), sign_bit);
1362            let b_signed = _mm_xor_si128(b.into(), sign_bit);
1363            _mm_cmpgt_epi16(b_signed, a_signed).simd_into(self)
1364        }
1365    }
1366    #[inline(always)]
1367    fn simd_le_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
1368        unsafe { _mm_cmpeq_epi16(_mm_min_epu16(a.into(), b.into()), a.into()).simd_into(self) }
1369    }
1370    #[inline(always)]
1371    fn simd_ge_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
1372        unsafe { _mm_cmpeq_epi16(_mm_max_epu16(a.into(), b.into()), a.into()).simd_into(self) }
1373    }
1374    #[inline(always)]
1375    fn simd_gt_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
1376        unsafe {
1377            let sign_bit = _mm_set1_epi16(0x8000u16.cast_signed());
1378            let a_signed = _mm_xor_si128(a.into(), sign_bit);
1379            let b_signed = _mm_xor_si128(b.into(), sign_bit);
1380            _mm_cmpgt_epi16(a_signed, b_signed).simd_into(self)
1381        }
1382    }
1383    #[inline(always)]
1384    fn zip_low_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1385        unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) }
1386    }
1387    #[inline(always)]
1388    fn zip_high_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1389        unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) }
1390    }
1391    #[inline(always)]
1392    fn unzip_low_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1393        unsafe {
1394            let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
1395            let t1 = _mm_shuffle_epi8(a.into(), mask);
1396            let t2 = _mm_shuffle_epi8(b.into(), mask);
1397            _mm_unpacklo_epi64(t1, t2).simd_into(self)
1398        }
1399    }
1400    #[inline(always)]
1401    fn unzip_high_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1402        unsafe {
1403            let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
1404            let t1 = _mm_shuffle_epi8(a.into(), mask);
1405            let t2 = _mm_shuffle_epi8(b.into(), mask);
1406            _mm_unpackhi_epi64(t1, t2).simd_into(self)
1407        }
1408    }
1409    #[inline(always)]
1410    fn interleave_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> (u16x8<Self>, u16x8<Self>) {
1411        (self.zip_low_u16x8(a, b), self.zip_high_u16x8(a, b))
1412    }
1413    #[inline(always)]
1414    fn deinterleave_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> (u16x8<Self>, u16x8<Self>) {
1415        (self.unzip_low_u16x8(a, b), self.unzip_high_u16x8(a, b))
1416    }
1417    #[inline(always)]
1418    fn select_u16x8(self, a: mask16x8<Self>, b: u16x8<Self>, c: u16x8<Self>) -> u16x8<Self> {
1419        unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
1420    }
1421    #[inline(always)]
1422    fn min_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1423        unsafe { _mm_min_epu16(a.into(), b.into()).simd_into(self) }
1424    }
1425    #[inline(always)]
1426    fn max_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1427        unsafe { _mm_max_epu16(a.into(), b.into()).simd_into(self) }
1428    }
1429    #[inline(always)]
1430    fn combine_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x16<Self> {
1431        unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
1432    }
1433    #[inline(always)]
1434    fn reinterpret_u8_u16x8(self, a: u16x8<Self>) -> u8x16<Self> {
1435        __m128i::from(a).simd_into(self)
1436    }
1437    #[inline(always)]
1438    fn reinterpret_u32_u16x8(self, a: u16x8<Self>) -> u32x4<Self> {
1439        __m128i::from(a).simd_into(self)
1440    }
1441    #[inline(always)]
1442    fn splat_mask16x8(self, val: i16) -> mask16x8<Self> {
1443        unsafe { _mm_set1_epi16(val).simd_into(self) }
1444    }
1445    #[inline(always)]
1446    fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8<Self> {
1447        mask16x8 {
1448            val: unsafe { core::mem::transmute_copy(&val) },
1449            simd: self,
1450        }
1451    }
1452    #[inline(always)]
1453    fn load_array_ref_mask16x8(self, val: &[i16; 8usize]) -> mask16x8<Self> {
1454        mask16x8 {
1455            val: unsafe { core::mem::transmute_copy(val) },
1456            simd: self,
1457        }
1458    }
1459    #[inline(always)]
1460    fn as_array_mask16x8(self, a: mask16x8<Self>) -> [i16; 8usize] {
1461        unsafe { core::mem::transmute::<__m128i, [i16; 8usize]>(a.val.0) }
1462    }
1463    #[inline(always)]
1464    fn as_array_ref_mask16x8(self, a: &mask16x8<Self>) -> &[i16; 8usize] {
1465        unsafe { core::mem::transmute::<&__m128i, &[i16; 8usize]>(&a.val.0) }
1466    }
1467    #[inline(always)]
1468    fn as_array_mut_mask16x8(self, a: &mut mask16x8<Self>) -> &mut [i16; 8usize] {
1469        unsafe { core::mem::transmute::<&mut __m128i, &mut [i16; 8usize]>(&mut a.val.0) }
1470    }
1471    #[inline(always)]
1472    fn store_array_mask16x8(self, a: mask16x8<Self>, dest: &mut [i16; 8usize]) -> () {
1473        unsafe {
1474            core::ptr::copy_nonoverlapping(
1475                (&raw const a.val.0) as *const i16,
1476                dest.as_mut_ptr(),
1477                8usize,
1478            );
1479        }
1480    }
1481    #[inline(always)]
1482    fn cvt_from_bytes_mask16x8(self, a: u8x16<Self>) -> mask16x8<Self> {
1483        unsafe {
1484            mask16x8 {
1485                val: core::mem::transmute(a.val),
1486                simd: self,
1487            }
1488        }
1489    }
1490    #[inline(always)]
1491    fn cvt_to_bytes_mask16x8(self, a: mask16x8<Self>) -> u8x16<Self> {
1492        unsafe {
1493            u8x16 {
1494                val: core::mem::transmute(a.val),
1495                simd: self,
1496            }
1497        }
1498    }
1499    #[inline(always)]
1500    fn slide_mask16x8<const SHIFT: usize>(
1501        self,
1502        a: mask16x8<Self>,
1503        b: mask16x8<Self>,
1504    ) -> mask16x8<Self> {
1505        unsafe {
1506            if SHIFT >= 8usize {
1507                return b;
1508            }
1509            let result = dyn_alignr_128(
1510                self.cvt_to_bytes_mask16x8(b).val.0,
1511                self.cvt_to_bytes_mask16x8(a).val.0,
1512                SHIFT * 2usize,
1513            );
1514            self.cvt_from_bytes_mask16x8(u8x16 {
1515                val: crate::support::Aligned128(result),
1516                simd: self,
1517            })
1518        }
1519    }
1520    #[inline(always)]
1521    fn slide_within_blocks_mask16x8<const SHIFT: usize>(
1522        self,
1523        a: mask16x8<Self>,
1524        b: mask16x8<Self>,
1525    ) -> mask16x8<Self> {
1526        self.slide_mask16x8::<SHIFT>(a, b)
1527    }
1528    #[inline(always)]
1529    fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
1530        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1531    }
1532    #[inline(always)]
1533    fn or_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
1534        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1535    }
1536    #[inline(always)]
1537    fn xor_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
1538        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1539    }
1540    #[inline(always)]
1541    fn not_mask16x8(self, a: mask16x8<Self>) -> mask16x8<Self> {
1542        a ^ !0
1543    }
1544    #[inline(always)]
1545    fn select_mask16x8(
1546        self,
1547        a: mask16x8<Self>,
1548        b: mask16x8<Self>,
1549        c: mask16x8<Self>,
1550    ) -> mask16x8<Self> {
1551        unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
1552    }
1553    #[inline(always)]
1554    fn simd_eq_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
1555        unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) }
1556    }
1557    #[inline(always)]
1558    fn any_true_mask16x8(self, a: mask16x8<Self>) -> bool {
1559        unsafe { _mm_movemask_epi8(a.into()) as u32 != 0 }
1560    }
1561    #[inline(always)]
1562    fn all_true_mask16x8(self, a: mask16x8<Self>) -> bool {
1563        unsafe { _mm_movemask_epi8(a.into()) as u32 == 0xffff }
1564    }
1565    #[inline(always)]
1566    fn any_false_mask16x8(self, a: mask16x8<Self>) -> bool {
1567        unsafe { _mm_movemask_epi8(a.into()) as u32 != 0xffff }
1568    }
1569    #[inline(always)]
1570    fn all_false_mask16x8(self, a: mask16x8<Self>) -> bool {
1571        unsafe { _mm_movemask_epi8(a.into()) as u32 == 0 }
1572    }
1573    #[inline(always)]
1574    fn combine_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x16<Self> {
1575        unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
1576    }
1577    #[inline(always)]
1578    fn splat_i32x4(self, val: i32) -> i32x4<Self> {
1579        unsafe { _mm_set1_epi32(val).simd_into(self) }
1580    }
1581    #[inline(always)]
1582    fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4<Self> {
1583        i32x4 {
1584            val: unsafe { core::mem::transmute_copy(&val) },
1585            simd: self,
1586        }
1587    }
1588    #[inline(always)]
1589    fn load_array_ref_i32x4(self, val: &[i32; 4usize]) -> i32x4<Self> {
1590        i32x4 {
1591            val: unsafe { core::mem::transmute_copy(val) },
1592            simd: self,
1593        }
1594    }
1595    #[inline(always)]
1596    fn as_array_i32x4(self, a: i32x4<Self>) -> [i32; 4usize] {
1597        unsafe { core::mem::transmute::<__m128i, [i32; 4usize]>(a.val.0) }
1598    }
1599    #[inline(always)]
1600    fn as_array_ref_i32x4(self, a: &i32x4<Self>) -> &[i32; 4usize] {
1601        unsafe { core::mem::transmute::<&__m128i, &[i32; 4usize]>(&a.val.0) }
1602    }
1603    #[inline(always)]
1604    fn as_array_mut_i32x4(self, a: &mut i32x4<Self>) -> &mut [i32; 4usize] {
1605        unsafe { core::mem::transmute::<&mut __m128i, &mut [i32; 4usize]>(&mut a.val.0) }
1606    }
1607    #[inline(always)]
1608    fn store_array_i32x4(self, a: i32x4<Self>, dest: &mut [i32; 4usize]) -> () {
1609        unsafe {
1610            core::ptr::copy_nonoverlapping(
1611                (&raw const a.val.0) as *const i32,
1612                dest.as_mut_ptr(),
1613                4usize,
1614            );
1615        }
1616    }
1617    #[inline(always)]
1618    fn cvt_from_bytes_i32x4(self, a: u8x16<Self>) -> i32x4<Self> {
1619        unsafe {
1620            i32x4 {
1621                val: core::mem::transmute(a.val),
1622                simd: self,
1623            }
1624        }
1625    }
1626    #[inline(always)]
1627    fn cvt_to_bytes_i32x4(self, a: i32x4<Self>) -> u8x16<Self> {
1628        unsafe {
1629            u8x16 {
1630                val: core::mem::transmute(a.val),
1631                simd: self,
1632            }
1633        }
1634    }
1635    #[inline(always)]
1636    fn slide_i32x4<const SHIFT: usize>(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1637        unsafe {
1638            if SHIFT >= 4usize {
1639                return b;
1640            }
1641            let result = dyn_alignr_128(
1642                self.cvt_to_bytes_i32x4(b).val.0,
1643                self.cvt_to_bytes_i32x4(a).val.0,
1644                SHIFT * 4usize,
1645            );
1646            self.cvt_from_bytes_i32x4(u8x16 {
1647                val: crate::support::Aligned128(result),
1648                simd: self,
1649            })
1650        }
1651    }
1652    #[inline(always)]
1653    fn slide_within_blocks_i32x4<const SHIFT: usize>(
1654        self,
1655        a: i32x4<Self>,
1656        b: i32x4<Self>,
1657    ) -> i32x4<Self> {
1658        self.slide_i32x4::<SHIFT>(a, b)
1659    }
1660    #[inline(always)]
1661    fn add_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1662        unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) }
1663    }
1664    #[inline(always)]
1665    fn sub_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1666        unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) }
1667    }
1668    #[inline(always)]
1669    fn mul_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1670        unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) }
1671    }
1672    #[inline(always)]
1673    fn and_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1674        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1675    }
1676    #[inline(always)]
1677    fn or_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1678        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1679    }
1680    #[inline(always)]
1681    fn xor_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1682        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1683    }
1684    #[inline(always)]
1685    fn not_i32x4(self, a: i32x4<Self>) -> i32x4<Self> {
1686        a ^ !0
1687    }
1688    #[inline(always)]
1689    fn shl_i32x4(self, a: i32x4<Self>, shift: u32) -> i32x4<Self> {
1690        unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1691    }
1692    #[inline(always)]
1693    fn shlv_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1694        unsafe { _mm_sllv_epi32(a.into(), b.into()).simd_into(self) }
1695    }
1696    #[inline(always)]
1697    fn shr_i32x4(self, a: i32x4<Self>, shift: u32) -> i32x4<Self> {
1698        unsafe { _mm_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1699    }
1700    #[inline(always)]
1701    fn shrv_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1702        unsafe { _mm_srav_epi32(a.into(), b.into()).simd_into(self) }
1703    }
1704    #[inline(always)]
1705    fn simd_eq_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
1706        unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) }
1707    }
1708    #[inline(always)]
1709    fn simd_lt_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
1710        unsafe { _mm_cmpgt_epi32(b.into(), a.into()).simd_into(self) }
1711    }
1712    #[inline(always)]
1713    fn simd_le_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
1714        unsafe { _mm_cmpeq_epi32(_mm_min_epi32(a.into(), b.into()), a.into()).simd_into(self) }
1715    }
1716    #[inline(always)]
1717    fn simd_ge_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
1718        unsafe { _mm_cmpeq_epi32(_mm_max_epi32(a.into(), b.into()), a.into()).simd_into(self) }
1719    }
1720    #[inline(always)]
1721    fn simd_gt_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
1722        unsafe { _mm_cmpgt_epi32(a.into(), b.into()).simd_into(self) }
1723    }
1724    #[inline(always)]
1725    fn zip_low_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1726        unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) }
1727    }
1728    #[inline(always)]
1729    fn zip_high_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1730        unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) }
1731    }
1732    #[inline(always)]
1733    fn unzip_low_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1734        unsafe {
1735            let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
1736            let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
1737            _mm_unpacklo_epi64(t1, t2).simd_into(self)
1738        }
1739    }
1740    #[inline(always)]
1741    fn unzip_high_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1742        unsafe {
1743            let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
1744            let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
1745            _mm_unpackhi_epi64(t1, t2).simd_into(self)
1746        }
1747    }
1748    #[inline(always)]
1749    fn interleave_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> (i32x4<Self>, i32x4<Self>) {
1750        (self.zip_low_i32x4(a, b), self.zip_high_i32x4(a, b))
1751    }
1752    #[inline(always)]
1753    fn deinterleave_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> (i32x4<Self>, i32x4<Self>) {
1754        (self.unzip_low_i32x4(a, b), self.unzip_high_i32x4(a, b))
1755    }
1756    #[inline(always)]
1757    fn select_i32x4(self, a: mask32x4<Self>, b: i32x4<Self>, c: i32x4<Self>) -> i32x4<Self> {
1758        unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
1759    }
1760    #[inline(always)]
1761    fn min_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1762        unsafe { _mm_min_epi32(a.into(), b.into()).simd_into(self) }
1763    }
1764    #[inline(always)]
1765    fn max_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1766        unsafe { _mm_max_epi32(a.into(), b.into()).simd_into(self) }
1767    }
1768    #[inline(always)]
1769    fn combine_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x8<Self> {
1770        unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
1771    }
1772    #[inline(always)]
1773    fn neg_i32x4(self, a: i32x4<Self>) -> i32x4<Self> {
1774        unsafe { _mm_sub_epi32(_mm_setzero_si128(), a.into()).simd_into(self) }
1775    }
1776    #[inline(always)]
1777    fn reinterpret_u8_i32x4(self, a: i32x4<Self>) -> u8x16<Self> {
1778        __m128i::from(a).simd_into(self)
1779    }
1780    #[inline(always)]
1781    fn reinterpret_u32_i32x4(self, a: i32x4<Self>) -> u32x4<Self> {
1782        __m128i::from(a).simd_into(self)
1783    }
1784    #[inline(always)]
1785    fn cvt_f32_i32x4(self, a: i32x4<Self>) -> f32x4<Self> {
1786        unsafe { _mm_cvtepi32_ps(a.into()).simd_into(self) }
1787    }
1788    #[inline(always)]
1789    fn splat_u32x4(self, val: u32) -> u32x4<Self> {
1790        unsafe { _mm_set1_epi32(val.cast_signed()).simd_into(self) }
1791    }
1792    #[inline(always)]
1793    fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4<Self> {
1794        u32x4 {
1795            val: unsafe { core::mem::transmute_copy(&val) },
1796            simd: self,
1797        }
1798    }
1799    #[inline(always)]
1800    fn load_array_ref_u32x4(self, val: &[u32; 4usize]) -> u32x4<Self> {
1801        u32x4 {
1802            val: unsafe { core::mem::transmute_copy(val) },
1803            simd: self,
1804        }
1805    }
1806    #[inline(always)]
1807    fn as_array_u32x4(self, a: u32x4<Self>) -> [u32; 4usize] {
1808        unsafe { core::mem::transmute::<__m128i, [u32; 4usize]>(a.val.0) }
1809    }
1810    #[inline(always)]
1811    fn as_array_ref_u32x4(self, a: &u32x4<Self>) -> &[u32; 4usize] {
1812        unsafe { core::mem::transmute::<&__m128i, &[u32; 4usize]>(&a.val.0) }
1813    }
1814    #[inline(always)]
1815    fn as_array_mut_u32x4(self, a: &mut u32x4<Self>) -> &mut [u32; 4usize] {
1816        unsafe { core::mem::transmute::<&mut __m128i, &mut [u32; 4usize]>(&mut a.val.0) }
1817    }
1818    #[inline(always)]
1819    fn store_array_u32x4(self, a: u32x4<Self>, dest: &mut [u32; 4usize]) -> () {
1820        unsafe {
1821            core::ptr::copy_nonoverlapping(
1822                (&raw const a.val.0) as *const u32,
1823                dest.as_mut_ptr(),
1824                4usize,
1825            );
1826        }
1827    }
1828    #[inline(always)]
1829    fn cvt_from_bytes_u32x4(self, a: u8x16<Self>) -> u32x4<Self> {
1830        unsafe {
1831            u32x4 {
1832                val: core::mem::transmute(a.val),
1833                simd: self,
1834            }
1835        }
1836    }
1837    #[inline(always)]
1838    fn cvt_to_bytes_u32x4(self, a: u32x4<Self>) -> u8x16<Self> {
1839        unsafe {
1840            u8x16 {
1841                val: core::mem::transmute(a.val),
1842                simd: self,
1843            }
1844        }
1845    }
1846    #[inline(always)]
1847    fn slide_u32x4<const SHIFT: usize>(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1848        unsafe {
1849            if SHIFT >= 4usize {
1850                return b;
1851            }
1852            let result = dyn_alignr_128(
1853                self.cvt_to_bytes_u32x4(b).val.0,
1854                self.cvt_to_bytes_u32x4(a).val.0,
1855                SHIFT * 4usize,
1856            );
1857            self.cvt_from_bytes_u32x4(u8x16 {
1858                val: crate::support::Aligned128(result),
1859                simd: self,
1860            })
1861        }
1862    }
1863    #[inline(always)]
1864    fn slide_within_blocks_u32x4<const SHIFT: usize>(
1865        self,
1866        a: u32x4<Self>,
1867        b: u32x4<Self>,
1868    ) -> u32x4<Self> {
1869        self.slide_u32x4::<SHIFT>(a, b)
1870    }
1871    #[inline(always)]
1872    fn add_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1873        unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) }
1874    }
1875    #[inline(always)]
1876    fn sub_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1877        unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) }
1878    }
1879    #[inline(always)]
1880    fn mul_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1881        unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) }
1882    }
1883    #[inline(always)]
1884    fn and_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1885        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1886    }
1887    #[inline(always)]
1888    fn or_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1889        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1890    }
1891    #[inline(always)]
1892    fn xor_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1893        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1894    }
1895    #[inline(always)]
1896    fn not_u32x4(self, a: u32x4<Self>) -> u32x4<Self> {
1897        a ^ !0
1898    }
1899    #[inline(always)]
1900    fn shl_u32x4(self, a: u32x4<Self>, shift: u32) -> u32x4<Self> {
1901        unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1902    }
1903    #[inline(always)]
1904    fn shlv_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1905        unsafe { _mm_sllv_epi32(a.into(), b.into()).simd_into(self) }
1906    }
1907    #[inline(always)]
1908    fn shr_u32x4(self, a: u32x4<Self>, shift: u32) -> u32x4<Self> {
1909        unsafe { _mm_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1910    }
1911    #[inline(always)]
1912    fn shrv_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1913        unsafe { _mm_srlv_epi32(a.into(), b.into()).simd_into(self) }
1914    }
1915    #[inline(always)]
1916    fn simd_eq_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1917        unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) }
1918    }
1919    #[inline(always)]
1920    fn simd_lt_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1921        unsafe {
1922            let sign_bit = _mm_set1_epi32(0x80000000u32.cast_signed());
1923            let a_signed = _mm_xor_si128(a.into(), sign_bit);
1924            let b_signed = _mm_xor_si128(b.into(), sign_bit);
1925            _mm_cmpgt_epi32(b_signed, a_signed).simd_into(self)
1926        }
1927    }
1928    #[inline(always)]
1929    fn simd_le_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1930        unsafe { _mm_cmpeq_epi32(_mm_min_epu32(a.into(), b.into()), a.into()).simd_into(self) }
1931    }
1932    #[inline(always)]
1933    fn simd_ge_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1934        unsafe { _mm_cmpeq_epi32(_mm_max_epu32(a.into(), b.into()), a.into()).simd_into(self) }
1935    }
1936    #[inline(always)]
1937    fn simd_gt_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1938        unsafe {
1939            let sign_bit = _mm_set1_epi32(0x80000000u32.cast_signed());
1940            let a_signed = _mm_xor_si128(a.into(), sign_bit);
1941            let b_signed = _mm_xor_si128(b.into(), sign_bit);
1942            _mm_cmpgt_epi32(a_signed, b_signed).simd_into(self)
1943        }
1944    }
1945    #[inline(always)]
1946    fn zip_low_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1947        unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) }
1948    }
1949    #[inline(always)]
1950    fn zip_high_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1951        unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) }
1952    }
1953    #[inline(always)]
1954    fn unzip_low_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1955        unsafe {
1956            let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
1957            let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
1958            _mm_unpacklo_epi64(t1, t2).simd_into(self)
1959        }
1960    }
1961    #[inline(always)]
1962    fn unzip_high_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1963        unsafe {
1964            let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
1965            let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
1966            _mm_unpackhi_epi64(t1, t2).simd_into(self)
1967        }
1968    }
1969    #[inline(always)]
1970    fn interleave_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> (u32x4<Self>, u32x4<Self>) {
1971        (self.zip_low_u32x4(a, b), self.zip_high_u32x4(a, b))
1972    }
1973    #[inline(always)]
1974    fn deinterleave_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> (u32x4<Self>, u32x4<Self>) {
1975        (self.unzip_low_u32x4(a, b), self.unzip_high_u32x4(a, b))
1976    }
1977    #[inline(always)]
1978    fn select_u32x4(self, a: mask32x4<Self>, b: u32x4<Self>, c: u32x4<Self>) -> u32x4<Self> {
1979        unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
1980    }
1981    #[inline(always)]
1982    fn min_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1983        unsafe { _mm_min_epu32(a.into(), b.into()).simd_into(self) }
1984    }
1985    #[inline(always)]
1986    fn max_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1987        unsafe { _mm_max_epu32(a.into(), b.into()).simd_into(self) }
1988    }
1989    #[inline(always)]
1990    fn combine_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x8<Self> {
1991        unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
1992    }
1993    #[inline(always)]
1994    fn reinterpret_u8_u32x4(self, a: u32x4<Self>) -> u8x16<Self> {
1995        __m128i::from(a).simd_into(self)
1996    }
1997    #[inline(always)]
1998    fn cvt_f32_u32x4(self, a: u32x4<Self>) -> f32x4<Self> {
1999        unsafe {
2000            let a = a.into();
2001            let lo = _mm_blend_epi16::<0xAA>(a, _mm_set1_epi32(0x4B000000));
2002            let hi = _mm_blend_epi16::<0xAA>(_mm_srli_epi32::<16>(a), _mm_set1_epi32(0x53000000));
2003            let fhi = _mm_sub_ps(
2004                _mm_castsi128_ps(hi),
2005                _mm_set1_ps(f32::from_bits(0x53000080)),
2006            );
2007            let result = _mm_add_ps(_mm_castsi128_ps(lo), fhi);
2008            result.simd_into(self)
2009        }
2010    }
2011    #[inline(always)]
2012    fn splat_mask32x4(self, val: i32) -> mask32x4<Self> {
2013        unsafe { _mm_set1_epi32(val).simd_into(self) }
2014    }
2015    #[inline(always)]
2016    fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4<Self> {
2017        mask32x4 {
2018            val: unsafe { core::mem::transmute_copy(&val) },
2019            simd: self,
2020        }
2021    }
2022    #[inline(always)]
2023    fn load_array_ref_mask32x4(self, val: &[i32; 4usize]) -> mask32x4<Self> {
2024        mask32x4 {
2025            val: unsafe { core::mem::transmute_copy(val) },
2026            simd: self,
2027        }
2028    }
2029    #[inline(always)]
2030    fn as_array_mask32x4(self, a: mask32x4<Self>) -> [i32; 4usize] {
2031        unsafe { core::mem::transmute::<__m128i, [i32; 4usize]>(a.val.0) }
2032    }
2033    #[inline(always)]
2034    fn as_array_ref_mask32x4(self, a: &mask32x4<Self>) -> &[i32; 4usize] {
2035        unsafe { core::mem::transmute::<&__m128i, &[i32; 4usize]>(&a.val.0) }
2036    }
2037    #[inline(always)]
2038    fn as_array_mut_mask32x4(self, a: &mut mask32x4<Self>) -> &mut [i32; 4usize] {
2039        unsafe { core::mem::transmute::<&mut __m128i, &mut [i32; 4usize]>(&mut a.val.0) }
2040    }
2041    #[inline(always)]
2042    fn store_array_mask32x4(self, a: mask32x4<Self>, dest: &mut [i32; 4usize]) -> () {
2043        unsafe {
2044            core::ptr::copy_nonoverlapping(
2045                (&raw const a.val.0) as *const i32,
2046                dest.as_mut_ptr(),
2047                4usize,
2048            );
2049        }
2050    }
2051    #[inline(always)]
2052    fn cvt_from_bytes_mask32x4(self, a: u8x16<Self>) -> mask32x4<Self> {
2053        unsafe {
2054            mask32x4 {
2055                val: core::mem::transmute(a.val),
2056                simd: self,
2057            }
2058        }
2059    }
2060    #[inline(always)]
2061    fn cvt_to_bytes_mask32x4(self, a: mask32x4<Self>) -> u8x16<Self> {
2062        unsafe {
2063            u8x16 {
2064                val: core::mem::transmute(a.val),
2065                simd: self,
2066            }
2067        }
2068    }
2069    #[inline(always)]
2070    fn slide_mask32x4<const SHIFT: usize>(
2071        self,
2072        a: mask32x4<Self>,
2073        b: mask32x4<Self>,
2074    ) -> mask32x4<Self> {
2075        unsafe {
2076            if SHIFT >= 4usize {
2077                return b;
2078            }
2079            let result = dyn_alignr_128(
2080                self.cvt_to_bytes_mask32x4(b).val.0,
2081                self.cvt_to_bytes_mask32x4(a).val.0,
2082                SHIFT * 4usize,
2083            );
2084            self.cvt_from_bytes_mask32x4(u8x16 {
2085                val: crate::support::Aligned128(result),
2086                simd: self,
2087            })
2088        }
2089    }
2090    #[inline(always)]
2091    fn slide_within_blocks_mask32x4<const SHIFT: usize>(
2092        self,
2093        a: mask32x4<Self>,
2094        b: mask32x4<Self>,
2095    ) -> mask32x4<Self> {
2096        self.slide_mask32x4::<SHIFT>(a, b)
2097    }
2098    #[inline(always)]
2099    fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
2100        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
2101    }
2102    #[inline(always)]
2103    fn or_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
2104        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
2105    }
2106    #[inline(always)]
2107    fn xor_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
2108        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
2109    }
2110    #[inline(always)]
2111    fn not_mask32x4(self, a: mask32x4<Self>) -> mask32x4<Self> {
2112        a ^ !0
2113    }
2114    #[inline(always)]
2115    fn select_mask32x4(
2116        self,
2117        a: mask32x4<Self>,
2118        b: mask32x4<Self>,
2119        c: mask32x4<Self>,
2120    ) -> mask32x4<Self> {
2121        unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
2122    }
2123    #[inline(always)]
2124    fn simd_eq_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
2125        unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) }
2126    }
2127    #[inline(always)]
2128    fn any_true_mask32x4(self, a: mask32x4<Self>) -> bool {
2129        unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 != 0 }
2130    }
2131    #[inline(always)]
2132    fn all_true_mask32x4(self, a: mask32x4<Self>) -> bool {
2133        unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 == 0b1111 }
2134    }
2135    #[inline(always)]
2136    fn any_false_mask32x4(self, a: mask32x4<Self>) -> bool {
2137        unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 != 0b1111 }
2138    }
2139    #[inline(always)]
2140    fn all_false_mask32x4(self, a: mask32x4<Self>) -> bool {
2141        unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 == 0 }
2142    }
2143    #[inline(always)]
2144    fn combine_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x8<Self> {
2145        unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
2146    }
2147    #[inline(always)]
2148    fn splat_f64x2(self, val: f64) -> f64x2<Self> {
2149        unsafe { _mm_set1_pd(val).simd_into(self) }
2150    }
2151    #[inline(always)]
2152    fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2<Self> {
2153        f64x2 {
2154            val: unsafe { core::mem::transmute_copy(&val) },
2155            simd: self,
2156        }
2157    }
2158    #[inline(always)]
2159    fn load_array_ref_f64x2(self, val: &[f64; 2usize]) -> f64x2<Self> {
2160        f64x2 {
2161            val: unsafe { core::mem::transmute_copy(val) },
2162            simd: self,
2163        }
2164    }
2165    #[inline(always)]
2166    fn as_array_f64x2(self, a: f64x2<Self>) -> [f64; 2usize] {
2167        unsafe { core::mem::transmute::<__m128d, [f64; 2usize]>(a.val.0) }
2168    }
2169    #[inline(always)]
2170    fn as_array_ref_f64x2(self, a: &f64x2<Self>) -> &[f64; 2usize] {
2171        unsafe { core::mem::transmute::<&__m128d, &[f64; 2usize]>(&a.val.0) }
2172    }
2173    #[inline(always)]
2174    fn as_array_mut_f64x2(self, a: &mut f64x2<Self>) -> &mut [f64; 2usize] {
2175        unsafe { core::mem::transmute::<&mut __m128d, &mut [f64; 2usize]>(&mut a.val.0) }
2176    }
2177    #[inline(always)]
2178    fn store_array_f64x2(self, a: f64x2<Self>, dest: &mut [f64; 2usize]) -> () {
2179        unsafe {
2180            core::ptr::copy_nonoverlapping(
2181                (&raw const a.val.0) as *const f64,
2182                dest.as_mut_ptr(),
2183                2usize,
2184            );
2185        }
2186    }
2187    #[inline(always)]
2188    fn cvt_from_bytes_f64x2(self, a: u8x16<Self>) -> f64x2<Self> {
2189        unsafe {
2190            f64x2 {
2191                val: core::mem::transmute(a.val),
2192                simd: self,
2193            }
2194        }
2195    }
2196    #[inline(always)]
2197    fn cvt_to_bytes_f64x2(self, a: f64x2<Self>) -> u8x16<Self> {
2198        unsafe {
2199            u8x16 {
2200                val: core::mem::transmute(a.val),
2201                simd: self,
2202            }
2203        }
2204    }
2205    #[inline(always)]
2206    fn slide_f64x2<const SHIFT: usize>(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2207        unsafe {
2208            if SHIFT >= 2usize {
2209                return b;
2210            }
2211            let result = dyn_alignr_128(
2212                self.cvt_to_bytes_f64x2(b).val.0,
2213                self.cvt_to_bytes_f64x2(a).val.0,
2214                SHIFT * 8usize,
2215            );
2216            self.cvt_from_bytes_f64x2(u8x16 {
2217                val: crate::support::Aligned128(result),
2218                simd: self,
2219            })
2220        }
2221    }
2222    #[inline(always)]
2223    fn slide_within_blocks_f64x2<const SHIFT: usize>(
2224        self,
2225        a: f64x2<Self>,
2226        b: f64x2<Self>,
2227    ) -> f64x2<Self> {
2228        self.slide_f64x2::<SHIFT>(a, b)
2229    }
2230    #[inline(always)]
2231    fn abs_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2232        unsafe { _mm_andnot_pd(_mm_set1_pd(-0.0), a.into()).simd_into(self) }
2233    }
2234    #[inline(always)]
2235    fn neg_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2236        unsafe { _mm_xor_pd(a.into(), _mm_set1_pd(-0.0)).simd_into(self) }
2237    }
2238    #[inline(always)]
2239    fn sqrt_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2240        unsafe { _mm_sqrt_pd(a.into()).simd_into(self) }
2241    }
2242    #[inline(always)]
2243    fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2244        unsafe { _mm_add_pd(a.into(), b.into()).simd_into(self) }
2245    }
2246    #[inline(always)]
2247    fn sub_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2248        unsafe { _mm_sub_pd(a.into(), b.into()).simd_into(self) }
2249    }
2250    #[inline(always)]
2251    fn mul_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2252        unsafe { _mm_mul_pd(a.into(), b.into()).simd_into(self) }
2253    }
2254    #[inline(always)]
2255    fn div_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2256        unsafe { _mm_div_pd(a.into(), b.into()).simd_into(self) }
2257    }
2258    #[inline(always)]
2259    fn copysign_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2260        unsafe {
2261            let mask = _mm_set1_pd(-0.0);
2262            _mm_or_pd(_mm_and_pd(mask, b.into()), _mm_andnot_pd(mask, a.into())).simd_into(self)
2263        }
2264    }
2265    #[inline(always)]
2266    fn simd_eq_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
2267        unsafe { _mm_castpd_si128(_mm_cmpeq_pd(a.into(), b.into())).simd_into(self) }
2268    }
2269    #[inline(always)]
2270    fn simd_lt_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
2271        unsafe { _mm_castpd_si128(_mm_cmplt_pd(a.into(), b.into())).simd_into(self) }
2272    }
2273    #[inline(always)]
2274    fn simd_le_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
2275        unsafe { _mm_castpd_si128(_mm_cmple_pd(a.into(), b.into())).simd_into(self) }
2276    }
2277    #[inline(always)]
2278    fn simd_ge_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
2279        unsafe { _mm_castpd_si128(_mm_cmpge_pd(a.into(), b.into())).simd_into(self) }
2280    }
2281    #[inline(always)]
2282    fn simd_gt_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
2283        unsafe { _mm_castpd_si128(_mm_cmpgt_pd(a.into(), b.into())).simd_into(self) }
2284    }
2285    #[inline(always)]
2286    fn zip_low_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2287        unsafe { _mm_unpacklo_pd(a.into(), b.into()).simd_into(self) }
2288    }
2289    #[inline(always)]
2290    fn zip_high_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2291        unsafe { _mm_unpackhi_pd(a.into(), b.into()).simd_into(self) }
2292    }
2293    #[inline(always)]
2294    fn unzip_low_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2295        unsafe { _mm_shuffle_pd::<0b00>(a.into(), b.into()).simd_into(self) }
2296    }
2297    #[inline(always)]
2298    fn unzip_high_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2299        unsafe { _mm_shuffle_pd::<0b11>(a.into(), b.into()).simd_into(self) }
2300    }
2301    #[inline(always)]
2302    fn interleave_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> (f64x2<Self>, f64x2<Self>) {
2303        (self.zip_low_f64x2(a, b), self.zip_high_f64x2(a, b))
2304    }
2305    #[inline(always)]
2306    fn deinterleave_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> (f64x2<Self>, f64x2<Self>) {
2307        (self.unzip_low_f64x2(a, b), self.unzip_high_f64x2(a, b))
2308    }
2309    #[inline(always)]
2310    fn max_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2311        unsafe { _mm_max_pd(a.into(), b.into()).simd_into(self) }
2312    }
2313    #[inline(always)]
2314    fn min_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2315        unsafe { _mm_min_pd(a.into(), b.into()).simd_into(self) }
2316    }
2317    #[inline(always)]
2318    fn max_precise_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2319        unsafe {
2320            let intermediate = _mm_max_pd(a.into(), b.into());
2321            let b_is_nan = _mm_cmpunord_pd(b.into(), b.into());
2322            _mm_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(self)
2323        }
2324    }
2325    #[inline(always)]
2326    fn min_precise_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2327        unsafe {
2328            let intermediate = _mm_min_pd(a.into(), b.into());
2329            let b_is_nan = _mm_cmpunord_pd(b.into(), b.into());
2330            _mm_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(self)
2331        }
2332    }
2333    #[inline(always)]
2334    fn mul_add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
2335        unsafe { _mm_fmadd_pd(a.into(), b.into(), c.into()).simd_into(self) }
2336    }
2337    #[inline(always)]
2338    fn mul_sub_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
2339        unsafe { _mm_fmsub_pd(a.into(), b.into(), c.into()).simd_into(self) }
2340    }
2341    #[inline(always)]
2342    fn floor_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2343        unsafe {
2344            _mm_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
2345        }
2346    }
2347    #[inline(always)]
2348    fn ceil_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2349        unsafe {
2350            _mm_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
2351        }
2352    }
2353    #[inline(always)]
2354    fn round_ties_even_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2355        unsafe {
2356            _mm_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
2357                .simd_into(self)
2358        }
2359    }
2360    #[inline(always)]
2361    fn fract_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2362        a - self.trunc_f64x2(a)
2363    }
2364    #[inline(always)]
2365    fn trunc_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2366        unsafe {
2367            _mm_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
2368        }
2369    }
2370    #[inline(always)]
2371    fn select_f64x2(self, a: mask64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
2372        unsafe { _mm_blendv_pd(c.into(), b.into(), _mm_castsi128_pd(a.into())).simd_into(self) }
2373    }
2374    #[inline(always)]
2375    fn combine_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x4<Self> {
2376        unsafe { _mm256_setr_m128d(a.into(), b.into()).simd_into(self) }
2377    }
2378    #[inline(always)]
2379    fn reinterpret_f32_f64x2(self, a: f64x2<Self>) -> f32x4<Self> {
2380        unsafe { _mm_castpd_ps(a.into()).simd_into(self) }
2381    }
2382    #[inline(always)]
2383    fn splat_mask64x2(self, val: i64) -> mask64x2<Self> {
2384        unsafe { _mm_set1_epi64x(val).simd_into(self) }
2385    }
2386    #[inline(always)]
2387    fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2<Self> {
2388        mask64x2 {
2389            val: unsafe { core::mem::transmute_copy(&val) },
2390            simd: self,
2391        }
2392    }
2393    #[inline(always)]
2394    fn load_array_ref_mask64x2(self, val: &[i64; 2usize]) -> mask64x2<Self> {
2395        mask64x2 {
2396            val: unsafe { core::mem::transmute_copy(val) },
2397            simd: self,
2398        }
2399    }
2400    #[inline(always)]
2401    fn as_array_mask64x2(self, a: mask64x2<Self>) -> [i64; 2usize] {
2402        unsafe { core::mem::transmute::<__m128i, [i64; 2usize]>(a.val.0) }
2403    }
2404    #[inline(always)]
2405    fn as_array_ref_mask64x2(self, a: &mask64x2<Self>) -> &[i64; 2usize] {
2406        unsafe { core::mem::transmute::<&__m128i, &[i64; 2usize]>(&a.val.0) }
2407    }
2408    #[inline(always)]
2409    fn as_array_mut_mask64x2(self, a: &mut mask64x2<Self>) -> &mut [i64; 2usize] {
2410        unsafe { core::mem::transmute::<&mut __m128i, &mut [i64; 2usize]>(&mut a.val.0) }
2411    }
2412    #[inline(always)]
2413    fn store_array_mask64x2(self, a: mask64x2<Self>, dest: &mut [i64; 2usize]) -> () {
2414        unsafe {
2415            core::ptr::copy_nonoverlapping(
2416                (&raw const a.val.0) as *const i64,
2417                dest.as_mut_ptr(),
2418                2usize,
2419            );
2420        }
2421    }
2422    #[inline(always)]
2423    fn cvt_from_bytes_mask64x2(self, a: u8x16<Self>) -> mask64x2<Self> {
2424        unsafe {
2425            mask64x2 {
2426                val: core::mem::transmute(a.val),
2427                simd: self,
2428            }
2429        }
2430    }
2431    #[inline(always)]
2432    fn cvt_to_bytes_mask64x2(self, a: mask64x2<Self>) -> u8x16<Self> {
2433        unsafe {
2434            u8x16 {
2435                val: core::mem::transmute(a.val),
2436                simd: self,
2437            }
2438        }
2439    }
2440    #[inline(always)]
2441    fn slide_mask64x2<const SHIFT: usize>(
2442        self,
2443        a: mask64x2<Self>,
2444        b: mask64x2<Self>,
2445    ) -> mask64x2<Self> {
2446        unsafe {
2447            if SHIFT >= 2usize {
2448                return b;
2449            }
2450            let result = dyn_alignr_128(
2451                self.cvt_to_bytes_mask64x2(b).val.0,
2452                self.cvt_to_bytes_mask64x2(a).val.0,
2453                SHIFT * 8usize,
2454            );
2455            self.cvt_from_bytes_mask64x2(u8x16 {
2456                val: crate::support::Aligned128(result),
2457                simd: self,
2458            })
2459        }
2460    }
2461    #[inline(always)]
2462    fn slide_within_blocks_mask64x2<const SHIFT: usize>(
2463        self,
2464        a: mask64x2<Self>,
2465        b: mask64x2<Self>,
2466    ) -> mask64x2<Self> {
2467        self.slide_mask64x2::<SHIFT>(a, b)
2468    }
2469    #[inline(always)]
2470    fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
2471        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
2472    }
2473    #[inline(always)]
2474    fn or_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
2475        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
2476    }
2477    #[inline(always)]
2478    fn xor_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
2479        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
2480    }
2481    #[inline(always)]
2482    fn not_mask64x2(self, a: mask64x2<Self>) -> mask64x2<Self> {
2483        a ^ !0
2484    }
2485    #[inline(always)]
2486    fn select_mask64x2(
2487        self,
2488        a: mask64x2<Self>,
2489        b: mask64x2<Self>,
2490        c: mask64x2<Self>,
2491    ) -> mask64x2<Self> {
2492        unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
2493    }
2494    #[inline(always)]
2495    fn simd_eq_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
2496        unsafe { _mm_cmpeq_epi64(a.into(), b.into()).simd_into(self) }
2497    }
2498    #[inline(always)]
2499    fn any_true_mask64x2(self, a: mask64x2<Self>) -> bool {
2500        unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0 }
2501    }
2502    #[inline(always)]
2503    fn all_true_mask64x2(self, a: mask64x2<Self>) -> bool {
2504        unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0b11 }
2505    }
2506    #[inline(always)]
2507    fn any_false_mask64x2(self, a: mask64x2<Self>) -> bool {
2508        unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0b11 }
2509    }
2510    #[inline(always)]
2511    fn all_false_mask64x2(self, a: mask64x2<Self>) -> bool {
2512        unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0 }
2513    }
2514    #[inline(always)]
2515    fn combine_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x4<Self> {
2516        unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
2517    }
2518    #[inline(always)]
2519    fn splat_f32x8(self, val: f32) -> f32x8<Self> {
2520        unsafe { _mm256_set1_ps(val).simd_into(self) }
2521    }
2522    #[inline(always)]
2523    fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8<Self> {
2524        f32x8 {
2525            val: unsafe { core::mem::transmute_copy(&val) },
2526            simd: self,
2527        }
2528    }
2529    #[inline(always)]
2530    fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8<Self> {
2531        f32x8 {
2532            val: unsafe { core::mem::transmute_copy(val) },
2533            simd: self,
2534        }
2535    }
2536    #[inline(always)]
2537    fn as_array_f32x8(self, a: f32x8<Self>) -> [f32; 8usize] {
2538        unsafe { core::mem::transmute::<__m256, [f32; 8usize]>(a.val.0) }
2539    }
2540    #[inline(always)]
2541    fn as_array_ref_f32x8(self, a: &f32x8<Self>) -> &[f32; 8usize] {
2542        unsafe { core::mem::transmute::<&__m256, &[f32; 8usize]>(&a.val.0) }
2543    }
2544    #[inline(always)]
2545    fn as_array_mut_f32x8(self, a: &mut f32x8<Self>) -> &mut [f32; 8usize] {
2546        unsafe { core::mem::transmute::<&mut __m256, &mut [f32; 8usize]>(&mut a.val.0) }
2547    }
2548    #[inline(always)]
2549    fn store_array_f32x8(self, a: f32x8<Self>, dest: &mut [f32; 8usize]) -> () {
2550        unsafe {
2551            core::ptr::copy_nonoverlapping(
2552                (&raw const a.val.0) as *const f32,
2553                dest.as_mut_ptr(),
2554                8usize,
2555            );
2556        }
2557    }
2558    #[inline(always)]
2559    fn cvt_from_bytes_f32x8(self, a: u8x32<Self>) -> f32x8<Self> {
2560        unsafe {
2561            f32x8 {
2562                val: core::mem::transmute(a.val),
2563                simd: self,
2564            }
2565        }
2566    }
2567    #[inline(always)]
2568    fn cvt_to_bytes_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
2569        unsafe {
2570            u8x32 {
2571                val: core::mem::transmute(a.val),
2572                simd: self,
2573            }
2574        }
2575    }
2576    #[inline(always)]
2577    fn slide_f32x8<const SHIFT: usize>(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2578        unsafe {
2579            if SHIFT >= 8usize {
2580                return b;
2581            }
2582            let result = cross_block_alignr_256x1(
2583                self.cvt_to_bytes_f32x8(b).val.0,
2584                self.cvt_to_bytes_f32x8(a).val.0,
2585                SHIFT * 4usize,
2586            );
2587            self.cvt_from_bytes_f32x8(u8x32 {
2588                val: crate::support::Aligned256(result),
2589                simd: self,
2590            })
2591        }
2592    }
2593    #[inline(always)]
2594    fn slide_within_blocks_f32x8<const SHIFT: usize>(
2595        self,
2596        a: f32x8<Self>,
2597        b: f32x8<Self>,
2598    ) -> f32x8<Self> {
2599        unsafe {
2600            if SHIFT >= 4usize {
2601                return b;
2602            }
2603            let result = dyn_alignr_256(
2604                self.cvt_to_bytes_f32x8(b).val.0,
2605                self.cvt_to_bytes_f32x8(a).val.0,
2606                SHIFT * 4usize,
2607            );
2608            self.cvt_from_bytes_f32x8(u8x32 {
2609                val: crate::support::Aligned256(result),
2610                simd: self,
2611            })
2612        }
2613    }
2614    #[inline(always)]
2615    fn abs_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2616        unsafe { _mm256_andnot_ps(_mm256_set1_ps(-0.0), a.into()).simd_into(self) }
2617    }
2618    #[inline(always)]
2619    fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2620        unsafe { _mm256_xor_ps(a.into(), _mm256_set1_ps(-0.0)).simd_into(self) }
2621    }
2622    #[inline(always)]
2623    fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2624        unsafe { _mm256_sqrt_ps(a.into()).simd_into(self) }
2625    }
2626    #[inline(always)]
2627    fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2628        unsafe { _mm256_add_ps(a.into(), b.into()).simd_into(self) }
2629    }
2630    #[inline(always)]
2631    fn sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2632        unsafe { _mm256_sub_ps(a.into(), b.into()).simd_into(self) }
2633    }
2634    #[inline(always)]
2635    fn mul_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2636        unsafe { _mm256_mul_ps(a.into(), b.into()).simd_into(self) }
2637    }
2638    #[inline(always)]
2639    fn div_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2640        unsafe { _mm256_div_ps(a.into(), b.into()).simd_into(self) }
2641    }
2642    #[inline(always)]
2643    fn copysign_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2644        unsafe {
2645            let mask = _mm256_set1_ps(-0.0);
2646            _mm256_or_ps(
2647                _mm256_and_ps(mask, b.into()),
2648                _mm256_andnot_ps(mask, a.into()),
2649            )
2650            .simd_into(self)
2651        }
2652    }
2653    #[inline(always)]
2654    fn simd_eq_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
2655        unsafe { _mm256_castps_si256(_mm256_cmp_ps::<0i32>(a.into(), b.into())).simd_into(self) }
2656    }
2657    #[inline(always)]
2658    fn simd_lt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
2659        unsafe { _mm256_castps_si256(_mm256_cmp_ps::<17i32>(a.into(), b.into())).simd_into(self) }
2660    }
2661    #[inline(always)]
2662    fn simd_le_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
2663        unsafe { _mm256_castps_si256(_mm256_cmp_ps::<18i32>(a.into(), b.into())).simd_into(self) }
2664    }
2665    #[inline(always)]
2666    fn simd_ge_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
2667        unsafe { _mm256_castps_si256(_mm256_cmp_ps::<29i32>(a.into(), b.into())).simd_into(self) }
2668    }
2669    #[inline(always)]
2670    fn simd_gt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
2671        unsafe { _mm256_castps_si256(_mm256_cmp_ps::<30i32>(a.into(), b.into())).simd_into(self) }
2672    }
2673    #[inline(always)]
2674    fn zip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2675        unsafe {
2676            let lo = _mm256_unpacklo_ps(a.into(), b.into());
2677            let hi = _mm256_unpackhi_ps(a.into(), b.into());
2678            _mm256_permute2f128_ps::<0b0010_0000>(lo, hi).simd_into(self)
2679        }
2680    }
2681    #[inline(always)]
2682    fn zip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2683        unsafe {
2684            let lo = _mm256_unpacklo_ps(a.into(), b.into());
2685            let hi = _mm256_unpackhi_ps(a.into(), b.into());
2686            _mm256_permute2f128_ps::<0b0011_0001>(lo, hi).simd_into(self)
2687        }
2688    }
2689    #[inline(always)]
2690    fn unzip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2691        unsafe {
2692            let t1 = _mm256_permutevar8x32_ps(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
2693            let t2 = _mm256_permutevar8x32_ps(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
2694            _mm256_permute2f128_ps::<0b0010_0000>(t1, t2).simd_into(self)
2695        }
2696    }
2697    #[inline(always)]
2698    fn unzip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2699        unsafe {
2700            let t1 = _mm256_permutevar8x32_ps(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
2701            let t2 = _mm256_permutevar8x32_ps(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
2702            _mm256_permute2f128_ps::<0b0011_0001>(t1, t2).simd_into(self)
2703        }
2704    }
2705    #[inline(always)]
2706    fn interleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
2707        unsafe {
2708            let lo = _mm256_unpacklo_ps(a.into(), b.into());
2709            let hi = _mm256_unpackhi_ps(a.into(), b.into());
2710            (
2711                _mm256_permute2f128_ps::<0b0010_0000>(lo, hi).simd_into(self),
2712                _mm256_permute2f128_ps::<0b0011_0001>(lo, hi).simd_into(self),
2713            )
2714        }
2715    }
2716    #[inline(always)]
2717    fn deinterleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
2718        unsafe {
2719            let t1 = _mm256_permutevar8x32_ps(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
2720            let t2 = _mm256_permutevar8x32_ps(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
2721            (
2722                _mm256_permute2f128_ps::<0b0010_0000>(t1, t2).simd_into(self),
2723                _mm256_permute2f128_ps::<0b0011_0001>(t1, t2).simd_into(self),
2724            )
2725        }
2726    }
2727    #[inline(always)]
2728    fn max_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2729        unsafe { _mm256_max_ps(a.into(), b.into()).simd_into(self) }
2730    }
2731    #[inline(always)]
2732    fn min_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2733        unsafe { _mm256_min_ps(a.into(), b.into()).simd_into(self) }
2734    }
2735    #[inline(always)]
2736    fn max_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2737        unsafe {
2738            let intermediate = _mm256_max_ps(a.into(), b.into());
2739            let b_is_nan = _mm256_cmp_ps::<3i32>(b.into(), b.into());
2740            _mm256_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(self)
2741        }
2742    }
2743    #[inline(always)]
2744    fn min_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2745        unsafe {
2746            let intermediate = _mm256_min_ps(a.into(), b.into());
2747            let b_is_nan = _mm256_cmp_ps::<3i32>(b.into(), b.into());
2748            _mm256_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(self)
2749        }
2750    }
2751    #[inline(always)]
2752    fn mul_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
2753        unsafe { _mm256_fmadd_ps(a.into(), b.into(), c.into()).simd_into(self) }
2754    }
2755    #[inline(always)]
2756    fn mul_sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
2757        unsafe { _mm256_fmsub_ps(a.into(), b.into(), c.into()).simd_into(self) }
2758    }
2759    #[inline(always)]
2760    fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2761        unsafe {
2762            _mm256_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
2763                .simd_into(self)
2764        }
2765    }
2766    #[inline(always)]
2767    fn ceil_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2768        unsafe {
2769            _mm256_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into())
2770                .simd_into(self)
2771        }
2772    }
2773    #[inline(always)]
2774    fn round_ties_even_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2775        unsafe {
2776            _mm256_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
2777                .simd_into(self)
2778        }
2779    }
2780    #[inline(always)]
2781    fn fract_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2782        a - self.trunc_f32x8(a)
2783    }
2784    #[inline(always)]
2785    fn trunc_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2786        unsafe {
2787            _mm256_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
2788        }
2789    }
2790    #[inline(always)]
2791    fn select_f32x8(self, a: mask32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
2792        unsafe {
2793            _mm256_blendv_ps(c.into(), b.into(), _mm256_castsi256_ps(a.into())).simd_into(self)
2794        }
2795    }
2796    #[inline(always)]
2797    fn combine_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x16<Self> {
2798        f32x16 {
2799            val: crate::support::Aligned512([a.val.0, b.val.0]),
2800            simd: self,
2801        }
2802    }
2803    #[inline(always)]
2804    fn split_f32x8(self, a: f32x8<Self>) -> (f32x4<Self>, f32x4<Self>) {
2805        unsafe {
2806            (
2807                _mm256_extractf128_ps::<0>(a.into()).simd_into(self),
2808                _mm256_extractf128_ps::<1>(a.into()).simd_into(self),
2809            )
2810        }
2811    }
2812    #[inline(always)]
2813    fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self> {
2814        unsafe { _mm256_castps_pd(a.into()).simd_into(self) }
2815    }
2816    #[inline(always)]
2817    fn reinterpret_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
2818        unsafe { _mm256_castps_si256(a.into()).simd_into(self) }
2819    }
2820    #[inline(always)]
2821    fn reinterpret_u8_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
2822        unsafe { _mm256_castps_si256(a.into()).simd_into(self) }
2823    }
2824    #[inline(always)]
2825    fn reinterpret_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
2826        unsafe { _mm256_castps_si256(a.into()).simd_into(self) }
2827    }
2828    #[inline(always)]
2829    fn cvt_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
2830        unsafe {
2831            let mut converted = _mm256_cvttps_epi32(a.into());
2832            let in_range = _mm256_cmp_ps::<17i32>(a.into(), _mm256_set1_ps(2147483648.0));
2833            let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111;
2834            if !all_in_range {
2835                let excess = _mm256_sub_ps(a.into(), _mm256_set1_ps(2147483648.0));
2836                let excess_converted = _mm256_cvttps_epi32(_mm256_andnot_ps(in_range, excess));
2837                converted = _mm256_add_epi32(converted, excess_converted);
2838            }
2839            converted.simd_into(self)
2840        }
2841    }
2842    #[inline(always)]
2843    fn cvt_u32_precise_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
2844        unsafe {
2845            let a = _mm256_max_ps(a.into(), _mm256_setzero_ps());
2846            let mut converted = _mm256_cvttps_epi32(a);
2847            let in_range = _mm256_cmp_ps::<17i32>(a, _mm256_set1_ps(2147483648.0));
2848            let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111;
2849            if !all_in_range {
2850                let exceeds_unsigned_range =
2851                    _mm256_castps_si256(_mm256_cmp_ps::<17i32>(_mm256_set1_ps(4294967040.0), a));
2852                let excess = _mm256_sub_ps(a, _mm256_set1_ps(2147483648.0));
2853                let excess_converted = _mm256_cvttps_epi32(_mm256_andnot_ps(in_range, excess));
2854                converted = _mm256_add_epi32(converted, excess_converted);
2855                converted = _mm256_blendv_epi8(
2856                    converted,
2857                    _mm256_set1_epi32(u32::MAX.cast_signed()),
2858                    exceeds_unsigned_range,
2859                );
2860            }
2861            converted.simd_into(self)
2862        }
2863    }
2864    #[inline(always)]
2865    fn cvt_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
2866        unsafe { _mm256_cvttps_epi32(a.into()).simd_into(self) }
2867    }
2868    #[inline(always)]
2869    fn cvt_i32_precise_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
2870        unsafe {
2871            let a = a.into();
2872            let mut converted = _mm256_cvttps_epi32(a);
2873            let in_range = _mm256_cmp_ps::<17i32>(a, _mm256_set1_ps(2147483648.0));
2874            let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111;
2875            if !all_in_range {
2876                converted = _mm256_blendv_epi8(
2877                    _mm256_set1_epi32(i32::MAX),
2878                    converted,
2879                    _mm256_castps_si256(in_range),
2880                );
2881                let is_not_nan = _mm256_castps_si256(_mm256_cmp_ps::<7i32>(a, a));
2882                converted = _mm256_and_si256(converted, is_not_nan);
2883            }
2884            converted.simd_into(self)
2885        }
2886    }
2887    #[inline(always)]
2888    fn splat_i8x32(self, val: i8) -> i8x32<Self> {
2889        unsafe { _mm256_set1_epi8(val).simd_into(self) }
2890    }
2891    #[inline(always)]
2892    fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32<Self> {
2893        i8x32 {
2894            val: unsafe { core::mem::transmute_copy(&val) },
2895            simd: self,
2896        }
2897    }
2898    #[inline(always)]
2899    fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32<Self> {
2900        i8x32 {
2901            val: unsafe { core::mem::transmute_copy(val) },
2902            simd: self,
2903        }
2904    }
2905    #[inline(always)]
2906    fn as_array_i8x32(self, a: i8x32<Self>) -> [i8; 32usize] {
2907        unsafe { core::mem::transmute::<__m256i, [i8; 32usize]>(a.val.0) }
2908    }
2909    #[inline(always)]
2910    fn as_array_ref_i8x32(self, a: &i8x32<Self>) -> &[i8; 32usize] {
2911        unsafe { core::mem::transmute::<&__m256i, &[i8; 32usize]>(&a.val.0) }
2912    }
2913    #[inline(always)]
2914    fn as_array_mut_i8x32(self, a: &mut i8x32<Self>) -> &mut [i8; 32usize] {
2915        unsafe { core::mem::transmute::<&mut __m256i, &mut [i8; 32usize]>(&mut a.val.0) }
2916    }
2917    #[inline(always)]
2918    fn store_array_i8x32(self, a: i8x32<Self>, dest: &mut [i8; 32usize]) -> () {
2919        unsafe {
2920            core::ptr::copy_nonoverlapping(
2921                (&raw const a.val.0) as *const i8,
2922                dest.as_mut_ptr(),
2923                32usize,
2924            );
2925        }
2926    }
2927    #[inline(always)]
2928    fn cvt_from_bytes_i8x32(self, a: u8x32<Self>) -> i8x32<Self> {
2929        unsafe {
2930            i8x32 {
2931                val: core::mem::transmute(a.val),
2932                simd: self,
2933            }
2934        }
2935    }
2936    #[inline(always)]
2937    fn cvt_to_bytes_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
2938        unsafe {
2939            u8x32 {
2940                val: core::mem::transmute(a.val),
2941                simd: self,
2942            }
2943        }
2944    }
2945    #[inline(always)]
2946    fn slide_i8x32<const SHIFT: usize>(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
2947        unsafe {
2948            if SHIFT >= 32usize {
2949                return b;
2950            }
2951            let result = cross_block_alignr_256x1(
2952                self.cvt_to_bytes_i8x32(b).val.0,
2953                self.cvt_to_bytes_i8x32(a).val.0,
2954                SHIFT,
2955            );
2956            self.cvt_from_bytes_i8x32(u8x32 {
2957                val: crate::support::Aligned256(result),
2958                simd: self,
2959            })
2960        }
2961    }
2962    #[inline(always)]
2963    fn slide_within_blocks_i8x32<const SHIFT: usize>(
2964        self,
2965        a: i8x32<Self>,
2966        b: i8x32<Self>,
2967    ) -> i8x32<Self> {
2968        unsafe {
2969            if SHIFT >= 16usize {
2970                return b;
2971            }
2972            let result = dyn_alignr_256(
2973                self.cvt_to_bytes_i8x32(b).val.0,
2974                self.cvt_to_bytes_i8x32(a).val.0,
2975                SHIFT,
2976            );
2977            self.cvt_from_bytes_i8x32(u8x32 {
2978                val: crate::support::Aligned256(result),
2979                simd: self,
2980            })
2981        }
2982    }
2983    #[inline(always)]
2984    fn add_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
2985        unsafe { _mm256_add_epi8(a.into(), b.into()).simd_into(self) }
2986    }
2987    #[inline(always)]
2988    fn sub_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
2989        unsafe { _mm256_sub_epi8(a.into(), b.into()).simd_into(self) }
2990    }
2991    #[inline(always)]
2992    fn mul_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
2993        unsafe {
2994            let dst_even = _mm256_mullo_epi16(a.into(), b.into());
2995            let dst_odd = _mm256_mullo_epi16(
2996                _mm256_srli_epi16::<8>(a.into()),
2997                _mm256_srli_epi16::<8>(b.into()),
2998            );
2999            _mm256_or_si256(
3000                _mm256_slli_epi16(dst_odd, 8),
3001                _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)),
3002            )
3003            .simd_into(self)
3004        }
3005    }
3006    #[inline(always)]
3007    fn and_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3008        unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
3009    }
3010    #[inline(always)]
3011    fn or_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3012        unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
3013    }
3014    #[inline(always)]
3015    fn xor_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3016        unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
3017    }
3018    #[inline(always)]
3019    fn not_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
3020        a ^ !0
3021    }
3022    #[inline(always)]
3023    fn shl_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
3024        unsafe {
3025            let val = a.into();
3026            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
3027            let lo_16 = _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
3028            let hi_16 = _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
3029            let lo_shifted = _mm256_sll_epi16(lo_16, shift_count);
3030            let hi_shifted = _mm256_sll_epi16(hi_16, shift_count);
3031            _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
3032        }
3033    }
3034    #[inline(always)]
3035    fn shlv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3036        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
3037    }
3038    #[inline(always)]
3039    fn shr_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
3040        unsafe {
3041            let val = a.into();
3042            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
3043            let lo_16 = _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
3044            let hi_16 = _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
3045            let lo_shifted = _mm256_sra_epi16(lo_16, shift_count);
3046            let hi_shifted = _mm256_sra_epi16(hi_16, shift_count);
3047            _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
3048        }
3049    }
3050    #[inline(always)]
3051    fn shrv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3052        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
3053    }
3054    #[inline(always)]
3055    fn simd_eq_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
3056        unsafe { _mm256_cmpeq_epi8(a.into(), b.into()).simd_into(self) }
3057    }
3058    #[inline(always)]
3059    fn simd_lt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
3060        unsafe { _mm256_cmpgt_epi8(b.into(), a.into()).simd_into(self) }
3061    }
3062    #[inline(always)]
3063    fn simd_le_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
3064        unsafe { _mm256_cmpeq_epi8(_mm256_min_epi8(a.into(), b.into()), a.into()).simd_into(self) }
3065    }
3066    #[inline(always)]
3067    fn simd_ge_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
3068        unsafe { _mm256_cmpeq_epi8(_mm256_max_epi8(a.into(), b.into()), a.into()).simd_into(self) }
3069    }
3070    #[inline(always)]
3071    fn simd_gt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
3072        unsafe { _mm256_cmpgt_epi8(a.into(), b.into()).simd_into(self) }
3073    }
3074    #[inline(always)]
3075    fn zip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3076        unsafe {
3077            let lo = _mm256_unpacklo_epi8(a.into(), b.into());
3078            let hi = _mm256_unpackhi_epi8(a.into(), b.into());
3079            _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self)
3080        }
3081    }
3082    #[inline(always)]
3083    fn zip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3084        unsafe {
3085            let lo = _mm256_unpacklo_epi8(a.into(), b.into());
3086            let hi = _mm256_unpackhi_epi8(a.into(), b.into());
3087            _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self)
3088        }
3089    }
3090    #[inline(always)]
3091    fn unzip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3092        unsafe {
3093            let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3094                a.into(),
3095                _mm256_setr_epi8(
3096                    0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12,
3097                    14, 1, 3, 5, 7, 9, 11, 13, 15,
3098                ),
3099            ));
3100            let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3101                b.into(),
3102                _mm256_setr_epi8(
3103                    0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12,
3104                    14, 1, 3, 5, 7, 9, 11, 13, 15,
3105                ),
3106            ));
3107            _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self)
3108        }
3109    }
3110    #[inline(always)]
3111    fn unzip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3112        unsafe {
3113            let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3114                a.into(),
3115                _mm256_setr_epi8(
3116                    0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12,
3117                    14, 1, 3, 5, 7, 9, 11, 13, 15,
3118                ),
3119            ));
3120            let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3121                b.into(),
3122                _mm256_setr_epi8(
3123                    0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12,
3124                    14, 1, 3, 5, 7, 9, 11, 13, 15,
3125                ),
3126            ));
3127            _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self)
3128        }
3129    }
3130    #[inline(always)]
3131    fn interleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
3132        unsafe {
3133            let lo = _mm256_unpacklo_epi8(a.into(), b.into());
3134            let hi = _mm256_unpackhi_epi8(a.into(), b.into());
3135            (
3136                _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self),
3137                _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self),
3138            )
3139        }
3140    }
3141    #[inline(always)]
3142    fn deinterleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
3143        unsafe {
3144            let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3145                a.into(),
3146                _mm256_setr_epi8(
3147                    0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12,
3148                    14, 1, 3, 5, 7, 9, 11, 13, 15,
3149                ),
3150            ));
3151            let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3152                b.into(),
3153                _mm256_setr_epi8(
3154                    0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12,
3155                    14, 1, 3, 5, 7, 9, 11, 13, 15,
3156                ),
3157            ));
3158            (
3159                _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self),
3160                _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self),
3161            )
3162        }
3163    }
3164    #[inline(always)]
3165    fn select_i8x32(self, a: mask8x32<Self>, b: i8x32<Self>, c: i8x32<Self>) -> i8x32<Self> {
3166        unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
3167    }
3168    #[inline(always)]
3169    fn min_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3170        unsafe { _mm256_min_epi8(a.into(), b.into()).simd_into(self) }
3171    }
3172    #[inline(always)]
3173    fn max_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3174        unsafe { _mm256_max_epi8(a.into(), b.into()).simd_into(self) }
3175    }
3176    #[inline(always)]
3177    fn combine_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x64<Self> {
3178        i8x64 {
3179            val: crate::support::Aligned512([a.val.0, b.val.0]),
3180            simd: self,
3181        }
3182    }
3183    #[inline(always)]
3184    fn split_i8x32(self, a: i8x32<Self>) -> (i8x16<Self>, i8x16<Self>) {
3185        unsafe {
3186            (
3187                _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
3188                _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
3189            )
3190        }
3191    }
3192    #[inline(always)]
3193    fn neg_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
3194        unsafe { _mm256_sub_epi8(_mm256_setzero_si256(), a.into()).simd_into(self) }
3195    }
3196    #[inline(always)]
3197    fn reinterpret_u8_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
3198        __m256i::from(a).simd_into(self)
3199    }
3200    #[inline(always)]
3201    fn reinterpret_u32_i8x32(self, a: i8x32<Self>) -> u32x8<Self> {
3202        __m256i::from(a).simd_into(self)
3203    }
3204    #[inline(always)]
3205    fn splat_u8x32(self, val: u8) -> u8x32<Self> {
3206        unsafe { _mm256_set1_epi8(val.cast_signed()).simd_into(self) }
3207    }
3208    #[inline(always)]
3209    fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32<Self> {
3210        u8x32 {
3211            val: unsafe { core::mem::transmute_copy(&val) },
3212            simd: self,
3213        }
3214    }
3215    #[inline(always)]
3216    fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32<Self> {
3217        u8x32 {
3218            val: unsafe { core::mem::transmute_copy(val) },
3219            simd: self,
3220        }
3221    }
3222    #[inline(always)]
3223    fn as_array_u8x32(self, a: u8x32<Self>) -> [u8; 32usize] {
3224        unsafe { core::mem::transmute::<__m256i, [u8; 32usize]>(a.val.0) }
3225    }
3226    #[inline(always)]
3227    fn as_array_ref_u8x32(self, a: &u8x32<Self>) -> &[u8; 32usize] {
3228        unsafe { core::mem::transmute::<&__m256i, &[u8; 32usize]>(&a.val.0) }
3229    }
3230    #[inline(always)]
3231    fn as_array_mut_u8x32(self, a: &mut u8x32<Self>) -> &mut [u8; 32usize] {
3232        unsafe { core::mem::transmute::<&mut __m256i, &mut [u8; 32usize]>(&mut a.val.0) }
3233    }
3234    #[inline(always)]
3235    fn store_array_u8x32(self, a: u8x32<Self>, dest: &mut [u8; 32usize]) -> () {
3236        unsafe {
3237            core::ptr::copy_nonoverlapping(
3238                (&raw const a.val.0) as *const u8,
3239                dest.as_mut_ptr(),
3240                32usize,
3241            );
3242        }
3243    }
3244    #[inline(always)]
3245    fn cvt_from_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
3246        unsafe {
3247            u8x32 {
3248                val: core::mem::transmute(a.val),
3249                simd: self,
3250            }
3251        }
3252    }
3253    #[inline(always)]
3254    fn cvt_to_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
3255        unsafe {
3256            u8x32 {
3257                val: core::mem::transmute(a.val),
3258                simd: self,
3259            }
3260        }
3261    }
3262    #[inline(always)]
3263    fn slide_u8x32<const SHIFT: usize>(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3264        unsafe {
3265            if SHIFT >= 32usize {
3266                return b;
3267            }
3268            let result = cross_block_alignr_256x1(
3269                self.cvt_to_bytes_u8x32(b).val.0,
3270                self.cvt_to_bytes_u8x32(a).val.0,
3271                SHIFT,
3272            );
3273            self.cvt_from_bytes_u8x32(u8x32 {
3274                val: crate::support::Aligned256(result),
3275                simd: self,
3276            })
3277        }
3278    }
3279    #[inline(always)]
3280    fn slide_within_blocks_u8x32<const SHIFT: usize>(
3281        self,
3282        a: u8x32<Self>,
3283        b: u8x32<Self>,
3284    ) -> u8x32<Self> {
3285        unsafe {
3286            if SHIFT >= 16usize {
3287                return b;
3288            }
3289            let result = dyn_alignr_256(
3290                self.cvt_to_bytes_u8x32(b).val.0,
3291                self.cvt_to_bytes_u8x32(a).val.0,
3292                SHIFT,
3293            );
3294            self.cvt_from_bytes_u8x32(u8x32 {
3295                val: crate::support::Aligned256(result),
3296                simd: self,
3297            })
3298        }
3299    }
3300    #[inline(always)]
3301    fn add_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3302        unsafe { _mm256_add_epi8(a.into(), b.into()).simd_into(self) }
3303    }
3304    #[inline(always)]
3305    fn sub_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3306        unsafe { _mm256_sub_epi8(a.into(), b.into()).simd_into(self) }
3307    }
3308    #[inline(always)]
3309    fn mul_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3310        unsafe {
3311            let dst_even = _mm256_mullo_epi16(a.into(), b.into());
3312            let dst_odd = _mm256_mullo_epi16(
3313                _mm256_srli_epi16::<8>(a.into()),
3314                _mm256_srli_epi16::<8>(b.into()),
3315            );
3316            _mm256_or_si256(
3317                _mm256_slli_epi16(dst_odd, 8),
3318                _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)),
3319            )
3320            .simd_into(self)
3321        }
3322    }
3323    #[inline(always)]
3324    fn and_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3325        unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
3326    }
3327    #[inline(always)]
3328    fn or_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3329        unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
3330    }
3331    #[inline(always)]
3332    fn xor_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3333        unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
3334    }
3335    #[inline(always)]
3336    fn not_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
3337        a ^ !0
3338    }
3339    #[inline(always)]
3340    fn shl_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
3341        unsafe {
3342            let val = a.into();
3343            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
3344            let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256());
3345            let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256());
3346            let lo_shifted = _mm256_sll_epi16(lo_16, shift_count);
3347            let hi_shifted = _mm256_sll_epi16(hi_16, shift_count);
3348            _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
3349        }
3350    }
3351    #[inline(always)]
3352    fn shlv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3353        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
3354    }
3355    #[inline(always)]
3356    fn shr_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
3357        unsafe {
3358            let val = a.into();
3359            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
3360            let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256());
3361            let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256());
3362            let lo_shifted = _mm256_srl_epi16(lo_16, shift_count);
3363            let hi_shifted = _mm256_srl_epi16(hi_16, shift_count);
3364            _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
3365        }
3366    }
3367    #[inline(always)]
3368    fn shrv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3369        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
3370    }
3371    #[inline(always)]
3372    fn simd_eq_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
3373        unsafe { _mm256_cmpeq_epi8(a.into(), b.into()).simd_into(self) }
3374    }
3375    #[inline(always)]
3376    fn simd_lt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
3377        unsafe {
3378            let sign_bit = _mm256_set1_epi8(0x80u8.cast_signed());
3379            let a_signed = _mm256_xor_si256(a.into(), sign_bit);
3380            let b_signed = _mm256_xor_si256(b.into(), sign_bit);
3381            _mm256_cmpgt_epi8(b_signed, a_signed).simd_into(self)
3382        }
3383    }
3384    #[inline(always)]
3385    fn simd_le_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
3386        unsafe { _mm256_cmpeq_epi8(_mm256_min_epu8(a.into(), b.into()), a.into()).simd_into(self) }
3387    }
3388    #[inline(always)]
3389    fn simd_ge_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
3390        unsafe { _mm256_cmpeq_epi8(_mm256_max_epu8(a.into(), b.into()), a.into()).simd_into(self) }
3391    }
3392    #[inline(always)]
3393    fn simd_gt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
3394        unsafe {
3395            let sign_bit = _mm256_set1_epi8(0x80u8.cast_signed());
3396            let a_signed = _mm256_xor_si256(a.into(), sign_bit);
3397            let b_signed = _mm256_xor_si256(b.into(), sign_bit);
3398            _mm256_cmpgt_epi8(a_signed, b_signed).simd_into(self)
3399        }
3400    }
3401    #[inline(always)]
3402    fn zip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3403        unsafe {
3404            let lo = _mm256_unpacklo_epi8(a.into(), b.into());
3405            let hi = _mm256_unpackhi_epi8(a.into(), b.into());
3406            _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self)
3407        }
3408    }
3409    #[inline(always)]
3410    fn zip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3411        unsafe {
3412            let lo = _mm256_unpacklo_epi8(a.into(), b.into());
3413            let hi = _mm256_unpackhi_epi8(a.into(), b.into());
3414            _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self)
3415        }
3416    }
3417    #[inline(always)]
3418    fn unzip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3419        unsafe {
3420            let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3421                a.into(),
3422                _mm256_setr_epi8(
3423                    0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12,
3424                    14, 1, 3, 5, 7, 9, 11, 13, 15,
3425                ),
3426            ));
3427            let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3428                b.into(),
3429                _mm256_setr_epi8(
3430                    0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12,
3431                    14, 1, 3, 5, 7, 9, 11, 13, 15,
3432                ),
3433            ));
3434            _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self)
3435        }
3436    }
3437    #[inline(always)]
3438    fn unzip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3439        unsafe {
3440            let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3441                a.into(),
3442                _mm256_setr_epi8(
3443                    0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12,
3444                    14, 1, 3, 5, 7, 9, 11, 13, 15,
3445                ),
3446            ));
3447            let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3448                b.into(),
3449                _mm256_setr_epi8(
3450                    0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12,
3451                    14, 1, 3, 5, 7, 9, 11, 13, 15,
3452                ),
3453            ));
3454            _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self)
3455        }
3456    }
3457    #[inline(always)]
3458    fn interleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
3459        unsafe {
3460            let lo = _mm256_unpacklo_epi8(a.into(), b.into());
3461            let hi = _mm256_unpackhi_epi8(a.into(), b.into());
3462            (
3463                _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self),
3464                _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self),
3465            )
3466        }
3467    }
3468    #[inline(always)]
3469    fn deinterleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
3470        unsafe {
3471            let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3472                a.into(),
3473                _mm256_setr_epi8(
3474                    0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12,
3475                    14, 1, 3, 5, 7, 9, 11, 13, 15,
3476                ),
3477            ));
3478            let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3479                b.into(),
3480                _mm256_setr_epi8(
3481                    0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12,
3482                    14, 1, 3, 5, 7, 9, 11, 13, 15,
3483                ),
3484            ));
3485            (
3486                _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self),
3487                _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self),
3488            )
3489        }
3490    }
3491    #[inline(always)]
3492    fn select_u8x32(self, a: mask8x32<Self>, b: u8x32<Self>, c: u8x32<Self>) -> u8x32<Self> {
3493        unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
3494    }
3495    #[inline(always)]
3496    fn min_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3497        unsafe { _mm256_min_epu8(a.into(), b.into()).simd_into(self) }
3498    }
3499    #[inline(always)]
3500    fn max_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3501        unsafe { _mm256_max_epu8(a.into(), b.into()).simd_into(self) }
3502    }
3503    #[inline(always)]
3504    fn combine_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x64<Self> {
3505        u8x64 {
3506            val: crate::support::Aligned512([a.val.0, b.val.0]),
3507            simd: self,
3508        }
3509    }
3510    #[inline(always)]
3511    fn split_u8x32(self, a: u8x32<Self>) -> (u8x16<Self>, u8x16<Self>) {
3512        unsafe {
3513            (
3514                _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
3515                _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
3516            )
3517        }
3518    }
3519    #[inline(always)]
3520    fn widen_u8x32(self, a: u8x32<Self>) -> u16x32<Self> {
3521        unsafe {
3522            let (a0, a1) = self.split_u8x32(a);
3523            let high = _mm256_cvtepu8_epi16(a0.into()).simd_into(self);
3524            let low = _mm256_cvtepu8_epi16(a1.into()).simd_into(self);
3525            self.combine_u16x16(high, low)
3526        }
3527    }
3528    #[inline(always)]
3529    fn reinterpret_u32_u8x32(self, a: u8x32<Self>) -> u32x8<Self> {
3530        __m256i::from(a).simd_into(self)
3531    }
3532    #[inline(always)]
3533    fn splat_mask8x32(self, val: i8) -> mask8x32<Self> {
3534        unsafe { _mm256_set1_epi8(val).simd_into(self) }
3535    }
3536    #[inline(always)]
3537    fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32<Self> {
3538        mask8x32 {
3539            val: unsafe { core::mem::transmute_copy(&val) },
3540            simd: self,
3541        }
3542    }
3543    #[inline(always)]
3544    fn load_array_ref_mask8x32(self, val: &[i8; 32usize]) -> mask8x32<Self> {
3545        mask8x32 {
3546            val: unsafe { core::mem::transmute_copy(val) },
3547            simd: self,
3548        }
3549    }
3550    #[inline(always)]
3551    fn as_array_mask8x32(self, a: mask8x32<Self>) -> [i8; 32usize] {
3552        unsafe { core::mem::transmute::<__m256i, [i8; 32usize]>(a.val.0) }
3553    }
3554    #[inline(always)]
3555    fn as_array_ref_mask8x32(self, a: &mask8x32<Self>) -> &[i8; 32usize] {
3556        unsafe { core::mem::transmute::<&__m256i, &[i8; 32usize]>(&a.val.0) }
3557    }
3558    #[inline(always)]
3559    fn as_array_mut_mask8x32(self, a: &mut mask8x32<Self>) -> &mut [i8; 32usize] {
3560        unsafe { core::mem::transmute::<&mut __m256i, &mut [i8; 32usize]>(&mut a.val.0) }
3561    }
3562    #[inline(always)]
3563    fn store_array_mask8x32(self, a: mask8x32<Self>, dest: &mut [i8; 32usize]) -> () {
3564        unsafe {
3565            core::ptr::copy_nonoverlapping(
3566                (&raw const a.val.0) as *const i8,
3567                dest.as_mut_ptr(),
3568                32usize,
3569            );
3570        }
3571    }
3572    #[inline(always)]
3573    fn cvt_from_bytes_mask8x32(self, a: u8x32<Self>) -> mask8x32<Self> {
3574        unsafe {
3575            mask8x32 {
3576                val: core::mem::transmute(a.val),
3577                simd: self,
3578            }
3579        }
3580    }
3581    #[inline(always)]
3582    fn cvt_to_bytes_mask8x32(self, a: mask8x32<Self>) -> u8x32<Self> {
3583        unsafe {
3584            u8x32 {
3585                val: core::mem::transmute(a.val),
3586                simd: self,
3587            }
3588        }
3589    }
3590    #[inline(always)]
3591    fn slide_mask8x32<const SHIFT: usize>(
3592        self,
3593        a: mask8x32<Self>,
3594        b: mask8x32<Self>,
3595    ) -> mask8x32<Self> {
3596        unsafe {
3597            if SHIFT >= 32usize {
3598                return b;
3599            }
3600            let result = cross_block_alignr_256x1(
3601                self.cvt_to_bytes_mask8x32(b).val.0,
3602                self.cvt_to_bytes_mask8x32(a).val.0,
3603                SHIFT,
3604            );
3605            self.cvt_from_bytes_mask8x32(u8x32 {
3606                val: crate::support::Aligned256(result),
3607                simd: self,
3608            })
3609        }
3610    }
3611    #[inline(always)]
3612    fn slide_within_blocks_mask8x32<const SHIFT: usize>(
3613        self,
3614        a: mask8x32<Self>,
3615        b: mask8x32<Self>,
3616    ) -> mask8x32<Self> {
3617        unsafe {
3618            if SHIFT >= 16usize {
3619                return b;
3620            }
3621            let result = dyn_alignr_256(
3622                self.cvt_to_bytes_mask8x32(b).val.0,
3623                self.cvt_to_bytes_mask8x32(a).val.0,
3624                SHIFT,
3625            );
3626            self.cvt_from_bytes_mask8x32(u8x32 {
3627                val: crate::support::Aligned256(result),
3628                simd: self,
3629            })
3630        }
3631    }
3632    #[inline(always)]
3633    fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
3634        unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
3635    }
3636    #[inline(always)]
3637    fn or_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
3638        unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
3639    }
3640    #[inline(always)]
3641    fn xor_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
3642        unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
3643    }
3644    #[inline(always)]
3645    fn not_mask8x32(self, a: mask8x32<Self>) -> mask8x32<Self> {
3646        a ^ !0
3647    }
3648    #[inline(always)]
3649    fn select_mask8x32(
3650        self,
3651        a: mask8x32<Self>,
3652        b: mask8x32<Self>,
3653        c: mask8x32<Self>,
3654    ) -> mask8x32<Self> {
3655        unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
3656    }
3657    #[inline(always)]
3658    fn simd_eq_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
3659        unsafe { _mm256_cmpeq_epi8(a.into(), b.into()).simd_into(self) }
3660    }
3661    #[inline(always)]
3662    fn any_true_mask8x32(self, a: mask8x32<Self>) -> bool {
3663        unsafe { _mm256_movemask_epi8(a.into()) as u32 != 0 }
3664    }
3665    #[inline(always)]
3666    fn all_true_mask8x32(self, a: mask8x32<Self>) -> bool {
3667        unsafe { _mm256_movemask_epi8(a.into()) as u32 == 0xffffffff }
3668    }
3669    #[inline(always)]
3670    fn any_false_mask8x32(self, a: mask8x32<Self>) -> bool {
3671        unsafe { _mm256_movemask_epi8(a.into()) as u32 != 0xffffffff }
3672    }
3673    #[inline(always)]
3674    fn all_false_mask8x32(self, a: mask8x32<Self>) -> bool {
3675        unsafe { _mm256_movemask_epi8(a.into()) as u32 == 0 }
3676    }
3677    #[inline(always)]
3678    fn combine_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x64<Self> {
3679        mask8x64 {
3680            val: crate::support::Aligned512([a.val.0, b.val.0]),
3681            simd: self,
3682        }
3683    }
3684    #[inline(always)]
3685    fn split_mask8x32(self, a: mask8x32<Self>) -> (mask8x16<Self>, mask8x16<Self>) {
3686        unsafe {
3687            (
3688                _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
3689                _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
3690            )
3691        }
3692    }
3693    #[inline(always)]
3694    fn splat_i16x16(self, val: i16) -> i16x16<Self> {
3695        unsafe { _mm256_set1_epi16(val).simd_into(self) }
3696    }
3697    #[inline(always)]
3698    fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16<Self> {
3699        i16x16 {
3700            val: unsafe { core::mem::transmute_copy(&val) },
3701            simd: self,
3702        }
3703    }
3704    #[inline(always)]
3705    fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16<Self> {
3706        i16x16 {
3707            val: unsafe { core::mem::transmute_copy(val) },
3708            simd: self,
3709        }
3710    }
3711    #[inline(always)]
3712    fn as_array_i16x16(self, a: i16x16<Self>) -> [i16; 16usize] {
3713        unsafe { core::mem::transmute::<__m256i, [i16; 16usize]>(a.val.0) }
3714    }
3715    #[inline(always)]
3716    fn as_array_ref_i16x16(self, a: &i16x16<Self>) -> &[i16; 16usize] {
3717        unsafe { core::mem::transmute::<&__m256i, &[i16; 16usize]>(&a.val.0) }
3718    }
3719    #[inline(always)]
3720    fn as_array_mut_i16x16(self, a: &mut i16x16<Self>) -> &mut [i16; 16usize] {
3721        unsafe { core::mem::transmute::<&mut __m256i, &mut [i16; 16usize]>(&mut a.val.0) }
3722    }
3723    #[inline(always)]
3724    fn store_array_i16x16(self, a: i16x16<Self>, dest: &mut [i16; 16usize]) -> () {
3725        unsafe {
3726            core::ptr::copy_nonoverlapping(
3727                (&raw const a.val.0) as *const i16,
3728                dest.as_mut_ptr(),
3729                16usize,
3730            );
3731        }
3732    }
3733    #[inline(always)]
3734    fn cvt_from_bytes_i16x16(self, a: u8x32<Self>) -> i16x16<Self> {
3735        unsafe {
3736            i16x16 {
3737                val: core::mem::transmute(a.val),
3738                simd: self,
3739            }
3740        }
3741    }
3742    #[inline(always)]
3743    fn cvt_to_bytes_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
3744        unsafe {
3745            u8x32 {
3746                val: core::mem::transmute(a.val),
3747                simd: self,
3748            }
3749        }
3750    }
3751    #[inline(always)]
3752    fn slide_i16x16<const SHIFT: usize>(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3753        unsafe {
3754            if SHIFT >= 16usize {
3755                return b;
3756            }
3757            let result = cross_block_alignr_256x1(
3758                self.cvt_to_bytes_i16x16(b).val.0,
3759                self.cvt_to_bytes_i16x16(a).val.0,
3760                SHIFT * 2usize,
3761            );
3762            self.cvt_from_bytes_i16x16(u8x32 {
3763                val: crate::support::Aligned256(result),
3764                simd: self,
3765            })
3766        }
3767    }
3768    #[inline(always)]
3769    fn slide_within_blocks_i16x16<const SHIFT: usize>(
3770        self,
3771        a: i16x16<Self>,
3772        b: i16x16<Self>,
3773    ) -> i16x16<Self> {
3774        unsafe {
3775            if SHIFT >= 8usize {
3776                return b;
3777            }
3778            let result = dyn_alignr_256(
3779                self.cvt_to_bytes_i16x16(b).val.0,
3780                self.cvt_to_bytes_i16x16(a).val.0,
3781                SHIFT * 2usize,
3782            );
3783            self.cvt_from_bytes_i16x16(u8x32 {
3784                val: crate::support::Aligned256(result),
3785                simd: self,
3786            })
3787        }
3788    }
3789    #[inline(always)]
3790    fn add_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3791        unsafe { _mm256_add_epi16(a.into(), b.into()).simd_into(self) }
3792    }
3793    #[inline(always)]
3794    fn sub_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3795        unsafe { _mm256_sub_epi16(a.into(), b.into()).simd_into(self) }
3796    }
3797    #[inline(always)]
3798    fn mul_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3799        unsafe { _mm256_mullo_epi16(a.into(), b.into()).simd_into(self) }
3800    }
3801    #[inline(always)]
3802    fn and_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3803        unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
3804    }
3805    #[inline(always)]
3806    fn or_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3807        unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
3808    }
3809    #[inline(always)]
3810    fn xor_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3811        unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
3812    }
3813    #[inline(always)]
3814    fn not_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
3815        a ^ !0
3816    }
3817    #[inline(always)]
3818    fn shl_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
3819        unsafe {
3820            _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
3821        }
3822    }
3823    #[inline(always)]
3824    fn shlv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3825        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
3826    }
3827    #[inline(always)]
3828    fn shr_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
3829        unsafe {
3830            _mm256_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
3831        }
3832    }
3833    #[inline(always)]
3834    fn shrv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3835        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
3836    }
3837    #[inline(always)]
3838    fn simd_eq_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
3839        unsafe { _mm256_cmpeq_epi16(a.into(), b.into()).simd_into(self) }
3840    }
3841    #[inline(always)]
3842    fn simd_lt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
3843        unsafe { _mm256_cmpgt_epi16(b.into(), a.into()).simd_into(self) }
3844    }
3845    #[inline(always)]
3846    fn simd_le_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
3847        unsafe {
3848            _mm256_cmpeq_epi16(_mm256_min_epi16(a.into(), b.into()), a.into()).simd_into(self)
3849        }
3850    }
3851    #[inline(always)]
3852    fn simd_ge_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
3853        unsafe {
3854            _mm256_cmpeq_epi16(_mm256_max_epi16(a.into(), b.into()), a.into()).simd_into(self)
3855        }
3856    }
3857    #[inline(always)]
3858    fn simd_gt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
3859        unsafe { _mm256_cmpgt_epi16(a.into(), b.into()).simd_into(self) }
3860    }
3861    #[inline(always)]
3862    fn zip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3863        unsafe {
3864            let lo = _mm256_unpacklo_epi16(a.into(), b.into());
3865            let hi = _mm256_unpackhi_epi16(a.into(), b.into());
3866            _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self)
3867        }
3868    }
3869    #[inline(always)]
3870    fn zip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3871        unsafe {
3872            let lo = _mm256_unpacklo_epi16(a.into(), b.into());
3873            let hi = _mm256_unpackhi_epi16(a.into(), b.into());
3874            _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self)
3875        }
3876    }
3877    #[inline(always)]
3878    fn unzip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3879        unsafe {
3880            let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3881                a.into(),
3882                _mm256_setr_epi8(
3883                    0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13,
3884                    2, 3, 6, 7, 10, 11, 14, 15,
3885                ),
3886            ));
3887            let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3888                b.into(),
3889                _mm256_setr_epi8(
3890                    0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13,
3891                    2, 3, 6, 7, 10, 11, 14, 15,
3892                ),
3893            ));
3894            _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self)
3895        }
3896    }
3897    #[inline(always)]
3898    fn unzip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3899        unsafe {
3900            let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3901                a.into(),
3902                _mm256_setr_epi8(
3903                    0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13,
3904                    2, 3, 6, 7, 10, 11, 14, 15,
3905                ),
3906            ));
3907            let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3908                b.into(),
3909                _mm256_setr_epi8(
3910                    0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13,
3911                    2, 3, 6, 7, 10, 11, 14, 15,
3912                ),
3913            ));
3914            _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self)
3915        }
3916    }
3917    #[inline(always)]
3918    fn interleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
3919        unsafe {
3920            let lo = _mm256_unpacklo_epi16(a.into(), b.into());
3921            let hi = _mm256_unpackhi_epi16(a.into(), b.into());
3922            (
3923                _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self),
3924                _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self),
3925            )
3926        }
3927    }
3928    #[inline(always)]
3929    fn deinterleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
3930        unsafe {
3931            let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3932                a.into(),
3933                _mm256_setr_epi8(
3934                    0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13,
3935                    2, 3, 6, 7, 10, 11, 14, 15,
3936                ),
3937            ));
3938            let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3939                b.into(),
3940                _mm256_setr_epi8(
3941                    0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13,
3942                    2, 3, 6, 7, 10, 11, 14, 15,
3943                ),
3944            ));
3945            (
3946                _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self),
3947                _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self),
3948            )
3949        }
3950    }
3951    #[inline(always)]
3952    fn select_i16x16(self, a: mask16x16<Self>, b: i16x16<Self>, c: i16x16<Self>) -> i16x16<Self> {
3953        unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
3954    }
3955    #[inline(always)]
3956    fn min_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3957        unsafe { _mm256_min_epi16(a.into(), b.into()).simd_into(self) }
3958    }
3959    #[inline(always)]
3960    fn max_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3961        unsafe { _mm256_max_epi16(a.into(), b.into()).simd_into(self) }
3962    }
3963    #[inline(always)]
3964    fn combine_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x32<Self> {
3965        i16x32 {
3966            val: crate::support::Aligned512([a.val.0, b.val.0]),
3967            simd: self,
3968        }
3969    }
3970    #[inline(always)]
3971    fn split_i16x16(self, a: i16x16<Self>) -> (i16x8<Self>, i16x8<Self>) {
3972        unsafe {
3973            (
3974                _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
3975                _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
3976            )
3977        }
3978    }
3979    #[inline(always)]
3980    fn neg_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
3981        unsafe { _mm256_sub_epi16(_mm256_setzero_si256(), a.into()).simd_into(self) }
3982    }
3983    #[inline(always)]
3984    fn reinterpret_u8_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
3985        __m256i::from(a).simd_into(self)
3986    }
3987    #[inline(always)]
3988    fn reinterpret_u32_i16x16(self, a: i16x16<Self>) -> u32x8<Self> {
3989        __m256i::from(a).simd_into(self)
3990    }
3991    #[inline(always)]
3992    fn splat_u16x16(self, val: u16) -> u16x16<Self> {
3993        unsafe { _mm256_set1_epi16(val.cast_signed()).simd_into(self) }
3994    }
3995    #[inline(always)]
3996    fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16<Self> {
3997        u16x16 {
3998            val: unsafe { core::mem::transmute_copy(&val) },
3999            simd: self,
4000        }
4001    }
4002    #[inline(always)]
4003    fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16<Self> {
4004        u16x16 {
4005            val: unsafe { core::mem::transmute_copy(val) },
4006            simd: self,
4007        }
4008    }
4009    #[inline(always)]
4010    fn as_array_u16x16(self, a: u16x16<Self>) -> [u16; 16usize] {
4011        unsafe { core::mem::transmute::<__m256i, [u16; 16usize]>(a.val.0) }
4012    }
4013    #[inline(always)]
4014    fn as_array_ref_u16x16(self, a: &u16x16<Self>) -> &[u16; 16usize] {
4015        unsafe { core::mem::transmute::<&__m256i, &[u16; 16usize]>(&a.val.0) }
4016    }
4017    #[inline(always)]
4018    fn as_array_mut_u16x16(self, a: &mut u16x16<Self>) -> &mut [u16; 16usize] {
4019        unsafe { core::mem::transmute::<&mut __m256i, &mut [u16; 16usize]>(&mut a.val.0) }
4020    }
4021    #[inline(always)]
4022    fn store_array_u16x16(self, a: u16x16<Self>, dest: &mut [u16; 16usize]) -> () {
4023        unsafe {
4024            core::ptr::copy_nonoverlapping(
4025                (&raw const a.val.0) as *const u16,
4026                dest.as_mut_ptr(),
4027                16usize,
4028            );
4029        }
4030    }
4031    #[inline(always)]
4032    fn cvt_from_bytes_u16x16(self, a: u8x32<Self>) -> u16x16<Self> {
4033        unsafe {
4034            u16x16 {
4035                val: core::mem::transmute(a.val),
4036                simd: self,
4037            }
4038        }
4039    }
4040    #[inline(always)]
4041    fn cvt_to_bytes_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
4042        unsafe {
4043            u8x32 {
4044                val: core::mem::transmute(a.val),
4045                simd: self,
4046            }
4047        }
4048    }
4049    #[inline(always)]
4050    fn slide_u16x16<const SHIFT: usize>(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4051        unsafe {
4052            if SHIFT >= 16usize {
4053                return b;
4054            }
4055            let result = cross_block_alignr_256x1(
4056                self.cvt_to_bytes_u16x16(b).val.0,
4057                self.cvt_to_bytes_u16x16(a).val.0,
4058                SHIFT * 2usize,
4059            );
4060            self.cvt_from_bytes_u16x16(u8x32 {
4061                val: crate::support::Aligned256(result),
4062                simd: self,
4063            })
4064        }
4065    }
4066    #[inline(always)]
4067    fn slide_within_blocks_u16x16<const SHIFT: usize>(
4068        self,
4069        a: u16x16<Self>,
4070        b: u16x16<Self>,
4071    ) -> u16x16<Self> {
4072        unsafe {
4073            if SHIFT >= 8usize {
4074                return b;
4075            }
4076            let result = dyn_alignr_256(
4077                self.cvt_to_bytes_u16x16(b).val.0,
4078                self.cvt_to_bytes_u16x16(a).val.0,
4079                SHIFT * 2usize,
4080            );
4081            self.cvt_from_bytes_u16x16(u8x32 {
4082                val: crate::support::Aligned256(result),
4083                simd: self,
4084            })
4085        }
4086    }
4087    #[inline(always)]
4088    fn add_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4089        unsafe { _mm256_add_epi16(a.into(), b.into()).simd_into(self) }
4090    }
4091    #[inline(always)]
4092    fn sub_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4093        unsafe { _mm256_sub_epi16(a.into(), b.into()).simd_into(self) }
4094    }
4095    #[inline(always)]
4096    fn mul_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4097        unsafe { _mm256_mullo_epi16(a.into(), b.into()).simd_into(self) }
4098    }
4099    #[inline(always)]
4100    fn and_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4101        unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
4102    }
4103    #[inline(always)]
4104    fn or_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4105        unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
4106    }
4107    #[inline(always)]
4108    fn xor_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4109        unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
4110    }
4111    #[inline(always)]
4112    fn not_u16x16(self, a: u16x16<Self>) -> u16x16<Self> {
4113        a ^ !0
4114    }
4115    #[inline(always)]
4116    fn shl_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
4117        unsafe {
4118            _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
4119        }
4120    }
4121    #[inline(always)]
4122    fn shlv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4123        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
4124    }
4125    #[inline(always)]
4126    fn shr_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
4127        unsafe {
4128            _mm256_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
4129        }
4130    }
4131    #[inline(always)]
4132    fn shrv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4133        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
4134    }
4135    #[inline(always)]
4136    fn simd_eq_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
4137        unsafe { _mm256_cmpeq_epi16(a.into(), b.into()).simd_into(self) }
4138    }
4139    #[inline(always)]
4140    fn simd_lt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
4141        unsafe {
4142            let sign_bit = _mm256_set1_epi16(0x8000u16.cast_signed());
4143            let a_signed = _mm256_xor_si256(a.into(), sign_bit);
4144            let b_signed = _mm256_xor_si256(b.into(), sign_bit);
4145            _mm256_cmpgt_epi16(b_signed, a_signed).simd_into(self)
4146        }
4147    }
4148    #[inline(always)]
4149    fn simd_le_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
4150        unsafe {
4151            _mm256_cmpeq_epi16(_mm256_min_epu16(a.into(), b.into()), a.into()).simd_into(self)
4152        }
4153    }
4154    #[inline(always)]
4155    fn simd_ge_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
4156        unsafe {
4157            _mm256_cmpeq_epi16(_mm256_max_epu16(a.into(), b.into()), a.into()).simd_into(self)
4158        }
4159    }
4160    #[inline(always)]
4161    fn simd_gt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
4162        unsafe {
4163            let sign_bit = _mm256_set1_epi16(0x8000u16.cast_signed());
4164            let a_signed = _mm256_xor_si256(a.into(), sign_bit);
4165            let b_signed = _mm256_xor_si256(b.into(), sign_bit);
4166            _mm256_cmpgt_epi16(a_signed, b_signed).simd_into(self)
4167        }
4168    }
4169    #[inline(always)]
4170    fn zip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4171        unsafe {
4172            let lo = _mm256_unpacklo_epi16(a.into(), b.into());
4173            let hi = _mm256_unpackhi_epi16(a.into(), b.into());
4174            _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self)
4175        }
4176    }
4177    #[inline(always)]
4178    fn zip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4179        unsafe {
4180            let lo = _mm256_unpacklo_epi16(a.into(), b.into());
4181            let hi = _mm256_unpackhi_epi16(a.into(), b.into());
4182            _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self)
4183        }
4184    }
4185    #[inline(always)]
4186    fn unzip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4187        unsafe {
4188            let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
4189                a.into(),
4190                _mm256_setr_epi8(
4191                    0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13,
4192                    2, 3, 6, 7, 10, 11, 14, 15,
4193                ),
4194            ));
4195            let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
4196                b.into(),
4197                _mm256_setr_epi8(
4198                    0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13,
4199                    2, 3, 6, 7, 10, 11, 14, 15,
4200                ),
4201            ));
4202            _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self)
4203        }
4204    }
4205    #[inline(always)]
4206    fn unzip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4207        unsafe {
4208            let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
4209                a.into(),
4210                _mm256_setr_epi8(
4211                    0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13,
4212                    2, 3, 6, 7, 10, 11, 14, 15,
4213                ),
4214            ));
4215            let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
4216                b.into(),
4217                _mm256_setr_epi8(
4218                    0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13,
4219                    2, 3, 6, 7, 10, 11, 14, 15,
4220                ),
4221            ));
4222            _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self)
4223        }
4224    }
4225    #[inline(always)]
4226    fn interleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
4227        unsafe {
4228            let lo = _mm256_unpacklo_epi16(a.into(), b.into());
4229            let hi = _mm256_unpackhi_epi16(a.into(), b.into());
4230            (
4231                _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self),
4232                _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self),
4233            )
4234        }
4235    }
4236    #[inline(always)]
4237    fn deinterleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
4238        unsafe {
4239            let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
4240                a.into(),
4241                _mm256_setr_epi8(
4242                    0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13,
4243                    2, 3, 6, 7, 10, 11, 14, 15,
4244                ),
4245            ));
4246            let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
4247                b.into(),
4248                _mm256_setr_epi8(
4249                    0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13,
4250                    2, 3, 6, 7, 10, 11, 14, 15,
4251                ),
4252            ));
4253            (
4254                _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self),
4255                _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self),
4256            )
4257        }
4258    }
4259    #[inline(always)]
4260    fn select_u16x16(self, a: mask16x16<Self>, b: u16x16<Self>, c: u16x16<Self>) -> u16x16<Self> {
4261        unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
4262    }
4263    #[inline(always)]
4264    fn min_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4265        unsafe { _mm256_min_epu16(a.into(), b.into()).simd_into(self) }
4266    }
4267    #[inline(always)]
4268    fn max_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4269        unsafe { _mm256_max_epu16(a.into(), b.into()).simd_into(self) }
4270    }
4271    #[inline(always)]
4272    fn combine_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x32<Self> {
4273        u16x32 {
4274            val: crate::support::Aligned512([a.val.0, b.val.0]),
4275            simd: self,
4276        }
4277    }
4278    #[inline(always)]
4279    fn split_u16x16(self, a: u16x16<Self>) -> (u16x8<Self>, u16x8<Self>) {
4280        unsafe {
4281            (
4282                _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
4283                _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
4284            )
4285        }
4286    }
4287    #[inline(always)]
4288    fn narrow_u16x16(self, a: u16x16<Self>) -> u8x16<Self> {
4289        unsafe {
4290            let mask = _mm256_setr_epi8(
4291                0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, 6, 8, 10, 12,
4292                14, -1, -1, -1, -1, -1, -1, -1, -1,
4293            );
4294            let shuffled = _mm256_shuffle_epi8(a.into(), mask);
4295            let packed = _mm256_permute4x64_epi64::<0b11_01_10_00>(shuffled);
4296            _mm256_castsi256_si128(packed).simd_into(self)
4297        }
4298    }
4299    #[inline(always)]
4300    fn reinterpret_u8_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
4301        __m256i::from(a).simd_into(self)
4302    }
4303    #[inline(always)]
4304    fn reinterpret_u32_u16x16(self, a: u16x16<Self>) -> u32x8<Self> {
4305        __m256i::from(a).simd_into(self)
4306    }
4307    #[inline(always)]
4308    fn splat_mask16x16(self, val: i16) -> mask16x16<Self> {
4309        unsafe { _mm256_set1_epi16(val).simd_into(self) }
4310    }
4311    #[inline(always)]
4312    fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16<Self> {
4313        mask16x16 {
4314            val: unsafe { core::mem::transmute_copy(&val) },
4315            simd: self,
4316        }
4317    }
4318    #[inline(always)]
4319    fn load_array_ref_mask16x16(self, val: &[i16; 16usize]) -> mask16x16<Self> {
4320        mask16x16 {
4321            val: unsafe { core::mem::transmute_copy(val) },
4322            simd: self,
4323        }
4324    }
4325    #[inline(always)]
4326    fn as_array_mask16x16(self, a: mask16x16<Self>) -> [i16; 16usize] {
4327        unsafe { core::mem::transmute::<__m256i, [i16; 16usize]>(a.val.0) }
4328    }
4329    #[inline(always)]
4330    fn as_array_ref_mask16x16(self, a: &mask16x16<Self>) -> &[i16; 16usize] {
4331        unsafe { core::mem::transmute::<&__m256i, &[i16; 16usize]>(&a.val.0) }
4332    }
4333    #[inline(always)]
4334    fn as_array_mut_mask16x16(self, a: &mut mask16x16<Self>) -> &mut [i16; 16usize] {
4335        unsafe { core::mem::transmute::<&mut __m256i, &mut [i16; 16usize]>(&mut a.val.0) }
4336    }
4337    #[inline(always)]
4338    fn store_array_mask16x16(self, a: mask16x16<Self>, dest: &mut [i16; 16usize]) -> () {
4339        unsafe {
4340            core::ptr::copy_nonoverlapping(
4341                (&raw const a.val.0) as *const i16,
4342                dest.as_mut_ptr(),
4343                16usize,
4344            );
4345        }
4346    }
4347    #[inline(always)]
4348    fn cvt_from_bytes_mask16x16(self, a: u8x32<Self>) -> mask16x16<Self> {
4349        unsafe {
4350            mask16x16 {
4351                val: core::mem::transmute(a.val),
4352                simd: self,
4353            }
4354        }
4355    }
4356    #[inline(always)]
4357    fn cvt_to_bytes_mask16x16(self, a: mask16x16<Self>) -> u8x32<Self> {
4358        unsafe {
4359            u8x32 {
4360                val: core::mem::transmute(a.val),
4361                simd: self,
4362            }
4363        }
4364    }
4365    #[inline(always)]
4366    fn slide_mask16x16<const SHIFT: usize>(
4367        self,
4368        a: mask16x16<Self>,
4369        b: mask16x16<Self>,
4370    ) -> mask16x16<Self> {
4371        unsafe {
4372            if SHIFT >= 16usize {
4373                return b;
4374            }
4375            let result = cross_block_alignr_256x1(
4376                self.cvt_to_bytes_mask16x16(b).val.0,
4377                self.cvt_to_bytes_mask16x16(a).val.0,
4378                SHIFT * 2usize,
4379            );
4380            self.cvt_from_bytes_mask16x16(u8x32 {
4381                val: crate::support::Aligned256(result),
4382                simd: self,
4383            })
4384        }
4385    }
4386    #[inline(always)]
4387    fn slide_within_blocks_mask16x16<const SHIFT: usize>(
4388        self,
4389        a: mask16x16<Self>,
4390        b: mask16x16<Self>,
4391    ) -> mask16x16<Self> {
4392        unsafe {
4393            if SHIFT >= 8usize {
4394                return b;
4395            }
4396            let result = dyn_alignr_256(
4397                self.cvt_to_bytes_mask16x16(b).val.0,
4398                self.cvt_to_bytes_mask16x16(a).val.0,
4399                SHIFT * 2usize,
4400            );
4401            self.cvt_from_bytes_mask16x16(u8x32 {
4402                val: crate::support::Aligned256(result),
4403                simd: self,
4404            })
4405        }
4406    }
4407    #[inline(always)]
4408    fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
4409        unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
4410    }
4411    #[inline(always)]
4412    fn or_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
4413        unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
4414    }
4415    #[inline(always)]
4416    fn xor_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
4417        unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
4418    }
4419    #[inline(always)]
4420    fn not_mask16x16(self, a: mask16x16<Self>) -> mask16x16<Self> {
4421        a ^ !0
4422    }
4423    #[inline(always)]
4424    fn select_mask16x16(
4425        self,
4426        a: mask16x16<Self>,
4427        b: mask16x16<Self>,
4428        c: mask16x16<Self>,
4429    ) -> mask16x16<Self> {
4430        unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
4431    }
4432    #[inline(always)]
4433    fn simd_eq_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
4434        unsafe { _mm256_cmpeq_epi16(a.into(), b.into()).simd_into(self) }
4435    }
4436    #[inline(always)]
4437    fn any_true_mask16x16(self, a: mask16x16<Self>) -> bool {
4438        unsafe { _mm256_movemask_epi8(a.into()) as u32 != 0 }
4439    }
4440    #[inline(always)]
4441    fn all_true_mask16x16(self, a: mask16x16<Self>) -> bool {
4442        unsafe { _mm256_movemask_epi8(a.into()) as u32 == 0xffffffff }
4443    }
4444    #[inline(always)]
4445    fn any_false_mask16x16(self, a: mask16x16<Self>) -> bool {
4446        unsafe { _mm256_movemask_epi8(a.into()) as u32 != 0xffffffff }
4447    }
4448    #[inline(always)]
4449    fn all_false_mask16x16(self, a: mask16x16<Self>) -> bool {
4450        unsafe { _mm256_movemask_epi8(a.into()) as u32 == 0 }
4451    }
4452    #[inline(always)]
4453    fn combine_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x32<Self> {
4454        mask16x32 {
4455            val: crate::support::Aligned512([a.val.0, b.val.0]),
4456            simd: self,
4457        }
4458    }
4459    #[inline(always)]
4460    fn split_mask16x16(self, a: mask16x16<Self>) -> (mask16x8<Self>, mask16x8<Self>) {
4461        unsafe {
4462            (
4463                _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
4464                _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
4465            )
4466        }
4467    }
4468    #[inline(always)]
4469    fn splat_i32x8(self, val: i32) -> i32x8<Self> {
4470        unsafe { _mm256_set1_epi32(val).simd_into(self) }
4471    }
4472    #[inline(always)]
4473    fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8<Self> {
4474        i32x8 {
4475            val: unsafe { core::mem::transmute_copy(&val) },
4476            simd: self,
4477        }
4478    }
4479    #[inline(always)]
4480    fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8<Self> {
4481        i32x8 {
4482            val: unsafe { core::mem::transmute_copy(val) },
4483            simd: self,
4484        }
4485    }
4486    #[inline(always)]
4487    fn as_array_i32x8(self, a: i32x8<Self>) -> [i32; 8usize] {
4488        unsafe { core::mem::transmute::<__m256i, [i32; 8usize]>(a.val.0) }
4489    }
4490    #[inline(always)]
4491    fn as_array_ref_i32x8(self, a: &i32x8<Self>) -> &[i32; 8usize] {
4492        unsafe { core::mem::transmute::<&__m256i, &[i32; 8usize]>(&a.val.0) }
4493    }
4494    #[inline(always)]
4495    fn as_array_mut_i32x8(self, a: &mut i32x8<Self>) -> &mut [i32; 8usize] {
4496        unsafe { core::mem::transmute::<&mut __m256i, &mut [i32; 8usize]>(&mut a.val.0) }
4497    }
4498    #[inline(always)]
4499    fn store_array_i32x8(self, a: i32x8<Self>, dest: &mut [i32; 8usize]) -> () {
4500        unsafe {
4501            core::ptr::copy_nonoverlapping(
4502                (&raw const a.val.0) as *const i32,
4503                dest.as_mut_ptr(),
4504                8usize,
4505            );
4506        }
4507    }
4508    #[inline(always)]
4509    fn cvt_from_bytes_i32x8(self, a: u8x32<Self>) -> i32x8<Self> {
4510        unsafe {
4511            i32x8 {
4512                val: core::mem::transmute(a.val),
4513                simd: self,
4514            }
4515        }
4516    }
4517    #[inline(always)]
4518    fn cvt_to_bytes_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
4519        unsafe {
4520            u8x32 {
4521                val: core::mem::transmute(a.val),
4522                simd: self,
4523            }
4524        }
4525    }
4526    #[inline(always)]
4527    fn slide_i32x8<const SHIFT: usize>(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4528        unsafe {
4529            if SHIFT >= 8usize {
4530                return b;
4531            }
4532            let result = cross_block_alignr_256x1(
4533                self.cvt_to_bytes_i32x8(b).val.0,
4534                self.cvt_to_bytes_i32x8(a).val.0,
4535                SHIFT * 4usize,
4536            );
4537            self.cvt_from_bytes_i32x8(u8x32 {
4538                val: crate::support::Aligned256(result),
4539                simd: self,
4540            })
4541        }
4542    }
4543    #[inline(always)]
4544    fn slide_within_blocks_i32x8<const SHIFT: usize>(
4545        self,
4546        a: i32x8<Self>,
4547        b: i32x8<Self>,
4548    ) -> i32x8<Self> {
4549        unsafe {
4550            if SHIFT >= 4usize {
4551                return b;
4552            }
4553            let result = dyn_alignr_256(
4554                self.cvt_to_bytes_i32x8(b).val.0,
4555                self.cvt_to_bytes_i32x8(a).val.0,
4556                SHIFT * 4usize,
4557            );
4558            self.cvt_from_bytes_i32x8(u8x32 {
4559                val: crate::support::Aligned256(result),
4560                simd: self,
4561            })
4562        }
4563    }
4564    #[inline(always)]
4565    fn add_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4566        unsafe { _mm256_add_epi32(a.into(), b.into()).simd_into(self) }
4567    }
4568    #[inline(always)]
4569    fn sub_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4570        unsafe { _mm256_sub_epi32(a.into(), b.into()).simd_into(self) }
4571    }
4572    #[inline(always)]
4573    fn mul_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4574        unsafe { _mm256_mullo_epi32(a.into(), b.into()).simd_into(self) }
4575    }
4576    #[inline(always)]
4577    fn and_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4578        unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
4579    }
4580    #[inline(always)]
4581    fn or_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4582        unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
4583    }
4584    #[inline(always)]
4585    fn xor_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4586        unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
4587    }
4588    #[inline(always)]
4589    fn not_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
4590        a ^ !0
4591    }
4592    #[inline(always)]
4593    fn shl_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
4594        unsafe {
4595            _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
4596        }
4597    }
4598    #[inline(always)]
4599    fn shlv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4600        unsafe { _mm256_sllv_epi32(a.into(), b.into()).simd_into(self) }
4601    }
4602    #[inline(always)]
4603    fn shr_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
4604        unsafe {
4605            _mm256_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
4606        }
4607    }
4608    #[inline(always)]
4609    fn shrv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4610        unsafe { _mm256_srav_epi32(a.into(), b.into()).simd_into(self) }
4611    }
4612    #[inline(always)]
4613    fn simd_eq_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
4614        unsafe { _mm256_cmpeq_epi32(a.into(), b.into()).simd_into(self) }
4615    }
4616    #[inline(always)]
4617    fn simd_lt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
4618        unsafe { _mm256_cmpgt_epi32(b.into(), a.into()).simd_into(self) }
4619    }
4620    #[inline(always)]
4621    fn simd_le_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
4622        unsafe {
4623            _mm256_cmpeq_epi32(_mm256_min_epi32(a.into(), b.into()), a.into()).simd_into(self)
4624        }
4625    }
4626    #[inline(always)]
4627    fn simd_ge_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
4628        unsafe {
4629            _mm256_cmpeq_epi32(_mm256_max_epi32(a.into(), b.into()), a.into()).simd_into(self)
4630        }
4631    }
4632    #[inline(always)]
4633    fn simd_gt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
4634        unsafe { _mm256_cmpgt_epi32(a.into(), b.into()).simd_into(self) }
4635    }
4636    #[inline(always)]
4637    fn zip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4638        unsafe {
4639            let lo = _mm256_unpacklo_epi32(a.into(), b.into());
4640            let hi = _mm256_unpackhi_epi32(a.into(), b.into());
4641            _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self)
4642        }
4643    }
4644    #[inline(always)]
4645    fn zip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4646        unsafe {
4647            let lo = _mm256_unpacklo_epi32(a.into(), b.into());
4648            let hi = _mm256_unpackhi_epi32(a.into(), b.into());
4649            _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self)
4650        }
4651    }
4652    #[inline(always)]
4653    fn unzip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4654        unsafe {
4655            let t1 =
4656                _mm256_permutevar8x32_epi32(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
4657            let t2 =
4658                _mm256_permutevar8x32_epi32(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
4659            _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self)
4660        }
4661    }
4662    #[inline(always)]
4663    fn unzip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4664        unsafe {
4665            let t1 =
4666                _mm256_permutevar8x32_epi32(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
4667            let t2 =
4668                _mm256_permutevar8x32_epi32(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
4669            _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self)
4670        }
4671    }
4672    #[inline(always)]
4673    fn interleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
4674        unsafe {
4675            let lo = _mm256_unpacklo_epi32(a.into(), b.into());
4676            let hi = _mm256_unpackhi_epi32(a.into(), b.into());
4677            (
4678                _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self),
4679                _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self),
4680            )
4681        }
4682    }
4683    #[inline(always)]
4684    fn deinterleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
4685        unsafe {
4686            let t1 =
4687                _mm256_permutevar8x32_epi32(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
4688            let t2 =
4689                _mm256_permutevar8x32_epi32(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
4690            (
4691                _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self),
4692                _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self),
4693            )
4694        }
4695    }
4696    #[inline(always)]
4697    fn select_i32x8(self, a: mask32x8<Self>, b: i32x8<Self>, c: i32x8<Self>) -> i32x8<Self> {
4698        unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
4699    }
4700    #[inline(always)]
4701    fn min_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4702        unsafe { _mm256_min_epi32(a.into(), b.into()).simd_into(self) }
4703    }
4704    #[inline(always)]
4705    fn max_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4706        unsafe { _mm256_max_epi32(a.into(), b.into()).simd_into(self) }
4707    }
4708    #[inline(always)]
4709    fn combine_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x16<Self> {
4710        i32x16 {
4711            val: crate::support::Aligned512([a.val.0, b.val.0]),
4712            simd: self,
4713        }
4714    }
4715    #[inline(always)]
4716    fn split_i32x8(self, a: i32x8<Self>) -> (i32x4<Self>, i32x4<Self>) {
4717        unsafe {
4718            (
4719                _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
4720                _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
4721            )
4722        }
4723    }
4724    #[inline(always)]
4725    fn neg_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
4726        unsafe { _mm256_sub_epi32(_mm256_setzero_si256(), a.into()).simd_into(self) }
4727    }
4728    #[inline(always)]
4729    fn reinterpret_u8_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
4730        __m256i::from(a).simd_into(self)
4731    }
4732    #[inline(always)]
4733    fn reinterpret_u32_i32x8(self, a: i32x8<Self>) -> u32x8<Self> {
4734        __m256i::from(a).simd_into(self)
4735    }
4736    #[inline(always)]
4737    fn cvt_f32_i32x8(self, a: i32x8<Self>) -> f32x8<Self> {
4738        unsafe { _mm256_cvtepi32_ps(a.into()).simd_into(self) }
4739    }
4740    #[inline(always)]
4741    fn splat_u32x8(self, val: u32) -> u32x8<Self> {
4742        unsafe { _mm256_set1_epi32(val.cast_signed()).simd_into(self) }
4743    }
4744    #[inline(always)]
4745    fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8<Self> {
4746        u32x8 {
4747            val: unsafe { core::mem::transmute_copy(&val) },
4748            simd: self,
4749        }
4750    }
4751    #[inline(always)]
4752    fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8<Self> {
4753        u32x8 {
4754            val: unsafe { core::mem::transmute_copy(val) },
4755            simd: self,
4756        }
4757    }
4758    #[inline(always)]
4759    fn as_array_u32x8(self, a: u32x8<Self>) -> [u32; 8usize] {
4760        unsafe { core::mem::transmute::<__m256i, [u32; 8usize]>(a.val.0) }
4761    }
4762    #[inline(always)]
4763    fn as_array_ref_u32x8(self, a: &u32x8<Self>) -> &[u32; 8usize] {
4764        unsafe { core::mem::transmute::<&__m256i, &[u32; 8usize]>(&a.val.0) }
4765    }
4766    #[inline(always)]
4767    fn as_array_mut_u32x8(self, a: &mut u32x8<Self>) -> &mut [u32; 8usize] {
4768        unsafe { core::mem::transmute::<&mut __m256i, &mut [u32; 8usize]>(&mut a.val.0) }
4769    }
4770    #[inline(always)]
4771    fn store_array_u32x8(self, a: u32x8<Self>, dest: &mut [u32; 8usize]) -> () {
4772        unsafe {
4773            core::ptr::copy_nonoverlapping(
4774                (&raw const a.val.0) as *const u32,
4775                dest.as_mut_ptr(),
4776                8usize,
4777            );
4778        }
4779    }
4780    #[inline(always)]
4781    fn cvt_from_bytes_u32x8(self, a: u8x32<Self>) -> u32x8<Self> {
4782        unsafe {
4783            u32x8 {
4784                val: core::mem::transmute(a.val),
4785                simd: self,
4786            }
4787        }
4788    }
4789    #[inline(always)]
4790    fn cvt_to_bytes_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
4791        unsafe {
4792            u8x32 {
4793                val: core::mem::transmute(a.val),
4794                simd: self,
4795            }
4796        }
4797    }
4798    #[inline(always)]
4799    fn slide_u32x8<const SHIFT: usize>(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4800        unsafe {
4801            if SHIFT >= 8usize {
4802                return b;
4803            }
4804            let result = cross_block_alignr_256x1(
4805                self.cvt_to_bytes_u32x8(b).val.0,
4806                self.cvt_to_bytes_u32x8(a).val.0,
4807                SHIFT * 4usize,
4808            );
4809            self.cvt_from_bytes_u32x8(u8x32 {
4810                val: crate::support::Aligned256(result),
4811                simd: self,
4812            })
4813        }
4814    }
4815    #[inline(always)]
4816    fn slide_within_blocks_u32x8<const SHIFT: usize>(
4817        self,
4818        a: u32x8<Self>,
4819        b: u32x8<Self>,
4820    ) -> u32x8<Self> {
4821        unsafe {
4822            if SHIFT >= 4usize {
4823                return b;
4824            }
4825            let result = dyn_alignr_256(
4826                self.cvt_to_bytes_u32x8(b).val.0,
4827                self.cvt_to_bytes_u32x8(a).val.0,
4828                SHIFT * 4usize,
4829            );
4830            self.cvt_from_bytes_u32x8(u8x32 {
4831                val: crate::support::Aligned256(result),
4832                simd: self,
4833            })
4834        }
4835    }
4836    #[inline(always)]
4837    fn add_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4838        unsafe { _mm256_add_epi32(a.into(), b.into()).simd_into(self) }
4839    }
4840    #[inline(always)]
4841    fn sub_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4842        unsafe { _mm256_sub_epi32(a.into(), b.into()).simd_into(self) }
4843    }
4844    #[inline(always)]
4845    fn mul_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4846        unsafe { _mm256_mullo_epi32(a.into(), b.into()).simd_into(self) }
4847    }
4848    #[inline(always)]
4849    fn and_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4850        unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
4851    }
4852    #[inline(always)]
4853    fn or_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4854        unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
4855    }
4856    #[inline(always)]
4857    fn xor_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4858        unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
4859    }
4860    #[inline(always)]
4861    fn not_u32x8(self, a: u32x8<Self>) -> u32x8<Self> {
4862        a ^ !0
4863    }
4864    #[inline(always)]
4865    fn shl_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
4866        unsafe {
4867            _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
4868        }
4869    }
4870    #[inline(always)]
4871    fn shlv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4872        unsafe { _mm256_sllv_epi32(a.into(), b.into()).simd_into(self) }
4873    }
4874    #[inline(always)]
4875    fn shr_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
4876        unsafe {
4877            _mm256_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
4878        }
4879    }
4880    #[inline(always)]
4881    fn shrv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4882        unsafe { _mm256_srlv_epi32(a.into(), b.into()).simd_into(self) }
4883    }
4884    #[inline(always)]
4885    fn simd_eq_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
4886        unsafe { _mm256_cmpeq_epi32(a.into(), b.into()).simd_into(self) }
4887    }
4888    #[inline(always)]
4889    fn simd_lt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
4890        unsafe {
4891            let sign_bit = _mm256_set1_epi32(0x80000000u32.cast_signed());
4892            let a_signed = _mm256_xor_si256(a.into(), sign_bit);
4893            let b_signed = _mm256_xor_si256(b.into(), sign_bit);
4894            _mm256_cmpgt_epi32(b_signed, a_signed).simd_into(self)
4895        }
4896    }
4897    #[inline(always)]
4898    fn simd_le_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
4899        unsafe {
4900            _mm256_cmpeq_epi32(_mm256_min_epu32(a.into(), b.into()), a.into()).simd_into(self)
4901        }
4902    }
4903    #[inline(always)]
4904    fn simd_ge_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
4905        unsafe {
4906            _mm256_cmpeq_epi32(_mm256_max_epu32(a.into(), b.into()), a.into()).simd_into(self)
4907        }
4908    }
4909    #[inline(always)]
4910    fn simd_gt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
4911        unsafe {
4912            let sign_bit = _mm256_set1_epi32(0x80000000u32.cast_signed());
4913            let a_signed = _mm256_xor_si256(a.into(), sign_bit);
4914            let b_signed = _mm256_xor_si256(b.into(), sign_bit);
4915            _mm256_cmpgt_epi32(a_signed, b_signed).simd_into(self)
4916        }
4917    }
4918    #[inline(always)]
4919    fn zip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4920        unsafe {
4921            let lo = _mm256_unpacklo_epi32(a.into(), b.into());
4922            let hi = _mm256_unpackhi_epi32(a.into(), b.into());
4923            _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self)
4924        }
4925    }
4926    #[inline(always)]
4927    fn zip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4928        unsafe {
4929            let lo = _mm256_unpacklo_epi32(a.into(), b.into());
4930            let hi = _mm256_unpackhi_epi32(a.into(), b.into());
4931            _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self)
4932        }
4933    }
4934    #[inline(always)]
4935    fn unzip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4936        unsafe {
4937            let t1 =
4938                _mm256_permutevar8x32_epi32(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
4939            let t2 =
4940                _mm256_permutevar8x32_epi32(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
4941            _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self)
4942        }
4943    }
4944    #[inline(always)]
4945    fn unzip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4946        unsafe {
4947            let t1 =
4948                _mm256_permutevar8x32_epi32(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
4949            let t2 =
4950                _mm256_permutevar8x32_epi32(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
4951            _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self)
4952        }
4953    }
4954    #[inline(always)]
4955    fn interleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
4956        unsafe {
4957            let lo = _mm256_unpacklo_epi32(a.into(), b.into());
4958            let hi = _mm256_unpackhi_epi32(a.into(), b.into());
4959            (
4960                _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self),
4961                _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self),
4962            )
4963        }
4964    }
4965    #[inline(always)]
4966    fn deinterleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
4967        unsafe {
4968            let t1 =
4969                _mm256_permutevar8x32_epi32(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
4970            let t2 =
4971                _mm256_permutevar8x32_epi32(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
4972            (
4973                _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self),
4974                _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self),
4975            )
4976        }
4977    }
4978    #[inline(always)]
4979    fn select_u32x8(self, a: mask32x8<Self>, b: u32x8<Self>, c: u32x8<Self>) -> u32x8<Self> {
4980        unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
4981    }
4982    #[inline(always)]
4983    fn min_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4984        unsafe { _mm256_min_epu32(a.into(), b.into()).simd_into(self) }
4985    }
4986    #[inline(always)]
4987    fn max_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4988        unsafe { _mm256_max_epu32(a.into(), b.into()).simd_into(self) }
4989    }
4990    #[inline(always)]
4991    fn combine_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x16<Self> {
4992        u32x16 {
4993            val: crate::support::Aligned512([a.val.0, b.val.0]),
4994            simd: self,
4995        }
4996    }
4997    #[inline(always)]
4998    fn split_u32x8(self, a: u32x8<Self>) -> (u32x4<Self>, u32x4<Self>) {
4999        unsafe {
5000            (
5001                _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
5002                _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
5003            )
5004        }
5005    }
5006    #[inline(always)]
5007    fn reinterpret_u8_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
5008        __m256i::from(a).simd_into(self)
5009    }
5010    #[inline(always)]
5011    fn cvt_f32_u32x8(self, a: u32x8<Self>) -> f32x8<Self> {
5012        unsafe {
5013            let a = a.into();
5014            let lo = _mm256_blend_epi16::<0xAA>(a, _mm256_set1_epi32(0x4B000000));
5015            let hi = _mm256_blend_epi16::<0xAA>(
5016                _mm256_srli_epi32::<16>(a),
5017                _mm256_set1_epi32(0x53000000),
5018            );
5019            let fhi = _mm256_sub_ps(
5020                _mm256_castsi256_ps(hi),
5021                _mm256_set1_ps(f32::from_bits(0x53000080)),
5022            );
5023            let result = _mm256_add_ps(_mm256_castsi256_ps(lo), fhi);
5024            result.simd_into(self)
5025        }
5026    }
5027    #[inline(always)]
5028    fn splat_mask32x8(self, val: i32) -> mask32x8<Self> {
5029        unsafe { _mm256_set1_epi32(val).simd_into(self) }
5030    }
5031    #[inline(always)]
5032    fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8<Self> {
5033        mask32x8 {
5034            val: unsafe { core::mem::transmute_copy(&val) },
5035            simd: self,
5036        }
5037    }
5038    #[inline(always)]
5039    fn load_array_ref_mask32x8(self, val: &[i32; 8usize]) -> mask32x8<Self> {
5040        mask32x8 {
5041            val: unsafe { core::mem::transmute_copy(val) },
5042            simd: self,
5043        }
5044    }
5045    #[inline(always)]
5046    fn as_array_mask32x8(self, a: mask32x8<Self>) -> [i32; 8usize] {
5047        unsafe { core::mem::transmute::<__m256i, [i32; 8usize]>(a.val.0) }
5048    }
5049    #[inline(always)]
5050    fn as_array_ref_mask32x8(self, a: &mask32x8<Self>) -> &[i32; 8usize] {
5051        unsafe { core::mem::transmute::<&__m256i, &[i32; 8usize]>(&a.val.0) }
5052    }
5053    #[inline(always)]
5054    fn as_array_mut_mask32x8(self, a: &mut mask32x8<Self>) -> &mut [i32; 8usize] {
5055        unsafe { core::mem::transmute::<&mut __m256i, &mut [i32; 8usize]>(&mut a.val.0) }
5056    }
5057    #[inline(always)]
5058    fn store_array_mask32x8(self, a: mask32x8<Self>, dest: &mut [i32; 8usize]) -> () {
5059        unsafe {
5060            core::ptr::copy_nonoverlapping(
5061                (&raw const a.val.0) as *const i32,
5062                dest.as_mut_ptr(),
5063                8usize,
5064            );
5065        }
5066    }
5067    #[inline(always)]
5068    fn cvt_from_bytes_mask32x8(self, a: u8x32<Self>) -> mask32x8<Self> {
5069        unsafe {
5070            mask32x8 {
5071                val: core::mem::transmute(a.val),
5072                simd: self,
5073            }
5074        }
5075    }
5076    #[inline(always)]
5077    fn cvt_to_bytes_mask32x8(self, a: mask32x8<Self>) -> u8x32<Self> {
5078        unsafe {
5079            u8x32 {
5080                val: core::mem::transmute(a.val),
5081                simd: self,
5082            }
5083        }
5084    }
5085    #[inline(always)]
5086    fn slide_mask32x8<const SHIFT: usize>(
5087        self,
5088        a: mask32x8<Self>,
5089        b: mask32x8<Self>,
5090    ) -> mask32x8<Self> {
5091        unsafe {
5092            if SHIFT >= 8usize {
5093                return b;
5094            }
5095            let result = cross_block_alignr_256x1(
5096                self.cvt_to_bytes_mask32x8(b).val.0,
5097                self.cvt_to_bytes_mask32x8(a).val.0,
5098                SHIFT * 4usize,
5099            );
5100            self.cvt_from_bytes_mask32x8(u8x32 {
5101                val: crate::support::Aligned256(result),
5102                simd: self,
5103            })
5104        }
5105    }
5106    #[inline(always)]
5107    fn slide_within_blocks_mask32x8<const SHIFT: usize>(
5108        self,
5109        a: mask32x8<Self>,
5110        b: mask32x8<Self>,
5111    ) -> mask32x8<Self> {
5112        unsafe {
5113            if SHIFT >= 4usize {
5114                return b;
5115            }
5116            let result = dyn_alignr_256(
5117                self.cvt_to_bytes_mask32x8(b).val.0,
5118                self.cvt_to_bytes_mask32x8(a).val.0,
5119                SHIFT * 4usize,
5120            );
5121            self.cvt_from_bytes_mask32x8(u8x32 {
5122                val: crate::support::Aligned256(result),
5123                simd: self,
5124            })
5125        }
5126    }
5127    #[inline(always)]
5128    fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
5129        unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
5130    }
5131    #[inline(always)]
5132    fn or_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
5133        unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
5134    }
5135    #[inline(always)]
5136    fn xor_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
5137        unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
5138    }
5139    #[inline(always)]
5140    fn not_mask32x8(self, a: mask32x8<Self>) -> mask32x8<Self> {
5141        a ^ !0
5142    }
5143    #[inline(always)]
5144    fn select_mask32x8(
5145        self,
5146        a: mask32x8<Self>,
5147        b: mask32x8<Self>,
5148        c: mask32x8<Self>,
5149    ) -> mask32x8<Self> {
5150        unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
5151    }
5152    #[inline(always)]
5153    fn simd_eq_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
5154        unsafe { _mm256_cmpeq_epi32(a.into(), b.into()).simd_into(self) }
5155    }
5156    #[inline(always)]
5157    fn any_true_mask32x8(self, a: mask32x8<Self>) -> bool {
5158        unsafe { _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 != 0 }
5159    }
5160    #[inline(always)]
5161    fn all_true_mask32x8(self, a: mask32x8<Self>) -> bool {
5162        unsafe { _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 == 0b11111111 }
5163    }
5164    #[inline(always)]
5165    fn any_false_mask32x8(self, a: mask32x8<Self>) -> bool {
5166        unsafe { _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 != 0b11111111 }
5167    }
5168    #[inline(always)]
5169    fn all_false_mask32x8(self, a: mask32x8<Self>) -> bool {
5170        unsafe { _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 == 0 }
5171    }
5172    #[inline(always)]
5173    fn combine_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x16<Self> {
5174        mask32x16 {
5175            val: crate::support::Aligned512([a.val.0, b.val.0]),
5176            simd: self,
5177        }
5178    }
5179    #[inline(always)]
5180    fn split_mask32x8(self, a: mask32x8<Self>) -> (mask32x4<Self>, mask32x4<Self>) {
5181        unsafe {
5182            (
5183                _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
5184                _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
5185            )
5186        }
5187    }
5188    #[inline(always)]
5189    fn splat_f64x4(self, val: f64) -> f64x4<Self> {
5190        unsafe { _mm256_set1_pd(val).simd_into(self) }
5191    }
5192    #[inline(always)]
5193    fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4<Self> {
5194        f64x4 {
5195            val: unsafe { core::mem::transmute_copy(&val) },
5196            simd: self,
5197        }
5198    }
5199    #[inline(always)]
5200    fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4<Self> {
5201        f64x4 {
5202            val: unsafe { core::mem::transmute_copy(val) },
5203            simd: self,
5204        }
5205    }
5206    #[inline(always)]
5207    fn as_array_f64x4(self, a: f64x4<Self>) -> [f64; 4usize] {
5208        unsafe { core::mem::transmute::<__m256d, [f64; 4usize]>(a.val.0) }
5209    }
5210    #[inline(always)]
5211    fn as_array_ref_f64x4(self, a: &f64x4<Self>) -> &[f64; 4usize] {
5212        unsafe { core::mem::transmute::<&__m256d, &[f64; 4usize]>(&a.val.0) }
5213    }
5214    #[inline(always)]
5215    fn as_array_mut_f64x4(self, a: &mut f64x4<Self>) -> &mut [f64; 4usize] {
5216        unsafe { core::mem::transmute::<&mut __m256d, &mut [f64; 4usize]>(&mut a.val.0) }
5217    }
5218    #[inline(always)]
5219    fn store_array_f64x4(self, a: f64x4<Self>, dest: &mut [f64; 4usize]) -> () {
5220        unsafe {
5221            core::ptr::copy_nonoverlapping(
5222                (&raw const a.val.0) as *const f64,
5223                dest.as_mut_ptr(),
5224                4usize,
5225            );
5226        }
5227    }
5228    #[inline(always)]
5229    fn cvt_from_bytes_f64x4(self, a: u8x32<Self>) -> f64x4<Self> {
5230        unsafe {
5231            f64x4 {
5232                val: core::mem::transmute(a.val),
5233                simd: self,
5234            }
5235        }
5236    }
5237    #[inline(always)]
5238    fn cvt_to_bytes_f64x4(self, a: f64x4<Self>) -> u8x32<Self> {
5239        unsafe {
5240            u8x32 {
5241                val: core::mem::transmute(a.val),
5242                simd: self,
5243            }
5244        }
5245    }
5246    #[inline(always)]
5247    fn slide_f64x4<const SHIFT: usize>(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5248        unsafe {
5249            if SHIFT >= 4usize {
5250                return b;
5251            }
5252            let result = cross_block_alignr_256x1(
5253                self.cvt_to_bytes_f64x4(b).val.0,
5254                self.cvt_to_bytes_f64x4(a).val.0,
5255                SHIFT * 8usize,
5256            );
5257            self.cvt_from_bytes_f64x4(u8x32 {
5258                val: crate::support::Aligned256(result),
5259                simd: self,
5260            })
5261        }
5262    }
5263    #[inline(always)]
5264    fn slide_within_blocks_f64x4<const SHIFT: usize>(
5265        self,
5266        a: f64x4<Self>,
5267        b: f64x4<Self>,
5268    ) -> f64x4<Self> {
5269        unsafe {
5270            if SHIFT >= 2usize {
5271                return b;
5272            }
5273            let result = dyn_alignr_256(
5274                self.cvt_to_bytes_f64x4(b).val.0,
5275                self.cvt_to_bytes_f64x4(a).val.0,
5276                SHIFT * 8usize,
5277            );
5278            self.cvt_from_bytes_f64x4(u8x32 {
5279                val: crate::support::Aligned256(result),
5280                simd: self,
5281            })
5282        }
5283    }
5284    #[inline(always)]
5285    fn abs_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5286        unsafe { _mm256_andnot_pd(_mm256_set1_pd(-0.0), a.into()).simd_into(self) }
5287    }
5288    #[inline(always)]
5289    fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5290        unsafe { _mm256_xor_pd(a.into(), _mm256_set1_pd(-0.0)).simd_into(self) }
5291    }
5292    #[inline(always)]
5293    fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5294        unsafe { _mm256_sqrt_pd(a.into()).simd_into(self) }
5295    }
5296    #[inline(always)]
5297    fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5298        unsafe { _mm256_add_pd(a.into(), b.into()).simd_into(self) }
5299    }
5300    #[inline(always)]
5301    fn sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5302        unsafe { _mm256_sub_pd(a.into(), b.into()).simd_into(self) }
5303    }
5304    #[inline(always)]
5305    fn mul_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5306        unsafe { _mm256_mul_pd(a.into(), b.into()).simd_into(self) }
5307    }
5308    #[inline(always)]
5309    fn div_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5310        unsafe { _mm256_div_pd(a.into(), b.into()).simd_into(self) }
5311    }
5312    #[inline(always)]
5313    fn copysign_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5314        unsafe {
5315            let mask = _mm256_set1_pd(-0.0);
5316            _mm256_or_pd(
5317                _mm256_and_pd(mask, b.into()),
5318                _mm256_andnot_pd(mask, a.into()),
5319            )
5320            .simd_into(self)
5321        }
5322    }
5323    #[inline(always)]
5324    fn simd_eq_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
5325        unsafe { _mm256_castpd_si256(_mm256_cmp_pd::<0i32>(a.into(), b.into())).simd_into(self) }
5326    }
5327    #[inline(always)]
5328    fn simd_lt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
5329        unsafe { _mm256_castpd_si256(_mm256_cmp_pd::<17i32>(a.into(), b.into())).simd_into(self) }
5330    }
5331    #[inline(always)]
5332    fn simd_le_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
5333        unsafe { _mm256_castpd_si256(_mm256_cmp_pd::<18i32>(a.into(), b.into())).simd_into(self) }
5334    }
5335    #[inline(always)]
5336    fn simd_ge_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
5337        unsafe { _mm256_castpd_si256(_mm256_cmp_pd::<29i32>(a.into(), b.into())).simd_into(self) }
5338    }
5339    #[inline(always)]
5340    fn simd_gt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
5341        unsafe { _mm256_castpd_si256(_mm256_cmp_pd::<30i32>(a.into(), b.into())).simd_into(self) }
5342    }
5343    #[inline(always)]
5344    fn zip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5345        unsafe {
5346            let lo = _mm256_unpacklo_pd(a.into(), b.into());
5347            let hi = _mm256_unpackhi_pd(a.into(), b.into());
5348            _mm256_permute2f128_pd::<0b0010_0000>(lo, hi).simd_into(self)
5349        }
5350    }
5351    #[inline(always)]
5352    fn zip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5353        unsafe {
5354            let lo = _mm256_unpacklo_pd(a.into(), b.into());
5355            let hi = _mm256_unpackhi_pd(a.into(), b.into());
5356            _mm256_permute2f128_pd::<0b0011_0001>(lo, hi).simd_into(self)
5357        }
5358    }
5359    #[inline(always)]
5360    fn unzip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5361        unsafe {
5362            let t1 = _mm256_permute4x64_pd::<0b11_01_10_00>(a.into());
5363            let t2 = _mm256_permute4x64_pd::<0b11_01_10_00>(b.into());
5364            _mm256_permute2f128_pd::<0b0010_0000>(t1, t2).simd_into(self)
5365        }
5366    }
5367    #[inline(always)]
5368    fn unzip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5369        unsafe {
5370            let t1 = _mm256_permute4x64_pd::<0b11_01_10_00>(a.into());
5371            let t2 = _mm256_permute4x64_pd::<0b11_01_10_00>(b.into());
5372            _mm256_permute2f128_pd::<0b0011_0001>(t1, t2).simd_into(self)
5373        }
5374    }
5375    #[inline(always)]
5376    fn interleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
5377        unsafe {
5378            let lo = _mm256_unpacklo_pd(a.into(), b.into());
5379            let hi = _mm256_unpackhi_pd(a.into(), b.into());
5380            (
5381                _mm256_permute2f128_pd::<0b0010_0000>(lo, hi).simd_into(self),
5382                _mm256_permute2f128_pd::<0b0011_0001>(lo, hi).simd_into(self),
5383            )
5384        }
5385    }
5386    #[inline(always)]
5387    fn deinterleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
5388        unsafe {
5389            let t1 = _mm256_permute4x64_pd::<0b11_01_10_00>(a.into());
5390            let t2 = _mm256_permute4x64_pd::<0b11_01_10_00>(b.into());
5391            (
5392                _mm256_permute2f128_pd::<0b0010_0000>(t1, t2).simd_into(self),
5393                _mm256_permute2f128_pd::<0b0011_0001>(t1, t2).simd_into(self),
5394            )
5395        }
5396    }
5397    #[inline(always)]
5398    fn max_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5399        unsafe { _mm256_max_pd(a.into(), b.into()).simd_into(self) }
5400    }
5401    #[inline(always)]
5402    fn min_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5403        unsafe { _mm256_min_pd(a.into(), b.into()).simd_into(self) }
5404    }
5405    #[inline(always)]
5406    fn max_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5407        unsafe {
5408            let intermediate = _mm256_max_pd(a.into(), b.into());
5409            let b_is_nan = _mm256_cmp_pd::<3i32>(b.into(), b.into());
5410            _mm256_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(self)
5411        }
5412    }
5413    #[inline(always)]
5414    fn min_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5415        unsafe {
5416            let intermediate = _mm256_min_pd(a.into(), b.into());
5417            let b_is_nan = _mm256_cmp_pd::<3i32>(b.into(), b.into());
5418            _mm256_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(self)
5419        }
5420    }
5421    #[inline(always)]
5422    fn mul_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
5423        unsafe { _mm256_fmadd_pd(a.into(), b.into(), c.into()).simd_into(self) }
5424    }
5425    #[inline(always)]
5426    fn mul_sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
5427        unsafe { _mm256_fmsub_pd(a.into(), b.into(), c.into()).simd_into(self) }
5428    }
5429    #[inline(always)]
5430    fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5431        unsafe {
5432            _mm256_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
5433                .simd_into(self)
5434        }
5435    }
5436    #[inline(always)]
5437    fn ceil_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5438        unsafe {
5439            _mm256_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into())
5440                .simd_into(self)
5441        }
5442    }
5443    #[inline(always)]
5444    fn round_ties_even_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5445        unsafe {
5446            _mm256_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
5447                .simd_into(self)
5448        }
5449    }
5450    #[inline(always)]
5451    fn fract_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5452        a - self.trunc_f64x4(a)
5453    }
5454    #[inline(always)]
5455    fn trunc_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5456        unsafe {
5457            _mm256_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
5458        }
5459    }
5460    #[inline(always)]
5461    fn select_f64x4(self, a: mask64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
5462        unsafe {
5463            _mm256_blendv_pd(c.into(), b.into(), _mm256_castsi256_pd(a.into())).simd_into(self)
5464        }
5465    }
5466    #[inline(always)]
5467    fn combine_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x8<Self> {
5468        f64x8 {
5469            val: crate::support::Aligned512([a.val.0, b.val.0]),
5470            simd: self,
5471        }
5472    }
5473    #[inline(always)]
5474    fn split_f64x4(self, a: f64x4<Self>) -> (f64x2<Self>, f64x2<Self>) {
5475        unsafe {
5476            (
5477                _mm256_extractf128_pd::<0>(a.into()).simd_into(self),
5478                _mm256_extractf128_pd::<1>(a.into()).simd_into(self),
5479            )
5480        }
5481    }
5482    #[inline(always)]
5483    fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
5484        unsafe { _mm256_castpd_ps(a.into()).simd_into(self) }
5485    }
5486    #[inline(always)]
5487    fn splat_mask64x4(self, val: i64) -> mask64x4<Self> {
5488        unsafe { _mm256_set1_epi64x(val).simd_into(self) }
5489    }
5490    #[inline(always)]
5491    fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4<Self> {
5492        mask64x4 {
5493            val: unsafe { core::mem::transmute_copy(&val) },
5494            simd: self,
5495        }
5496    }
5497    #[inline(always)]
5498    fn load_array_ref_mask64x4(self, val: &[i64; 4usize]) -> mask64x4<Self> {
5499        mask64x4 {
5500            val: unsafe { core::mem::transmute_copy(val) },
5501            simd: self,
5502        }
5503    }
5504    #[inline(always)]
5505    fn as_array_mask64x4(self, a: mask64x4<Self>) -> [i64; 4usize] {
5506        unsafe { core::mem::transmute::<__m256i, [i64; 4usize]>(a.val.0) }
5507    }
5508    #[inline(always)]
5509    fn as_array_ref_mask64x4(self, a: &mask64x4<Self>) -> &[i64; 4usize] {
5510        unsafe { core::mem::transmute::<&__m256i, &[i64; 4usize]>(&a.val.0) }
5511    }
5512    #[inline(always)]
5513    fn as_array_mut_mask64x4(self, a: &mut mask64x4<Self>) -> &mut [i64; 4usize] {
5514        unsafe { core::mem::transmute::<&mut __m256i, &mut [i64; 4usize]>(&mut a.val.0) }
5515    }
5516    #[inline(always)]
5517    fn store_array_mask64x4(self, a: mask64x4<Self>, dest: &mut [i64; 4usize]) -> () {
5518        unsafe {
5519            core::ptr::copy_nonoverlapping(
5520                (&raw const a.val.0) as *const i64,
5521                dest.as_mut_ptr(),
5522                4usize,
5523            );
5524        }
5525    }
5526    #[inline(always)]
5527    fn cvt_from_bytes_mask64x4(self, a: u8x32<Self>) -> mask64x4<Self> {
5528        unsafe {
5529            mask64x4 {
5530                val: core::mem::transmute(a.val),
5531                simd: self,
5532            }
5533        }
5534    }
5535    #[inline(always)]
5536    fn cvt_to_bytes_mask64x4(self, a: mask64x4<Self>) -> u8x32<Self> {
5537        unsafe {
5538            u8x32 {
5539                val: core::mem::transmute(a.val),
5540                simd: self,
5541            }
5542        }
5543    }
5544    #[inline(always)]
5545    fn slide_mask64x4<const SHIFT: usize>(
5546        self,
5547        a: mask64x4<Self>,
5548        b: mask64x4<Self>,
5549    ) -> mask64x4<Self> {
5550        unsafe {
5551            if SHIFT >= 4usize {
5552                return b;
5553            }
5554            let result = cross_block_alignr_256x1(
5555                self.cvt_to_bytes_mask64x4(b).val.0,
5556                self.cvt_to_bytes_mask64x4(a).val.0,
5557                SHIFT * 8usize,
5558            );
5559            self.cvt_from_bytes_mask64x4(u8x32 {
5560                val: crate::support::Aligned256(result),
5561                simd: self,
5562            })
5563        }
5564    }
5565    #[inline(always)]
5566    fn slide_within_blocks_mask64x4<const SHIFT: usize>(
5567        self,
5568        a: mask64x4<Self>,
5569        b: mask64x4<Self>,
5570    ) -> mask64x4<Self> {
5571        unsafe {
5572            if SHIFT >= 2usize {
5573                return b;
5574            }
5575            let result = dyn_alignr_256(
5576                self.cvt_to_bytes_mask64x4(b).val.0,
5577                self.cvt_to_bytes_mask64x4(a).val.0,
5578                SHIFT * 8usize,
5579            );
5580            self.cvt_from_bytes_mask64x4(u8x32 {
5581                val: crate::support::Aligned256(result),
5582                simd: self,
5583            })
5584        }
5585    }
5586    #[inline(always)]
5587    fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
5588        unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
5589    }
5590    #[inline(always)]
5591    fn or_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
5592        unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
5593    }
5594    #[inline(always)]
5595    fn xor_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
5596        unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
5597    }
5598    #[inline(always)]
5599    fn not_mask64x4(self, a: mask64x4<Self>) -> mask64x4<Self> {
5600        a ^ !0
5601    }
5602    #[inline(always)]
5603    fn select_mask64x4(
5604        self,
5605        a: mask64x4<Self>,
5606        b: mask64x4<Self>,
5607        c: mask64x4<Self>,
5608    ) -> mask64x4<Self> {
5609        unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
5610    }
5611    #[inline(always)]
5612    fn simd_eq_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
5613        unsafe { _mm256_cmpeq_epi64(a.into(), b.into()).simd_into(self) }
5614    }
5615    #[inline(always)]
5616    fn any_true_mask64x4(self, a: mask64x4<Self>) -> bool {
5617        unsafe { _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 != 0 }
5618    }
5619    #[inline(always)]
5620    fn all_true_mask64x4(self, a: mask64x4<Self>) -> bool {
5621        unsafe { _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 == 0b1111 }
5622    }
5623    #[inline(always)]
5624    fn any_false_mask64x4(self, a: mask64x4<Self>) -> bool {
5625        unsafe { _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 != 0b1111 }
5626    }
5627    #[inline(always)]
5628    fn all_false_mask64x4(self, a: mask64x4<Self>) -> bool {
5629        unsafe { _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 == 0 }
5630    }
5631    #[inline(always)]
5632    fn combine_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x8<Self> {
5633        mask64x8 {
5634            val: crate::support::Aligned512([a.val.0, b.val.0]),
5635            simd: self,
5636        }
5637    }
5638    #[inline(always)]
5639    fn split_mask64x4(self, a: mask64x4<Self>) -> (mask64x2<Self>, mask64x2<Self>) {
5640        unsafe {
5641            (
5642                _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
5643                _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
5644            )
5645        }
5646    }
5647    #[inline(always)]
5648    fn splat_f32x16(self, val: f32) -> f32x16<Self> {
5649        let half = self.splat_f32x8(val);
5650        self.combine_f32x8(half, half)
5651    }
5652    #[inline(always)]
5653    fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16<Self> {
5654        f32x16 {
5655            val: unsafe { core::mem::transmute_copy(&val) },
5656            simd: self,
5657        }
5658    }
5659    #[inline(always)]
5660    fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16<Self> {
5661        f32x16 {
5662            val: unsafe { core::mem::transmute_copy(val) },
5663            simd: self,
5664        }
5665    }
5666    #[inline(always)]
5667    fn as_array_f32x16(self, a: f32x16<Self>) -> [f32; 16usize] {
5668        unsafe { core::mem::transmute::<[__m256; 2usize], [f32; 16usize]>(a.val.0) }
5669    }
5670    #[inline(always)]
5671    fn as_array_ref_f32x16(self, a: &f32x16<Self>) -> &[f32; 16usize] {
5672        unsafe { core::mem::transmute::<&[__m256; 2usize], &[f32; 16usize]>(&a.val.0) }
5673    }
5674    #[inline(always)]
5675    fn as_array_mut_f32x16(self, a: &mut f32x16<Self>) -> &mut [f32; 16usize] {
5676        unsafe { core::mem::transmute::<&mut [__m256; 2usize], &mut [f32; 16usize]>(&mut a.val.0) }
5677    }
5678    #[inline(always)]
5679    fn store_array_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
5680        unsafe {
5681            core::ptr::copy_nonoverlapping(
5682                (&raw const a.val.0) as *const f32,
5683                dest.as_mut_ptr(),
5684                16usize,
5685            );
5686        }
5687    }
5688    #[inline(always)]
5689    fn cvt_from_bytes_f32x16(self, a: u8x64<Self>) -> f32x16<Self> {
5690        unsafe {
5691            f32x16 {
5692                val: core::mem::transmute(a.val),
5693                simd: self,
5694            }
5695        }
5696    }
5697    #[inline(always)]
5698    fn cvt_to_bytes_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
5699        unsafe {
5700            u8x64 {
5701                val: core::mem::transmute(a.val),
5702                simd: self,
5703            }
5704        }
5705    }
5706    #[inline(always)]
5707    fn slide_f32x16<const SHIFT: usize>(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5708        unsafe {
5709            if SHIFT >= 16usize {
5710                return b;
5711            }
5712            let result = cross_block_alignr_256x2(
5713                self.cvt_to_bytes_f32x16(b).val.0,
5714                self.cvt_to_bytes_f32x16(a).val.0,
5715                SHIFT * 4usize,
5716            );
5717            self.cvt_from_bytes_f32x16(u8x64 {
5718                val: crate::support::Aligned512(result),
5719                simd: self,
5720            })
5721        }
5722    }
5723    #[inline(always)]
5724    fn slide_within_blocks_f32x16<const SHIFT: usize>(
5725        self,
5726        a: f32x16<Self>,
5727        b: f32x16<Self>,
5728    ) -> f32x16<Self> {
5729        let (a0, a1) = self.split_f32x16(a);
5730        let (b0, b1) = self.split_f32x16(b);
5731        self.combine_f32x8(
5732            self.slide_within_blocks_f32x8::<SHIFT>(a0, b0),
5733            self.slide_within_blocks_f32x8::<SHIFT>(a1, b1),
5734        )
5735    }
5736    #[inline(always)]
5737    fn abs_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
5738        let (a0, a1) = self.split_f32x16(a);
5739        self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1))
5740    }
5741    #[inline(always)]
5742    fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
5743        let (a0, a1) = self.split_f32x16(a);
5744        self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1))
5745    }
5746    #[inline(always)]
5747    fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
5748        let (a0, a1) = self.split_f32x16(a);
5749        self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
5750    }
5751    #[inline(always)]
5752    fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5753        let (a0, a1) = self.split_f32x16(a);
5754        let (b0, b1) = self.split_f32x16(b);
5755        self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1))
5756    }
5757    #[inline(always)]
5758    fn sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5759        let (a0, a1) = self.split_f32x16(a);
5760        let (b0, b1) = self.split_f32x16(b);
5761        self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1))
5762    }
5763    #[inline(always)]
5764    fn mul_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5765        let (a0, a1) = self.split_f32x16(a);
5766        let (b0, b1) = self.split_f32x16(b);
5767        self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1))
5768    }
5769    #[inline(always)]
5770    fn div_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5771        let (a0, a1) = self.split_f32x16(a);
5772        let (b0, b1) = self.split_f32x16(b);
5773        self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1))
5774    }
5775    #[inline(always)]
5776    fn copysign_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5777        let (a0, a1) = self.split_f32x16(a);
5778        let (b0, b1) = self.split_f32x16(b);
5779        self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1))
5780    }
5781    #[inline(always)]
5782    fn simd_eq_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
5783        let (a0, a1) = self.split_f32x16(a);
5784        let (b0, b1) = self.split_f32x16(b);
5785        self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1))
5786    }
5787    #[inline(always)]
5788    fn simd_lt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
5789        let (a0, a1) = self.split_f32x16(a);
5790        let (b0, b1) = self.split_f32x16(b);
5791        self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1))
5792    }
5793    #[inline(always)]
5794    fn simd_le_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
5795        let (a0, a1) = self.split_f32x16(a);
5796        let (b0, b1) = self.split_f32x16(b);
5797        self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1))
5798    }
5799    #[inline(always)]
5800    fn simd_ge_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
5801        let (a0, a1) = self.split_f32x16(a);
5802        let (b0, b1) = self.split_f32x16(b);
5803        self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1))
5804    }
5805    #[inline(always)]
5806    fn simd_gt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
5807        let (a0, a1) = self.split_f32x16(a);
5808        let (b0, b1) = self.split_f32x16(b);
5809        self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1))
5810    }
5811    #[inline(always)]
5812    fn zip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5813        let (a0, _) = self.split_f32x16(a);
5814        let (b0, _) = self.split_f32x16(b);
5815        self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0))
5816    }
5817    #[inline(always)]
5818    fn zip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5819        let (_, a1) = self.split_f32x16(a);
5820        let (_, b1) = self.split_f32x16(b);
5821        self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1))
5822    }
5823    #[inline(always)]
5824    fn unzip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5825        let (a0, a1) = self.split_f32x16(a);
5826        let (b0, b1) = self.split_f32x16(b);
5827        self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1))
5828    }
5829    #[inline(always)]
5830    fn unzip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5831        let (a0, a1) = self.split_f32x16(a);
5832        let (b0, b1) = self.split_f32x16(b);
5833        self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1))
5834    }
5835    #[inline(always)]
5836    fn interleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
5837        let (a0, a1) = self.split_f32x16(a);
5838        let (b0, b1) = self.split_f32x16(b);
5839        let lo_lo = self.zip_low_f32x8(a0, b0);
5840        let lo_hi = self.zip_high_f32x8(a0, b0);
5841        let hi_lo = self.zip_low_f32x8(a1, b1);
5842        let hi_hi = self.zip_high_f32x8(a1, b1);
5843        (
5844            self.combine_f32x8(lo_lo, lo_hi),
5845            self.combine_f32x8(hi_lo, hi_hi),
5846        )
5847    }
5848    #[inline(always)]
5849    fn deinterleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
5850        let (a0, a1) = self.split_f32x16(a);
5851        let (b0, b1) = self.split_f32x16(b);
5852        let lo_even = self.unzip_low_f32x8(a0, a1);
5853        let lo_odd = self.unzip_high_f32x8(a0, a1);
5854        let hi_even = self.unzip_low_f32x8(b0, b1);
5855        let hi_odd = self.unzip_high_f32x8(b0, b1);
5856        (
5857            self.combine_f32x8(lo_even, hi_even),
5858            self.combine_f32x8(lo_odd, hi_odd),
5859        )
5860    }
5861    #[inline(always)]
5862    fn max_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5863        let (a0, a1) = self.split_f32x16(a);
5864        let (b0, b1) = self.split_f32x16(b);
5865        self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1))
5866    }
5867    #[inline(always)]
5868    fn min_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5869        let (a0, a1) = self.split_f32x16(a);
5870        let (b0, b1) = self.split_f32x16(b);
5871        self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1))
5872    }
5873    #[inline(always)]
5874    fn max_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5875        let (a0, a1) = self.split_f32x16(a);
5876        let (b0, b1) = self.split_f32x16(b);
5877        self.combine_f32x8(
5878            self.max_precise_f32x8(a0, b0),
5879            self.max_precise_f32x8(a1, b1),
5880        )
5881    }
5882    #[inline(always)]
5883    fn min_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5884        let (a0, a1) = self.split_f32x16(a);
5885        let (b0, b1) = self.split_f32x16(b);
5886        self.combine_f32x8(
5887            self.min_precise_f32x8(a0, b0),
5888            self.min_precise_f32x8(a1, b1),
5889        )
5890    }
5891    #[inline(always)]
5892    fn mul_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
5893        let (a0, a1) = self.split_f32x16(a);
5894        let (b0, b1) = self.split_f32x16(b);
5895        let (c0, c1) = self.split_f32x16(c);
5896        self.combine_f32x8(
5897            self.mul_add_f32x8(a0, b0, c0),
5898            self.mul_add_f32x8(a1, b1, c1),
5899        )
5900    }
5901    #[inline(always)]
5902    fn mul_sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
5903        let (a0, a1) = self.split_f32x16(a);
5904        let (b0, b1) = self.split_f32x16(b);
5905        let (c0, c1) = self.split_f32x16(c);
5906        self.combine_f32x8(
5907            self.mul_sub_f32x8(a0, b0, c0),
5908            self.mul_sub_f32x8(a1, b1, c1),
5909        )
5910    }
5911    #[inline(always)]
5912    fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
5913        let (a0, a1) = self.split_f32x16(a);
5914        self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1))
5915    }
5916    #[inline(always)]
5917    fn ceil_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
5918        let (a0, a1) = self.split_f32x16(a);
5919        self.combine_f32x8(self.ceil_f32x8(a0), self.ceil_f32x8(a1))
5920    }
5921    #[inline(always)]
5922    fn round_ties_even_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
5923        let (a0, a1) = self.split_f32x16(a);
5924        self.combine_f32x8(
5925            self.round_ties_even_f32x8(a0),
5926            self.round_ties_even_f32x8(a1),
5927        )
5928    }
5929    #[inline(always)]
5930    fn fract_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
5931        let (a0, a1) = self.split_f32x16(a);
5932        self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1))
5933    }
5934    #[inline(always)]
5935    fn trunc_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
5936        let (a0, a1) = self.split_f32x16(a);
5937        self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1))
5938    }
5939    #[inline(always)]
5940    fn select_f32x16(self, a: mask32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
5941        let (a0, a1) = self.split_mask32x16(a);
5942        let (b0, b1) = self.split_f32x16(b);
5943        let (c0, c1) = self.split_f32x16(c);
5944        self.combine_f32x8(self.select_f32x8(a0, b0, c0), self.select_f32x8(a1, b1, c1))
5945    }
5946    #[inline(always)]
5947    fn split_f32x16(self, a: f32x16<Self>) -> (f32x8<Self>, f32x8<Self>) {
5948        (
5949            f32x8 {
5950                val: crate::support::Aligned256(a.val.0[0]),
5951                simd: self,
5952            },
5953            f32x8 {
5954                val: crate::support::Aligned256(a.val.0[1]),
5955                simd: self,
5956            },
5957        )
5958    }
5959    #[inline(always)]
5960    fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
5961        let (a0, a1) = self.split_f32x16(a);
5962        self.combine_f64x4(
5963            self.reinterpret_f64_f32x8(a0),
5964            self.reinterpret_f64_f32x8(a1),
5965        )
5966    }
5967    #[inline(always)]
5968    fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
5969        let (a0, a1) = self.split_f32x16(a);
5970        self.combine_i32x8(
5971            self.reinterpret_i32_f32x8(a0),
5972            self.reinterpret_i32_f32x8(a1),
5973        )
5974    }
5975    #[inline(always)]
5976    fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
5977        unsafe {
5978            let v0 = _mm_loadu_ps(src.as_ptr() as *const _);
5979            let v1 = _mm_loadu_ps(src.as_ptr().add(4usize) as *const _);
5980            let v2 = _mm_loadu_ps(src.as_ptr().add(2 * 4usize) as *const _);
5981            let v3 = _mm_loadu_ps(src.as_ptr().add(3 * 4usize) as *const _);
5982            let tmp0 = _mm_unpacklo_ps(v0, v1);
5983            let tmp1 = _mm_unpackhi_ps(v0, v1);
5984            let tmp2 = _mm_unpacklo_ps(v2, v3);
5985            let tmp3 = _mm_unpackhi_ps(v2, v3);
5986            let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
5987            let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
5988            let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
5989            let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
5990            self.combine_f32x8(
5991                self.combine_f32x4(out0.simd_into(self), out1.simd_into(self)),
5992                self.combine_f32x4(out2.simd_into(self), out3.simd_into(self)),
5993            )
5994        }
5995    }
5996    #[inline(always)]
5997    fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
5998        let (v01, v23) = self.split_f32x16(a);
5999        let (v0, v1) = self.split_f32x8(v01);
6000        let (v2, v3) = self.split_f32x8(v23);
6001        let v0 = v0.into();
6002        let v1 = v1.into();
6003        let v2 = v2.into();
6004        let v3 = v3.into();
6005        unsafe {
6006            let tmp0 = _mm_unpacklo_ps(v0, v1);
6007            let tmp1 = _mm_unpackhi_ps(v0, v1);
6008            let tmp2 = _mm_unpacklo_ps(v2, v3);
6009            let tmp3 = _mm_unpackhi_ps(v2, v3);
6010            let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
6011            let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
6012            let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
6013            let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
6014            _mm_storeu_ps(dest.as_mut_ptr() as *mut _, out0);
6015            _mm_storeu_ps(dest.as_mut_ptr().add(4usize) as *mut _, out1);
6016            _mm_storeu_ps(dest.as_mut_ptr().add(2 * 4usize) as *mut _, out2);
6017            _mm_storeu_ps(dest.as_mut_ptr().add(3 * 4usize) as *mut _, out3);
6018        }
6019    }
6020    #[inline(always)]
6021    fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
6022        let (a0, a1) = self.split_f32x16(a);
6023        self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1))
6024    }
6025    #[inline(always)]
6026    fn reinterpret_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
6027        let (a0, a1) = self.split_f32x16(a);
6028        self.combine_u32x8(
6029            self.reinterpret_u32_f32x8(a0),
6030            self.reinterpret_u32_f32x8(a1),
6031        )
6032    }
6033    #[inline(always)]
6034    fn cvt_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
6035        let (a0, a1) = self.split_f32x16(a);
6036        self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1))
6037    }
6038    #[inline(always)]
6039    fn cvt_u32_precise_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
6040        let (a0, a1) = self.split_f32x16(a);
6041        self.combine_u32x8(
6042            self.cvt_u32_precise_f32x8(a0),
6043            self.cvt_u32_precise_f32x8(a1),
6044        )
6045    }
6046    #[inline(always)]
6047    fn cvt_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
6048        let (a0, a1) = self.split_f32x16(a);
6049        self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1))
6050    }
6051    #[inline(always)]
6052    fn cvt_i32_precise_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
6053        let (a0, a1) = self.split_f32x16(a);
6054        self.combine_i32x8(
6055            self.cvt_i32_precise_f32x8(a0),
6056            self.cvt_i32_precise_f32x8(a1),
6057        )
6058    }
6059    #[inline(always)]
6060    fn splat_i8x64(self, val: i8) -> i8x64<Self> {
6061        let half = self.splat_i8x32(val);
6062        self.combine_i8x32(half, half)
6063    }
6064    #[inline(always)]
6065    fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64<Self> {
6066        i8x64 {
6067            val: unsafe { core::mem::transmute_copy(&val) },
6068            simd: self,
6069        }
6070    }
6071    #[inline(always)]
6072    fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64<Self> {
6073        i8x64 {
6074            val: unsafe { core::mem::transmute_copy(val) },
6075            simd: self,
6076        }
6077    }
6078    #[inline(always)]
6079    fn as_array_i8x64(self, a: i8x64<Self>) -> [i8; 64usize] {
6080        unsafe { core::mem::transmute::<[__m256i; 2usize], [i8; 64usize]>(a.val.0) }
6081    }
6082    #[inline(always)]
6083    fn as_array_ref_i8x64(self, a: &i8x64<Self>) -> &[i8; 64usize] {
6084        unsafe { core::mem::transmute::<&[__m256i; 2usize], &[i8; 64usize]>(&a.val.0) }
6085    }
6086    #[inline(always)]
6087    fn as_array_mut_i8x64(self, a: &mut i8x64<Self>) -> &mut [i8; 64usize] {
6088        unsafe { core::mem::transmute::<&mut [__m256i; 2usize], &mut [i8; 64usize]>(&mut a.val.0) }
6089    }
6090    #[inline(always)]
6091    fn store_array_i8x64(self, a: i8x64<Self>, dest: &mut [i8; 64usize]) -> () {
6092        unsafe {
6093            core::ptr::copy_nonoverlapping(
6094                (&raw const a.val.0) as *const i8,
6095                dest.as_mut_ptr(),
6096                64usize,
6097            );
6098        }
6099    }
6100    #[inline(always)]
6101    fn cvt_from_bytes_i8x64(self, a: u8x64<Self>) -> i8x64<Self> {
6102        unsafe {
6103            i8x64 {
6104                val: core::mem::transmute(a.val),
6105                simd: self,
6106            }
6107        }
6108    }
6109    #[inline(always)]
6110    fn cvt_to_bytes_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
6111        unsafe {
6112            u8x64 {
6113                val: core::mem::transmute(a.val),
6114                simd: self,
6115            }
6116        }
6117    }
6118    #[inline(always)]
6119    fn slide_i8x64<const SHIFT: usize>(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6120        unsafe {
6121            if SHIFT >= 64usize {
6122                return b;
6123            }
6124            let result = cross_block_alignr_256x2(
6125                self.cvt_to_bytes_i8x64(b).val.0,
6126                self.cvt_to_bytes_i8x64(a).val.0,
6127                SHIFT,
6128            );
6129            self.cvt_from_bytes_i8x64(u8x64 {
6130                val: crate::support::Aligned512(result),
6131                simd: self,
6132            })
6133        }
6134    }
6135    #[inline(always)]
6136    fn slide_within_blocks_i8x64<const SHIFT: usize>(
6137        self,
6138        a: i8x64<Self>,
6139        b: i8x64<Self>,
6140    ) -> i8x64<Self> {
6141        let (a0, a1) = self.split_i8x64(a);
6142        let (b0, b1) = self.split_i8x64(b);
6143        self.combine_i8x32(
6144            self.slide_within_blocks_i8x32::<SHIFT>(a0, b0),
6145            self.slide_within_blocks_i8x32::<SHIFT>(a1, b1),
6146        )
6147    }
6148    #[inline(always)]
6149    fn add_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6150        let (a0, a1) = self.split_i8x64(a);
6151        let (b0, b1) = self.split_i8x64(b);
6152        self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1))
6153    }
6154    #[inline(always)]
6155    fn sub_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6156        let (a0, a1) = self.split_i8x64(a);
6157        let (b0, b1) = self.split_i8x64(b);
6158        self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1))
6159    }
6160    #[inline(always)]
6161    fn mul_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6162        let (a0, a1) = self.split_i8x64(a);
6163        let (b0, b1) = self.split_i8x64(b);
6164        self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1))
6165    }
6166    #[inline(always)]
6167    fn and_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6168        let (a0, a1) = self.split_i8x64(a);
6169        let (b0, b1) = self.split_i8x64(b);
6170        self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1))
6171    }
6172    #[inline(always)]
6173    fn or_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6174        let (a0, a1) = self.split_i8x64(a);
6175        let (b0, b1) = self.split_i8x64(b);
6176        self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1))
6177    }
6178    #[inline(always)]
6179    fn xor_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6180        let (a0, a1) = self.split_i8x64(a);
6181        let (b0, b1) = self.split_i8x64(b);
6182        self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1))
6183    }
6184    #[inline(always)]
6185    fn not_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
6186        let (a0, a1) = self.split_i8x64(a);
6187        self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1))
6188    }
6189    #[inline(always)]
6190    fn shl_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
6191        let (a0, a1) = self.split_i8x64(a);
6192        self.combine_i8x32(self.shl_i8x32(a0, shift), self.shl_i8x32(a1, shift))
6193    }
6194    #[inline(always)]
6195    fn shlv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6196        let (a0, a1) = self.split_i8x64(a);
6197        let (b0, b1) = self.split_i8x64(b);
6198        self.combine_i8x32(self.shlv_i8x32(a0, b0), self.shlv_i8x32(a1, b1))
6199    }
6200    #[inline(always)]
6201    fn shr_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
6202        let (a0, a1) = self.split_i8x64(a);
6203        self.combine_i8x32(self.shr_i8x32(a0, shift), self.shr_i8x32(a1, shift))
6204    }
6205    #[inline(always)]
6206    fn shrv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6207        let (a0, a1) = self.split_i8x64(a);
6208        let (b0, b1) = self.split_i8x64(b);
6209        self.combine_i8x32(self.shrv_i8x32(a0, b0), self.shrv_i8x32(a1, b1))
6210    }
6211    #[inline(always)]
6212    fn simd_eq_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
6213        let (a0, a1) = self.split_i8x64(a);
6214        let (b0, b1) = self.split_i8x64(b);
6215        self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1))
6216    }
6217    #[inline(always)]
6218    fn simd_lt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
6219        let (a0, a1) = self.split_i8x64(a);
6220        let (b0, b1) = self.split_i8x64(b);
6221        self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1))
6222    }
6223    #[inline(always)]
6224    fn simd_le_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
6225        let (a0, a1) = self.split_i8x64(a);
6226        let (b0, b1) = self.split_i8x64(b);
6227        self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1))
6228    }
6229    #[inline(always)]
6230    fn simd_ge_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
6231        let (a0, a1) = self.split_i8x64(a);
6232        let (b0, b1) = self.split_i8x64(b);
6233        self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1))
6234    }
6235    #[inline(always)]
6236    fn simd_gt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
6237        let (a0, a1) = self.split_i8x64(a);
6238        let (b0, b1) = self.split_i8x64(b);
6239        self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1))
6240    }
6241    #[inline(always)]
6242    fn zip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6243        let (a0, _) = self.split_i8x64(a);
6244        let (b0, _) = self.split_i8x64(b);
6245        self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0))
6246    }
6247    #[inline(always)]
6248    fn zip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6249        let (_, a1) = self.split_i8x64(a);
6250        let (_, b1) = self.split_i8x64(b);
6251        self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1))
6252    }
6253    #[inline(always)]
6254    fn unzip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6255        let (a0, a1) = self.split_i8x64(a);
6256        let (b0, b1) = self.split_i8x64(b);
6257        self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1))
6258    }
6259    #[inline(always)]
6260    fn unzip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6261        let (a0, a1) = self.split_i8x64(a);
6262        let (b0, b1) = self.split_i8x64(b);
6263        self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1))
6264    }
6265    #[inline(always)]
6266    fn interleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
6267        let (a0, a1) = self.split_i8x64(a);
6268        let (b0, b1) = self.split_i8x64(b);
6269        let lo_lo = self.zip_low_i8x32(a0, b0);
6270        let lo_hi = self.zip_high_i8x32(a0, b0);
6271        let hi_lo = self.zip_low_i8x32(a1, b1);
6272        let hi_hi = self.zip_high_i8x32(a1, b1);
6273        (
6274            self.combine_i8x32(lo_lo, lo_hi),
6275            self.combine_i8x32(hi_lo, hi_hi),
6276        )
6277    }
6278    #[inline(always)]
6279    fn deinterleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
6280        let (a0, a1) = self.split_i8x64(a);
6281        let (b0, b1) = self.split_i8x64(b);
6282        let lo_even = self.unzip_low_i8x32(a0, a1);
6283        let lo_odd = self.unzip_high_i8x32(a0, a1);
6284        let hi_even = self.unzip_low_i8x32(b0, b1);
6285        let hi_odd = self.unzip_high_i8x32(b0, b1);
6286        (
6287            self.combine_i8x32(lo_even, hi_even),
6288            self.combine_i8x32(lo_odd, hi_odd),
6289        )
6290    }
6291    #[inline(always)]
6292    fn select_i8x64(self, a: mask8x64<Self>, b: i8x64<Self>, c: i8x64<Self>) -> i8x64<Self> {
6293        let (a0, a1) = self.split_mask8x64(a);
6294        let (b0, b1) = self.split_i8x64(b);
6295        let (c0, c1) = self.split_i8x64(c);
6296        self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1))
6297    }
6298    #[inline(always)]
6299    fn min_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6300        let (a0, a1) = self.split_i8x64(a);
6301        let (b0, b1) = self.split_i8x64(b);
6302        self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1))
6303    }
6304    #[inline(always)]
6305    fn max_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6306        let (a0, a1) = self.split_i8x64(a);
6307        let (b0, b1) = self.split_i8x64(b);
6308        self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1))
6309    }
6310    #[inline(always)]
6311    fn split_i8x64(self, a: i8x64<Self>) -> (i8x32<Self>, i8x32<Self>) {
6312        (
6313            i8x32 {
6314                val: crate::support::Aligned256(a.val.0[0]),
6315                simd: self,
6316            },
6317            i8x32 {
6318                val: crate::support::Aligned256(a.val.0[1]),
6319                simd: self,
6320            },
6321        )
6322    }
6323    #[inline(always)]
6324    fn neg_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
6325        let (a0, a1) = self.split_i8x64(a);
6326        self.combine_i8x32(self.neg_i8x32(a0), self.neg_i8x32(a1))
6327    }
6328    #[inline(always)]
6329    fn reinterpret_u8_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
6330        let (a0, a1) = self.split_i8x64(a);
6331        self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1))
6332    }
6333    #[inline(always)]
6334    fn reinterpret_u32_i8x64(self, a: i8x64<Self>) -> u32x16<Self> {
6335        let (a0, a1) = self.split_i8x64(a);
6336        self.combine_u32x8(
6337            self.reinterpret_u32_i8x32(a0),
6338            self.reinterpret_u32_i8x32(a1),
6339        )
6340    }
6341    #[inline(always)]
6342    fn splat_u8x64(self, val: u8) -> u8x64<Self> {
6343        let half = self.splat_u8x32(val);
6344        self.combine_u8x32(half, half)
6345    }
6346    #[inline(always)]
6347    fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64<Self> {
6348        u8x64 {
6349            val: unsafe { core::mem::transmute_copy(&val) },
6350            simd: self,
6351        }
6352    }
6353    #[inline(always)]
6354    fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64<Self> {
6355        u8x64 {
6356            val: unsafe { core::mem::transmute_copy(val) },
6357            simd: self,
6358        }
6359    }
6360    #[inline(always)]
6361    fn as_array_u8x64(self, a: u8x64<Self>) -> [u8; 64usize] {
6362        unsafe { core::mem::transmute::<[__m256i; 2usize], [u8; 64usize]>(a.val.0) }
6363    }
6364    #[inline(always)]
6365    fn as_array_ref_u8x64(self, a: &u8x64<Self>) -> &[u8; 64usize] {
6366        unsafe { core::mem::transmute::<&[__m256i; 2usize], &[u8; 64usize]>(&a.val.0) }
6367    }
6368    #[inline(always)]
6369    fn as_array_mut_u8x64(self, a: &mut u8x64<Self>) -> &mut [u8; 64usize] {
6370        unsafe { core::mem::transmute::<&mut [__m256i; 2usize], &mut [u8; 64usize]>(&mut a.val.0) }
6371    }
6372    #[inline(always)]
6373    fn store_array_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
6374        unsafe {
6375            core::ptr::copy_nonoverlapping(
6376                (&raw const a.val.0) as *const u8,
6377                dest.as_mut_ptr(),
6378                64usize,
6379            );
6380        }
6381    }
6382    #[inline(always)]
6383    fn cvt_from_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
6384        unsafe {
6385            u8x64 {
6386                val: core::mem::transmute(a.val),
6387                simd: self,
6388            }
6389        }
6390    }
6391    #[inline(always)]
6392    fn cvt_to_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
6393        unsafe {
6394            u8x64 {
6395                val: core::mem::transmute(a.val),
6396                simd: self,
6397            }
6398        }
6399    }
6400    #[inline(always)]
6401    fn slide_u8x64<const SHIFT: usize>(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6402        unsafe {
6403            if SHIFT >= 64usize {
6404                return b;
6405            }
6406            let result = cross_block_alignr_256x2(
6407                self.cvt_to_bytes_u8x64(b).val.0,
6408                self.cvt_to_bytes_u8x64(a).val.0,
6409                SHIFT,
6410            );
6411            self.cvt_from_bytes_u8x64(u8x64 {
6412                val: crate::support::Aligned512(result),
6413                simd: self,
6414            })
6415        }
6416    }
6417    #[inline(always)]
6418    fn slide_within_blocks_u8x64<const SHIFT: usize>(
6419        self,
6420        a: u8x64<Self>,
6421        b: u8x64<Self>,
6422    ) -> u8x64<Self> {
6423        let (a0, a1) = self.split_u8x64(a);
6424        let (b0, b1) = self.split_u8x64(b);
6425        self.combine_u8x32(
6426            self.slide_within_blocks_u8x32::<SHIFT>(a0, b0),
6427            self.slide_within_blocks_u8x32::<SHIFT>(a1, b1),
6428        )
6429    }
6430    #[inline(always)]
6431    fn add_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6432        let (a0, a1) = self.split_u8x64(a);
6433        let (b0, b1) = self.split_u8x64(b);
6434        self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1))
6435    }
6436    #[inline(always)]
6437    fn sub_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6438        let (a0, a1) = self.split_u8x64(a);
6439        let (b0, b1) = self.split_u8x64(b);
6440        self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1))
6441    }
6442    #[inline(always)]
6443    fn mul_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6444        let (a0, a1) = self.split_u8x64(a);
6445        let (b0, b1) = self.split_u8x64(b);
6446        self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1))
6447    }
6448    #[inline(always)]
6449    fn and_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6450        let (a0, a1) = self.split_u8x64(a);
6451        let (b0, b1) = self.split_u8x64(b);
6452        self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1))
6453    }
6454    #[inline(always)]
6455    fn or_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6456        let (a0, a1) = self.split_u8x64(a);
6457        let (b0, b1) = self.split_u8x64(b);
6458        self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1))
6459    }
6460    #[inline(always)]
6461    fn xor_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6462        let (a0, a1) = self.split_u8x64(a);
6463        let (b0, b1) = self.split_u8x64(b);
6464        self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1))
6465    }
6466    #[inline(always)]
6467    fn not_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
6468        let (a0, a1) = self.split_u8x64(a);
6469        self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1))
6470    }
6471    #[inline(always)]
6472    fn shl_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
6473        let (a0, a1) = self.split_u8x64(a);
6474        self.combine_u8x32(self.shl_u8x32(a0, shift), self.shl_u8x32(a1, shift))
6475    }
6476    #[inline(always)]
6477    fn shlv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6478        let (a0, a1) = self.split_u8x64(a);
6479        let (b0, b1) = self.split_u8x64(b);
6480        self.combine_u8x32(self.shlv_u8x32(a0, b0), self.shlv_u8x32(a1, b1))
6481    }
6482    #[inline(always)]
6483    fn shr_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
6484        let (a0, a1) = self.split_u8x64(a);
6485        self.combine_u8x32(self.shr_u8x32(a0, shift), self.shr_u8x32(a1, shift))
6486    }
6487    #[inline(always)]
6488    fn shrv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6489        let (a0, a1) = self.split_u8x64(a);
6490        let (b0, b1) = self.split_u8x64(b);
6491        self.combine_u8x32(self.shrv_u8x32(a0, b0), self.shrv_u8x32(a1, b1))
6492    }
6493    #[inline(always)]
6494    fn simd_eq_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
6495        let (a0, a1) = self.split_u8x64(a);
6496        let (b0, b1) = self.split_u8x64(b);
6497        self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1))
6498    }
6499    #[inline(always)]
6500    fn simd_lt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
6501        let (a0, a1) = self.split_u8x64(a);
6502        let (b0, b1) = self.split_u8x64(b);
6503        self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1))
6504    }
6505    #[inline(always)]
6506    fn simd_le_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
6507        let (a0, a1) = self.split_u8x64(a);
6508        let (b0, b1) = self.split_u8x64(b);
6509        self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1))
6510    }
6511    #[inline(always)]
6512    fn simd_ge_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
6513        let (a0, a1) = self.split_u8x64(a);
6514        let (b0, b1) = self.split_u8x64(b);
6515        self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1))
6516    }
6517    #[inline(always)]
6518    fn simd_gt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
6519        let (a0, a1) = self.split_u8x64(a);
6520        let (b0, b1) = self.split_u8x64(b);
6521        self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1))
6522    }
6523    #[inline(always)]
6524    fn zip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6525        let (a0, _) = self.split_u8x64(a);
6526        let (b0, _) = self.split_u8x64(b);
6527        self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0))
6528    }
6529    #[inline(always)]
6530    fn zip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6531        let (_, a1) = self.split_u8x64(a);
6532        let (_, b1) = self.split_u8x64(b);
6533        self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1))
6534    }
6535    #[inline(always)]
6536    fn unzip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6537        let (a0, a1) = self.split_u8x64(a);
6538        let (b0, b1) = self.split_u8x64(b);
6539        self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1))
6540    }
6541    #[inline(always)]
6542    fn unzip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6543        let (a0, a1) = self.split_u8x64(a);
6544        let (b0, b1) = self.split_u8x64(b);
6545        self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1))
6546    }
6547    #[inline(always)]
6548    fn interleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
6549        let (a0, a1) = self.split_u8x64(a);
6550        let (b0, b1) = self.split_u8x64(b);
6551        let lo_lo = self.zip_low_u8x32(a0, b0);
6552        let lo_hi = self.zip_high_u8x32(a0, b0);
6553        let hi_lo = self.zip_low_u8x32(a1, b1);
6554        let hi_hi = self.zip_high_u8x32(a1, b1);
6555        (
6556            self.combine_u8x32(lo_lo, lo_hi),
6557            self.combine_u8x32(hi_lo, hi_hi),
6558        )
6559    }
6560    #[inline(always)]
6561    fn deinterleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
6562        let (a0, a1) = self.split_u8x64(a);
6563        let (b0, b1) = self.split_u8x64(b);
6564        let lo_even = self.unzip_low_u8x32(a0, a1);
6565        let lo_odd = self.unzip_high_u8x32(a0, a1);
6566        let hi_even = self.unzip_low_u8x32(b0, b1);
6567        let hi_odd = self.unzip_high_u8x32(b0, b1);
6568        (
6569            self.combine_u8x32(lo_even, hi_even),
6570            self.combine_u8x32(lo_odd, hi_odd),
6571        )
6572    }
6573    #[inline(always)]
6574    fn select_u8x64(self, a: mask8x64<Self>, b: u8x64<Self>, c: u8x64<Self>) -> u8x64<Self> {
6575        let (a0, a1) = self.split_mask8x64(a);
6576        let (b0, b1) = self.split_u8x64(b);
6577        let (c0, c1) = self.split_u8x64(c);
6578        self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1))
6579    }
6580    #[inline(always)]
6581    fn min_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6582        let (a0, a1) = self.split_u8x64(a);
6583        let (b0, b1) = self.split_u8x64(b);
6584        self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1))
6585    }
6586    #[inline(always)]
6587    fn max_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6588        let (a0, a1) = self.split_u8x64(a);
6589        let (b0, b1) = self.split_u8x64(b);
6590        self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1))
6591    }
6592    #[inline(always)]
6593    fn split_u8x64(self, a: u8x64<Self>) -> (u8x32<Self>, u8x32<Self>) {
6594        (
6595            u8x32 {
6596                val: crate::support::Aligned256(a.val.0[0]),
6597                simd: self,
6598            },
6599            u8x32 {
6600                val: crate::support::Aligned256(a.val.0[1]),
6601                simd: self,
6602            },
6603        )
6604    }
6605    #[inline(always)]
6606    fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self> {
6607        unsafe {
6608            let v0 = _mm_loadu_si128(src.as_ptr() as *const _);
6609            let v1 = _mm_loadu_si128(src.as_ptr().add(16usize) as *const _);
6610            let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 16usize) as *const _);
6611            let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 16usize) as *const _);
6612            let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
6613            let v0 = _mm_shuffle_epi8(v0, mask);
6614            let v1 = _mm_shuffle_epi8(v1, mask);
6615            let v2 = _mm_shuffle_epi8(v2, mask);
6616            let v3 = _mm_shuffle_epi8(v3, mask);
6617            let tmp0 = _mm_unpacklo_epi32(v0, v1);
6618            let tmp1 = _mm_unpackhi_epi32(v0, v1);
6619            let tmp2 = _mm_unpacklo_epi32(v2, v3);
6620            let tmp3 = _mm_unpackhi_epi32(v2, v3);
6621            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
6622            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
6623            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
6624            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
6625            self.combine_u8x32(
6626                self.combine_u8x16(out0.simd_into(self), out1.simd_into(self)),
6627                self.combine_u8x16(out2.simd_into(self), out3.simd_into(self)),
6628            )
6629        }
6630    }
6631    #[inline(always)]
6632    fn store_interleaved_128_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
6633        let (v01, v23) = self.split_u8x64(a);
6634        let (v0, v1) = self.split_u8x32(v01);
6635        let (v2, v3) = self.split_u8x32(v23);
6636        let v0 = v0.into();
6637        let v1 = v1.into();
6638        let v2 = v2.into();
6639        let v3 = v3.into();
6640        unsafe {
6641            let tmp0 = _mm_unpacklo_epi32(v0, v1);
6642            let tmp1 = _mm_unpackhi_epi32(v0, v1);
6643            let tmp2 = _mm_unpacklo_epi32(v2, v3);
6644            let tmp3 = _mm_unpackhi_epi32(v2, v3);
6645            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
6646            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
6647            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
6648            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
6649            let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
6650            let out0 = _mm_shuffle_epi8(out0, mask);
6651            let out1 = _mm_shuffle_epi8(out1, mask);
6652            let out2 = _mm_shuffle_epi8(out2, mask);
6653            let out3 = _mm_shuffle_epi8(out3, mask);
6654            _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0);
6655            _mm_storeu_si128(dest.as_mut_ptr().add(16usize) as *mut _, out1);
6656            _mm_storeu_si128(dest.as_mut_ptr().add(2 * 16usize) as *mut _, out2);
6657            _mm_storeu_si128(dest.as_mut_ptr().add(3 * 16usize) as *mut _, out3);
6658        }
6659    }
6660    #[inline(always)]
6661    fn reinterpret_u32_u8x64(self, a: u8x64<Self>) -> u32x16<Self> {
6662        let (a0, a1) = self.split_u8x64(a);
6663        self.combine_u32x8(
6664            self.reinterpret_u32_u8x32(a0),
6665            self.reinterpret_u32_u8x32(a1),
6666        )
6667    }
6668    #[inline(always)]
6669    fn splat_mask8x64(self, val: i8) -> mask8x64<Self> {
6670        let half = self.splat_mask8x32(val);
6671        self.combine_mask8x32(half, half)
6672    }
6673    #[inline(always)]
6674    fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64<Self> {
6675        mask8x64 {
6676            val: unsafe { core::mem::transmute_copy(&val) },
6677            simd: self,
6678        }
6679    }
6680    #[inline(always)]
6681    fn load_array_ref_mask8x64(self, val: &[i8; 64usize]) -> mask8x64<Self> {
6682        mask8x64 {
6683            val: unsafe { core::mem::transmute_copy(val) },
6684            simd: self,
6685        }
6686    }
6687    #[inline(always)]
6688    fn as_array_mask8x64(self, a: mask8x64<Self>) -> [i8; 64usize] {
6689        unsafe { core::mem::transmute::<[__m256i; 2usize], [i8; 64usize]>(a.val.0) }
6690    }
6691    #[inline(always)]
6692    fn as_array_ref_mask8x64(self, a: &mask8x64<Self>) -> &[i8; 64usize] {
6693        unsafe { core::mem::transmute::<&[__m256i; 2usize], &[i8; 64usize]>(&a.val.0) }
6694    }
6695    #[inline(always)]
6696    fn as_array_mut_mask8x64(self, a: &mut mask8x64<Self>) -> &mut [i8; 64usize] {
6697        unsafe { core::mem::transmute::<&mut [__m256i; 2usize], &mut [i8; 64usize]>(&mut a.val.0) }
6698    }
6699    #[inline(always)]
6700    fn store_array_mask8x64(self, a: mask8x64<Self>, dest: &mut [i8; 64usize]) -> () {
6701        unsafe {
6702            core::ptr::copy_nonoverlapping(
6703                (&raw const a.val.0) as *const i8,
6704                dest.as_mut_ptr(),
6705                64usize,
6706            );
6707        }
6708    }
6709    #[inline(always)]
6710    fn cvt_from_bytes_mask8x64(self, a: u8x64<Self>) -> mask8x64<Self> {
6711        unsafe {
6712            mask8x64 {
6713                val: core::mem::transmute(a.val),
6714                simd: self,
6715            }
6716        }
6717    }
6718    #[inline(always)]
6719    fn cvt_to_bytes_mask8x64(self, a: mask8x64<Self>) -> u8x64<Self> {
6720        unsafe {
6721            u8x64 {
6722                val: core::mem::transmute(a.val),
6723                simd: self,
6724            }
6725        }
6726    }
6727    #[inline(always)]
6728    fn slide_mask8x64<const SHIFT: usize>(
6729        self,
6730        a: mask8x64<Self>,
6731        b: mask8x64<Self>,
6732    ) -> mask8x64<Self> {
6733        unsafe {
6734            if SHIFT >= 64usize {
6735                return b;
6736            }
6737            let result = cross_block_alignr_256x2(
6738                self.cvt_to_bytes_mask8x64(b).val.0,
6739                self.cvt_to_bytes_mask8x64(a).val.0,
6740                SHIFT,
6741            );
6742            self.cvt_from_bytes_mask8x64(u8x64 {
6743                val: crate::support::Aligned512(result),
6744                simd: self,
6745            })
6746        }
6747    }
6748    #[inline(always)]
6749    fn slide_within_blocks_mask8x64<const SHIFT: usize>(
6750        self,
6751        a: mask8x64<Self>,
6752        b: mask8x64<Self>,
6753    ) -> mask8x64<Self> {
6754        let (a0, a1) = self.split_mask8x64(a);
6755        let (b0, b1) = self.split_mask8x64(b);
6756        self.combine_mask8x32(
6757            self.slide_within_blocks_mask8x32::<SHIFT>(a0, b0),
6758            self.slide_within_blocks_mask8x32::<SHIFT>(a1, b1),
6759        )
6760    }
6761    #[inline(always)]
6762    fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
6763        let (a0, a1) = self.split_mask8x64(a);
6764        let (b0, b1) = self.split_mask8x64(b);
6765        self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1))
6766    }
6767    #[inline(always)]
6768    fn or_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
6769        let (a0, a1) = self.split_mask8x64(a);
6770        let (b0, b1) = self.split_mask8x64(b);
6771        self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1))
6772    }
6773    #[inline(always)]
6774    fn xor_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
6775        let (a0, a1) = self.split_mask8x64(a);
6776        let (b0, b1) = self.split_mask8x64(b);
6777        self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1))
6778    }
6779    #[inline(always)]
6780    fn not_mask8x64(self, a: mask8x64<Self>) -> mask8x64<Self> {
6781        let (a0, a1) = self.split_mask8x64(a);
6782        self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1))
6783    }
6784    #[inline(always)]
6785    fn select_mask8x64(
6786        self,
6787        a: mask8x64<Self>,
6788        b: mask8x64<Self>,
6789        c: mask8x64<Self>,
6790    ) -> mask8x64<Self> {
6791        let (a0, a1) = self.split_mask8x64(a);
6792        let (b0, b1) = self.split_mask8x64(b);
6793        let (c0, c1) = self.split_mask8x64(c);
6794        self.combine_mask8x32(
6795            self.select_mask8x32(a0, b0, c0),
6796            self.select_mask8x32(a1, b1, c1),
6797        )
6798    }
6799    #[inline(always)]
6800    fn simd_eq_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
6801        let (a0, a1) = self.split_mask8x64(a);
6802        let (b0, b1) = self.split_mask8x64(b);
6803        self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1))
6804    }
6805    #[inline(always)]
6806    fn any_true_mask8x64(self, a: mask8x64<Self>) -> bool {
6807        let (a0, a1) = self.split_mask8x64(a);
6808        self.any_true_mask8x32(a0) || self.any_true_mask8x32(a1)
6809    }
6810    #[inline(always)]
6811    fn all_true_mask8x64(self, a: mask8x64<Self>) -> bool {
6812        let (a0, a1) = self.split_mask8x64(a);
6813        self.all_true_mask8x32(a0) && self.all_true_mask8x32(a1)
6814    }
6815    #[inline(always)]
6816    fn any_false_mask8x64(self, a: mask8x64<Self>) -> bool {
6817        let (a0, a1) = self.split_mask8x64(a);
6818        self.any_false_mask8x32(a0) || self.any_false_mask8x32(a1)
6819    }
6820    #[inline(always)]
6821    fn all_false_mask8x64(self, a: mask8x64<Self>) -> bool {
6822        let (a0, a1) = self.split_mask8x64(a);
6823        self.all_false_mask8x32(a0) && self.all_false_mask8x32(a1)
6824    }
6825    #[inline(always)]
6826    fn split_mask8x64(self, a: mask8x64<Self>) -> (mask8x32<Self>, mask8x32<Self>) {
6827        (
6828            mask8x32 {
6829                val: crate::support::Aligned256(a.val.0[0]),
6830                simd: self,
6831            },
6832            mask8x32 {
6833                val: crate::support::Aligned256(a.val.0[1]),
6834                simd: self,
6835            },
6836        )
6837    }
6838    #[inline(always)]
6839    fn splat_i16x32(self, val: i16) -> i16x32<Self> {
6840        let half = self.splat_i16x16(val);
6841        self.combine_i16x16(half, half)
6842    }
6843    #[inline(always)]
6844    fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32<Self> {
6845        i16x32 {
6846            val: unsafe { core::mem::transmute_copy(&val) },
6847            simd: self,
6848        }
6849    }
6850    #[inline(always)]
6851    fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32<Self> {
6852        i16x32 {
6853            val: unsafe { core::mem::transmute_copy(val) },
6854            simd: self,
6855        }
6856    }
6857    #[inline(always)]
6858    fn as_array_i16x32(self, a: i16x32<Self>) -> [i16; 32usize] {
6859        unsafe { core::mem::transmute::<[__m256i; 2usize], [i16; 32usize]>(a.val.0) }
6860    }
6861    #[inline(always)]
6862    fn as_array_ref_i16x32(self, a: &i16x32<Self>) -> &[i16; 32usize] {
6863        unsafe { core::mem::transmute::<&[__m256i; 2usize], &[i16; 32usize]>(&a.val.0) }
6864    }
6865    #[inline(always)]
6866    fn as_array_mut_i16x32(self, a: &mut i16x32<Self>) -> &mut [i16; 32usize] {
6867        unsafe { core::mem::transmute::<&mut [__m256i; 2usize], &mut [i16; 32usize]>(&mut a.val.0) }
6868    }
6869    #[inline(always)]
6870    fn store_array_i16x32(self, a: i16x32<Self>, dest: &mut [i16; 32usize]) -> () {
6871        unsafe {
6872            core::ptr::copy_nonoverlapping(
6873                (&raw const a.val.0) as *const i16,
6874                dest.as_mut_ptr(),
6875                32usize,
6876            );
6877        }
6878    }
6879    #[inline(always)]
6880    fn cvt_from_bytes_i16x32(self, a: u8x64<Self>) -> i16x32<Self> {
6881        unsafe {
6882            i16x32 {
6883                val: core::mem::transmute(a.val),
6884                simd: self,
6885            }
6886        }
6887    }
6888    #[inline(always)]
6889    fn cvt_to_bytes_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
6890        unsafe {
6891            u8x64 {
6892                val: core::mem::transmute(a.val),
6893                simd: self,
6894            }
6895        }
6896    }
6897    #[inline(always)]
6898    fn slide_i16x32<const SHIFT: usize>(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
6899        unsafe {
6900            if SHIFT >= 32usize {
6901                return b;
6902            }
6903            let result = cross_block_alignr_256x2(
6904                self.cvt_to_bytes_i16x32(b).val.0,
6905                self.cvt_to_bytes_i16x32(a).val.0,
6906                SHIFT * 2usize,
6907            );
6908            self.cvt_from_bytes_i16x32(u8x64 {
6909                val: crate::support::Aligned512(result),
6910                simd: self,
6911            })
6912        }
6913    }
6914    #[inline(always)]
6915    fn slide_within_blocks_i16x32<const SHIFT: usize>(
6916        self,
6917        a: i16x32<Self>,
6918        b: i16x32<Self>,
6919    ) -> i16x32<Self> {
6920        let (a0, a1) = self.split_i16x32(a);
6921        let (b0, b1) = self.split_i16x32(b);
6922        self.combine_i16x16(
6923            self.slide_within_blocks_i16x16::<SHIFT>(a0, b0),
6924            self.slide_within_blocks_i16x16::<SHIFT>(a1, b1),
6925        )
6926    }
6927    #[inline(always)]
6928    fn add_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
6929        let (a0, a1) = self.split_i16x32(a);
6930        let (b0, b1) = self.split_i16x32(b);
6931        self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1))
6932    }
6933    #[inline(always)]
6934    fn sub_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
6935        let (a0, a1) = self.split_i16x32(a);
6936        let (b0, b1) = self.split_i16x32(b);
6937        self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1))
6938    }
6939    #[inline(always)]
6940    fn mul_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
6941        let (a0, a1) = self.split_i16x32(a);
6942        let (b0, b1) = self.split_i16x32(b);
6943        self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1))
6944    }
6945    #[inline(always)]
6946    fn and_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
6947        let (a0, a1) = self.split_i16x32(a);
6948        let (b0, b1) = self.split_i16x32(b);
6949        self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1))
6950    }
6951    #[inline(always)]
6952    fn or_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
6953        let (a0, a1) = self.split_i16x32(a);
6954        let (b0, b1) = self.split_i16x32(b);
6955        self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1))
6956    }
6957    #[inline(always)]
6958    fn xor_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
6959        let (a0, a1) = self.split_i16x32(a);
6960        let (b0, b1) = self.split_i16x32(b);
6961        self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1))
6962    }
6963    #[inline(always)]
6964    fn not_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
6965        let (a0, a1) = self.split_i16x32(a);
6966        self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1))
6967    }
6968    #[inline(always)]
6969    fn shl_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
6970        let (a0, a1) = self.split_i16x32(a);
6971        self.combine_i16x16(self.shl_i16x16(a0, shift), self.shl_i16x16(a1, shift))
6972    }
6973    #[inline(always)]
6974    fn shlv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
6975        let (a0, a1) = self.split_i16x32(a);
6976        let (b0, b1) = self.split_i16x32(b);
6977        self.combine_i16x16(self.shlv_i16x16(a0, b0), self.shlv_i16x16(a1, b1))
6978    }
6979    #[inline(always)]
6980    fn shr_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
6981        let (a0, a1) = self.split_i16x32(a);
6982        self.combine_i16x16(self.shr_i16x16(a0, shift), self.shr_i16x16(a1, shift))
6983    }
6984    #[inline(always)]
6985    fn shrv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
6986        let (a0, a1) = self.split_i16x32(a);
6987        let (b0, b1) = self.split_i16x32(b);
6988        self.combine_i16x16(self.shrv_i16x16(a0, b0), self.shrv_i16x16(a1, b1))
6989    }
6990    #[inline(always)]
6991    fn simd_eq_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
6992        let (a0, a1) = self.split_i16x32(a);
6993        let (b0, b1) = self.split_i16x32(b);
6994        self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1))
6995    }
6996    #[inline(always)]
6997    fn simd_lt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
6998        let (a0, a1) = self.split_i16x32(a);
6999        let (b0, b1) = self.split_i16x32(b);
7000        self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1))
7001    }
7002    #[inline(always)]
7003    fn simd_le_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
7004        let (a0, a1) = self.split_i16x32(a);
7005        let (b0, b1) = self.split_i16x32(b);
7006        self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1))
7007    }
7008    #[inline(always)]
7009    fn simd_ge_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
7010        let (a0, a1) = self.split_i16x32(a);
7011        let (b0, b1) = self.split_i16x32(b);
7012        self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1))
7013    }
7014    #[inline(always)]
7015    fn simd_gt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
7016        let (a0, a1) = self.split_i16x32(a);
7017        let (b0, b1) = self.split_i16x32(b);
7018        self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1))
7019    }
7020    #[inline(always)]
7021    fn zip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7022        let (a0, _) = self.split_i16x32(a);
7023        let (b0, _) = self.split_i16x32(b);
7024        self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0))
7025    }
7026    #[inline(always)]
7027    fn zip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7028        let (_, a1) = self.split_i16x32(a);
7029        let (_, b1) = self.split_i16x32(b);
7030        self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1))
7031    }
7032    #[inline(always)]
7033    fn unzip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7034        let (a0, a1) = self.split_i16x32(a);
7035        let (b0, b1) = self.split_i16x32(b);
7036        self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1))
7037    }
7038    #[inline(always)]
7039    fn unzip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7040        let (a0, a1) = self.split_i16x32(a);
7041        let (b0, b1) = self.split_i16x32(b);
7042        self.combine_i16x16(
7043            self.unzip_high_i16x16(a0, a1),
7044            self.unzip_high_i16x16(b0, b1),
7045        )
7046    }
7047    #[inline(always)]
7048    fn interleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
7049        let (a0, a1) = self.split_i16x32(a);
7050        let (b0, b1) = self.split_i16x32(b);
7051        let lo_lo = self.zip_low_i16x16(a0, b0);
7052        let lo_hi = self.zip_high_i16x16(a0, b0);
7053        let hi_lo = self.zip_low_i16x16(a1, b1);
7054        let hi_hi = self.zip_high_i16x16(a1, b1);
7055        (
7056            self.combine_i16x16(lo_lo, lo_hi),
7057            self.combine_i16x16(hi_lo, hi_hi),
7058        )
7059    }
7060    #[inline(always)]
7061    fn deinterleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
7062        let (a0, a1) = self.split_i16x32(a);
7063        let (b0, b1) = self.split_i16x32(b);
7064        let lo_even = self.unzip_low_i16x16(a0, a1);
7065        let lo_odd = self.unzip_high_i16x16(a0, a1);
7066        let hi_even = self.unzip_low_i16x16(b0, b1);
7067        let hi_odd = self.unzip_high_i16x16(b0, b1);
7068        (
7069            self.combine_i16x16(lo_even, hi_even),
7070            self.combine_i16x16(lo_odd, hi_odd),
7071        )
7072    }
7073    #[inline(always)]
7074    fn select_i16x32(self, a: mask16x32<Self>, b: i16x32<Self>, c: i16x32<Self>) -> i16x32<Self> {
7075        let (a0, a1) = self.split_mask16x32(a);
7076        let (b0, b1) = self.split_i16x32(b);
7077        let (c0, c1) = self.split_i16x32(c);
7078        self.combine_i16x16(
7079            self.select_i16x16(a0, b0, c0),
7080            self.select_i16x16(a1, b1, c1),
7081        )
7082    }
7083    #[inline(always)]
7084    fn min_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7085        let (a0, a1) = self.split_i16x32(a);
7086        let (b0, b1) = self.split_i16x32(b);
7087        self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1))
7088    }
7089    #[inline(always)]
7090    fn max_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7091        let (a0, a1) = self.split_i16x32(a);
7092        let (b0, b1) = self.split_i16x32(b);
7093        self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1))
7094    }
7095    #[inline(always)]
7096    fn split_i16x32(self, a: i16x32<Self>) -> (i16x16<Self>, i16x16<Self>) {
7097        (
7098            i16x16 {
7099                val: crate::support::Aligned256(a.val.0[0]),
7100                simd: self,
7101            },
7102            i16x16 {
7103                val: crate::support::Aligned256(a.val.0[1]),
7104                simd: self,
7105            },
7106        )
7107    }
7108    #[inline(always)]
7109    fn neg_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
7110        let (a0, a1) = self.split_i16x32(a);
7111        self.combine_i16x16(self.neg_i16x16(a0), self.neg_i16x16(a1))
7112    }
7113    #[inline(always)]
7114    fn reinterpret_u8_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
7115        let (a0, a1) = self.split_i16x32(a);
7116        self.combine_u8x32(
7117            self.reinterpret_u8_i16x16(a0),
7118            self.reinterpret_u8_i16x16(a1),
7119        )
7120    }
7121    #[inline(always)]
7122    fn reinterpret_u32_i16x32(self, a: i16x32<Self>) -> u32x16<Self> {
7123        let (a0, a1) = self.split_i16x32(a);
7124        self.combine_u32x8(
7125            self.reinterpret_u32_i16x16(a0),
7126            self.reinterpret_u32_i16x16(a1),
7127        )
7128    }
7129    #[inline(always)]
7130    fn splat_u16x32(self, val: u16) -> u16x32<Self> {
7131        let half = self.splat_u16x16(val);
7132        self.combine_u16x16(half, half)
7133    }
7134    #[inline(always)]
7135    fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32<Self> {
7136        u16x32 {
7137            val: unsafe { core::mem::transmute_copy(&val) },
7138            simd: self,
7139        }
7140    }
7141    #[inline(always)]
7142    fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32<Self> {
7143        u16x32 {
7144            val: unsafe { core::mem::transmute_copy(val) },
7145            simd: self,
7146        }
7147    }
7148    #[inline(always)]
7149    fn as_array_u16x32(self, a: u16x32<Self>) -> [u16; 32usize] {
7150        unsafe { core::mem::transmute::<[__m256i; 2usize], [u16; 32usize]>(a.val.0) }
7151    }
7152    #[inline(always)]
7153    fn as_array_ref_u16x32(self, a: &u16x32<Self>) -> &[u16; 32usize] {
7154        unsafe { core::mem::transmute::<&[__m256i; 2usize], &[u16; 32usize]>(&a.val.0) }
7155    }
7156    #[inline(always)]
7157    fn as_array_mut_u16x32(self, a: &mut u16x32<Self>) -> &mut [u16; 32usize] {
7158        unsafe { core::mem::transmute::<&mut [__m256i; 2usize], &mut [u16; 32usize]>(&mut a.val.0) }
7159    }
7160    #[inline(always)]
7161    fn store_array_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
7162        unsafe {
7163            core::ptr::copy_nonoverlapping(
7164                (&raw const a.val.0) as *const u16,
7165                dest.as_mut_ptr(),
7166                32usize,
7167            );
7168        }
7169    }
7170    #[inline(always)]
7171    fn cvt_from_bytes_u16x32(self, a: u8x64<Self>) -> u16x32<Self> {
7172        unsafe {
7173            u16x32 {
7174                val: core::mem::transmute(a.val),
7175                simd: self,
7176            }
7177        }
7178    }
7179    #[inline(always)]
7180    fn cvt_to_bytes_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
7181        unsafe {
7182            u8x64 {
7183                val: core::mem::transmute(a.val),
7184                simd: self,
7185            }
7186        }
7187    }
7188    #[inline(always)]
7189    fn slide_u16x32<const SHIFT: usize>(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7190        unsafe {
7191            if SHIFT >= 32usize {
7192                return b;
7193            }
7194            let result = cross_block_alignr_256x2(
7195                self.cvt_to_bytes_u16x32(b).val.0,
7196                self.cvt_to_bytes_u16x32(a).val.0,
7197                SHIFT * 2usize,
7198            );
7199            self.cvt_from_bytes_u16x32(u8x64 {
7200                val: crate::support::Aligned512(result),
7201                simd: self,
7202            })
7203        }
7204    }
7205    #[inline(always)]
7206    fn slide_within_blocks_u16x32<const SHIFT: usize>(
7207        self,
7208        a: u16x32<Self>,
7209        b: u16x32<Self>,
7210    ) -> u16x32<Self> {
7211        let (a0, a1) = self.split_u16x32(a);
7212        let (b0, b1) = self.split_u16x32(b);
7213        self.combine_u16x16(
7214            self.slide_within_blocks_u16x16::<SHIFT>(a0, b0),
7215            self.slide_within_blocks_u16x16::<SHIFT>(a1, b1),
7216        )
7217    }
7218    #[inline(always)]
7219    fn add_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7220        let (a0, a1) = self.split_u16x32(a);
7221        let (b0, b1) = self.split_u16x32(b);
7222        self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1))
7223    }
7224    #[inline(always)]
7225    fn sub_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7226        let (a0, a1) = self.split_u16x32(a);
7227        let (b0, b1) = self.split_u16x32(b);
7228        self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1))
7229    }
7230    #[inline(always)]
7231    fn mul_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7232        let (a0, a1) = self.split_u16x32(a);
7233        let (b0, b1) = self.split_u16x32(b);
7234        self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1))
7235    }
7236    #[inline(always)]
7237    fn and_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7238        let (a0, a1) = self.split_u16x32(a);
7239        let (b0, b1) = self.split_u16x32(b);
7240        self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1))
7241    }
7242    #[inline(always)]
7243    fn or_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7244        let (a0, a1) = self.split_u16x32(a);
7245        let (b0, b1) = self.split_u16x32(b);
7246        self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1))
7247    }
7248    #[inline(always)]
7249    fn xor_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7250        let (a0, a1) = self.split_u16x32(a);
7251        let (b0, b1) = self.split_u16x32(b);
7252        self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1))
7253    }
7254    #[inline(always)]
7255    fn not_u16x32(self, a: u16x32<Self>) -> u16x32<Self> {
7256        let (a0, a1) = self.split_u16x32(a);
7257        self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1))
7258    }
7259    #[inline(always)]
7260    fn shl_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
7261        let (a0, a1) = self.split_u16x32(a);
7262        self.combine_u16x16(self.shl_u16x16(a0, shift), self.shl_u16x16(a1, shift))
7263    }
7264    #[inline(always)]
7265    fn shlv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7266        let (a0, a1) = self.split_u16x32(a);
7267        let (b0, b1) = self.split_u16x32(b);
7268        self.combine_u16x16(self.shlv_u16x16(a0, b0), self.shlv_u16x16(a1, b1))
7269    }
7270    #[inline(always)]
7271    fn shr_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
7272        let (a0, a1) = self.split_u16x32(a);
7273        self.combine_u16x16(self.shr_u16x16(a0, shift), self.shr_u16x16(a1, shift))
7274    }
7275    #[inline(always)]
7276    fn shrv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7277        let (a0, a1) = self.split_u16x32(a);
7278        let (b0, b1) = self.split_u16x32(b);
7279        self.combine_u16x16(self.shrv_u16x16(a0, b0), self.shrv_u16x16(a1, b1))
7280    }
7281    #[inline(always)]
7282    fn simd_eq_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
7283        let (a0, a1) = self.split_u16x32(a);
7284        let (b0, b1) = self.split_u16x32(b);
7285        self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1))
7286    }
7287    #[inline(always)]
7288    fn simd_lt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
7289        let (a0, a1) = self.split_u16x32(a);
7290        let (b0, b1) = self.split_u16x32(b);
7291        self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1))
7292    }
7293    #[inline(always)]
7294    fn simd_le_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
7295        let (a0, a1) = self.split_u16x32(a);
7296        let (b0, b1) = self.split_u16x32(b);
7297        self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1))
7298    }
7299    #[inline(always)]
7300    fn simd_ge_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
7301        let (a0, a1) = self.split_u16x32(a);
7302        let (b0, b1) = self.split_u16x32(b);
7303        self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1))
7304    }
7305    #[inline(always)]
7306    fn simd_gt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
7307        let (a0, a1) = self.split_u16x32(a);
7308        let (b0, b1) = self.split_u16x32(b);
7309        self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1))
7310    }
7311    #[inline(always)]
7312    fn zip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7313        let (a0, _) = self.split_u16x32(a);
7314        let (b0, _) = self.split_u16x32(b);
7315        self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0))
7316    }
7317    #[inline(always)]
7318    fn zip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7319        let (_, a1) = self.split_u16x32(a);
7320        let (_, b1) = self.split_u16x32(b);
7321        self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1))
7322    }
7323    #[inline(always)]
7324    fn unzip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7325        let (a0, a1) = self.split_u16x32(a);
7326        let (b0, b1) = self.split_u16x32(b);
7327        self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1))
7328    }
7329    #[inline(always)]
7330    fn unzip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7331        let (a0, a1) = self.split_u16x32(a);
7332        let (b0, b1) = self.split_u16x32(b);
7333        self.combine_u16x16(
7334            self.unzip_high_u16x16(a0, a1),
7335            self.unzip_high_u16x16(b0, b1),
7336        )
7337    }
7338    #[inline(always)]
7339    fn interleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
7340        let (a0, a1) = self.split_u16x32(a);
7341        let (b0, b1) = self.split_u16x32(b);
7342        let lo_lo = self.zip_low_u16x16(a0, b0);
7343        let lo_hi = self.zip_high_u16x16(a0, b0);
7344        let hi_lo = self.zip_low_u16x16(a1, b1);
7345        let hi_hi = self.zip_high_u16x16(a1, b1);
7346        (
7347            self.combine_u16x16(lo_lo, lo_hi),
7348            self.combine_u16x16(hi_lo, hi_hi),
7349        )
7350    }
7351    #[inline(always)]
7352    fn deinterleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
7353        let (a0, a1) = self.split_u16x32(a);
7354        let (b0, b1) = self.split_u16x32(b);
7355        let lo_even = self.unzip_low_u16x16(a0, a1);
7356        let lo_odd = self.unzip_high_u16x16(a0, a1);
7357        let hi_even = self.unzip_low_u16x16(b0, b1);
7358        let hi_odd = self.unzip_high_u16x16(b0, b1);
7359        (
7360            self.combine_u16x16(lo_even, hi_even),
7361            self.combine_u16x16(lo_odd, hi_odd),
7362        )
7363    }
7364    #[inline(always)]
7365    fn select_u16x32(self, a: mask16x32<Self>, b: u16x32<Self>, c: u16x32<Self>) -> u16x32<Self> {
7366        let (a0, a1) = self.split_mask16x32(a);
7367        let (b0, b1) = self.split_u16x32(b);
7368        let (c0, c1) = self.split_u16x32(c);
7369        self.combine_u16x16(
7370            self.select_u16x16(a0, b0, c0),
7371            self.select_u16x16(a1, b1, c1),
7372        )
7373    }
7374    #[inline(always)]
7375    fn min_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7376        let (a0, a1) = self.split_u16x32(a);
7377        let (b0, b1) = self.split_u16x32(b);
7378        self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1))
7379    }
7380    #[inline(always)]
7381    fn max_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7382        let (a0, a1) = self.split_u16x32(a);
7383        let (b0, b1) = self.split_u16x32(b);
7384        self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1))
7385    }
7386    #[inline(always)]
7387    fn split_u16x32(self, a: u16x32<Self>) -> (u16x16<Self>, u16x16<Self>) {
7388        (
7389            u16x16 {
7390                val: crate::support::Aligned256(a.val.0[0]),
7391                simd: self,
7392            },
7393            u16x16 {
7394                val: crate::support::Aligned256(a.val.0[1]),
7395                simd: self,
7396            },
7397        )
7398    }
7399    #[inline(always)]
7400    fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self> {
7401        unsafe {
7402            let v0 = _mm_loadu_si128(src.as_ptr() as *const _);
7403            let v1 = _mm_loadu_si128(src.as_ptr().add(8usize) as *const _);
7404            let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 8usize) as *const _);
7405            let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 8usize) as *const _);
7406            let mask = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
7407            let v0 = _mm_shuffle_epi8(v0, mask);
7408            let v1 = _mm_shuffle_epi8(v1, mask);
7409            let v2 = _mm_shuffle_epi8(v2, mask);
7410            let v3 = _mm_shuffle_epi8(v3, mask);
7411            let tmp0 = _mm_unpacklo_epi32(v0, v1);
7412            let tmp1 = _mm_unpackhi_epi32(v0, v1);
7413            let tmp2 = _mm_unpacklo_epi32(v2, v3);
7414            let tmp3 = _mm_unpackhi_epi32(v2, v3);
7415            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
7416            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
7417            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
7418            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
7419            self.combine_u16x16(
7420                self.combine_u16x8(out0.simd_into(self), out1.simd_into(self)),
7421                self.combine_u16x8(out2.simd_into(self), out3.simd_into(self)),
7422            )
7423        }
7424    }
7425    #[inline(always)]
7426    fn store_interleaved_128_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
7427        let (v01, v23) = self.split_u16x32(a);
7428        let (v0, v1) = self.split_u16x16(v01);
7429        let (v2, v3) = self.split_u16x16(v23);
7430        let v0 = v0.into();
7431        let v1 = v1.into();
7432        let v2 = v2.into();
7433        let v3 = v3.into();
7434        unsafe {
7435            let tmp0 = _mm_unpacklo_epi32(v0, v1);
7436            let tmp1 = _mm_unpackhi_epi32(v0, v1);
7437            let tmp2 = _mm_unpacklo_epi32(v2, v3);
7438            let tmp3 = _mm_unpackhi_epi32(v2, v3);
7439            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
7440            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
7441            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
7442            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
7443            let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
7444            let out0 = _mm_shuffle_epi8(out0, mask);
7445            let out1 = _mm_shuffle_epi8(out1, mask);
7446            let out2 = _mm_shuffle_epi8(out2, mask);
7447            let out3 = _mm_shuffle_epi8(out3, mask);
7448            _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0);
7449            _mm_storeu_si128(dest.as_mut_ptr().add(8usize) as *mut _, out1);
7450            _mm_storeu_si128(dest.as_mut_ptr().add(2 * 8usize) as *mut _, out2);
7451            _mm_storeu_si128(dest.as_mut_ptr().add(3 * 8usize) as *mut _, out3);
7452        }
7453    }
7454    #[inline(always)]
7455    fn narrow_u16x32(self, a: u16x32<Self>) -> u8x32<Self> {
7456        let (a, b) = self.split_u16x32(a);
7457        unsafe {
7458            let mask = _mm256_set1_epi16(0xFF);
7459            let lo_masked = _mm256_and_si256(a.into(), mask);
7460            let hi_masked = _mm256_and_si256(b.into(), mask);
7461            let result = _mm256_permute4x64_epi64::<0b_11_01_10_00>(_mm256_packus_epi16(
7462                lo_masked, hi_masked,
7463            ));
7464            result.simd_into(self)
7465        }
7466    }
7467    #[inline(always)]
7468    fn reinterpret_u8_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
7469        let (a0, a1) = self.split_u16x32(a);
7470        self.combine_u8x32(
7471            self.reinterpret_u8_u16x16(a0),
7472            self.reinterpret_u8_u16x16(a1),
7473        )
7474    }
7475    #[inline(always)]
7476    fn reinterpret_u32_u16x32(self, a: u16x32<Self>) -> u32x16<Self> {
7477        let (a0, a1) = self.split_u16x32(a);
7478        self.combine_u32x8(
7479            self.reinterpret_u32_u16x16(a0),
7480            self.reinterpret_u32_u16x16(a1),
7481        )
7482    }
7483    #[inline(always)]
7484    fn splat_mask16x32(self, val: i16) -> mask16x32<Self> {
7485        let half = self.splat_mask16x16(val);
7486        self.combine_mask16x16(half, half)
7487    }
7488    #[inline(always)]
7489    fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32<Self> {
7490        mask16x32 {
7491            val: unsafe { core::mem::transmute_copy(&val) },
7492            simd: self,
7493        }
7494    }
7495    #[inline(always)]
7496    fn load_array_ref_mask16x32(self, val: &[i16; 32usize]) -> mask16x32<Self> {
7497        mask16x32 {
7498            val: unsafe { core::mem::transmute_copy(val) },
7499            simd: self,
7500        }
7501    }
7502    #[inline(always)]
7503    fn as_array_mask16x32(self, a: mask16x32<Self>) -> [i16; 32usize] {
7504        unsafe { core::mem::transmute::<[__m256i; 2usize], [i16; 32usize]>(a.val.0) }
7505    }
7506    #[inline(always)]
7507    fn as_array_ref_mask16x32(self, a: &mask16x32<Self>) -> &[i16; 32usize] {
7508        unsafe { core::mem::transmute::<&[__m256i; 2usize], &[i16; 32usize]>(&a.val.0) }
7509    }
7510    #[inline(always)]
7511    fn as_array_mut_mask16x32(self, a: &mut mask16x32<Self>) -> &mut [i16; 32usize] {
7512        unsafe { core::mem::transmute::<&mut [__m256i; 2usize], &mut [i16; 32usize]>(&mut a.val.0) }
7513    }
7514    #[inline(always)]
7515    fn store_array_mask16x32(self, a: mask16x32<Self>, dest: &mut [i16; 32usize]) -> () {
7516        unsafe {
7517            core::ptr::copy_nonoverlapping(
7518                (&raw const a.val.0) as *const i16,
7519                dest.as_mut_ptr(),
7520                32usize,
7521            );
7522        }
7523    }
7524    #[inline(always)]
7525    fn cvt_from_bytes_mask16x32(self, a: u8x64<Self>) -> mask16x32<Self> {
7526        unsafe {
7527            mask16x32 {
7528                val: core::mem::transmute(a.val),
7529                simd: self,
7530            }
7531        }
7532    }
7533    #[inline(always)]
7534    fn cvt_to_bytes_mask16x32(self, a: mask16x32<Self>) -> u8x64<Self> {
7535        unsafe {
7536            u8x64 {
7537                val: core::mem::transmute(a.val),
7538                simd: self,
7539            }
7540        }
7541    }
7542    #[inline(always)]
7543    fn slide_mask16x32<const SHIFT: usize>(
7544        self,
7545        a: mask16x32<Self>,
7546        b: mask16x32<Self>,
7547    ) -> mask16x32<Self> {
7548        unsafe {
7549            if SHIFT >= 32usize {
7550                return b;
7551            }
7552            let result = cross_block_alignr_256x2(
7553                self.cvt_to_bytes_mask16x32(b).val.0,
7554                self.cvt_to_bytes_mask16x32(a).val.0,
7555                SHIFT * 2usize,
7556            );
7557            self.cvt_from_bytes_mask16x32(u8x64 {
7558                val: crate::support::Aligned512(result),
7559                simd: self,
7560            })
7561        }
7562    }
7563    #[inline(always)]
7564    fn slide_within_blocks_mask16x32<const SHIFT: usize>(
7565        self,
7566        a: mask16x32<Self>,
7567        b: mask16x32<Self>,
7568    ) -> mask16x32<Self> {
7569        let (a0, a1) = self.split_mask16x32(a);
7570        let (b0, b1) = self.split_mask16x32(b);
7571        self.combine_mask16x16(
7572            self.slide_within_blocks_mask16x16::<SHIFT>(a0, b0),
7573            self.slide_within_blocks_mask16x16::<SHIFT>(a1, b1),
7574        )
7575    }
7576    #[inline(always)]
7577    fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
7578        let (a0, a1) = self.split_mask16x32(a);
7579        let (b0, b1) = self.split_mask16x32(b);
7580        self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1))
7581    }
7582    #[inline(always)]
7583    fn or_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
7584        let (a0, a1) = self.split_mask16x32(a);
7585        let (b0, b1) = self.split_mask16x32(b);
7586        self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1))
7587    }
7588    #[inline(always)]
7589    fn xor_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
7590        let (a0, a1) = self.split_mask16x32(a);
7591        let (b0, b1) = self.split_mask16x32(b);
7592        self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1))
7593    }
7594    #[inline(always)]
7595    fn not_mask16x32(self, a: mask16x32<Self>) -> mask16x32<Self> {
7596        let (a0, a1) = self.split_mask16x32(a);
7597        self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1))
7598    }
7599    #[inline(always)]
7600    fn select_mask16x32(
7601        self,
7602        a: mask16x32<Self>,
7603        b: mask16x32<Self>,
7604        c: mask16x32<Self>,
7605    ) -> mask16x32<Self> {
7606        let (a0, a1) = self.split_mask16x32(a);
7607        let (b0, b1) = self.split_mask16x32(b);
7608        let (c0, c1) = self.split_mask16x32(c);
7609        self.combine_mask16x16(
7610            self.select_mask16x16(a0, b0, c0),
7611            self.select_mask16x16(a1, b1, c1),
7612        )
7613    }
7614    #[inline(always)]
7615    fn simd_eq_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
7616        let (a0, a1) = self.split_mask16x32(a);
7617        let (b0, b1) = self.split_mask16x32(b);
7618        self.combine_mask16x16(
7619            self.simd_eq_mask16x16(a0, b0),
7620            self.simd_eq_mask16x16(a1, b1),
7621        )
7622    }
7623    #[inline(always)]
7624    fn any_true_mask16x32(self, a: mask16x32<Self>) -> bool {
7625        let (a0, a1) = self.split_mask16x32(a);
7626        self.any_true_mask16x16(a0) || self.any_true_mask16x16(a1)
7627    }
7628    #[inline(always)]
7629    fn all_true_mask16x32(self, a: mask16x32<Self>) -> bool {
7630        let (a0, a1) = self.split_mask16x32(a);
7631        self.all_true_mask16x16(a0) && self.all_true_mask16x16(a1)
7632    }
7633    #[inline(always)]
7634    fn any_false_mask16x32(self, a: mask16x32<Self>) -> bool {
7635        let (a0, a1) = self.split_mask16x32(a);
7636        self.any_false_mask16x16(a0) || self.any_false_mask16x16(a1)
7637    }
7638    #[inline(always)]
7639    fn all_false_mask16x32(self, a: mask16x32<Self>) -> bool {
7640        let (a0, a1) = self.split_mask16x32(a);
7641        self.all_false_mask16x16(a0) && self.all_false_mask16x16(a1)
7642    }
7643    #[inline(always)]
7644    fn split_mask16x32(self, a: mask16x32<Self>) -> (mask16x16<Self>, mask16x16<Self>) {
7645        (
7646            mask16x16 {
7647                val: crate::support::Aligned256(a.val.0[0]),
7648                simd: self,
7649            },
7650            mask16x16 {
7651                val: crate::support::Aligned256(a.val.0[1]),
7652                simd: self,
7653            },
7654        )
7655    }
7656    #[inline(always)]
7657    fn splat_i32x16(self, val: i32) -> i32x16<Self> {
7658        let half = self.splat_i32x8(val);
7659        self.combine_i32x8(half, half)
7660    }
7661    #[inline(always)]
7662    fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16<Self> {
7663        i32x16 {
7664            val: unsafe { core::mem::transmute_copy(&val) },
7665            simd: self,
7666        }
7667    }
7668    #[inline(always)]
7669    fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16<Self> {
7670        i32x16 {
7671            val: unsafe { core::mem::transmute_copy(val) },
7672            simd: self,
7673        }
7674    }
7675    #[inline(always)]
7676    fn as_array_i32x16(self, a: i32x16<Self>) -> [i32; 16usize] {
7677        unsafe { core::mem::transmute::<[__m256i; 2usize], [i32; 16usize]>(a.val.0) }
7678    }
7679    #[inline(always)]
7680    fn as_array_ref_i32x16(self, a: &i32x16<Self>) -> &[i32; 16usize] {
7681        unsafe { core::mem::transmute::<&[__m256i; 2usize], &[i32; 16usize]>(&a.val.0) }
7682    }
7683    #[inline(always)]
7684    fn as_array_mut_i32x16(self, a: &mut i32x16<Self>) -> &mut [i32; 16usize] {
7685        unsafe { core::mem::transmute::<&mut [__m256i; 2usize], &mut [i32; 16usize]>(&mut a.val.0) }
7686    }
7687    #[inline(always)]
7688    fn store_array_i32x16(self, a: i32x16<Self>, dest: &mut [i32; 16usize]) -> () {
7689        unsafe {
7690            core::ptr::copy_nonoverlapping(
7691                (&raw const a.val.0) as *const i32,
7692                dest.as_mut_ptr(),
7693                16usize,
7694            );
7695        }
7696    }
7697    #[inline(always)]
7698    fn cvt_from_bytes_i32x16(self, a: u8x64<Self>) -> i32x16<Self> {
7699        unsafe {
7700            i32x16 {
7701                val: core::mem::transmute(a.val),
7702                simd: self,
7703            }
7704        }
7705    }
7706    #[inline(always)]
7707    fn cvt_to_bytes_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
7708        unsafe {
7709            u8x64 {
7710                val: core::mem::transmute(a.val),
7711                simd: self,
7712            }
7713        }
7714    }
7715    #[inline(always)]
7716    fn slide_i32x16<const SHIFT: usize>(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7717        unsafe {
7718            if SHIFT >= 16usize {
7719                return b;
7720            }
7721            let result = cross_block_alignr_256x2(
7722                self.cvt_to_bytes_i32x16(b).val.0,
7723                self.cvt_to_bytes_i32x16(a).val.0,
7724                SHIFT * 4usize,
7725            );
7726            self.cvt_from_bytes_i32x16(u8x64 {
7727                val: crate::support::Aligned512(result),
7728                simd: self,
7729            })
7730        }
7731    }
7732    #[inline(always)]
7733    fn slide_within_blocks_i32x16<const SHIFT: usize>(
7734        self,
7735        a: i32x16<Self>,
7736        b: i32x16<Self>,
7737    ) -> i32x16<Self> {
7738        let (a0, a1) = self.split_i32x16(a);
7739        let (b0, b1) = self.split_i32x16(b);
7740        self.combine_i32x8(
7741            self.slide_within_blocks_i32x8::<SHIFT>(a0, b0),
7742            self.slide_within_blocks_i32x8::<SHIFT>(a1, b1),
7743        )
7744    }
7745    #[inline(always)]
7746    fn add_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7747        let (a0, a1) = self.split_i32x16(a);
7748        let (b0, b1) = self.split_i32x16(b);
7749        self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1))
7750    }
7751    #[inline(always)]
7752    fn sub_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7753        let (a0, a1) = self.split_i32x16(a);
7754        let (b0, b1) = self.split_i32x16(b);
7755        self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1))
7756    }
7757    #[inline(always)]
7758    fn mul_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7759        let (a0, a1) = self.split_i32x16(a);
7760        let (b0, b1) = self.split_i32x16(b);
7761        self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1))
7762    }
7763    #[inline(always)]
7764    fn and_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7765        let (a0, a1) = self.split_i32x16(a);
7766        let (b0, b1) = self.split_i32x16(b);
7767        self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1))
7768    }
7769    #[inline(always)]
7770    fn or_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7771        let (a0, a1) = self.split_i32x16(a);
7772        let (b0, b1) = self.split_i32x16(b);
7773        self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1))
7774    }
7775    #[inline(always)]
7776    fn xor_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7777        let (a0, a1) = self.split_i32x16(a);
7778        let (b0, b1) = self.split_i32x16(b);
7779        self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1))
7780    }
7781    #[inline(always)]
7782    fn not_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
7783        let (a0, a1) = self.split_i32x16(a);
7784        self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1))
7785    }
7786    #[inline(always)]
7787    fn shl_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
7788        let (a0, a1) = self.split_i32x16(a);
7789        self.combine_i32x8(self.shl_i32x8(a0, shift), self.shl_i32x8(a1, shift))
7790    }
7791    #[inline(always)]
7792    fn shlv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7793        let (a0, a1) = self.split_i32x16(a);
7794        let (b0, b1) = self.split_i32x16(b);
7795        self.combine_i32x8(self.shlv_i32x8(a0, b0), self.shlv_i32x8(a1, b1))
7796    }
7797    #[inline(always)]
7798    fn shr_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
7799        let (a0, a1) = self.split_i32x16(a);
7800        self.combine_i32x8(self.shr_i32x8(a0, shift), self.shr_i32x8(a1, shift))
7801    }
7802    #[inline(always)]
7803    fn shrv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7804        let (a0, a1) = self.split_i32x16(a);
7805        let (b0, b1) = self.split_i32x16(b);
7806        self.combine_i32x8(self.shrv_i32x8(a0, b0), self.shrv_i32x8(a1, b1))
7807    }
7808    #[inline(always)]
7809    fn simd_eq_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
7810        let (a0, a1) = self.split_i32x16(a);
7811        let (b0, b1) = self.split_i32x16(b);
7812        self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1))
7813    }
7814    #[inline(always)]
7815    fn simd_lt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
7816        let (a0, a1) = self.split_i32x16(a);
7817        let (b0, b1) = self.split_i32x16(b);
7818        self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1))
7819    }
7820    #[inline(always)]
7821    fn simd_le_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
7822        let (a0, a1) = self.split_i32x16(a);
7823        let (b0, b1) = self.split_i32x16(b);
7824        self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1))
7825    }
7826    #[inline(always)]
7827    fn simd_ge_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
7828        let (a0, a1) = self.split_i32x16(a);
7829        let (b0, b1) = self.split_i32x16(b);
7830        self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1))
7831    }
7832    #[inline(always)]
7833    fn simd_gt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
7834        let (a0, a1) = self.split_i32x16(a);
7835        let (b0, b1) = self.split_i32x16(b);
7836        self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1))
7837    }
7838    #[inline(always)]
7839    fn zip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7840        let (a0, _) = self.split_i32x16(a);
7841        let (b0, _) = self.split_i32x16(b);
7842        self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0))
7843    }
7844    #[inline(always)]
7845    fn zip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7846        let (_, a1) = self.split_i32x16(a);
7847        let (_, b1) = self.split_i32x16(b);
7848        self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1))
7849    }
7850    #[inline(always)]
7851    fn unzip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7852        let (a0, a1) = self.split_i32x16(a);
7853        let (b0, b1) = self.split_i32x16(b);
7854        self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1))
7855    }
7856    #[inline(always)]
7857    fn unzip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7858        let (a0, a1) = self.split_i32x16(a);
7859        let (b0, b1) = self.split_i32x16(b);
7860        self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1))
7861    }
7862    #[inline(always)]
7863    fn interleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
7864        let (a0, a1) = self.split_i32x16(a);
7865        let (b0, b1) = self.split_i32x16(b);
7866        let lo_lo = self.zip_low_i32x8(a0, b0);
7867        let lo_hi = self.zip_high_i32x8(a0, b0);
7868        let hi_lo = self.zip_low_i32x8(a1, b1);
7869        let hi_hi = self.zip_high_i32x8(a1, b1);
7870        (
7871            self.combine_i32x8(lo_lo, lo_hi),
7872            self.combine_i32x8(hi_lo, hi_hi),
7873        )
7874    }
7875    #[inline(always)]
7876    fn deinterleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
7877        let (a0, a1) = self.split_i32x16(a);
7878        let (b0, b1) = self.split_i32x16(b);
7879        let lo_even = self.unzip_low_i32x8(a0, a1);
7880        let lo_odd = self.unzip_high_i32x8(a0, a1);
7881        let hi_even = self.unzip_low_i32x8(b0, b1);
7882        let hi_odd = self.unzip_high_i32x8(b0, b1);
7883        (
7884            self.combine_i32x8(lo_even, hi_even),
7885            self.combine_i32x8(lo_odd, hi_odd),
7886        )
7887    }
7888    #[inline(always)]
7889    fn select_i32x16(self, a: mask32x16<Self>, b: i32x16<Self>, c: i32x16<Self>) -> i32x16<Self> {
7890        let (a0, a1) = self.split_mask32x16(a);
7891        let (b0, b1) = self.split_i32x16(b);
7892        let (c0, c1) = self.split_i32x16(c);
7893        self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1))
7894    }
7895    #[inline(always)]
7896    fn min_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7897        let (a0, a1) = self.split_i32x16(a);
7898        let (b0, b1) = self.split_i32x16(b);
7899        self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1))
7900    }
7901    #[inline(always)]
7902    fn max_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7903        let (a0, a1) = self.split_i32x16(a);
7904        let (b0, b1) = self.split_i32x16(b);
7905        self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1))
7906    }
7907    #[inline(always)]
7908    fn split_i32x16(self, a: i32x16<Self>) -> (i32x8<Self>, i32x8<Self>) {
7909        (
7910            i32x8 {
7911                val: crate::support::Aligned256(a.val.0[0]),
7912                simd: self,
7913            },
7914            i32x8 {
7915                val: crate::support::Aligned256(a.val.0[1]),
7916                simd: self,
7917            },
7918        )
7919    }
7920    #[inline(always)]
7921    fn neg_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
7922        let (a0, a1) = self.split_i32x16(a);
7923        self.combine_i32x8(self.neg_i32x8(a0), self.neg_i32x8(a1))
7924    }
7925    #[inline(always)]
7926    fn reinterpret_u8_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
7927        let (a0, a1) = self.split_i32x16(a);
7928        self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1))
7929    }
7930    #[inline(always)]
7931    fn reinterpret_u32_i32x16(self, a: i32x16<Self>) -> u32x16<Self> {
7932        let (a0, a1) = self.split_i32x16(a);
7933        self.combine_u32x8(
7934            self.reinterpret_u32_i32x8(a0),
7935            self.reinterpret_u32_i32x8(a1),
7936        )
7937    }
7938    #[inline(always)]
7939    fn cvt_f32_i32x16(self, a: i32x16<Self>) -> f32x16<Self> {
7940        let (a0, a1) = self.split_i32x16(a);
7941        self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1))
7942    }
7943    #[inline(always)]
7944    fn splat_u32x16(self, val: u32) -> u32x16<Self> {
7945        let half = self.splat_u32x8(val);
7946        self.combine_u32x8(half, half)
7947    }
7948    #[inline(always)]
7949    fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16<Self> {
7950        u32x16 {
7951            val: unsafe { core::mem::transmute_copy(&val) },
7952            simd: self,
7953        }
7954    }
7955    #[inline(always)]
7956    fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16<Self> {
7957        u32x16 {
7958            val: unsafe { core::mem::transmute_copy(val) },
7959            simd: self,
7960        }
7961    }
7962    #[inline(always)]
7963    fn as_array_u32x16(self, a: u32x16<Self>) -> [u32; 16usize] {
7964        unsafe { core::mem::transmute::<[__m256i; 2usize], [u32; 16usize]>(a.val.0) }
7965    }
7966    #[inline(always)]
7967    fn as_array_ref_u32x16(self, a: &u32x16<Self>) -> &[u32; 16usize] {
7968        unsafe { core::mem::transmute::<&[__m256i; 2usize], &[u32; 16usize]>(&a.val.0) }
7969    }
7970    #[inline(always)]
7971    fn as_array_mut_u32x16(self, a: &mut u32x16<Self>) -> &mut [u32; 16usize] {
7972        unsafe { core::mem::transmute::<&mut [__m256i; 2usize], &mut [u32; 16usize]>(&mut a.val.0) }
7973    }
7974    #[inline(always)]
7975    fn store_array_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
7976        unsafe {
7977            core::ptr::copy_nonoverlapping(
7978                (&raw const a.val.0) as *const u32,
7979                dest.as_mut_ptr(),
7980                16usize,
7981            );
7982        }
7983    }
7984    #[inline(always)]
7985    fn cvt_from_bytes_u32x16(self, a: u8x64<Self>) -> u32x16<Self> {
7986        unsafe {
7987            u32x16 {
7988                val: core::mem::transmute(a.val),
7989                simd: self,
7990            }
7991        }
7992    }
7993    #[inline(always)]
7994    fn cvt_to_bytes_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
7995        unsafe {
7996            u8x64 {
7997                val: core::mem::transmute(a.val),
7998                simd: self,
7999            }
8000        }
8001    }
8002    #[inline(always)]
8003    fn slide_u32x16<const SHIFT: usize>(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8004        unsafe {
8005            if SHIFT >= 16usize {
8006                return b;
8007            }
8008            let result = cross_block_alignr_256x2(
8009                self.cvt_to_bytes_u32x16(b).val.0,
8010                self.cvt_to_bytes_u32x16(a).val.0,
8011                SHIFT * 4usize,
8012            );
8013            self.cvt_from_bytes_u32x16(u8x64 {
8014                val: crate::support::Aligned512(result),
8015                simd: self,
8016            })
8017        }
8018    }
8019    #[inline(always)]
8020    fn slide_within_blocks_u32x16<const SHIFT: usize>(
8021        self,
8022        a: u32x16<Self>,
8023        b: u32x16<Self>,
8024    ) -> u32x16<Self> {
8025        let (a0, a1) = self.split_u32x16(a);
8026        let (b0, b1) = self.split_u32x16(b);
8027        self.combine_u32x8(
8028            self.slide_within_blocks_u32x8::<SHIFT>(a0, b0),
8029            self.slide_within_blocks_u32x8::<SHIFT>(a1, b1),
8030        )
8031    }
8032    #[inline(always)]
8033    fn add_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8034        let (a0, a1) = self.split_u32x16(a);
8035        let (b0, b1) = self.split_u32x16(b);
8036        self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1))
8037    }
8038    #[inline(always)]
8039    fn sub_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8040        let (a0, a1) = self.split_u32x16(a);
8041        let (b0, b1) = self.split_u32x16(b);
8042        self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1))
8043    }
8044    #[inline(always)]
8045    fn mul_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8046        let (a0, a1) = self.split_u32x16(a);
8047        let (b0, b1) = self.split_u32x16(b);
8048        self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1))
8049    }
8050    #[inline(always)]
8051    fn and_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8052        let (a0, a1) = self.split_u32x16(a);
8053        let (b0, b1) = self.split_u32x16(b);
8054        self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1))
8055    }
8056    #[inline(always)]
8057    fn or_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8058        let (a0, a1) = self.split_u32x16(a);
8059        let (b0, b1) = self.split_u32x16(b);
8060        self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1))
8061    }
8062    #[inline(always)]
8063    fn xor_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8064        let (a0, a1) = self.split_u32x16(a);
8065        let (b0, b1) = self.split_u32x16(b);
8066        self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1))
8067    }
8068    #[inline(always)]
8069    fn not_u32x16(self, a: u32x16<Self>) -> u32x16<Self> {
8070        let (a0, a1) = self.split_u32x16(a);
8071        self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1))
8072    }
8073    #[inline(always)]
8074    fn shl_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
8075        let (a0, a1) = self.split_u32x16(a);
8076        self.combine_u32x8(self.shl_u32x8(a0, shift), self.shl_u32x8(a1, shift))
8077    }
8078    #[inline(always)]
8079    fn shlv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8080        let (a0, a1) = self.split_u32x16(a);
8081        let (b0, b1) = self.split_u32x16(b);
8082        self.combine_u32x8(self.shlv_u32x8(a0, b0), self.shlv_u32x8(a1, b1))
8083    }
8084    #[inline(always)]
8085    fn shr_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
8086        let (a0, a1) = self.split_u32x16(a);
8087        self.combine_u32x8(self.shr_u32x8(a0, shift), self.shr_u32x8(a1, shift))
8088    }
8089    #[inline(always)]
8090    fn shrv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8091        let (a0, a1) = self.split_u32x16(a);
8092        let (b0, b1) = self.split_u32x16(b);
8093        self.combine_u32x8(self.shrv_u32x8(a0, b0), self.shrv_u32x8(a1, b1))
8094    }
8095    #[inline(always)]
8096    fn simd_eq_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
8097        let (a0, a1) = self.split_u32x16(a);
8098        let (b0, b1) = self.split_u32x16(b);
8099        self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1))
8100    }
8101    #[inline(always)]
8102    fn simd_lt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
8103        let (a0, a1) = self.split_u32x16(a);
8104        let (b0, b1) = self.split_u32x16(b);
8105        self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1))
8106    }
8107    #[inline(always)]
8108    fn simd_le_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
8109        let (a0, a1) = self.split_u32x16(a);
8110        let (b0, b1) = self.split_u32x16(b);
8111        self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1))
8112    }
8113    #[inline(always)]
8114    fn simd_ge_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
8115        let (a0, a1) = self.split_u32x16(a);
8116        let (b0, b1) = self.split_u32x16(b);
8117        self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1))
8118    }
8119    #[inline(always)]
8120    fn simd_gt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
8121        let (a0, a1) = self.split_u32x16(a);
8122        let (b0, b1) = self.split_u32x16(b);
8123        self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1))
8124    }
8125    #[inline(always)]
8126    fn zip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8127        let (a0, _) = self.split_u32x16(a);
8128        let (b0, _) = self.split_u32x16(b);
8129        self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0))
8130    }
8131    #[inline(always)]
8132    fn zip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8133        let (_, a1) = self.split_u32x16(a);
8134        let (_, b1) = self.split_u32x16(b);
8135        self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1))
8136    }
8137    #[inline(always)]
8138    fn unzip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8139        let (a0, a1) = self.split_u32x16(a);
8140        let (b0, b1) = self.split_u32x16(b);
8141        self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1))
8142    }
8143    #[inline(always)]
8144    fn unzip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8145        let (a0, a1) = self.split_u32x16(a);
8146        let (b0, b1) = self.split_u32x16(b);
8147        self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1))
8148    }
8149    #[inline(always)]
8150    fn interleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
8151        let (a0, a1) = self.split_u32x16(a);
8152        let (b0, b1) = self.split_u32x16(b);
8153        let lo_lo = self.zip_low_u32x8(a0, b0);
8154        let lo_hi = self.zip_high_u32x8(a0, b0);
8155        let hi_lo = self.zip_low_u32x8(a1, b1);
8156        let hi_hi = self.zip_high_u32x8(a1, b1);
8157        (
8158            self.combine_u32x8(lo_lo, lo_hi),
8159            self.combine_u32x8(hi_lo, hi_hi),
8160        )
8161    }
8162    #[inline(always)]
8163    fn deinterleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
8164        let (a0, a1) = self.split_u32x16(a);
8165        let (b0, b1) = self.split_u32x16(b);
8166        let lo_even = self.unzip_low_u32x8(a0, a1);
8167        let lo_odd = self.unzip_high_u32x8(a0, a1);
8168        let hi_even = self.unzip_low_u32x8(b0, b1);
8169        let hi_odd = self.unzip_high_u32x8(b0, b1);
8170        (
8171            self.combine_u32x8(lo_even, hi_even),
8172            self.combine_u32x8(lo_odd, hi_odd),
8173        )
8174    }
8175    #[inline(always)]
8176    fn select_u32x16(self, a: mask32x16<Self>, b: u32x16<Self>, c: u32x16<Self>) -> u32x16<Self> {
8177        let (a0, a1) = self.split_mask32x16(a);
8178        let (b0, b1) = self.split_u32x16(b);
8179        let (c0, c1) = self.split_u32x16(c);
8180        self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1))
8181    }
8182    #[inline(always)]
8183    fn min_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8184        let (a0, a1) = self.split_u32x16(a);
8185        let (b0, b1) = self.split_u32x16(b);
8186        self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1))
8187    }
8188    #[inline(always)]
8189    fn max_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8190        let (a0, a1) = self.split_u32x16(a);
8191        let (b0, b1) = self.split_u32x16(b);
8192        self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1))
8193    }
8194    #[inline(always)]
8195    fn split_u32x16(self, a: u32x16<Self>) -> (u32x8<Self>, u32x8<Self>) {
8196        (
8197            u32x8 {
8198                val: crate::support::Aligned256(a.val.0[0]),
8199                simd: self,
8200            },
8201            u32x8 {
8202                val: crate::support::Aligned256(a.val.0[1]),
8203                simd: self,
8204            },
8205        )
8206    }
8207    #[inline(always)]
8208    fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self> {
8209        unsafe {
8210            let v0 = _mm_loadu_si128(src.as_ptr() as *const _);
8211            let v1 = _mm_loadu_si128(src.as_ptr().add(4usize) as *const _);
8212            let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 4usize) as *const _);
8213            let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 4usize) as *const _);
8214            let tmp0 = _mm_unpacklo_epi32(v0, v1);
8215            let tmp1 = _mm_unpackhi_epi32(v0, v1);
8216            let tmp2 = _mm_unpacklo_epi32(v2, v3);
8217            let tmp3 = _mm_unpackhi_epi32(v2, v3);
8218            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
8219            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
8220            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
8221            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
8222            self.combine_u32x8(
8223                self.combine_u32x4(out0.simd_into(self), out1.simd_into(self)),
8224                self.combine_u32x4(out2.simd_into(self), out3.simd_into(self)),
8225            )
8226        }
8227    }
8228    #[inline(always)]
8229    fn store_interleaved_128_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
8230        let (v01, v23) = self.split_u32x16(a);
8231        let (v0, v1) = self.split_u32x8(v01);
8232        let (v2, v3) = self.split_u32x8(v23);
8233        let v0 = v0.into();
8234        let v1 = v1.into();
8235        let v2 = v2.into();
8236        let v3 = v3.into();
8237        unsafe {
8238            let tmp0 = _mm_unpacklo_epi32(v0, v1);
8239            let tmp1 = _mm_unpackhi_epi32(v0, v1);
8240            let tmp2 = _mm_unpacklo_epi32(v2, v3);
8241            let tmp3 = _mm_unpackhi_epi32(v2, v3);
8242            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
8243            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
8244            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
8245            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
8246            _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0);
8247            _mm_storeu_si128(dest.as_mut_ptr().add(4usize) as *mut _, out1);
8248            _mm_storeu_si128(dest.as_mut_ptr().add(2 * 4usize) as *mut _, out2);
8249            _mm_storeu_si128(dest.as_mut_ptr().add(3 * 4usize) as *mut _, out3);
8250        }
8251    }
8252    #[inline(always)]
8253    fn reinterpret_u8_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
8254        let (a0, a1) = self.split_u32x16(a);
8255        self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1))
8256    }
8257    #[inline(always)]
8258    fn cvt_f32_u32x16(self, a: u32x16<Self>) -> f32x16<Self> {
8259        let (a0, a1) = self.split_u32x16(a);
8260        self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1))
8261    }
8262    #[inline(always)]
8263    fn splat_mask32x16(self, val: i32) -> mask32x16<Self> {
8264        let half = self.splat_mask32x8(val);
8265        self.combine_mask32x8(half, half)
8266    }
8267    #[inline(always)]
8268    fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16<Self> {
8269        mask32x16 {
8270            val: unsafe { core::mem::transmute_copy(&val) },
8271            simd: self,
8272        }
8273    }
8274    #[inline(always)]
8275    fn load_array_ref_mask32x16(self, val: &[i32; 16usize]) -> mask32x16<Self> {
8276        mask32x16 {
8277            val: unsafe { core::mem::transmute_copy(val) },
8278            simd: self,
8279        }
8280    }
8281    #[inline(always)]
8282    fn as_array_mask32x16(self, a: mask32x16<Self>) -> [i32; 16usize] {
8283        unsafe { core::mem::transmute::<[__m256i; 2usize], [i32; 16usize]>(a.val.0) }
8284    }
8285    #[inline(always)]
8286    fn as_array_ref_mask32x16(self, a: &mask32x16<Self>) -> &[i32; 16usize] {
8287        unsafe { core::mem::transmute::<&[__m256i; 2usize], &[i32; 16usize]>(&a.val.0) }
8288    }
8289    #[inline(always)]
8290    fn as_array_mut_mask32x16(self, a: &mut mask32x16<Self>) -> &mut [i32; 16usize] {
8291        unsafe { core::mem::transmute::<&mut [__m256i; 2usize], &mut [i32; 16usize]>(&mut a.val.0) }
8292    }
8293    #[inline(always)]
8294    fn store_array_mask32x16(self, a: mask32x16<Self>, dest: &mut [i32; 16usize]) -> () {
8295        unsafe {
8296            core::ptr::copy_nonoverlapping(
8297                (&raw const a.val.0) as *const i32,
8298                dest.as_mut_ptr(),
8299                16usize,
8300            );
8301        }
8302    }
8303    #[inline(always)]
8304    fn cvt_from_bytes_mask32x16(self, a: u8x64<Self>) -> mask32x16<Self> {
8305        unsafe {
8306            mask32x16 {
8307                val: core::mem::transmute(a.val),
8308                simd: self,
8309            }
8310        }
8311    }
8312    #[inline(always)]
8313    fn cvt_to_bytes_mask32x16(self, a: mask32x16<Self>) -> u8x64<Self> {
8314        unsafe {
8315            u8x64 {
8316                val: core::mem::transmute(a.val),
8317                simd: self,
8318            }
8319        }
8320    }
8321    #[inline(always)]
8322    fn slide_mask32x16<const SHIFT: usize>(
8323        self,
8324        a: mask32x16<Self>,
8325        b: mask32x16<Self>,
8326    ) -> mask32x16<Self> {
8327        unsafe {
8328            if SHIFT >= 16usize {
8329                return b;
8330            }
8331            let result = cross_block_alignr_256x2(
8332                self.cvt_to_bytes_mask32x16(b).val.0,
8333                self.cvt_to_bytes_mask32x16(a).val.0,
8334                SHIFT * 4usize,
8335            );
8336            self.cvt_from_bytes_mask32x16(u8x64 {
8337                val: crate::support::Aligned512(result),
8338                simd: self,
8339            })
8340        }
8341    }
8342    #[inline(always)]
8343    fn slide_within_blocks_mask32x16<const SHIFT: usize>(
8344        self,
8345        a: mask32x16<Self>,
8346        b: mask32x16<Self>,
8347    ) -> mask32x16<Self> {
8348        let (a0, a1) = self.split_mask32x16(a);
8349        let (b0, b1) = self.split_mask32x16(b);
8350        self.combine_mask32x8(
8351            self.slide_within_blocks_mask32x8::<SHIFT>(a0, b0),
8352            self.slide_within_blocks_mask32x8::<SHIFT>(a1, b1),
8353        )
8354    }
8355    #[inline(always)]
8356    fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
8357        let (a0, a1) = self.split_mask32x16(a);
8358        let (b0, b1) = self.split_mask32x16(b);
8359        self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1))
8360    }
8361    #[inline(always)]
8362    fn or_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
8363        let (a0, a1) = self.split_mask32x16(a);
8364        let (b0, b1) = self.split_mask32x16(b);
8365        self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1))
8366    }
8367    #[inline(always)]
8368    fn xor_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
8369        let (a0, a1) = self.split_mask32x16(a);
8370        let (b0, b1) = self.split_mask32x16(b);
8371        self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1))
8372    }
8373    #[inline(always)]
8374    fn not_mask32x16(self, a: mask32x16<Self>) -> mask32x16<Self> {
8375        let (a0, a1) = self.split_mask32x16(a);
8376        self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1))
8377    }
8378    #[inline(always)]
8379    fn select_mask32x16(
8380        self,
8381        a: mask32x16<Self>,
8382        b: mask32x16<Self>,
8383        c: mask32x16<Self>,
8384    ) -> mask32x16<Self> {
8385        let (a0, a1) = self.split_mask32x16(a);
8386        let (b0, b1) = self.split_mask32x16(b);
8387        let (c0, c1) = self.split_mask32x16(c);
8388        self.combine_mask32x8(
8389            self.select_mask32x8(a0, b0, c0),
8390            self.select_mask32x8(a1, b1, c1),
8391        )
8392    }
8393    #[inline(always)]
8394    fn simd_eq_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
8395        let (a0, a1) = self.split_mask32x16(a);
8396        let (b0, b1) = self.split_mask32x16(b);
8397        self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1))
8398    }
8399    #[inline(always)]
8400    fn any_true_mask32x16(self, a: mask32x16<Self>) -> bool {
8401        let (a0, a1) = self.split_mask32x16(a);
8402        self.any_true_mask32x8(a0) || self.any_true_mask32x8(a1)
8403    }
8404    #[inline(always)]
8405    fn all_true_mask32x16(self, a: mask32x16<Self>) -> bool {
8406        let (a0, a1) = self.split_mask32x16(a);
8407        self.all_true_mask32x8(a0) && self.all_true_mask32x8(a1)
8408    }
8409    #[inline(always)]
8410    fn any_false_mask32x16(self, a: mask32x16<Self>) -> bool {
8411        let (a0, a1) = self.split_mask32x16(a);
8412        self.any_false_mask32x8(a0) || self.any_false_mask32x8(a1)
8413    }
8414    #[inline(always)]
8415    fn all_false_mask32x16(self, a: mask32x16<Self>) -> bool {
8416        let (a0, a1) = self.split_mask32x16(a);
8417        self.all_false_mask32x8(a0) && self.all_false_mask32x8(a1)
8418    }
8419    #[inline(always)]
8420    fn split_mask32x16(self, a: mask32x16<Self>) -> (mask32x8<Self>, mask32x8<Self>) {
8421        (
8422            mask32x8 {
8423                val: crate::support::Aligned256(a.val.0[0]),
8424                simd: self,
8425            },
8426            mask32x8 {
8427                val: crate::support::Aligned256(a.val.0[1]),
8428                simd: self,
8429            },
8430        )
8431    }
8432    #[inline(always)]
8433    fn splat_f64x8(self, val: f64) -> f64x8<Self> {
8434        let half = self.splat_f64x4(val);
8435        self.combine_f64x4(half, half)
8436    }
8437    #[inline(always)]
8438    fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8<Self> {
8439        f64x8 {
8440            val: unsafe { core::mem::transmute_copy(&val) },
8441            simd: self,
8442        }
8443    }
8444    #[inline(always)]
8445    fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8<Self> {
8446        f64x8 {
8447            val: unsafe { core::mem::transmute_copy(val) },
8448            simd: self,
8449        }
8450    }
8451    #[inline(always)]
8452    fn as_array_f64x8(self, a: f64x8<Self>) -> [f64; 8usize] {
8453        unsafe { core::mem::transmute::<[__m256d; 2usize], [f64; 8usize]>(a.val.0) }
8454    }
8455    #[inline(always)]
8456    fn as_array_ref_f64x8(self, a: &f64x8<Self>) -> &[f64; 8usize] {
8457        unsafe { core::mem::transmute::<&[__m256d; 2usize], &[f64; 8usize]>(&a.val.0) }
8458    }
8459    #[inline(always)]
8460    fn as_array_mut_f64x8(self, a: &mut f64x8<Self>) -> &mut [f64; 8usize] {
8461        unsafe { core::mem::transmute::<&mut [__m256d; 2usize], &mut [f64; 8usize]>(&mut a.val.0) }
8462    }
8463    #[inline(always)]
8464    fn store_array_f64x8(self, a: f64x8<Self>, dest: &mut [f64; 8usize]) -> () {
8465        unsafe {
8466            core::ptr::copy_nonoverlapping(
8467                (&raw const a.val.0) as *const f64,
8468                dest.as_mut_ptr(),
8469                8usize,
8470            );
8471        }
8472    }
8473    #[inline(always)]
8474    fn cvt_from_bytes_f64x8(self, a: u8x64<Self>) -> f64x8<Self> {
8475        unsafe {
8476            f64x8 {
8477                val: core::mem::transmute(a.val),
8478                simd: self,
8479            }
8480        }
8481    }
8482    #[inline(always)]
8483    fn cvt_to_bytes_f64x8(self, a: f64x8<Self>) -> u8x64<Self> {
8484        unsafe {
8485            u8x64 {
8486                val: core::mem::transmute(a.val),
8487                simd: self,
8488            }
8489        }
8490    }
8491    #[inline(always)]
8492    fn slide_f64x8<const SHIFT: usize>(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8493        unsafe {
8494            if SHIFT >= 8usize {
8495                return b;
8496            }
8497            let result = cross_block_alignr_256x2(
8498                self.cvt_to_bytes_f64x8(b).val.0,
8499                self.cvt_to_bytes_f64x8(a).val.0,
8500                SHIFT * 8usize,
8501            );
8502            self.cvt_from_bytes_f64x8(u8x64 {
8503                val: crate::support::Aligned512(result),
8504                simd: self,
8505            })
8506        }
8507    }
8508    #[inline(always)]
8509    fn slide_within_blocks_f64x8<const SHIFT: usize>(
8510        self,
8511        a: f64x8<Self>,
8512        b: f64x8<Self>,
8513    ) -> f64x8<Self> {
8514        let (a0, a1) = self.split_f64x8(a);
8515        let (b0, b1) = self.split_f64x8(b);
8516        self.combine_f64x4(
8517            self.slide_within_blocks_f64x4::<SHIFT>(a0, b0),
8518            self.slide_within_blocks_f64x4::<SHIFT>(a1, b1),
8519        )
8520    }
8521    #[inline(always)]
8522    fn abs_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8523        let (a0, a1) = self.split_f64x8(a);
8524        self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1))
8525    }
8526    #[inline(always)]
8527    fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8528        let (a0, a1) = self.split_f64x8(a);
8529        self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1))
8530    }
8531    #[inline(always)]
8532    fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8533        let (a0, a1) = self.split_f64x8(a);
8534        self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
8535    }
8536    #[inline(always)]
8537    fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8538        let (a0, a1) = self.split_f64x8(a);
8539        let (b0, b1) = self.split_f64x8(b);
8540        self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1))
8541    }
8542    #[inline(always)]
8543    fn sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8544        let (a0, a1) = self.split_f64x8(a);
8545        let (b0, b1) = self.split_f64x8(b);
8546        self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1))
8547    }
8548    #[inline(always)]
8549    fn mul_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8550        let (a0, a1) = self.split_f64x8(a);
8551        let (b0, b1) = self.split_f64x8(b);
8552        self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1))
8553    }
8554    #[inline(always)]
8555    fn div_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8556        let (a0, a1) = self.split_f64x8(a);
8557        let (b0, b1) = self.split_f64x8(b);
8558        self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1))
8559    }
8560    #[inline(always)]
8561    fn copysign_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8562        let (a0, a1) = self.split_f64x8(a);
8563        let (b0, b1) = self.split_f64x8(b);
8564        self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1))
8565    }
8566    #[inline(always)]
8567    fn simd_eq_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
8568        let (a0, a1) = self.split_f64x8(a);
8569        let (b0, b1) = self.split_f64x8(b);
8570        self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1))
8571    }
8572    #[inline(always)]
8573    fn simd_lt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
8574        let (a0, a1) = self.split_f64x8(a);
8575        let (b0, b1) = self.split_f64x8(b);
8576        self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1))
8577    }
8578    #[inline(always)]
8579    fn simd_le_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
8580        let (a0, a1) = self.split_f64x8(a);
8581        let (b0, b1) = self.split_f64x8(b);
8582        self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1))
8583    }
8584    #[inline(always)]
8585    fn simd_ge_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
8586        let (a0, a1) = self.split_f64x8(a);
8587        let (b0, b1) = self.split_f64x8(b);
8588        self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1))
8589    }
8590    #[inline(always)]
8591    fn simd_gt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
8592        let (a0, a1) = self.split_f64x8(a);
8593        let (b0, b1) = self.split_f64x8(b);
8594        self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1))
8595    }
8596    #[inline(always)]
8597    fn zip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8598        let (a0, _) = self.split_f64x8(a);
8599        let (b0, _) = self.split_f64x8(b);
8600        self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0))
8601    }
8602    #[inline(always)]
8603    fn zip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8604        let (_, a1) = self.split_f64x8(a);
8605        let (_, b1) = self.split_f64x8(b);
8606        self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1))
8607    }
8608    #[inline(always)]
8609    fn unzip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8610        let (a0, a1) = self.split_f64x8(a);
8611        let (b0, b1) = self.split_f64x8(b);
8612        self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1))
8613    }
8614    #[inline(always)]
8615    fn unzip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8616        let (a0, a1) = self.split_f64x8(a);
8617        let (b0, b1) = self.split_f64x8(b);
8618        self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1))
8619    }
8620    #[inline(always)]
8621    fn interleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
8622        let (a0, a1) = self.split_f64x8(a);
8623        let (b0, b1) = self.split_f64x8(b);
8624        let lo_lo = self.zip_low_f64x4(a0, b0);
8625        let lo_hi = self.zip_high_f64x4(a0, b0);
8626        let hi_lo = self.zip_low_f64x4(a1, b1);
8627        let hi_hi = self.zip_high_f64x4(a1, b1);
8628        (
8629            self.combine_f64x4(lo_lo, lo_hi),
8630            self.combine_f64x4(hi_lo, hi_hi),
8631        )
8632    }
8633    #[inline(always)]
8634    fn deinterleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
8635        let (a0, a1) = self.split_f64x8(a);
8636        let (b0, b1) = self.split_f64x8(b);
8637        let lo_even = self.unzip_low_f64x4(a0, a1);
8638        let lo_odd = self.unzip_high_f64x4(a0, a1);
8639        let hi_even = self.unzip_low_f64x4(b0, b1);
8640        let hi_odd = self.unzip_high_f64x4(b0, b1);
8641        (
8642            self.combine_f64x4(lo_even, hi_even),
8643            self.combine_f64x4(lo_odd, hi_odd),
8644        )
8645    }
8646    #[inline(always)]
8647    fn max_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8648        let (a0, a1) = self.split_f64x8(a);
8649        let (b0, b1) = self.split_f64x8(b);
8650        self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1))
8651    }
8652    #[inline(always)]
8653    fn min_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8654        let (a0, a1) = self.split_f64x8(a);
8655        let (b0, b1) = self.split_f64x8(b);
8656        self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1))
8657    }
8658    #[inline(always)]
8659    fn max_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8660        let (a0, a1) = self.split_f64x8(a);
8661        let (b0, b1) = self.split_f64x8(b);
8662        self.combine_f64x4(
8663            self.max_precise_f64x4(a0, b0),
8664            self.max_precise_f64x4(a1, b1),
8665        )
8666    }
8667    #[inline(always)]
8668    fn min_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8669        let (a0, a1) = self.split_f64x8(a);
8670        let (b0, b1) = self.split_f64x8(b);
8671        self.combine_f64x4(
8672            self.min_precise_f64x4(a0, b0),
8673            self.min_precise_f64x4(a1, b1),
8674        )
8675    }
8676    #[inline(always)]
8677    fn mul_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
8678        let (a0, a1) = self.split_f64x8(a);
8679        let (b0, b1) = self.split_f64x8(b);
8680        let (c0, c1) = self.split_f64x8(c);
8681        self.combine_f64x4(
8682            self.mul_add_f64x4(a0, b0, c0),
8683            self.mul_add_f64x4(a1, b1, c1),
8684        )
8685    }
8686    #[inline(always)]
8687    fn mul_sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
8688        let (a0, a1) = self.split_f64x8(a);
8689        let (b0, b1) = self.split_f64x8(b);
8690        let (c0, c1) = self.split_f64x8(c);
8691        self.combine_f64x4(
8692            self.mul_sub_f64x4(a0, b0, c0),
8693            self.mul_sub_f64x4(a1, b1, c1),
8694        )
8695    }
8696    #[inline(always)]
8697    fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8698        let (a0, a1) = self.split_f64x8(a);
8699        self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1))
8700    }
8701    #[inline(always)]
8702    fn ceil_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8703        let (a0, a1) = self.split_f64x8(a);
8704        self.combine_f64x4(self.ceil_f64x4(a0), self.ceil_f64x4(a1))
8705    }
8706    #[inline(always)]
8707    fn round_ties_even_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8708        let (a0, a1) = self.split_f64x8(a);
8709        self.combine_f64x4(
8710            self.round_ties_even_f64x4(a0),
8711            self.round_ties_even_f64x4(a1),
8712        )
8713    }
8714    #[inline(always)]
8715    fn fract_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8716        let (a0, a1) = self.split_f64x8(a);
8717        self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1))
8718    }
8719    #[inline(always)]
8720    fn trunc_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8721        let (a0, a1) = self.split_f64x8(a);
8722        self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1))
8723    }
8724    #[inline(always)]
8725    fn select_f64x8(self, a: mask64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
8726        let (a0, a1) = self.split_mask64x8(a);
8727        let (b0, b1) = self.split_f64x8(b);
8728        let (c0, c1) = self.split_f64x8(c);
8729        self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1))
8730    }
8731    #[inline(always)]
8732    fn split_f64x8(self, a: f64x8<Self>) -> (f64x4<Self>, f64x4<Self>) {
8733        (
8734            f64x4 {
8735                val: crate::support::Aligned256(a.val.0[0]),
8736                simd: self,
8737            },
8738            f64x4 {
8739                val: crate::support::Aligned256(a.val.0[1]),
8740                simd: self,
8741            },
8742        )
8743    }
8744    #[inline(always)]
8745    fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self> {
8746        let (a0, a1) = self.split_f64x8(a);
8747        self.combine_f32x8(
8748            self.reinterpret_f32_f64x4(a0),
8749            self.reinterpret_f32_f64x4(a1),
8750        )
8751    }
8752    #[inline(always)]
8753    fn splat_mask64x8(self, val: i64) -> mask64x8<Self> {
8754        let half = self.splat_mask64x4(val);
8755        self.combine_mask64x4(half, half)
8756    }
8757    #[inline(always)]
8758    fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8<Self> {
8759        mask64x8 {
8760            val: unsafe { core::mem::transmute_copy(&val) },
8761            simd: self,
8762        }
8763    }
8764    #[inline(always)]
8765    fn load_array_ref_mask64x8(self, val: &[i64; 8usize]) -> mask64x8<Self> {
8766        mask64x8 {
8767            val: unsafe { core::mem::transmute_copy(val) },
8768            simd: self,
8769        }
8770    }
8771    #[inline(always)]
8772    fn as_array_mask64x8(self, a: mask64x8<Self>) -> [i64; 8usize] {
8773        unsafe { core::mem::transmute::<[__m256i; 2usize], [i64; 8usize]>(a.val.0) }
8774    }
8775    #[inline(always)]
8776    fn as_array_ref_mask64x8(self, a: &mask64x8<Self>) -> &[i64; 8usize] {
8777        unsafe { core::mem::transmute::<&[__m256i; 2usize], &[i64; 8usize]>(&a.val.0) }
8778    }
8779    #[inline(always)]
8780    fn as_array_mut_mask64x8(self, a: &mut mask64x8<Self>) -> &mut [i64; 8usize] {
8781        unsafe { core::mem::transmute::<&mut [__m256i; 2usize], &mut [i64; 8usize]>(&mut a.val.0) }
8782    }
8783    #[inline(always)]
8784    fn store_array_mask64x8(self, a: mask64x8<Self>, dest: &mut [i64; 8usize]) -> () {
8785        unsafe {
8786            core::ptr::copy_nonoverlapping(
8787                (&raw const a.val.0) as *const i64,
8788                dest.as_mut_ptr(),
8789                8usize,
8790            );
8791        }
8792    }
8793    #[inline(always)]
8794    fn cvt_from_bytes_mask64x8(self, a: u8x64<Self>) -> mask64x8<Self> {
8795        unsafe {
8796            mask64x8 {
8797                val: core::mem::transmute(a.val),
8798                simd: self,
8799            }
8800        }
8801    }
8802    #[inline(always)]
8803    fn cvt_to_bytes_mask64x8(self, a: mask64x8<Self>) -> u8x64<Self> {
8804        unsafe {
8805            u8x64 {
8806                val: core::mem::transmute(a.val),
8807                simd: self,
8808            }
8809        }
8810    }
8811    #[inline(always)]
8812    fn slide_mask64x8<const SHIFT: usize>(
8813        self,
8814        a: mask64x8<Self>,
8815        b: mask64x8<Self>,
8816    ) -> mask64x8<Self> {
8817        unsafe {
8818            if SHIFT >= 8usize {
8819                return b;
8820            }
8821            let result = cross_block_alignr_256x2(
8822                self.cvt_to_bytes_mask64x8(b).val.0,
8823                self.cvt_to_bytes_mask64x8(a).val.0,
8824                SHIFT * 8usize,
8825            );
8826            self.cvt_from_bytes_mask64x8(u8x64 {
8827                val: crate::support::Aligned512(result),
8828                simd: self,
8829            })
8830        }
8831    }
8832    #[inline(always)]
8833    fn slide_within_blocks_mask64x8<const SHIFT: usize>(
8834        self,
8835        a: mask64x8<Self>,
8836        b: mask64x8<Self>,
8837    ) -> mask64x8<Self> {
8838        let (a0, a1) = self.split_mask64x8(a);
8839        let (b0, b1) = self.split_mask64x8(b);
8840        self.combine_mask64x4(
8841            self.slide_within_blocks_mask64x4::<SHIFT>(a0, b0),
8842            self.slide_within_blocks_mask64x4::<SHIFT>(a1, b1),
8843        )
8844    }
8845    #[inline(always)]
8846    fn and_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
8847        let (a0, a1) = self.split_mask64x8(a);
8848        let (b0, b1) = self.split_mask64x8(b);
8849        self.combine_mask64x4(self.and_mask64x4(a0, b0), self.and_mask64x4(a1, b1))
8850    }
8851    #[inline(always)]
8852    fn or_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
8853        let (a0, a1) = self.split_mask64x8(a);
8854        let (b0, b1) = self.split_mask64x8(b);
8855        self.combine_mask64x4(self.or_mask64x4(a0, b0), self.or_mask64x4(a1, b1))
8856    }
8857    #[inline(always)]
8858    fn xor_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
8859        let (a0, a1) = self.split_mask64x8(a);
8860        let (b0, b1) = self.split_mask64x8(b);
8861        self.combine_mask64x4(self.xor_mask64x4(a0, b0), self.xor_mask64x4(a1, b1))
8862    }
8863    #[inline(always)]
8864    fn not_mask64x8(self, a: mask64x8<Self>) -> mask64x8<Self> {
8865        let (a0, a1) = self.split_mask64x8(a);
8866        self.combine_mask64x4(self.not_mask64x4(a0), self.not_mask64x4(a1))
8867    }
8868    #[inline(always)]
8869    fn select_mask64x8(
8870        self,
8871        a: mask64x8<Self>,
8872        b: mask64x8<Self>,
8873        c: mask64x8<Self>,
8874    ) -> mask64x8<Self> {
8875        let (a0, a1) = self.split_mask64x8(a);
8876        let (b0, b1) = self.split_mask64x8(b);
8877        let (c0, c1) = self.split_mask64x8(c);
8878        self.combine_mask64x4(
8879            self.select_mask64x4(a0, b0, c0),
8880            self.select_mask64x4(a1, b1, c1),
8881        )
8882    }
8883    #[inline(always)]
8884    fn simd_eq_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
8885        let (a0, a1) = self.split_mask64x8(a);
8886        let (b0, b1) = self.split_mask64x8(b);
8887        self.combine_mask64x4(self.simd_eq_mask64x4(a0, b0), self.simd_eq_mask64x4(a1, b1))
8888    }
8889    #[inline(always)]
8890    fn any_true_mask64x8(self, a: mask64x8<Self>) -> bool {
8891        let (a0, a1) = self.split_mask64x8(a);
8892        self.any_true_mask64x4(a0) || self.any_true_mask64x4(a1)
8893    }
8894    #[inline(always)]
8895    fn all_true_mask64x8(self, a: mask64x8<Self>) -> bool {
8896        let (a0, a1) = self.split_mask64x8(a);
8897        self.all_true_mask64x4(a0) && self.all_true_mask64x4(a1)
8898    }
8899    #[inline(always)]
8900    fn any_false_mask64x8(self, a: mask64x8<Self>) -> bool {
8901        let (a0, a1) = self.split_mask64x8(a);
8902        self.any_false_mask64x4(a0) || self.any_false_mask64x4(a1)
8903    }
8904    #[inline(always)]
8905    fn all_false_mask64x8(self, a: mask64x8<Self>) -> bool {
8906        let (a0, a1) = self.split_mask64x8(a);
8907        self.all_false_mask64x4(a0) && self.all_false_mask64x4(a1)
8908    }
8909    #[inline(always)]
8910    fn split_mask64x8(self, a: mask64x8<Self>) -> (mask64x4<Self>, mask64x4<Self>) {
8911        (
8912            mask64x4 {
8913                val: crate::support::Aligned256(a.val.0[0]),
8914                simd: self,
8915            },
8916            mask64x4 {
8917                val: crate::support::Aligned256(a.val.0[1]),
8918                simd: self,
8919            },
8920        )
8921    }
8922}
8923impl<S: Simd> SimdFrom<__m256, S> for f32x8<S> {
8924    #[inline(always)]
8925    fn simd_from(simd: S, arch: __m256) -> Self {
8926        Self {
8927            val: unsafe { core::mem::transmute_copy(&arch) },
8928            simd,
8929        }
8930    }
8931}
8932impl<S: Simd> From<f32x8<S>> for __m256 {
8933    #[inline(always)]
8934    fn from(value: f32x8<S>) -> Self {
8935        unsafe { core::mem::transmute_copy(&value.val) }
8936    }
8937}
8938impl<S: Simd> SimdFrom<__m256i, S> for i8x32<S> {
8939    #[inline(always)]
8940    fn simd_from(simd: S, arch: __m256i) -> Self {
8941        Self {
8942            val: unsafe { core::mem::transmute_copy(&arch) },
8943            simd,
8944        }
8945    }
8946}
8947impl<S: Simd> From<i8x32<S>> for __m256i {
8948    #[inline(always)]
8949    fn from(value: i8x32<S>) -> Self {
8950        unsafe { core::mem::transmute_copy(&value.val) }
8951    }
8952}
8953impl<S: Simd> SimdFrom<__m256i, S> for u8x32<S> {
8954    #[inline(always)]
8955    fn simd_from(simd: S, arch: __m256i) -> Self {
8956        Self {
8957            val: unsafe { core::mem::transmute_copy(&arch) },
8958            simd,
8959        }
8960    }
8961}
8962impl<S: Simd> From<u8x32<S>> for __m256i {
8963    #[inline(always)]
8964    fn from(value: u8x32<S>) -> Self {
8965        unsafe { core::mem::transmute_copy(&value.val) }
8966    }
8967}
8968impl<S: Simd> SimdFrom<__m256i, S> for mask8x32<S> {
8969    #[inline(always)]
8970    fn simd_from(simd: S, arch: __m256i) -> Self {
8971        Self {
8972            val: unsafe { core::mem::transmute_copy(&arch) },
8973            simd,
8974        }
8975    }
8976}
8977impl<S: Simd> From<mask8x32<S>> for __m256i {
8978    #[inline(always)]
8979    fn from(value: mask8x32<S>) -> Self {
8980        unsafe { core::mem::transmute_copy(&value.val) }
8981    }
8982}
8983impl<S: Simd> SimdFrom<__m256i, S> for i16x16<S> {
8984    #[inline(always)]
8985    fn simd_from(simd: S, arch: __m256i) -> Self {
8986        Self {
8987            val: unsafe { core::mem::transmute_copy(&arch) },
8988            simd,
8989        }
8990    }
8991}
8992impl<S: Simd> From<i16x16<S>> for __m256i {
8993    #[inline(always)]
8994    fn from(value: i16x16<S>) -> Self {
8995        unsafe { core::mem::transmute_copy(&value.val) }
8996    }
8997}
8998impl<S: Simd> SimdFrom<__m256i, S> for u16x16<S> {
8999    #[inline(always)]
9000    fn simd_from(simd: S, arch: __m256i) -> Self {
9001        Self {
9002            val: unsafe { core::mem::transmute_copy(&arch) },
9003            simd,
9004        }
9005    }
9006}
9007impl<S: Simd> From<u16x16<S>> for __m256i {
9008    #[inline(always)]
9009    fn from(value: u16x16<S>) -> Self {
9010        unsafe { core::mem::transmute_copy(&value.val) }
9011    }
9012}
9013impl<S: Simd> SimdFrom<__m256i, S> for mask16x16<S> {
9014    #[inline(always)]
9015    fn simd_from(simd: S, arch: __m256i) -> Self {
9016        Self {
9017            val: unsafe { core::mem::transmute_copy(&arch) },
9018            simd,
9019        }
9020    }
9021}
9022impl<S: Simd> From<mask16x16<S>> for __m256i {
9023    #[inline(always)]
9024    fn from(value: mask16x16<S>) -> Self {
9025        unsafe { core::mem::transmute_copy(&value.val) }
9026    }
9027}
9028impl<S: Simd> SimdFrom<__m256i, S> for i32x8<S> {
9029    #[inline(always)]
9030    fn simd_from(simd: S, arch: __m256i) -> Self {
9031        Self {
9032            val: unsafe { core::mem::transmute_copy(&arch) },
9033            simd,
9034        }
9035    }
9036}
9037impl<S: Simd> From<i32x8<S>> for __m256i {
9038    #[inline(always)]
9039    fn from(value: i32x8<S>) -> Self {
9040        unsafe { core::mem::transmute_copy(&value.val) }
9041    }
9042}
9043impl<S: Simd> SimdFrom<__m256i, S> for u32x8<S> {
9044    #[inline(always)]
9045    fn simd_from(simd: S, arch: __m256i) -> Self {
9046        Self {
9047            val: unsafe { core::mem::transmute_copy(&arch) },
9048            simd,
9049        }
9050    }
9051}
9052impl<S: Simd> From<u32x8<S>> for __m256i {
9053    #[inline(always)]
9054    fn from(value: u32x8<S>) -> Self {
9055        unsafe { core::mem::transmute_copy(&value.val) }
9056    }
9057}
9058impl<S: Simd> SimdFrom<__m256i, S> for mask32x8<S> {
9059    #[inline(always)]
9060    fn simd_from(simd: S, arch: __m256i) -> Self {
9061        Self {
9062            val: unsafe { core::mem::transmute_copy(&arch) },
9063            simd,
9064        }
9065    }
9066}
9067impl<S: Simd> From<mask32x8<S>> for __m256i {
9068    #[inline(always)]
9069    fn from(value: mask32x8<S>) -> Self {
9070        unsafe { core::mem::transmute_copy(&value.val) }
9071    }
9072}
9073impl<S: Simd> SimdFrom<__m256d, S> for f64x4<S> {
9074    #[inline(always)]
9075    fn simd_from(simd: S, arch: __m256d) -> Self {
9076        Self {
9077            val: unsafe { core::mem::transmute_copy(&arch) },
9078            simd,
9079        }
9080    }
9081}
9082impl<S: Simd> From<f64x4<S>> for __m256d {
9083    #[inline(always)]
9084    fn from(value: f64x4<S>) -> Self {
9085        unsafe { core::mem::transmute_copy(&value.val) }
9086    }
9087}
9088impl<S: Simd> SimdFrom<__m256i, S> for mask64x4<S> {
9089    #[inline(always)]
9090    fn simd_from(simd: S, arch: __m256i) -> Self {
9091        Self {
9092            val: unsafe { core::mem::transmute_copy(&arch) },
9093            simd,
9094        }
9095    }
9096}
9097impl<S: Simd> From<mask64x4<S>> for __m256i {
9098    #[inline(always)]
9099    fn from(value: mask64x4<S>) -> Self {
9100        unsafe { core::mem::transmute_copy(&value.val) }
9101    }
9102}
9103#[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"]
9104#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"]
9105#[doc = r" Rust doesn't currently let you do math on const generics."]
9106#[inline(always)]
9107unsafe fn dyn_alignr_128(a: __m128i, b: __m128i, shift: usize) -> __m128i {
9108    unsafe {
9109        match shift {
9110            0usize => _mm_alignr_epi8::<0i32>(a, b),
9111            1usize => _mm_alignr_epi8::<1i32>(a, b),
9112            2usize => _mm_alignr_epi8::<2i32>(a, b),
9113            3usize => _mm_alignr_epi8::<3i32>(a, b),
9114            4usize => _mm_alignr_epi8::<4i32>(a, b),
9115            5usize => _mm_alignr_epi8::<5i32>(a, b),
9116            6usize => _mm_alignr_epi8::<6i32>(a, b),
9117            7usize => _mm_alignr_epi8::<7i32>(a, b),
9118            8usize => _mm_alignr_epi8::<8i32>(a, b),
9119            9usize => _mm_alignr_epi8::<9i32>(a, b),
9120            10usize => _mm_alignr_epi8::<10i32>(a, b),
9121            11usize => _mm_alignr_epi8::<11i32>(a, b),
9122            12usize => _mm_alignr_epi8::<12i32>(a, b),
9123            13usize => _mm_alignr_epi8::<13i32>(a, b),
9124            14usize => _mm_alignr_epi8::<14i32>(a, b),
9125            15usize => _mm_alignr_epi8::<15i32>(a, b),
9126            _ => unreachable!(),
9127        }
9128    }
9129}
9130#[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"]
9131#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"]
9132#[doc = r" Rust doesn't currently let you do math on const generics."]
9133#[inline(always)]
9134unsafe fn dyn_alignr_256(a: __m256i, b: __m256i, shift: usize) -> __m256i {
9135    unsafe {
9136        match shift {
9137            0usize => _mm256_alignr_epi8::<0i32>(a, b),
9138            1usize => _mm256_alignr_epi8::<1i32>(a, b),
9139            2usize => _mm256_alignr_epi8::<2i32>(a, b),
9140            3usize => _mm256_alignr_epi8::<3i32>(a, b),
9141            4usize => _mm256_alignr_epi8::<4i32>(a, b),
9142            5usize => _mm256_alignr_epi8::<5i32>(a, b),
9143            6usize => _mm256_alignr_epi8::<6i32>(a, b),
9144            7usize => _mm256_alignr_epi8::<7i32>(a, b),
9145            8usize => _mm256_alignr_epi8::<8i32>(a, b),
9146            9usize => _mm256_alignr_epi8::<9i32>(a, b),
9147            10usize => _mm256_alignr_epi8::<10i32>(a, b),
9148            11usize => _mm256_alignr_epi8::<11i32>(a, b),
9149            12usize => _mm256_alignr_epi8::<12i32>(a, b),
9150            13usize => _mm256_alignr_epi8::<13i32>(a, b),
9151            14usize => _mm256_alignr_epi8::<14i32>(a, b),
9152            15usize => _mm256_alignr_epi8::<15i32>(a, b),
9153            _ => unreachable!(),
9154        }
9155    }
9156}
9157#[doc = r" Computes one output __m256i for `cross_block_alignr_*` operations."]
9158#[doc = r""]
9159#[doc = r" Given an array of registers, each containing two 128-bit blocks, extracts two adjacent blocks (`lo_idx` and"]
9160#[doc = r" `hi_idx` = `lo_idx + 1`) and performs `alignr` with `intra_shift`."]
9161#[inline(always)]
9162unsafe fn cross_block_alignr_one(
9163    regs: &[__m256i],
9164    block_idx: usize,
9165    shift_bytes: usize,
9166) -> __m256i {
9167    let lo_idx = block_idx + (shift_bytes / 16);
9168    let intra_shift = shift_bytes % 16;
9169    let lo_blocks = if lo_idx & 1 == 0 {
9170        regs[lo_idx / 2]
9171    } else {
9172        unsafe { _mm256_permute2x128_si256::<0x21>(regs[lo_idx / 2], regs[(lo_idx / 2) + 1]) }
9173    };
9174    let hi_idx = lo_idx + 1;
9175    let hi_blocks = if hi_idx & 1 == 0 {
9176        regs[hi_idx / 2]
9177    } else {
9178        unsafe { _mm256_permute2x128_si256::<0x21>(regs[hi_idx / 2], regs[(hi_idx / 2) + 1]) }
9179    };
9180    unsafe { dyn_alignr_256(hi_blocks, lo_blocks, intra_shift) }
9181}
9182#[doc = r" Concatenates `b` and `a` (each 2 x __m256i = 4 blocks) and extracts 4 blocks starting at byte offset"]
9183#[doc = r" `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics."]
9184#[inline(always)]
9185unsafe fn cross_block_alignr_256x2(
9186    a: [__m256i; 2],
9187    b: [__m256i; 2],
9188    shift_bytes: usize,
9189) -> [__m256i; 2] {
9190    let regs = [b[0], b[1], a[0], a[1]];
9191    unsafe {
9192        [
9193            cross_block_alignr_one(&regs, 0, shift_bytes),
9194            cross_block_alignr_one(&regs, 2, shift_bytes),
9195        ]
9196    }
9197}
9198#[doc = r" Concatenates `b` and `a` (each 1 x __m256i = 2 blocks) and extracts 2 blocks starting at byte offset"]
9199#[doc = r" `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics."]
9200#[inline(always)]
9201unsafe fn cross_block_alignr_256x1(a: __m256i, b: __m256i, shift_bytes: usize) -> __m256i {
9202    let regs = [b, a];
9203    unsafe { cross_block_alignr_one(&regs, 0, shift_bytes) }
9204}