Skip to main content

fearless_simd/generated/
sse4_2.rs

1// Copyright 2025 the Fearless_SIMD Authors
2// SPDX-License-Identifier: Apache-2.0 OR MIT
3
4// This file is autogenerated by fearless_simd_gen
5
6use crate::{Level, arch_types::ArchTypes, prelude::*, seal::Seal};
7use crate::{
8    f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4,
9    i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4,
10    mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32,
11    u32x4, u32x8, u32x16,
12};
13#[cfg(target_arch = "x86")]
14use core::arch::x86::*;
15#[cfg(target_arch = "x86_64")]
16use core::arch::x86_64::*;
17#[doc = "The SIMD token for the x86-64-v2 level."]
18#[derive(Clone, Copy, Debug)]
19pub struct Sse4_2 {
20    pub sse4_2: crate::core_arch::x86::Sse4_2,
21}
22impl Sse4_2 {
23    #[doc = r" Create a SIMD token."]
24    #[doc = r""]
25    #[doc = r" # Safety"]
26    #[doc = r""]
27    #[doc = r" The `sse4.2`, `cmpxchg16b`, and `popcnt` CPU features must"]
28    #[doc = r" be available."]
29    #[inline]
30    pub const unsafe fn new_unchecked() -> Self {
31        Sse4_2 {
32            sse4_2: unsafe { crate::core_arch::x86::Sse4_2::new_unchecked() },
33        }
34    }
35}
36impl Seal for Sse4_2 {}
37impl ArchTypes for Sse4_2 {
38    type f32x4 = crate::support::Aligned128<__m128>;
39    type i8x16 = crate::support::Aligned128<__m128i>;
40    type u8x16 = crate::support::Aligned128<__m128i>;
41    type mask8x16 = crate::support::Aligned128<__m128i>;
42    type i16x8 = crate::support::Aligned128<__m128i>;
43    type u16x8 = crate::support::Aligned128<__m128i>;
44    type mask16x8 = crate::support::Aligned128<__m128i>;
45    type i32x4 = crate::support::Aligned128<__m128i>;
46    type u32x4 = crate::support::Aligned128<__m128i>;
47    type mask32x4 = crate::support::Aligned128<__m128i>;
48    type f64x2 = crate::support::Aligned128<__m128d>;
49    type mask64x2 = crate::support::Aligned128<__m128i>;
50    type f32x8 = crate::support::Aligned256<[__m128; 2usize]>;
51    type i8x32 = crate::support::Aligned256<[__m128i; 2usize]>;
52    type u8x32 = crate::support::Aligned256<[__m128i; 2usize]>;
53    type mask8x32 = crate::support::Aligned256<[__m128i; 2usize]>;
54    type i16x16 = crate::support::Aligned256<[__m128i; 2usize]>;
55    type u16x16 = crate::support::Aligned256<[__m128i; 2usize]>;
56    type mask16x16 = crate::support::Aligned256<[__m128i; 2usize]>;
57    type i32x8 = crate::support::Aligned256<[__m128i; 2usize]>;
58    type u32x8 = crate::support::Aligned256<[__m128i; 2usize]>;
59    type mask32x8 = crate::support::Aligned256<[__m128i; 2usize]>;
60    type f64x4 = crate::support::Aligned256<[__m128d; 2usize]>;
61    type mask64x4 = crate::support::Aligned256<[__m128i; 2usize]>;
62    type f32x16 = crate::support::Aligned512<[__m128; 4usize]>;
63    type i8x64 = crate::support::Aligned512<[__m128i; 4usize]>;
64    type u8x64 = crate::support::Aligned512<[__m128i; 4usize]>;
65    type mask8x64 = crate::support::Aligned512<[__m128i; 4usize]>;
66    type i16x32 = crate::support::Aligned512<[__m128i; 4usize]>;
67    type u16x32 = crate::support::Aligned512<[__m128i; 4usize]>;
68    type mask16x32 = crate::support::Aligned512<[__m128i; 4usize]>;
69    type i32x16 = crate::support::Aligned512<[__m128i; 4usize]>;
70    type u32x16 = crate::support::Aligned512<[__m128i; 4usize]>;
71    type mask32x16 = crate::support::Aligned512<[__m128i; 4usize]>;
72    type f64x8 = crate::support::Aligned512<[__m128d; 4usize]>;
73    type mask64x8 = crate::support::Aligned512<[__m128i; 4usize]>;
74}
75impl Simd for Sse4_2 {
76    type f32s = f32x4<Self>;
77    type f64s = f64x2<Self>;
78    type u8s = u8x16<Self>;
79    type i8s = i8x16<Self>;
80    type u16s = u16x8<Self>;
81    type i16s = i16x8<Self>;
82    type u32s = u32x4<Self>;
83    type i32s = i32x4<Self>;
84    type mask8s = mask8x16<Self>;
85    type mask16s = mask16x8<Self>;
86    type mask32s = mask32x4<Self>;
87    type mask64s = mask64x2<Self>;
88    #[inline(always)]
89    fn level(self) -> Level {
90        #[cfg(not(all(
91            target_feature = "avx2",
92            target_feature = "bmi1",
93            target_feature = "bmi2",
94            target_feature = "cmpxchg16b",
95            target_feature = "f16c",
96            target_feature = "fma",
97            target_feature = "lzcnt",
98            target_feature = "movbe",
99            target_feature = "popcnt",
100            target_feature = "xsave"
101        )))]
102        return Level::Sse4_2(self);
103        #[cfg(all(
104            target_feature = "avx2",
105            target_feature = "bmi1",
106            target_feature = "bmi2",
107            target_feature = "cmpxchg16b",
108            target_feature = "f16c",
109            target_feature = "fma",
110            target_feature = "lzcnt",
111            target_feature = "movbe",
112            target_feature = "popcnt",
113            target_feature = "xsave"
114        ))]
115        {
116            Level::baseline()
117        }
118    }
119    #[inline]
120    fn vectorize<F: FnOnce() -> R, R>(self, f: F) -> R {
121        #[target_feature(enable = "sse4.2,cmpxchg16b,popcnt")]
122        unsafe fn vectorize_sse4_2<F: FnOnce() -> R, R>(f: F) -> R {
123            f()
124        }
125        unsafe { vectorize_sse4_2(f) }
126    }
127    #[inline(always)]
128    fn splat_f32x4(self, val: f32) -> f32x4<Self> {
129        unsafe { _mm_set1_ps(val).simd_into(self) }
130    }
131    #[inline(always)]
132    fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4<Self> {
133        f32x4 {
134            val: unsafe { core::mem::transmute_copy(&val) },
135            simd: self,
136        }
137    }
138    #[inline(always)]
139    fn load_array_ref_f32x4(self, val: &[f32; 4usize]) -> f32x4<Self> {
140        f32x4 {
141            val: unsafe { core::mem::transmute_copy(val) },
142            simd: self,
143        }
144    }
145    #[inline(always)]
146    fn as_array_f32x4(self, a: f32x4<Self>) -> [f32; 4usize] {
147        unsafe { core::mem::transmute::<__m128, [f32; 4usize]>(a.val.0) }
148    }
149    #[inline(always)]
150    fn as_array_ref_f32x4(self, a: &f32x4<Self>) -> &[f32; 4usize] {
151        unsafe { core::mem::transmute::<&__m128, &[f32; 4usize]>(&a.val.0) }
152    }
153    #[inline(always)]
154    fn as_array_mut_f32x4(self, a: &mut f32x4<Self>) -> &mut [f32; 4usize] {
155        unsafe { core::mem::transmute::<&mut __m128, &mut [f32; 4usize]>(&mut a.val.0) }
156    }
157    #[inline(always)]
158    fn store_array_f32x4(self, a: f32x4<Self>, dest: &mut [f32; 4usize]) -> () {
159        unsafe {
160            core::ptr::copy_nonoverlapping(
161                (&raw const a.val.0) as *const f32,
162                dest.as_mut_ptr(),
163                4usize,
164            );
165        }
166    }
167    #[inline(always)]
168    fn cvt_from_bytes_f32x4(self, a: u8x16<Self>) -> f32x4<Self> {
169        unsafe {
170            f32x4 {
171                val: core::mem::transmute(a.val),
172                simd: self,
173            }
174        }
175    }
176    #[inline(always)]
177    fn cvt_to_bytes_f32x4(self, a: f32x4<Self>) -> u8x16<Self> {
178        unsafe {
179            u8x16 {
180                val: core::mem::transmute(a.val),
181                simd: self,
182            }
183        }
184    }
185    #[inline(always)]
186    fn slide_f32x4<const SHIFT: usize>(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
187        unsafe {
188            if SHIFT >= 4usize {
189                return b;
190            }
191            let result = dyn_alignr_128(
192                self.cvt_to_bytes_f32x4(b).val.0,
193                self.cvt_to_bytes_f32x4(a).val.0,
194                SHIFT * 4usize,
195            );
196            self.cvt_from_bytes_f32x4(u8x16 {
197                val: crate::support::Aligned128(result),
198                simd: self,
199            })
200        }
201    }
202    #[inline(always)]
203    fn slide_within_blocks_f32x4<const SHIFT: usize>(
204        self,
205        a: f32x4<Self>,
206        b: f32x4<Self>,
207    ) -> f32x4<Self> {
208        self.slide_f32x4::<SHIFT>(a, b)
209    }
210    #[inline(always)]
211    fn abs_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
212        unsafe { _mm_andnot_ps(_mm_set1_ps(-0.0), a.into()).simd_into(self) }
213    }
214    #[inline(always)]
215    fn neg_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
216        unsafe { _mm_xor_ps(a.into(), _mm_set1_ps(-0.0)).simd_into(self) }
217    }
218    #[inline(always)]
219    fn sqrt_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
220        unsafe { _mm_sqrt_ps(a.into()).simd_into(self) }
221    }
222    #[inline(always)]
223    fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
224        unsafe { _mm_add_ps(a.into(), b.into()).simd_into(self) }
225    }
226    #[inline(always)]
227    fn sub_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
228        unsafe { _mm_sub_ps(a.into(), b.into()).simd_into(self) }
229    }
230    #[inline(always)]
231    fn mul_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
232        unsafe { _mm_mul_ps(a.into(), b.into()).simd_into(self) }
233    }
234    #[inline(always)]
235    fn div_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
236        unsafe { _mm_div_ps(a.into(), b.into()).simd_into(self) }
237    }
238    #[inline(always)]
239    fn copysign_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
240        unsafe {
241            let mask = _mm_set1_ps(-0.0);
242            _mm_or_ps(_mm_and_ps(mask, b.into()), _mm_andnot_ps(mask, a.into())).simd_into(self)
243        }
244    }
245    #[inline(always)]
246    fn simd_eq_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
247        unsafe { _mm_castps_si128(_mm_cmpeq_ps(a.into(), b.into())).simd_into(self) }
248    }
249    #[inline(always)]
250    fn simd_lt_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
251        unsafe { _mm_castps_si128(_mm_cmplt_ps(a.into(), b.into())).simd_into(self) }
252    }
253    #[inline(always)]
254    fn simd_le_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
255        unsafe { _mm_castps_si128(_mm_cmple_ps(a.into(), b.into())).simd_into(self) }
256    }
257    #[inline(always)]
258    fn simd_ge_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
259        unsafe { _mm_castps_si128(_mm_cmpge_ps(a.into(), b.into())).simd_into(self) }
260    }
261    #[inline(always)]
262    fn simd_gt_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
263        unsafe { _mm_castps_si128(_mm_cmpgt_ps(a.into(), b.into())).simd_into(self) }
264    }
265    #[inline(always)]
266    fn zip_low_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
267        unsafe { _mm_unpacklo_ps(a.into(), b.into()).simd_into(self) }
268    }
269    #[inline(always)]
270    fn zip_high_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
271        unsafe { _mm_unpackhi_ps(a.into(), b.into()).simd_into(self) }
272    }
273    #[inline(always)]
274    fn unzip_low_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
275        unsafe { _mm_shuffle_ps::<0b10_00_10_00>(a.into(), b.into()).simd_into(self) }
276    }
277    #[inline(always)]
278    fn unzip_high_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
279        unsafe { _mm_shuffle_ps::<0b11_01_11_01>(a.into(), b.into()).simd_into(self) }
280    }
281    #[inline(always)]
282    fn interleave_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> (f32x4<Self>, f32x4<Self>) {
283        (self.zip_low_f32x4(a, b), self.zip_high_f32x4(a, b))
284    }
285    #[inline(always)]
286    fn deinterleave_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> (f32x4<Self>, f32x4<Self>) {
287        (self.unzip_low_f32x4(a, b), self.unzip_high_f32x4(a, b))
288    }
289    #[inline(always)]
290    fn max_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
291        unsafe { _mm_max_ps(a.into(), b.into()).simd_into(self) }
292    }
293    #[inline(always)]
294    fn min_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
295        unsafe { _mm_min_ps(a.into(), b.into()).simd_into(self) }
296    }
297    #[inline(always)]
298    fn max_precise_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
299        unsafe {
300            let intermediate = _mm_max_ps(a.into(), b.into());
301            let b_is_nan = _mm_cmpunord_ps(b.into(), b.into());
302            _mm_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(self)
303        }
304    }
305    #[inline(always)]
306    fn min_precise_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
307        unsafe {
308            let intermediate = _mm_min_ps(a.into(), b.into());
309            let b_is_nan = _mm_cmpunord_ps(b.into(), b.into());
310            _mm_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(self)
311        }
312    }
313    #[inline(always)]
314    fn mul_add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
315        a * b + c
316    }
317    #[inline(always)]
318    fn mul_sub_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
319        a * b - c
320    }
321    #[inline(always)]
322    fn floor_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
323        unsafe {
324            _mm_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
325        }
326    }
327    #[inline(always)]
328    fn ceil_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
329        unsafe {
330            _mm_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
331        }
332    }
333    #[inline(always)]
334    fn round_ties_even_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
335        unsafe {
336            _mm_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
337                .simd_into(self)
338        }
339    }
340    #[inline(always)]
341    fn fract_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
342        a - self.trunc_f32x4(a)
343    }
344    #[inline(always)]
345    fn trunc_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
346        unsafe {
347            _mm_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
348        }
349    }
350    #[inline(always)]
351    fn select_f32x4(self, a: mask32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
352        unsafe { _mm_blendv_ps(c.into(), b.into(), _mm_castsi128_ps(a.into())).simd_into(self) }
353    }
354    #[inline(always)]
355    fn combine_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x8<Self> {
356        f32x8 {
357            val: crate::support::Aligned256([a.val.0, b.val.0]),
358            simd: self,
359        }
360    }
361    #[inline(always)]
362    fn reinterpret_f64_f32x4(self, a: f32x4<Self>) -> f64x2<Self> {
363        unsafe { _mm_castps_pd(a.into()).simd_into(self) }
364    }
365    #[inline(always)]
366    fn reinterpret_i32_f32x4(self, a: f32x4<Self>) -> i32x4<Self> {
367        unsafe { _mm_castps_si128(a.into()).simd_into(self) }
368    }
369    #[inline(always)]
370    fn reinterpret_u8_f32x4(self, a: f32x4<Self>) -> u8x16<Self> {
371        unsafe { _mm_castps_si128(a.into()).simd_into(self) }
372    }
373    #[inline(always)]
374    fn reinterpret_u32_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
375        unsafe { _mm_castps_si128(a.into()).simd_into(self) }
376    }
377    #[inline(always)]
378    fn cvt_u32_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
379        unsafe {
380            let mut converted = _mm_cvttps_epi32(a.into());
381            let in_range = _mm_cmplt_ps(a.into(), _mm_set1_ps(2147483648.0));
382            let all_in_range = _mm_movemask_ps(in_range) == 0b1111;
383            if !all_in_range {
384                let excess = _mm_sub_ps(a.into(), _mm_set1_ps(2147483648.0));
385                let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess));
386                converted = _mm_add_epi32(converted, excess_converted);
387            }
388            converted.simd_into(self)
389        }
390    }
391    #[inline(always)]
392    fn cvt_u32_precise_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
393        unsafe {
394            let a = _mm_max_ps(a.into(), _mm_setzero_ps());
395            let mut converted = _mm_cvttps_epi32(a);
396            let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0));
397            let all_in_range = _mm_movemask_ps(in_range) == 0b1111;
398            if !all_in_range {
399                let exceeds_unsigned_range =
400                    _mm_castps_si128(_mm_cmplt_ps(_mm_set1_ps(4294967040.0), a));
401                let excess = _mm_sub_ps(a, _mm_set1_ps(2147483648.0));
402                let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess));
403                converted = _mm_add_epi32(converted, excess_converted);
404                converted = _mm_blendv_epi8(
405                    converted,
406                    _mm_set1_epi32(u32::MAX.cast_signed()),
407                    exceeds_unsigned_range,
408                );
409            }
410            converted.simd_into(self)
411        }
412    }
413    #[inline(always)]
414    fn cvt_i32_f32x4(self, a: f32x4<Self>) -> i32x4<Self> {
415        unsafe { _mm_cvttps_epi32(a.into()).simd_into(self) }
416    }
417    #[inline(always)]
418    fn cvt_i32_precise_f32x4(self, a: f32x4<Self>) -> i32x4<Self> {
419        unsafe {
420            let a = a.into();
421            let mut converted = _mm_cvttps_epi32(a);
422            let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0));
423            let all_in_range = _mm_movemask_ps(in_range) == 0b1111;
424            if !all_in_range {
425                converted = _mm_blendv_epi8(
426                    _mm_set1_epi32(i32::MAX),
427                    converted,
428                    _mm_castps_si128(in_range),
429                );
430                let is_not_nan = _mm_castps_si128(_mm_cmpord_ps(a, a));
431                converted = _mm_and_si128(converted, is_not_nan);
432            }
433            converted.simd_into(self)
434        }
435    }
436    #[inline(always)]
437    fn splat_i8x16(self, val: i8) -> i8x16<Self> {
438        unsafe { _mm_set1_epi8(val).simd_into(self) }
439    }
440    #[inline(always)]
441    fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16<Self> {
442        i8x16 {
443            val: unsafe { core::mem::transmute_copy(&val) },
444            simd: self,
445        }
446    }
447    #[inline(always)]
448    fn load_array_ref_i8x16(self, val: &[i8; 16usize]) -> i8x16<Self> {
449        i8x16 {
450            val: unsafe { core::mem::transmute_copy(val) },
451            simd: self,
452        }
453    }
454    #[inline(always)]
455    fn as_array_i8x16(self, a: i8x16<Self>) -> [i8; 16usize] {
456        unsafe { core::mem::transmute::<__m128i, [i8; 16usize]>(a.val.0) }
457    }
458    #[inline(always)]
459    fn as_array_ref_i8x16(self, a: &i8x16<Self>) -> &[i8; 16usize] {
460        unsafe { core::mem::transmute::<&__m128i, &[i8; 16usize]>(&a.val.0) }
461    }
462    #[inline(always)]
463    fn as_array_mut_i8x16(self, a: &mut i8x16<Self>) -> &mut [i8; 16usize] {
464        unsafe { core::mem::transmute::<&mut __m128i, &mut [i8; 16usize]>(&mut a.val.0) }
465    }
466    #[inline(always)]
467    fn store_array_i8x16(self, a: i8x16<Self>, dest: &mut [i8; 16usize]) -> () {
468        unsafe {
469            core::ptr::copy_nonoverlapping(
470                (&raw const a.val.0) as *const i8,
471                dest.as_mut_ptr(),
472                16usize,
473            );
474        }
475    }
476    #[inline(always)]
477    fn cvt_from_bytes_i8x16(self, a: u8x16<Self>) -> i8x16<Self> {
478        unsafe {
479            i8x16 {
480                val: core::mem::transmute(a.val),
481                simd: self,
482            }
483        }
484    }
485    #[inline(always)]
486    fn cvt_to_bytes_i8x16(self, a: i8x16<Self>) -> u8x16<Self> {
487        unsafe {
488            u8x16 {
489                val: core::mem::transmute(a.val),
490                simd: self,
491            }
492        }
493    }
494    #[inline(always)]
495    fn slide_i8x16<const SHIFT: usize>(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
496        unsafe {
497            if SHIFT >= 16usize {
498                return b;
499            }
500            let result = dyn_alignr_128(
501                self.cvt_to_bytes_i8x16(b).val.0,
502                self.cvt_to_bytes_i8x16(a).val.0,
503                SHIFT,
504            );
505            self.cvt_from_bytes_i8x16(u8x16 {
506                val: crate::support::Aligned128(result),
507                simd: self,
508            })
509        }
510    }
511    #[inline(always)]
512    fn slide_within_blocks_i8x16<const SHIFT: usize>(
513        self,
514        a: i8x16<Self>,
515        b: i8x16<Self>,
516    ) -> i8x16<Self> {
517        self.slide_i8x16::<SHIFT>(a, b)
518    }
519    #[inline(always)]
520    fn add_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
521        unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) }
522    }
523    #[inline(always)]
524    fn sub_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
525        unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) }
526    }
527    #[inline(always)]
528    fn mul_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
529        unsafe {
530            let dst_even = _mm_mullo_epi16(a.into(), b.into());
531            let dst_odd =
532                _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into()));
533            _mm_or_si128(
534                _mm_slli_epi16(dst_odd, 8),
535                _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)),
536            )
537            .simd_into(self)
538        }
539    }
540    #[inline(always)]
541    fn and_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
542        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
543    }
544    #[inline(always)]
545    fn or_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
546        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
547    }
548    #[inline(always)]
549    fn xor_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
550        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
551    }
552    #[inline(always)]
553    fn not_i8x16(self, a: i8x16<Self>) -> i8x16<Self> {
554        a ^ !0
555    }
556    #[inline(always)]
557    fn shl_i8x16(self, a: i8x16<Self>, shift: u32) -> i8x16<Self> {
558        unsafe {
559            let val = a.into();
560            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
561            let lo_16 = _mm_unpacklo_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val));
562            let hi_16 = _mm_unpackhi_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val));
563            let lo_shifted = _mm_sll_epi16(lo_16, shift_count);
564            let hi_shifted = _mm_sll_epi16(hi_16, shift_count);
565            _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
566        }
567    }
568    #[inline(always)]
569    fn shlv_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
570        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
571    }
572    #[inline(always)]
573    fn shr_i8x16(self, a: i8x16<Self>, shift: u32) -> i8x16<Self> {
574        unsafe {
575            let val = a.into();
576            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
577            let lo_16 = _mm_unpacklo_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val));
578            let hi_16 = _mm_unpackhi_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val));
579            let lo_shifted = _mm_sra_epi16(lo_16, shift_count);
580            let hi_shifted = _mm_sra_epi16(hi_16, shift_count);
581            _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
582        }
583    }
584    #[inline(always)]
585    fn shrv_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
586        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
587    }
588    #[inline(always)]
589    fn simd_eq_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
590        unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) }
591    }
592    #[inline(always)]
593    fn simd_lt_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
594        unsafe { _mm_cmpgt_epi8(b.into(), a.into()).simd_into(self) }
595    }
596    #[inline(always)]
597    fn simd_le_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
598        unsafe { _mm_cmpeq_epi8(_mm_min_epi8(a.into(), b.into()), a.into()).simd_into(self) }
599    }
600    #[inline(always)]
601    fn simd_ge_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
602        unsafe { _mm_cmpeq_epi8(_mm_max_epi8(a.into(), b.into()), a.into()).simd_into(self) }
603    }
604    #[inline(always)]
605    fn simd_gt_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
606        unsafe { _mm_cmpgt_epi8(a.into(), b.into()).simd_into(self) }
607    }
608    #[inline(always)]
609    fn zip_low_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
610        unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) }
611    }
612    #[inline(always)]
613    fn zip_high_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
614        unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) }
615    }
616    #[inline(always)]
617    fn unzip_low_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
618        unsafe {
619            let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
620            let t1 = _mm_shuffle_epi8(a.into(), mask);
621            let t2 = _mm_shuffle_epi8(b.into(), mask);
622            _mm_unpacklo_epi64(t1, t2).simd_into(self)
623        }
624    }
625    #[inline(always)]
626    fn unzip_high_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
627        unsafe {
628            let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
629            let t1 = _mm_shuffle_epi8(a.into(), mask);
630            let t2 = _mm_shuffle_epi8(b.into(), mask);
631            _mm_unpackhi_epi64(t1, t2).simd_into(self)
632        }
633    }
634    #[inline(always)]
635    fn interleave_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> (i8x16<Self>, i8x16<Self>) {
636        (self.zip_low_i8x16(a, b), self.zip_high_i8x16(a, b))
637    }
638    #[inline(always)]
639    fn deinterleave_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> (i8x16<Self>, i8x16<Self>) {
640        (self.unzip_low_i8x16(a, b), self.unzip_high_i8x16(a, b))
641    }
642    #[inline(always)]
643    fn select_i8x16(self, a: mask8x16<Self>, b: i8x16<Self>, c: i8x16<Self>) -> i8x16<Self> {
644        unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
645    }
646    #[inline(always)]
647    fn min_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
648        unsafe { _mm_min_epi8(a.into(), b.into()).simd_into(self) }
649    }
650    #[inline(always)]
651    fn max_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
652        unsafe { _mm_max_epi8(a.into(), b.into()).simd_into(self) }
653    }
654    #[inline(always)]
655    fn combine_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x32<Self> {
656        i8x32 {
657            val: crate::support::Aligned256([a.val.0, b.val.0]),
658            simd: self,
659        }
660    }
661    #[inline(always)]
662    fn neg_i8x16(self, a: i8x16<Self>) -> i8x16<Self> {
663        unsafe { _mm_sub_epi8(_mm_setzero_si128(), a.into()).simd_into(self) }
664    }
665    #[inline(always)]
666    fn reinterpret_u8_i8x16(self, a: i8x16<Self>) -> u8x16<Self> {
667        __m128i::from(a).simd_into(self)
668    }
669    #[inline(always)]
670    fn reinterpret_u32_i8x16(self, a: i8x16<Self>) -> u32x4<Self> {
671        __m128i::from(a).simd_into(self)
672    }
673    #[inline(always)]
674    fn splat_u8x16(self, val: u8) -> u8x16<Self> {
675        unsafe { _mm_set1_epi8(val.cast_signed()).simd_into(self) }
676    }
677    #[inline(always)]
678    fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16<Self> {
679        u8x16 {
680            val: unsafe { core::mem::transmute_copy(&val) },
681            simd: self,
682        }
683    }
684    #[inline(always)]
685    fn load_array_ref_u8x16(self, val: &[u8; 16usize]) -> u8x16<Self> {
686        u8x16 {
687            val: unsafe { core::mem::transmute_copy(val) },
688            simd: self,
689        }
690    }
691    #[inline(always)]
692    fn as_array_u8x16(self, a: u8x16<Self>) -> [u8; 16usize] {
693        unsafe { core::mem::transmute::<__m128i, [u8; 16usize]>(a.val.0) }
694    }
695    #[inline(always)]
696    fn as_array_ref_u8x16(self, a: &u8x16<Self>) -> &[u8; 16usize] {
697        unsafe { core::mem::transmute::<&__m128i, &[u8; 16usize]>(&a.val.0) }
698    }
699    #[inline(always)]
700    fn as_array_mut_u8x16(self, a: &mut u8x16<Self>) -> &mut [u8; 16usize] {
701        unsafe { core::mem::transmute::<&mut __m128i, &mut [u8; 16usize]>(&mut a.val.0) }
702    }
703    #[inline(always)]
704    fn store_array_u8x16(self, a: u8x16<Self>, dest: &mut [u8; 16usize]) -> () {
705        unsafe {
706            core::ptr::copy_nonoverlapping(
707                (&raw const a.val.0) as *const u8,
708                dest.as_mut_ptr(),
709                16usize,
710            );
711        }
712    }
713    #[inline(always)]
714    fn cvt_from_bytes_u8x16(self, a: u8x16<Self>) -> u8x16<Self> {
715        unsafe {
716            u8x16 {
717                val: core::mem::transmute(a.val),
718                simd: self,
719            }
720        }
721    }
722    #[inline(always)]
723    fn cvt_to_bytes_u8x16(self, a: u8x16<Self>) -> u8x16<Self> {
724        unsafe {
725            u8x16 {
726                val: core::mem::transmute(a.val),
727                simd: self,
728            }
729        }
730    }
731    #[inline(always)]
732    fn slide_u8x16<const SHIFT: usize>(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
733        unsafe {
734            if SHIFT >= 16usize {
735                return b;
736            }
737            let result = dyn_alignr_128(
738                self.cvt_to_bytes_u8x16(b).val.0,
739                self.cvt_to_bytes_u8x16(a).val.0,
740                SHIFT,
741            );
742            self.cvt_from_bytes_u8x16(u8x16 {
743                val: crate::support::Aligned128(result),
744                simd: self,
745            })
746        }
747    }
748    #[inline(always)]
749    fn slide_within_blocks_u8x16<const SHIFT: usize>(
750        self,
751        a: u8x16<Self>,
752        b: u8x16<Self>,
753    ) -> u8x16<Self> {
754        self.slide_u8x16::<SHIFT>(a, b)
755    }
756    #[inline(always)]
757    fn add_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
758        unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) }
759    }
760    #[inline(always)]
761    fn sub_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
762        unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) }
763    }
764    #[inline(always)]
765    fn mul_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
766        unsafe {
767            let dst_even = _mm_mullo_epi16(a.into(), b.into());
768            let dst_odd =
769                _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into()));
770            _mm_or_si128(
771                _mm_slli_epi16(dst_odd, 8),
772                _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)),
773            )
774            .simd_into(self)
775        }
776    }
777    #[inline(always)]
778    fn and_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
779        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
780    }
781    #[inline(always)]
782    fn or_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
783        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
784    }
785    #[inline(always)]
786    fn xor_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
787        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
788    }
789    #[inline(always)]
790    fn not_u8x16(self, a: u8x16<Self>) -> u8x16<Self> {
791        a ^ !0
792    }
793    #[inline(always)]
794    fn shl_u8x16(self, a: u8x16<Self>, shift: u32) -> u8x16<Self> {
795        unsafe {
796            let val = a.into();
797            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
798            let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128());
799            let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128());
800            let lo_shifted = _mm_sll_epi16(lo_16, shift_count);
801            let hi_shifted = _mm_sll_epi16(hi_16, shift_count);
802            _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
803        }
804    }
805    #[inline(always)]
806    fn shlv_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
807        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
808    }
809    #[inline(always)]
810    fn shr_u8x16(self, a: u8x16<Self>, shift: u32) -> u8x16<Self> {
811        unsafe {
812            let val = a.into();
813            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
814            let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128());
815            let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128());
816            let lo_shifted = _mm_srl_epi16(lo_16, shift_count);
817            let hi_shifted = _mm_srl_epi16(hi_16, shift_count);
818            _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
819        }
820    }
821    #[inline(always)]
822    fn shrv_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
823        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
824    }
825    #[inline(always)]
826    fn simd_eq_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
827        unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) }
828    }
829    #[inline(always)]
830    fn simd_lt_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
831        unsafe {
832            let sign_bit = _mm_set1_epi8(0x80u8.cast_signed());
833            let a_signed = _mm_xor_si128(a.into(), sign_bit);
834            let b_signed = _mm_xor_si128(b.into(), sign_bit);
835            _mm_cmpgt_epi8(b_signed, a_signed).simd_into(self)
836        }
837    }
838    #[inline(always)]
839    fn simd_le_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
840        unsafe { _mm_cmpeq_epi8(_mm_min_epu8(a.into(), b.into()), a.into()).simd_into(self) }
841    }
842    #[inline(always)]
843    fn simd_ge_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
844        unsafe { _mm_cmpeq_epi8(_mm_max_epu8(a.into(), b.into()), a.into()).simd_into(self) }
845    }
846    #[inline(always)]
847    fn simd_gt_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
848        unsafe {
849            let sign_bit = _mm_set1_epi8(0x80u8.cast_signed());
850            let a_signed = _mm_xor_si128(a.into(), sign_bit);
851            let b_signed = _mm_xor_si128(b.into(), sign_bit);
852            _mm_cmpgt_epi8(a_signed, b_signed).simd_into(self)
853        }
854    }
855    #[inline(always)]
856    fn zip_low_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
857        unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) }
858    }
859    #[inline(always)]
860    fn zip_high_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
861        unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) }
862    }
863    #[inline(always)]
864    fn unzip_low_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
865        unsafe {
866            let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
867            let t1 = _mm_shuffle_epi8(a.into(), mask);
868            let t2 = _mm_shuffle_epi8(b.into(), mask);
869            _mm_unpacklo_epi64(t1, t2).simd_into(self)
870        }
871    }
872    #[inline(always)]
873    fn unzip_high_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
874        unsafe {
875            let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
876            let t1 = _mm_shuffle_epi8(a.into(), mask);
877            let t2 = _mm_shuffle_epi8(b.into(), mask);
878            _mm_unpackhi_epi64(t1, t2).simd_into(self)
879        }
880    }
881    #[inline(always)]
882    fn interleave_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> (u8x16<Self>, u8x16<Self>) {
883        (self.zip_low_u8x16(a, b), self.zip_high_u8x16(a, b))
884    }
885    #[inline(always)]
886    fn deinterleave_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> (u8x16<Self>, u8x16<Self>) {
887        (self.unzip_low_u8x16(a, b), self.unzip_high_u8x16(a, b))
888    }
889    #[inline(always)]
890    fn select_u8x16(self, a: mask8x16<Self>, b: u8x16<Self>, c: u8x16<Self>) -> u8x16<Self> {
891        unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
892    }
893    #[inline(always)]
894    fn min_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
895        unsafe { _mm_min_epu8(a.into(), b.into()).simd_into(self) }
896    }
897    #[inline(always)]
898    fn max_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
899        unsafe { _mm_max_epu8(a.into(), b.into()).simd_into(self) }
900    }
901    #[inline(always)]
902    fn combine_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x32<Self> {
903        u8x32 {
904            val: crate::support::Aligned256([a.val.0, b.val.0]),
905            simd: self,
906        }
907    }
908    #[inline(always)]
909    fn widen_u8x16(self, a: u8x16<Self>) -> u16x16<Self> {
910        unsafe {
911            let raw = a.into();
912            let high = _mm_cvtepu8_epi16(raw).simd_into(self);
913            let low = _mm_cvtepu8_epi16(_mm_srli_si128::<8>(raw)).simd_into(self);
914            self.combine_u16x8(high, low)
915        }
916    }
917    #[inline(always)]
918    fn reinterpret_u32_u8x16(self, a: u8x16<Self>) -> u32x4<Self> {
919        __m128i::from(a).simd_into(self)
920    }
921    #[inline(always)]
922    fn splat_mask8x16(self, val: i8) -> mask8x16<Self> {
923        unsafe { _mm_set1_epi8(val).simd_into(self) }
924    }
925    #[inline(always)]
926    fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16<Self> {
927        mask8x16 {
928            val: unsafe { core::mem::transmute_copy(&val) },
929            simd: self,
930        }
931    }
932    #[inline(always)]
933    fn load_array_ref_mask8x16(self, val: &[i8; 16usize]) -> mask8x16<Self> {
934        mask8x16 {
935            val: unsafe { core::mem::transmute_copy(val) },
936            simd: self,
937        }
938    }
939    #[inline(always)]
940    fn as_array_mask8x16(self, a: mask8x16<Self>) -> [i8; 16usize] {
941        unsafe { core::mem::transmute::<__m128i, [i8; 16usize]>(a.val.0) }
942    }
943    #[inline(always)]
944    fn as_array_ref_mask8x16(self, a: &mask8x16<Self>) -> &[i8; 16usize] {
945        unsafe { core::mem::transmute::<&__m128i, &[i8; 16usize]>(&a.val.0) }
946    }
947    #[inline(always)]
948    fn as_array_mut_mask8x16(self, a: &mut mask8x16<Self>) -> &mut [i8; 16usize] {
949        unsafe { core::mem::transmute::<&mut __m128i, &mut [i8; 16usize]>(&mut a.val.0) }
950    }
951    #[inline(always)]
952    fn store_array_mask8x16(self, a: mask8x16<Self>, dest: &mut [i8; 16usize]) -> () {
953        unsafe {
954            core::ptr::copy_nonoverlapping(
955                (&raw const a.val.0) as *const i8,
956                dest.as_mut_ptr(),
957                16usize,
958            );
959        }
960    }
961    #[inline(always)]
962    fn cvt_from_bytes_mask8x16(self, a: u8x16<Self>) -> mask8x16<Self> {
963        unsafe {
964            mask8x16 {
965                val: core::mem::transmute(a.val),
966                simd: self,
967            }
968        }
969    }
970    #[inline(always)]
971    fn cvt_to_bytes_mask8x16(self, a: mask8x16<Self>) -> u8x16<Self> {
972        unsafe {
973            u8x16 {
974                val: core::mem::transmute(a.val),
975                simd: self,
976            }
977        }
978    }
979    #[inline(always)]
980    fn slide_mask8x16<const SHIFT: usize>(
981        self,
982        a: mask8x16<Self>,
983        b: mask8x16<Self>,
984    ) -> mask8x16<Self> {
985        unsafe {
986            if SHIFT >= 16usize {
987                return b;
988            }
989            let result = dyn_alignr_128(
990                self.cvt_to_bytes_mask8x16(b).val.0,
991                self.cvt_to_bytes_mask8x16(a).val.0,
992                SHIFT,
993            );
994            self.cvt_from_bytes_mask8x16(u8x16 {
995                val: crate::support::Aligned128(result),
996                simd: self,
997            })
998        }
999    }
1000    #[inline(always)]
1001    fn slide_within_blocks_mask8x16<const SHIFT: usize>(
1002        self,
1003        a: mask8x16<Self>,
1004        b: mask8x16<Self>,
1005    ) -> mask8x16<Self> {
1006        self.slide_mask8x16::<SHIFT>(a, b)
1007    }
1008    #[inline(always)]
1009    fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
1010        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1011    }
1012    #[inline(always)]
1013    fn or_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
1014        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1015    }
1016    #[inline(always)]
1017    fn xor_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
1018        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1019    }
1020    #[inline(always)]
1021    fn not_mask8x16(self, a: mask8x16<Self>) -> mask8x16<Self> {
1022        a ^ !0
1023    }
1024    #[inline(always)]
1025    fn select_mask8x16(
1026        self,
1027        a: mask8x16<Self>,
1028        b: mask8x16<Self>,
1029        c: mask8x16<Self>,
1030    ) -> mask8x16<Self> {
1031        unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
1032    }
1033    #[inline(always)]
1034    fn simd_eq_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
1035        unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) }
1036    }
1037    #[inline(always)]
1038    fn any_true_mask8x16(self, a: mask8x16<Self>) -> bool {
1039        unsafe { _mm_movemask_epi8(a.into()) as u32 != 0 }
1040    }
1041    #[inline(always)]
1042    fn all_true_mask8x16(self, a: mask8x16<Self>) -> bool {
1043        unsafe { _mm_movemask_epi8(a.into()) as u32 == 0xffff }
1044    }
1045    #[inline(always)]
1046    fn any_false_mask8x16(self, a: mask8x16<Self>) -> bool {
1047        unsafe { _mm_movemask_epi8(a.into()) as u32 != 0xffff }
1048    }
1049    #[inline(always)]
1050    fn all_false_mask8x16(self, a: mask8x16<Self>) -> bool {
1051        unsafe { _mm_movemask_epi8(a.into()) as u32 == 0 }
1052    }
1053    #[inline(always)]
1054    fn combine_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x32<Self> {
1055        mask8x32 {
1056            val: crate::support::Aligned256([a.val.0, b.val.0]),
1057            simd: self,
1058        }
1059    }
1060    #[inline(always)]
1061    fn splat_i16x8(self, val: i16) -> i16x8<Self> {
1062        unsafe { _mm_set1_epi16(val).simd_into(self) }
1063    }
1064    #[inline(always)]
1065    fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8<Self> {
1066        i16x8 {
1067            val: unsafe { core::mem::transmute_copy(&val) },
1068            simd: self,
1069        }
1070    }
1071    #[inline(always)]
1072    fn load_array_ref_i16x8(self, val: &[i16; 8usize]) -> i16x8<Self> {
1073        i16x8 {
1074            val: unsafe { core::mem::transmute_copy(val) },
1075            simd: self,
1076        }
1077    }
1078    #[inline(always)]
1079    fn as_array_i16x8(self, a: i16x8<Self>) -> [i16; 8usize] {
1080        unsafe { core::mem::transmute::<__m128i, [i16; 8usize]>(a.val.0) }
1081    }
1082    #[inline(always)]
1083    fn as_array_ref_i16x8(self, a: &i16x8<Self>) -> &[i16; 8usize] {
1084        unsafe { core::mem::transmute::<&__m128i, &[i16; 8usize]>(&a.val.0) }
1085    }
1086    #[inline(always)]
1087    fn as_array_mut_i16x8(self, a: &mut i16x8<Self>) -> &mut [i16; 8usize] {
1088        unsafe { core::mem::transmute::<&mut __m128i, &mut [i16; 8usize]>(&mut a.val.0) }
1089    }
1090    #[inline(always)]
1091    fn store_array_i16x8(self, a: i16x8<Self>, dest: &mut [i16; 8usize]) -> () {
1092        unsafe {
1093            core::ptr::copy_nonoverlapping(
1094                (&raw const a.val.0) as *const i16,
1095                dest.as_mut_ptr(),
1096                8usize,
1097            );
1098        }
1099    }
1100    #[inline(always)]
1101    fn cvt_from_bytes_i16x8(self, a: u8x16<Self>) -> i16x8<Self> {
1102        unsafe {
1103            i16x8 {
1104                val: core::mem::transmute(a.val),
1105                simd: self,
1106            }
1107        }
1108    }
1109    #[inline(always)]
1110    fn cvt_to_bytes_i16x8(self, a: i16x8<Self>) -> u8x16<Self> {
1111        unsafe {
1112            u8x16 {
1113                val: core::mem::transmute(a.val),
1114                simd: self,
1115            }
1116        }
1117    }
1118    #[inline(always)]
1119    fn slide_i16x8<const SHIFT: usize>(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1120        unsafe {
1121            if SHIFT >= 8usize {
1122                return b;
1123            }
1124            let result = dyn_alignr_128(
1125                self.cvt_to_bytes_i16x8(b).val.0,
1126                self.cvt_to_bytes_i16x8(a).val.0,
1127                SHIFT * 2usize,
1128            );
1129            self.cvt_from_bytes_i16x8(u8x16 {
1130                val: crate::support::Aligned128(result),
1131                simd: self,
1132            })
1133        }
1134    }
1135    #[inline(always)]
1136    fn slide_within_blocks_i16x8<const SHIFT: usize>(
1137        self,
1138        a: i16x8<Self>,
1139        b: i16x8<Self>,
1140    ) -> i16x8<Self> {
1141        self.slide_i16x8::<SHIFT>(a, b)
1142    }
1143    #[inline(always)]
1144    fn add_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1145        unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) }
1146    }
1147    #[inline(always)]
1148    fn sub_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1149        unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) }
1150    }
1151    #[inline(always)]
1152    fn mul_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1153        unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) }
1154    }
1155    #[inline(always)]
1156    fn and_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1157        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1158    }
1159    #[inline(always)]
1160    fn or_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1161        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1162    }
1163    #[inline(always)]
1164    fn xor_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1165        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1166    }
1167    #[inline(always)]
1168    fn not_i16x8(self, a: i16x8<Self>) -> i16x8<Self> {
1169        a ^ !0
1170    }
1171    #[inline(always)]
1172    fn shl_i16x8(self, a: i16x8<Self>, shift: u32) -> i16x8<Self> {
1173        unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1174    }
1175    #[inline(always)]
1176    fn shlv_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1177        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
1178    }
1179    #[inline(always)]
1180    fn shr_i16x8(self, a: i16x8<Self>, shift: u32) -> i16x8<Self> {
1181        unsafe { _mm_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1182    }
1183    #[inline(always)]
1184    fn shrv_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1185        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
1186    }
1187    #[inline(always)]
1188    fn simd_eq_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
1189        unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) }
1190    }
1191    #[inline(always)]
1192    fn simd_lt_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
1193        unsafe { _mm_cmpgt_epi16(b.into(), a.into()).simd_into(self) }
1194    }
1195    #[inline(always)]
1196    fn simd_le_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
1197        unsafe { _mm_cmpeq_epi16(_mm_min_epi16(a.into(), b.into()), a.into()).simd_into(self) }
1198    }
1199    #[inline(always)]
1200    fn simd_ge_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
1201        unsafe { _mm_cmpeq_epi16(_mm_max_epi16(a.into(), b.into()), a.into()).simd_into(self) }
1202    }
1203    #[inline(always)]
1204    fn simd_gt_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
1205        unsafe { _mm_cmpgt_epi16(a.into(), b.into()).simd_into(self) }
1206    }
1207    #[inline(always)]
1208    fn zip_low_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1209        unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) }
1210    }
1211    #[inline(always)]
1212    fn zip_high_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1213        unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) }
1214    }
1215    #[inline(always)]
1216    fn unzip_low_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1217        unsafe {
1218            let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
1219            let t1 = _mm_shuffle_epi8(a.into(), mask);
1220            let t2 = _mm_shuffle_epi8(b.into(), mask);
1221            _mm_unpacklo_epi64(t1, t2).simd_into(self)
1222        }
1223    }
1224    #[inline(always)]
1225    fn unzip_high_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1226        unsafe {
1227            let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
1228            let t1 = _mm_shuffle_epi8(a.into(), mask);
1229            let t2 = _mm_shuffle_epi8(b.into(), mask);
1230            _mm_unpackhi_epi64(t1, t2).simd_into(self)
1231        }
1232    }
1233    #[inline(always)]
1234    fn interleave_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> (i16x8<Self>, i16x8<Self>) {
1235        (self.zip_low_i16x8(a, b), self.zip_high_i16x8(a, b))
1236    }
1237    #[inline(always)]
1238    fn deinterleave_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> (i16x8<Self>, i16x8<Self>) {
1239        (self.unzip_low_i16x8(a, b), self.unzip_high_i16x8(a, b))
1240    }
1241    #[inline(always)]
1242    fn select_i16x8(self, a: mask16x8<Self>, b: i16x8<Self>, c: i16x8<Self>) -> i16x8<Self> {
1243        unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
1244    }
1245    #[inline(always)]
1246    fn min_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1247        unsafe { _mm_min_epi16(a.into(), b.into()).simd_into(self) }
1248    }
1249    #[inline(always)]
1250    fn max_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1251        unsafe { _mm_max_epi16(a.into(), b.into()).simd_into(self) }
1252    }
1253    #[inline(always)]
1254    fn combine_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x16<Self> {
1255        i16x16 {
1256            val: crate::support::Aligned256([a.val.0, b.val.0]),
1257            simd: self,
1258        }
1259    }
1260    #[inline(always)]
1261    fn neg_i16x8(self, a: i16x8<Self>) -> i16x8<Self> {
1262        unsafe { _mm_sub_epi16(_mm_setzero_si128(), a.into()).simd_into(self) }
1263    }
1264    #[inline(always)]
1265    fn reinterpret_u8_i16x8(self, a: i16x8<Self>) -> u8x16<Self> {
1266        __m128i::from(a).simd_into(self)
1267    }
1268    #[inline(always)]
1269    fn reinterpret_u32_i16x8(self, a: i16x8<Self>) -> u32x4<Self> {
1270        __m128i::from(a).simd_into(self)
1271    }
1272    #[inline(always)]
1273    fn splat_u16x8(self, val: u16) -> u16x8<Self> {
1274        unsafe { _mm_set1_epi16(val.cast_signed()).simd_into(self) }
1275    }
1276    #[inline(always)]
1277    fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8<Self> {
1278        u16x8 {
1279            val: unsafe { core::mem::transmute_copy(&val) },
1280            simd: self,
1281        }
1282    }
1283    #[inline(always)]
1284    fn load_array_ref_u16x8(self, val: &[u16; 8usize]) -> u16x8<Self> {
1285        u16x8 {
1286            val: unsafe { core::mem::transmute_copy(val) },
1287            simd: self,
1288        }
1289    }
1290    #[inline(always)]
1291    fn as_array_u16x8(self, a: u16x8<Self>) -> [u16; 8usize] {
1292        unsafe { core::mem::transmute::<__m128i, [u16; 8usize]>(a.val.0) }
1293    }
1294    #[inline(always)]
1295    fn as_array_ref_u16x8(self, a: &u16x8<Self>) -> &[u16; 8usize] {
1296        unsafe { core::mem::transmute::<&__m128i, &[u16; 8usize]>(&a.val.0) }
1297    }
1298    #[inline(always)]
1299    fn as_array_mut_u16x8(self, a: &mut u16x8<Self>) -> &mut [u16; 8usize] {
1300        unsafe { core::mem::transmute::<&mut __m128i, &mut [u16; 8usize]>(&mut a.val.0) }
1301    }
1302    #[inline(always)]
1303    fn store_array_u16x8(self, a: u16x8<Self>, dest: &mut [u16; 8usize]) -> () {
1304        unsafe {
1305            core::ptr::copy_nonoverlapping(
1306                (&raw const a.val.0) as *const u16,
1307                dest.as_mut_ptr(),
1308                8usize,
1309            );
1310        }
1311    }
1312    #[inline(always)]
1313    fn cvt_from_bytes_u16x8(self, a: u8x16<Self>) -> u16x8<Self> {
1314        unsafe {
1315            u16x8 {
1316                val: core::mem::transmute(a.val),
1317                simd: self,
1318            }
1319        }
1320    }
1321    #[inline(always)]
1322    fn cvt_to_bytes_u16x8(self, a: u16x8<Self>) -> u8x16<Self> {
1323        unsafe {
1324            u8x16 {
1325                val: core::mem::transmute(a.val),
1326                simd: self,
1327            }
1328        }
1329    }
1330    #[inline(always)]
1331    fn slide_u16x8<const SHIFT: usize>(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1332        unsafe {
1333            if SHIFT >= 8usize {
1334                return b;
1335            }
1336            let result = dyn_alignr_128(
1337                self.cvt_to_bytes_u16x8(b).val.0,
1338                self.cvt_to_bytes_u16x8(a).val.0,
1339                SHIFT * 2usize,
1340            );
1341            self.cvt_from_bytes_u16x8(u8x16 {
1342                val: crate::support::Aligned128(result),
1343                simd: self,
1344            })
1345        }
1346    }
1347    #[inline(always)]
1348    fn slide_within_blocks_u16x8<const SHIFT: usize>(
1349        self,
1350        a: u16x8<Self>,
1351        b: u16x8<Self>,
1352    ) -> u16x8<Self> {
1353        self.slide_u16x8::<SHIFT>(a, b)
1354    }
1355    #[inline(always)]
1356    fn add_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1357        unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) }
1358    }
1359    #[inline(always)]
1360    fn sub_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1361        unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) }
1362    }
1363    #[inline(always)]
1364    fn mul_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1365        unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) }
1366    }
1367    #[inline(always)]
1368    fn and_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1369        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1370    }
1371    #[inline(always)]
1372    fn or_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1373        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1374    }
1375    #[inline(always)]
1376    fn xor_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1377        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1378    }
1379    #[inline(always)]
1380    fn not_u16x8(self, a: u16x8<Self>) -> u16x8<Self> {
1381        a ^ !0
1382    }
1383    #[inline(always)]
1384    fn shl_u16x8(self, a: u16x8<Self>, shift: u32) -> u16x8<Self> {
1385        unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1386    }
1387    #[inline(always)]
1388    fn shlv_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1389        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
1390    }
1391    #[inline(always)]
1392    fn shr_u16x8(self, a: u16x8<Self>, shift: u32) -> u16x8<Self> {
1393        unsafe { _mm_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1394    }
1395    #[inline(always)]
1396    fn shrv_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1397        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
1398    }
1399    #[inline(always)]
1400    fn simd_eq_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
1401        unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) }
1402    }
1403    #[inline(always)]
1404    fn simd_lt_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
1405        unsafe {
1406            let sign_bit = _mm_set1_epi16(0x8000u16.cast_signed());
1407            let a_signed = _mm_xor_si128(a.into(), sign_bit);
1408            let b_signed = _mm_xor_si128(b.into(), sign_bit);
1409            _mm_cmpgt_epi16(b_signed, a_signed).simd_into(self)
1410        }
1411    }
1412    #[inline(always)]
1413    fn simd_le_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
1414        unsafe { _mm_cmpeq_epi16(_mm_min_epu16(a.into(), b.into()), a.into()).simd_into(self) }
1415    }
1416    #[inline(always)]
1417    fn simd_ge_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
1418        unsafe { _mm_cmpeq_epi16(_mm_max_epu16(a.into(), b.into()), a.into()).simd_into(self) }
1419    }
1420    #[inline(always)]
1421    fn simd_gt_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
1422        unsafe {
1423            let sign_bit = _mm_set1_epi16(0x8000u16.cast_signed());
1424            let a_signed = _mm_xor_si128(a.into(), sign_bit);
1425            let b_signed = _mm_xor_si128(b.into(), sign_bit);
1426            _mm_cmpgt_epi16(a_signed, b_signed).simd_into(self)
1427        }
1428    }
1429    #[inline(always)]
1430    fn zip_low_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1431        unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) }
1432    }
1433    #[inline(always)]
1434    fn zip_high_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1435        unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) }
1436    }
1437    #[inline(always)]
1438    fn unzip_low_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1439        unsafe {
1440            let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
1441            let t1 = _mm_shuffle_epi8(a.into(), mask);
1442            let t2 = _mm_shuffle_epi8(b.into(), mask);
1443            _mm_unpacklo_epi64(t1, t2).simd_into(self)
1444        }
1445    }
1446    #[inline(always)]
1447    fn unzip_high_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1448        unsafe {
1449            let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
1450            let t1 = _mm_shuffle_epi8(a.into(), mask);
1451            let t2 = _mm_shuffle_epi8(b.into(), mask);
1452            _mm_unpackhi_epi64(t1, t2).simd_into(self)
1453        }
1454    }
1455    #[inline(always)]
1456    fn interleave_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> (u16x8<Self>, u16x8<Self>) {
1457        (self.zip_low_u16x8(a, b), self.zip_high_u16x8(a, b))
1458    }
1459    #[inline(always)]
1460    fn deinterleave_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> (u16x8<Self>, u16x8<Self>) {
1461        (self.unzip_low_u16x8(a, b), self.unzip_high_u16x8(a, b))
1462    }
1463    #[inline(always)]
1464    fn select_u16x8(self, a: mask16x8<Self>, b: u16x8<Self>, c: u16x8<Self>) -> u16x8<Self> {
1465        unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
1466    }
1467    #[inline(always)]
1468    fn min_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1469        unsafe { _mm_min_epu16(a.into(), b.into()).simd_into(self) }
1470    }
1471    #[inline(always)]
1472    fn max_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1473        unsafe { _mm_max_epu16(a.into(), b.into()).simd_into(self) }
1474    }
1475    #[inline(always)]
1476    fn combine_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x16<Self> {
1477        u16x16 {
1478            val: crate::support::Aligned256([a.val.0, b.val.0]),
1479            simd: self,
1480        }
1481    }
1482    #[inline(always)]
1483    fn reinterpret_u8_u16x8(self, a: u16x8<Self>) -> u8x16<Self> {
1484        __m128i::from(a).simd_into(self)
1485    }
1486    #[inline(always)]
1487    fn reinterpret_u32_u16x8(self, a: u16x8<Self>) -> u32x4<Self> {
1488        __m128i::from(a).simd_into(self)
1489    }
1490    #[inline(always)]
1491    fn splat_mask16x8(self, val: i16) -> mask16x8<Self> {
1492        unsafe { _mm_set1_epi16(val).simd_into(self) }
1493    }
1494    #[inline(always)]
1495    fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8<Self> {
1496        mask16x8 {
1497            val: unsafe { core::mem::transmute_copy(&val) },
1498            simd: self,
1499        }
1500    }
1501    #[inline(always)]
1502    fn load_array_ref_mask16x8(self, val: &[i16; 8usize]) -> mask16x8<Self> {
1503        mask16x8 {
1504            val: unsafe { core::mem::transmute_copy(val) },
1505            simd: self,
1506        }
1507    }
1508    #[inline(always)]
1509    fn as_array_mask16x8(self, a: mask16x8<Self>) -> [i16; 8usize] {
1510        unsafe { core::mem::transmute::<__m128i, [i16; 8usize]>(a.val.0) }
1511    }
1512    #[inline(always)]
1513    fn as_array_ref_mask16x8(self, a: &mask16x8<Self>) -> &[i16; 8usize] {
1514        unsafe { core::mem::transmute::<&__m128i, &[i16; 8usize]>(&a.val.0) }
1515    }
1516    #[inline(always)]
1517    fn as_array_mut_mask16x8(self, a: &mut mask16x8<Self>) -> &mut [i16; 8usize] {
1518        unsafe { core::mem::transmute::<&mut __m128i, &mut [i16; 8usize]>(&mut a.val.0) }
1519    }
1520    #[inline(always)]
1521    fn store_array_mask16x8(self, a: mask16x8<Self>, dest: &mut [i16; 8usize]) -> () {
1522        unsafe {
1523            core::ptr::copy_nonoverlapping(
1524                (&raw const a.val.0) as *const i16,
1525                dest.as_mut_ptr(),
1526                8usize,
1527            );
1528        }
1529    }
1530    #[inline(always)]
1531    fn cvt_from_bytes_mask16x8(self, a: u8x16<Self>) -> mask16x8<Self> {
1532        unsafe {
1533            mask16x8 {
1534                val: core::mem::transmute(a.val),
1535                simd: self,
1536            }
1537        }
1538    }
1539    #[inline(always)]
1540    fn cvt_to_bytes_mask16x8(self, a: mask16x8<Self>) -> u8x16<Self> {
1541        unsafe {
1542            u8x16 {
1543                val: core::mem::transmute(a.val),
1544                simd: self,
1545            }
1546        }
1547    }
1548    #[inline(always)]
1549    fn slide_mask16x8<const SHIFT: usize>(
1550        self,
1551        a: mask16x8<Self>,
1552        b: mask16x8<Self>,
1553    ) -> mask16x8<Self> {
1554        unsafe {
1555            if SHIFT >= 8usize {
1556                return b;
1557            }
1558            let result = dyn_alignr_128(
1559                self.cvt_to_bytes_mask16x8(b).val.0,
1560                self.cvt_to_bytes_mask16x8(a).val.0,
1561                SHIFT * 2usize,
1562            );
1563            self.cvt_from_bytes_mask16x8(u8x16 {
1564                val: crate::support::Aligned128(result),
1565                simd: self,
1566            })
1567        }
1568    }
1569    #[inline(always)]
1570    fn slide_within_blocks_mask16x8<const SHIFT: usize>(
1571        self,
1572        a: mask16x8<Self>,
1573        b: mask16x8<Self>,
1574    ) -> mask16x8<Self> {
1575        self.slide_mask16x8::<SHIFT>(a, b)
1576    }
1577    #[inline(always)]
1578    fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
1579        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1580    }
1581    #[inline(always)]
1582    fn or_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
1583        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1584    }
1585    #[inline(always)]
1586    fn xor_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
1587        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1588    }
1589    #[inline(always)]
1590    fn not_mask16x8(self, a: mask16x8<Self>) -> mask16x8<Self> {
1591        a ^ !0
1592    }
1593    #[inline(always)]
1594    fn select_mask16x8(
1595        self,
1596        a: mask16x8<Self>,
1597        b: mask16x8<Self>,
1598        c: mask16x8<Self>,
1599    ) -> mask16x8<Self> {
1600        unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
1601    }
1602    #[inline(always)]
1603    fn simd_eq_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
1604        unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) }
1605    }
1606    #[inline(always)]
1607    fn any_true_mask16x8(self, a: mask16x8<Self>) -> bool {
1608        unsafe { _mm_movemask_epi8(a.into()) as u32 != 0 }
1609    }
1610    #[inline(always)]
1611    fn all_true_mask16x8(self, a: mask16x8<Self>) -> bool {
1612        unsafe { _mm_movemask_epi8(a.into()) as u32 == 0xffff }
1613    }
1614    #[inline(always)]
1615    fn any_false_mask16x8(self, a: mask16x8<Self>) -> bool {
1616        unsafe { _mm_movemask_epi8(a.into()) as u32 != 0xffff }
1617    }
1618    #[inline(always)]
1619    fn all_false_mask16x8(self, a: mask16x8<Self>) -> bool {
1620        unsafe { _mm_movemask_epi8(a.into()) as u32 == 0 }
1621    }
1622    #[inline(always)]
1623    fn combine_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x16<Self> {
1624        mask16x16 {
1625            val: crate::support::Aligned256([a.val.0, b.val.0]),
1626            simd: self,
1627        }
1628    }
1629    #[inline(always)]
1630    fn splat_i32x4(self, val: i32) -> i32x4<Self> {
1631        unsafe { _mm_set1_epi32(val).simd_into(self) }
1632    }
1633    #[inline(always)]
1634    fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4<Self> {
1635        i32x4 {
1636            val: unsafe { core::mem::transmute_copy(&val) },
1637            simd: self,
1638        }
1639    }
1640    #[inline(always)]
1641    fn load_array_ref_i32x4(self, val: &[i32; 4usize]) -> i32x4<Self> {
1642        i32x4 {
1643            val: unsafe { core::mem::transmute_copy(val) },
1644            simd: self,
1645        }
1646    }
1647    #[inline(always)]
1648    fn as_array_i32x4(self, a: i32x4<Self>) -> [i32; 4usize] {
1649        unsafe { core::mem::transmute::<__m128i, [i32; 4usize]>(a.val.0) }
1650    }
1651    #[inline(always)]
1652    fn as_array_ref_i32x4(self, a: &i32x4<Self>) -> &[i32; 4usize] {
1653        unsafe { core::mem::transmute::<&__m128i, &[i32; 4usize]>(&a.val.0) }
1654    }
1655    #[inline(always)]
1656    fn as_array_mut_i32x4(self, a: &mut i32x4<Self>) -> &mut [i32; 4usize] {
1657        unsafe { core::mem::transmute::<&mut __m128i, &mut [i32; 4usize]>(&mut a.val.0) }
1658    }
1659    #[inline(always)]
1660    fn store_array_i32x4(self, a: i32x4<Self>, dest: &mut [i32; 4usize]) -> () {
1661        unsafe {
1662            core::ptr::copy_nonoverlapping(
1663                (&raw const a.val.0) as *const i32,
1664                dest.as_mut_ptr(),
1665                4usize,
1666            );
1667        }
1668    }
1669    #[inline(always)]
1670    fn cvt_from_bytes_i32x4(self, a: u8x16<Self>) -> i32x4<Self> {
1671        unsafe {
1672            i32x4 {
1673                val: core::mem::transmute(a.val),
1674                simd: self,
1675            }
1676        }
1677    }
1678    #[inline(always)]
1679    fn cvt_to_bytes_i32x4(self, a: i32x4<Self>) -> u8x16<Self> {
1680        unsafe {
1681            u8x16 {
1682                val: core::mem::transmute(a.val),
1683                simd: self,
1684            }
1685        }
1686    }
1687    #[inline(always)]
1688    fn slide_i32x4<const SHIFT: usize>(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1689        unsafe {
1690            if SHIFT >= 4usize {
1691                return b;
1692            }
1693            let result = dyn_alignr_128(
1694                self.cvt_to_bytes_i32x4(b).val.0,
1695                self.cvt_to_bytes_i32x4(a).val.0,
1696                SHIFT * 4usize,
1697            );
1698            self.cvt_from_bytes_i32x4(u8x16 {
1699                val: crate::support::Aligned128(result),
1700                simd: self,
1701            })
1702        }
1703    }
1704    #[inline(always)]
1705    fn slide_within_blocks_i32x4<const SHIFT: usize>(
1706        self,
1707        a: i32x4<Self>,
1708        b: i32x4<Self>,
1709    ) -> i32x4<Self> {
1710        self.slide_i32x4::<SHIFT>(a, b)
1711    }
1712    #[inline(always)]
1713    fn add_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1714        unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) }
1715    }
1716    #[inline(always)]
1717    fn sub_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1718        unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) }
1719    }
1720    #[inline(always)]
1721    fn mul_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1722        unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) }
1723    }
1724    #[inline(always)]
1725    fn and_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1726        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1727    }
1728    #[inline(always)]
1729    fn or_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1730        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1731    }
1732    #[inline(always)]
1733    fn xor_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1734        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1735    }
1736    #[inline(always)]
1737    fn not_i32x4(self, a: i32x4<Self>) -> i32x4<Self> {
1738        a ^ !0
1739    }
1740    #[inline(always)]
1741    fn shl_i32x4(self, a: i32x4<Self>, shift: u32) -> i32x4<Self> {
1742        unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1743    }
1744    #[inline(always)]
1745    fn shlv_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1746        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
1747    }
1748    #[inline(always)]
1749    fn shr_i32x4(self, a: i32x4<Self>, shift: u32) -> i32x4<Self> {
1750        unsafe { _mm_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1751    }
1752    #[inline(always)]
1753    fn shrv_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1754        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
1755    }
1756    #[inline(always)]
1757    fn simd_eq_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
1758        unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) }
1759    }
1760    #[inline(always)]
1761    fn simd_lt_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
1762        unsafe { _mm_cmpgt_epi32(b.into(), a.into()).simd_into(self) }
1763    }
1764    #[inline(always)]
1765    fn simd_le_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
1766        unsafe { _mm_cmpeq_epi32(_mm_min_epi32(a.into(), b.into()), a.into()).simd_into(self) }
1767    }
1768    #[inline(always)]
1769    fn simd_ge_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
1770        unsafe { _mm_cmpeq_epi32(_mm_max_epi32(a.into(), b.into()), a.into()).simd_into(self) }
1771    }
1772    #[inline(always)]
1773    fn simd_gt_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
1774        unsafe { _mm_cmpgt_epi32(a.into(), b.into()).simd_into(self) }
1775    }
1776    #[inline(always)]
1777    fn zip_low_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1778        unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) }
1779    }
1780    #[inline(always)]
1781    fn zip_high_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1782        unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) }
1783    }
1784    #[inline(always)]
1785    fn unzip_low_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1786        unsafe {
1787            let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
1788            let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
1789            _mm_unpacklo_epi64(t1, t2).simd_into(self)
1790        }
1791    }
1792    #[inline(always)]
1793    fn unzip_high_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1794        unsafe {
1795            let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
1796            let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
1797            _mm_unpackhi_epi64(t1, t2).simd_into(self)
1798        }
1799    }
1800    #[inline(always)]
1801    fn interleave_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> (i32x4<Self>, i32x4<Self>) {
1802        (self.zip_low_i32x4(a, b), self.zip_high_i32x4(a, b))
1803    }
1804    #[inline(always)]
1805    fn deinterleave_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> (i32x4<Self>, i32x4<Self>) {
1806        (self.unzip_low_i32x4(a, b), self.unzip_high_i32x4(a, b))
1807    }
1808    #[inline(always)]
1809    fn select_i32x4(self, a: mask32x4<Self>, b: i32x4<Self>, c: i32x4<Self>) -> i32x4<Self> {
1810        unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
1811    }
1812    #[inline(always)]
1813    fn min_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1814        unsafe { _mm_min_epi32(a.into(), b.into()).simd_into(self) }
1815    }
1816    #[inline(always)]
1817    fn max_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1818        unsafe { _mm_max_epi32(a.into(), b.into()).simd_into(self) }
1819    }
1820    #[inline(always)]
1821    fn combine_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x8<Self> {
1822        i32x8 {
1823            val: crate::support::Aligned256([a.val.0, b.val.0]),
1824            simd: self,
1825        }
1826    }
1827    #[inline(always)]
1828    fn neg_i32x4(self, a: i32x4<Self>) -> i32x4<Self> {
1829        unsafe { _mm_sub_epi32(_mm_setzero_si128(), a.into()).simd_into(self) }
1830    }
1831    #[inline(always)]
1832    fn reinterpret_u8_i32x4(self, a: i32x4<Self>) -> u8x16<Self> {
1833        __m128i::from(a).simd_into(self)
1834    }
1835    #[inline(always)]
1836    fn reinterpret_u32_i32x4(self, a: i32x4<Self>) -> u32x4<Self> {
1837        __m128i::from(a).simd_into(self)
1838    }
1839    #[inline(always)]
1840    fn cvt_f32_i32x4(self, a: i32x4<Self>) -> f32x4<Self> {
1841        unsafe { _mm_cvtepi32_ps(a.into()).simd_into(self) }
1842    }
1843    #[inline(always)]
1844    fn splat_u32x4(self, val: u32) -> u32x4<Self> {
1845        unsafe { _mm_set1_epi32(val.cast_signed()).simd_into(self) }
1846    }
1847    #[inline(always)]
1848    fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4<Self> {
1849        u32x4 {
1850            val: unsafe { core::mem::transmute_copy(&val) },
1851            simd: self,
1852        }
1853    }
1854    #[inline(always)]
1855    fn load_array_ref_u32x4(self, val: &[u32; 4usize]) -> u32x4<Self> {
1856        u32x4 {
1857            val: unsafe { core::mem::transmute_copy(val) },
1858            simd: self,
1859        }
1860    }
1861    #[inline(always)]
1862    fn as_array_u32x4(self, a: u32x4<Self>) -> [u32; 4usize] {
1863        unsafe { core::mem::transmute::<__m128i, [u32; 4usize]>(a.val.0) }
1864    }
1865    #[inline(always)]
1866    fn as_array_ref_u32x4(self, a: &u32x4<Self>) -> &[u32; 4usize] {
1867        unsafe { core::mem::transmute::<&__m128i, &[u32; 4usize]>(&a.val.0) }
1868    }
1869    #[inline(always)]
1870    fn as_array_mut_u32x4(self, a: &mut u32x4<Self>) -> &mut [u32; 4usize] {
1871        unsafe { core::mem::transmute::<&mut __m128i, &mut [u32; 4usize]>(&mut a.val.0) }
1872    }
1873    #[inline(always)]
1874    fn store_array_u32x4(self, a: u32x4<Self>, dest: &mut [u32; 4usize]) -> () {
1875        unsafe {
1876            core::ptr::copy_nonoverlapping(
1877                (&raw const a.val.0) as *const u32,
1878                dest.as_mut_ptr(),
1879                4usize,
1880            );
1881        }
1882    }
1883    #[inline(always)]
1884    fn cvt_from_bytes_u32x4(self, a: u8x16<Self>) -> u32x4<Self> {
1885        unsafe {
1886            u32x4 {
1887                val: core::mem::transmute(a.val),
1888                simd: self,
1889            }
1890        }
1891    }
1892    #[inline(always)]
1893    fn cvt_to_bytes_u32x4(self, a: u32x4<Self>) -> u8x16<Self> {
1894        unsafe {
1895            u8x16 {
1896                val: core::mem::transmute(a.val),
1897                simd: self,
1898            }
1899        }
1900    }
1901    #[inline(always)]
1902    fn slide_u32x4<const SHIFT: usize>(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1903        unsafe {
1904            if SHIFT >= 4usize {
1905                return b;
1906            }
1907            let result = dyn_alignr_128(
1908                self.cvt_to_bytes_u32x4(b).val.0,
1909                self.cvt_to_bytes_u32x4(a).val.0,
1910                SHIFT * 4usize,
1911            );
1912            self.cvt_from_bytes_u32x4(u8x16 {
1913                val: crate::support::Aligned128(result),
1914                simd: self,
1915            })
1916        }
1917    }
1918    #[inline(always)]
1919    fn slide_within_blocks_u32x4<const SHIFT: usize>(
1920        self,
1921        a: u32x4<Self>,
1922        b: u32x4<Self>,
1923    ) -> u32x4<Self> {
1924        self.slide_u32x4::<SHIFT>(a, b)
1925    }
1926    #[inline(always)]
1927    fn add_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1928        unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) }
1929    }
1930    #[inline(always)]
1931    fn sub_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1932        unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) }
1933    }
1934    #[inline(always)]
1935    fn mul_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1936        unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) }
1937    }
1938    #[inline(always)]
1939    fn and_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1940        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1941    }
1942    #[inline(always)]
1943    fn or_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1944        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1945    }
1946    #[inline(always)]
1947    fn xor_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1948        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1949    }
1950    #[inline(always)]
1951    fn not_u32x4(self, a: u32x4<Self>) -> u32x4<Self> {
1952        a ^ !0
1953    }
1954    #[inline(always)]
1955    fn shl_u32x4(self, a: u32x4<Self>, shift: u32) -> u32x4<Self> {
1956        unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1957    }
1958    #[inline(always)]
1959    fn shlv_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1960        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
1961    }
1962    #[inline(always)]
1963    fn shr_u32x4(self, a: u32x4<Self>, shift: u32) -> u32x4<Self> {
1964        unsafe { _mm_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1965    }
1966    #[inline(always)]
1967    fn shrv_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1968        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
1969    }
1970    #[inline(always)]
1971    fn simd_eq_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1972        unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) }
1973    }
1974    #[inline(always)]
1975    fn simd_lt_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1976        unsafe {
1977            let sign_bit = _mm_set1_epi32(0x80000000u32.cast_signed());
1978            let a_signed = _mm_xor_si128(a.into(), sign_bit);
1979            let b_signed = _mm_xor_si128(b.into(), sign_bit);
1980            _mm_cmpgt_epi32(b_signed, a_signed).simd_into(self)
1981        }
1982    }
1983    #[inline(always)]
1984    fn simd_le_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1985        unsafe { _mm_cmpeq_epi32(_mm_min_epu32(a.into(), b.into()), a.into()).simd_into(self) }
1986    }
1987    #[inline(always)]
1988    fn simd_ge_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1989        unsafe { _mm_cmpeq_epi32(_mm_max_epu32(a.into(), b.into()), a.into()).simd_into(self) }
1990    }
1991    #[inline(always)]
1992    fn simd_gt_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1993        unsafe {
1994            let sign_bit = _mm_set1_epi32(0x80000000u32.cast_signed());
1995            let a_signed = _mm_xor_si128(a.into(), sign_bit);
1996            let b_signed = _mm_xor_si128(b.into(), sign_bit);
1997            _mm_cmpgt_epi32(a_signed, b_signed).simd_into(self)
1998        }
1999    }
2000    #[inline(always)]
2001    fn zip_low_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
2002        unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) }
2003    }
2004    #[inline(always)]
2005    fn zip_high_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
2006        unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) }
2007    }
2008    #[inline(always)]
2009    fn unzip_low_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
2010        unsafe {
2011            let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
2012            let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
2013            _mm_unpacklo_epi64(t1, t2).simd_into(self)
2014        }
2015    }
2016    #[inline(always)]
2017    fn unzip_high_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
2018        unsafe {
2019            let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
2020            let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
2021            _mm_unpackhi_epi64(t1, t2).simd_into(self)
2022        }
2023    }
2024    #[inline(always)]
2025    fn interleave_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> (u32x4<Self>, u32x4<Self>) {
2026        (self.zip_low_u32x4(a, b), self.zip_high_u32x4(a, b))
2027    }
2028    #[inline(always)]
2029    fn deinterleave_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> (u32x4<Self>, u32x4<Self>) {
2030        (self.unzip_low_u32x4(a, b), self.unzip_high_u32x4(a, b))
2031    }
2032    #[inline(always)]
2033    fn select_u32x4(self, a: mask32x4<Self>, b: u32x4<Self>, c: u32x4<Self>) -> u32x4<Self> {
2034        unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
2035    }
2036    #[inline(always)]
2037    fn min_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
2038        unsafe { _mm_min_epu32(a.into(), b.into()).simd_into(self) }
2039    }
2040    #[inline(always)]
2041    fn max_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
2042        unsafe { _mm_max_epu32(a.into(), b.into()).simd_into(self) }
2043    }
2044    #[inline(always)]
2045    fn combine_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x8<Self> {
2046        u32x8 {
2047            val: crate::support::Aligned256([a.val.0, b.val.0]),
2048            simd: self,
2049        }
2050    }
2051    #[inline(always)]
2052    fn reinterpret_u8_u32x4(self, a: u32x4<Self>) -> u8x16<Self> {
2053        __m128i::from(a).simd_into(self)
2054    }
2055    #[inline(always)]
2056    fn cvt_f32_u32x4(self, a: u32x4<Self>) -> f32x4<Self> {
2057        unsafe {
2058            let a = a.into();
2059            let lo = _mm_blend_epi16::<0xAA>(a, _mm_set1_epi32(0x4B000000));
2060            let hi = _mm_blend_epi16::<0xAA>(_mm_srli_epi32::<16>(a), _mm_set1_epi32(0x53000000));
2061            let fhi = _mm_sub_ps(
2062                _mm_castsi128_ps(hi),
2063                _mm_set1_ps(f32::from_bits(0x53000080)),
2064            );
2065            let result = _mm_add_ps(_mm_castsi128_ps(lo), fhi);
2066            result.simd_into(self)
2067        }
2068    }
2069    #[inline(always)]
2070    fn splat_mask32x4(self, val: i32) -> mask32x4<Self> {
2071        unsafe { _mm_set1_epi32(val).simd_into(self) }
2072    }
2073    #[inline(always)]
2074    fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4<Self> {
2075        mask32x4 {
2076            val: unsafe { core::mem::transmute_copy(&val) },
2077            simd: self,
2078        }
2079    }
2080    #[inline(always)]
2081    fn load_array_ref_mask32x4(self, val: &[i32; 4usize]) -> mask32x4<Self> {
2082        mask32x4 {
2083            val: unsafe { core::mem::transmute_copy(val) },
2084            simd: self,
2085        }
2086    }
2087    #[inline(always)]
2088    fn as_array_mask32x4(self, a: mask32x4<Self>) -> [i32; 4usize] {
2089        unsafe { core::mem::transmute::<__m128i, [i32; 4usize]>(a.val.0) }
2090    }
2091    #[inline(always)]
2092    fn as_array_ref_mask32x4(self, a: &mask32x4<Self>) -> &[i32; 4usize] {
2093        unsafe { core::mem::transmute::<&__m128i, &[i32; 4usize]>(&a.val.0) }
2094    }
2095    #[inline(always)]
2096    fn as_array_mut_mask32x4(self, a: &mut mask32x4<Self>) -> &mut [i32; 4usize] {
2097        unsafe { core::mem::transmute::<&mut __m128i, &mut [i32; 4usize]>(&mut a.val.0) }
2098    }
2099    #[inline(always)]
2100    fn store_array_mask32x4(self, a: mask32x4<Self>, dest: &mut [i32; 4usize]) -> () {
2101        unsafe {
2102            core::ptr::copy_nonoverlapping(
2103                (&raw const a.val.0) as *const i32,
2104                dest.as_mut_ptr(),
2105                4usize,
2106            );
2107        }
2108    }
2109    #[inline(always)]
2110    fn cvt_from_bytes_mask32x4(self, a: u8x16<Self>) -> mask32x4<Self> {
2111        unsafe {
2112            mask32x4 {
2113                val: core::mem::transmute(a.val),
2114                simd: self,
2115            }
2116        }
2117    }
2118    #[inline(always)]
2119    fn cvt_to_bytes_mask32x4(self, a: mask32x4<Self>) -> u8x16<Self> {
2120        unsafe {
2121            u8x16 {
2122                val: core::mem::transmute(a.val),
2123                simd: self,
2124            }
2125        }
2126    }
2127    #[inline(always)]
2128    fn slide_mask32x4<const SHIFT: usize>(
2129        self,
2130        a: mask32x4<Self>,
2131        b: mask32x4<Self>,
2132    ) -> mask32x4<Self> {
2133        unsafe {
2134            if SHIFT >= 4usize {
2135                return b;
2136            }
2137            let result = dyn_alignr_128(
2138                self.cvt_to_bytes_mask32x4(b).val.0,
2139                self.cvt_to_bytes_mask32x4(a).val.0,
2140                SHIFT * 4usize,
2141            );
2142            self.cvt_from_bytes_mask32x4(u8x16 {
2143                val: crate::support::Aligned128(result),
2144                simd: self,
2145            })
2146        }
2147    }
2148    #[inline(always)]
2149    fn slide_within_blocks_mask32x4<const SHIFT: usize>(
2150        self,
2151        a: mask32x4<Self>,
2152        b: mask32x4<Self>,
2153    ) -> mask32x4<Self> {
2154        self.slide_mask32x4::<SHIFT>(a, b)
2155    }
2156    #[inline(always)]
2157    fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
2158        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
2159    }
2160    #[inline(always)]
2161    fn or_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
2162        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
2163    }
2164    #[inline(always)]
2165    fn xor_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
2166        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
2167    }
2168    #[inline(always)]
2169    fn not_mask32x4(self, a: mask32x4<Self>) -> mask32x4<Self> {
2170        a ^ !0
2171    }
2172    #[inline(always)]
2173    fn select_mask32x4(
2174        self,
2175        a: mask32x4<Self>,
2176        b: mask32x4<Self>,
2177        c: mask32x4<Self>,
2178    ) -> mask32x4<Self> {
2179        unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
2180    }
2181    #[inline(always)]
2182    fn simd_eq_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
2183        unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) }
2184    }
2185    #[inline(always)]
2186    fn any_true_mask32x4(self, a: mask32x4<Self>) -> bool {
2187        unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 != 0 }
2188    }
2189    #[inline(always)]
2190    fn all_true_mask32x4(self, a: mask32x4<Self>) -> bool {
2191        unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 == 0b1111 }
2192    }
2193    #[inline(always)]
2194    fn any_false_mask32x4(self, a: mask32x4<Self>) -> bool {
2195        unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 != 0b1111 }
2196    }
2197    #[inline(always)]
2198    fn all_false_mask32x4(self, a: mask32x4<Self>) -> bool {
2199        unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 == 0 }
2200    }
2201    #[inline(always)]
2202    fn combine_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x8<Self> {
2203        mask32x8 {
2204            val: crate::support::Aligned256([a.val.0, b.val.0]),
2205            simd: self,
2206        }
2207    }
2208    #[inline(always)]
2209    fn splat_f64x2(self, val: f64) -> f64x2<Self> {
2210        unsafe { _mm_set1_pd(val).simd_into(self) }
2211    }
2212    #[inline(always)]
2213    fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2<Self> {
2214        f64x2 {
2215            val: unsafe { core::mem::transmute_copy(&val) },
2216            simd: self,
2217        }
2218    }
2219    #[inline(always)]
2220    fn load_array_ref_f64x2(self, val: &[f64; 2usize]) -> f64x2<Self> {
2221        f64x2 {
2222            val: unsafe { core::mem::transmute_copy(val) },
2223            simd: self,
2224        }
2225    }
2226    #[inline(always)]
2227    fn as_array_f64x2(self, a: f64x2<Self>) -> [f64; 2usize] {
2228        unsafe { core::mem::transmute::<__m128d, [f64; 2usize]>(a.val.0) }
2229    }
2230    #[inline(always)]
2231    fn as_array_ref_f64x2(self, a: &f64x2<Self>) -> &[f64; 2usize] {
2232        unsafe { core::mem::transmute::<&__m128d, &[f64; 2usize]>(&a.val.0) }
2233    }
2234    #[inline(always)]
2235    fn as_array_mut_f64x2(self, a: &mut f64x2<Self>) -> &mut [f64; 2usize] {
2236        unsafe { core::mem::transmute::<&mut __m128d, &mut [f64; 2usize]>(&mut a.val.0) }
2237    }
2238    #[inline(always)]
2239    fn store_array_f64x2(self, a: f64x2<Self>, dest: &mut [f64; 2usize]) -> () {
2240        unsafe {
2241            core::ptr::copy_nonoverlapping(
2242                (&raw const a.val.0) as *const f64,
2243                dest.as_mut_ptr(),
2244                2usize,
2245            );
2246        }
2247    }
2248    #[inline(always)]
2249    fn cvt_from_bytes_f64x2(self, a: u8x16<Self>) -> f64x2<Self> {
2250        unsafe {
2251            f64x2 {
2252                val: core::mem::transmute(a.val),
2253                simd: self,
2254            }
2255        }
2256    }
2257    #[inline(always)]
2258    fn cvt_to_bytes_f64x2(self, a: f64x2<Self>) -> u8x16<Self> {
2259        unsafe {
2260            u8x16 {
2261                val: core::mem::transmute(a.val),
2262                simd: self,
2263            }
2264        }
2265    }
2266    #[inline(always)]
2267    fn slide_f64x2<const SHIFT: usize>(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2268        unsafe {
2269            if SHIFT >= 2usize {
2270                return b;
2271            }
2272            let result = dyn_alignr_128(
2273                self.cvt_to_bytes_f64x2(b).val.0,
2274                self.cvt_to_bytes_f64x2(a).val.0,
2275                SHIFT * 8usize,
2276            );
2277            self.cvt_from_bytes_f64x2(u8x16 {
2278                val: crate::support::Aligned128(result),
2279                simd: self,
2280            })
2281        }
2282    }
2283    #[inline(always)]
2284    fn slide_within_blocks_f64x2<const SHIFT: usize>(
2285        self,
2286        a: f64x2<Self>,
2287        b: f64x2<Self>,
2288    ) -> f64x2<Self> {
2289        self.slide_f64x2::<SHIFT>(a, b)
2290    }
2291    #[inline(always)]
2292    fn abs_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2293        unsafe { _mm_andnot_pd(_mm_set1_pd(-0.0), a.into()).simd_into(self) }
2294    }
2295    #[inline(always)]
2296    fn neg_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2297        unsafe { _mm_xor_pd(a.into(), _mm_set1_pd(-0.0)).simd_into(self) }
2298    }
2299    #[inline(always)]
2300    fn sqrt_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2301        unsafe { _mm_sqrt_pd(a.into()).simd_into(self) }
2302    }
2303    #[inline(always)]
2304    fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2305        unsafe { _mm_add_pd(a.into(), b.into()).simd_into(self) }
2306    }
2307    #[inline(always)]
2308    fn sub_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2309        unsafe { _mm_sub_pd(a.into(), b.into()).simd_into(self) }
2310    }
2311    #[inline(always)]
2312    fn mul_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2313        unsafe { _mm_mul_pd(a.into(), b.into()).simd_into(self) }
2314    }
2315    #[inline(always)]
2316    fn div_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2317        unsafe { _mm_div_pd(a.into(), b.into()).simd_into(self) }
2318    }
2319    #[inline(always)]
2320    fn copysign_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2321        unsafe {
2322            let mask = _mm_set1_pd(-0.0);
2323            _mm_or_pd(_mm_and_pd(mask, b.into()), _mm_andnot_pd(mask, a.into())).simd_into(self)
2324        }
2325    }
2326    #[inline(always)]
2327    fn simd_eq_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
2328        unsafe { _mm_castpd_si128(_mm_cmpeq_pd(a.into(), b.into())).simd_into(self) }
2329    }
2330    #[inline(always)]
2331    fn simd_lt_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
2332        unsafe { _mm_castpd_si128(_mm_cmplt_pd(a.into(), b.into())).simd_into(self) }
2333    }
2334    #[inline(always)]
2335    fn simd_le_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
2336        unsafe { _mm_castpd_si128(_mm_cmple_pd(a.into(), b.into())).simd_into(self) }
2337    }
2338    #[inline(always)]
2339    fn simd_ge_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
2340        unsafe { _mm_castpd_si128(_mm_cmpge_pd(a.into(), b.into())).simd_into(self) }
2341    }
2342    #[inline(always)]
2343    fn simd_gt_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
2344        unsafe { _mm_castpd_si128(_mm_cmpgt_pd(a.into(), b.into())).simd_into(self) }
2345    }
2346    #[inline(always)]
2347    fn zip_low_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2348        unsafe { _mm_unpacklo_pd(a.into(), b.into()).simd_into(self) }
2349    }
2350    #[inline(always)]
2351    fn zip_high_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2352        unsafe { _mm_unpackhi_pd(a.into(), b.into()).simd_into(self) }
2353    }
2354    #[inline(always)]
2355    fn unzip_low_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2356        unsafe { _mm_shuffle_pd::<0b00>(a.into(), b.into()).simd_into(self) }
2357    }
2358    #[inline(always)]
2359    fn unzip_high_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2360        unsafe { _mm_shuffle_pd::<0b11>(a.into(), b.into()).simd_into(self) }
2361    }
2362    #[inline(always)]
2363    fn interleave_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> (f64x2<Self>, f64x2<Self>) {
2364        (self.zip_low_f64x2(a, b), self.zip_high_f64x2(a, b))
2365    }
2366    #[inline(always)]
2367    fn deinterleave_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> (f64x2<Self>, f64x2<Self>) {
2368        (self.unzip_low_f64x2(a, b), self.unzip_high_f64x2(a, b))
2369    }
2370    #[inline(always)]
2371    fn max_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2372        unsafe { _mm_max_pd(a.into(), b.into()).simd_into(self) }
2373    }
2374    #[inline(always)]
2375    fn min_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2376        unsafe { _mm_min_pd(a.into(), b.into()).simd_into(self) }
2377    }
2378    #[inline(always)]
2379    fn max_precise_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2380        unsafe {
2381            let intermediate = _mm_max_pd(a.into(), b.into());
2382            let b_is_nan = _mm_cmpunord_pd(b.into(), b.into());
2383            _mm_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(self)
2384        }
2385    }
2386    #[inline(always)]
2387    fn min_precise_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2388        unsafe {
2389            let intermediate = _mm_min_pd(a.into(), b.into());
2390            let b_is_nan = _mm_cmpunord_pd(b.into(), b.into());
2391            _mm_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(self)
2392        }
2393    }
2394    #[inline(always)]
2395    fn mul_add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
2396        a * b + c
2397    }
2398    #[inline(always)]
2399    fn mul_sub_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
2400        a * b - c
2401    }
2402    #[inline(always)]
2403    fn floor_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2404        unsafe {
2405            _mm_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
2406        }
2407    }
2408    #[inline(always)]
2409    fn ceil_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2410        unsafe {
2411            _mm_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
2412        }
2413    }
2414    #[inline(always)]
2415    fn round_ties_even_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2416        unsafe {
2417            _mm_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
2418                .simd_into(self)
2419        }
2420    }
2421    #[inline(always)]
2422    fn fract_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2423        a - self.trunc_f64x2(a)
2424    }
2425    #[inline(always)]
2426    fn trunc_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2427        unsafe {
2428            _mm_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
2429        }
2430    }
2431    #[inline(always)]
2432    fn select_f64x2(self, a: mask64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
2433        unsafe { _mm_blendv_pd(c.into(), b.into(), _mm_castsi128_pd(a.into())).simd_into(self) }
2434    }
2435    #[inline(always)]
2436    fn combine_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x4<Self> {
2437        f64x4 {
2438            val: crate::support::Aligned256([a.val.0, b.val.0]),
2439            simd: self,
2440        }
2441    }
2442    #[inline(always)]
2443    fn reinterpret_f32_f64x2(self, a: f64x2<Self>) -> f32x4<Self> {
2444        unsafe { _mm_castpd_ps(a.into()).simd_into(self) }
2445    }
2446    #[inline(always)]
2447    fn splat_mask64x2(self, val: i64) -> mask64x2<Self> {
2448        unsafe { _mm_set1_epi64x(val).simd_into(self) }
2449    }
2450    #[inline(always)]
2451    fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2<Self> {
2452        mask64x2 {
2453            val: unsafe { core::mem::transmute_copy(&val) },
2454            simd: self,
2455        }
2456    }
2457    #[inline(always)]
2458    fn load_array_ref_mask64x2(self, val: &[i64; 2usize]) -> mask64x2<Self> {
2459        mask64x2 {
2460            val: unsafe { core::mem::transmute_copy(val) },
2461            simd: self,
2462        }
2463    }
2464    #[inline(always)]
2465    fn as_array_mask64x2(self, a: mask64x2<Self>) -> [i64; 2usize] {
2466        unsafe { core::mem::transmute::<__m128i, [i64; 2usize]>(a.val.0) }
2467    }
2468    #[inline(always)]
2469    fn as_array_ref_mask64x2(self, a: &mask64x2<Self>) -> &[i64; 2usize] {
2470        unsafe { core::mem::transmute::<&__m128i, &[i64; 2usize]>(&a.val.0) }
2471    }
2472    #[inline(always)]
2473    fn as_array_mut_mask64x2(self, a: &mut mask64x2<Self>) -> &mut [i64; 2usize] {
2474        unsafe { core::mem::transmute::<&mut __m128i, &mut [i64; 2usize]>(&mut a.val.0) }
2475    }
2476    #[inline(always)]
2477    fn store_array_mask64x2(self, a: mask64x2<Self>, dest: &mut [i64; 2usize]) -> () {
2478        unsafe {
2479            core::ptr::copy_nonoverlapping(
2480                (&raw const a.val.0) as *const i64,
2481                dest.as_mut_ptr(),
2482                2usize,
2483            );
2484        }
2485    }
2486    #[inline(always)]
2487    fn cvt_from_bytes_mask64x2(self, a: u8x16<Self>) -> mask64x2<Self> {
2488        unsafe {
2489            mask64x2 {
2490                val: core::mem::transmute(a.val),
2491                simd: self,
2492            }
2493        }
2494    }
2495    #[inline(always)]
2496    fn cvt_to_bytes_mask64x2(self, a: mask64x2<Self>) -> u8x16<Self> {
2497        unsafe {
2498            u8x16 {
2499                val: core::mem::transmute(a.val),
2500                simd: self,
2501            }
2502        }
2503    }
2504    #[inline(always)]
2505    fn slide_mask64x2<const SHIFT: usize>(
2506        self,
2507        a: mask64x2<Self>,
2508        b: mask64x2<Self>,
2509    ) -> mask64x2<Self> {
2510        unsafe {
2511            if SHIFT >= 2usize {
2512                return b;
2513            }
2514            let result = dyn_alignr_128(
2515                self.cvt_to_bytes_mask64x2(b).val.0,
2516                self.cvt_to_bytes_mask64x2(a).val.0,
2517                SHIFT * 8usize,
2518            );
2519            self.cvt_from_bytes_mask64x2(u8x16 {
2520                val: crate::support::Aligned128(result),
2521                simd: self,
2522            })
2523        }
2524    }
2525    #[inline(always)]
2526    fn slide_within_blocks_mask64x2<const SHIFT: usize>(
2527        self,
2528        a: mask64x2<Self>,
2529        b: mask64x2<Self>,
2530    ) -> mask64x2<Self> {
2531        self.slide_mask64x2::<SHIFT>(a, b)
2532    }
2533    #[inline(always)]
2534    fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
2535        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
2536    }
2537    #[inline(always)]
2538    fn or_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
2539        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
2540    }
2541    #[inline(always)]
2542    fn xor_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
2543        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
2544    }
2545    #[inline(always)]
2546    fn not_mask64x2(self, a: mask64x2<Self>) -> mask64x2<Self> {
2547        a ^ !0
2548    }
2549    #[inline(always)]
2550    fn select_mask64x2(
2551        self,
2552        a: mask64x2<Self>,
2553        b: mask64x2<Self>,
2554        c: mask64x2<Self>,
2555    ) -> mask64x2<Self> {
2556        unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
2557    }
2558    #[inline(always)]
2559    fn simd_eq_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
2560        unsafe { _mm_cmpeq_epi64(a.into(), b.into()).simd_into(self) }
2561    }
2562    #[inline(always)]
2563    fn any_true_mask64x2(self, a: mask64x2<Self>) -> bool {
2564        unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0 }
2565    }
2566    #[inline(always)]
2567    fn all_true_mask64x2(self, a: mask64x2<Self>) -> bool {
2568        unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0b11 }
2569    }
2570    #[inline(always)]
2571    fn any_false_mask64x2(self, a: mask64x2<Self>) -> bool {
2572        unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0b11 }
2573    }
2574    #[inline(always)]
2575    fn all_false_mask64x2(self, a: mask64x2<Self>) -> bool {
2576        unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0 }
2577    }
2578    #[inline(always)]
2579    fn combine_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x4<Self> {
2580        mask64x4 {
2581            val: crate::support::Aligned256([a.val.0, b.val.0]),
2582            simd: self,
2583        }
2584    }
2585    #[inline(always)]
2586    fn splat_f32x8(self, val: f32) -> f32x8<Self> {
2587        let half = self.splat_f32x4(val);
2588        self.combine_f32x4(half, half)
2589    }
2590    #[inline(always)]
2591    fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8<Self> {
2592        f32x8 {
2593            val: unsafe { core::mem::transmute_copy(&val) },
2594            simd: self,
2595        }
2596    }
2597    #[inline(always)]
2598    fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8<Self> {
2599        f32x8 {
2600            val: unsafe { core::mem::transmute_copy(val) },
2601            simd: self,
2602        }
2603    }
2604    #[inline(always)]
2605    fn as_array_f32x8(self, a: f32x8<Self>) -> [f32; 8usize] {
2606        unsafe { core::mem::transmute::<[__m128; 2usize], [f32; 8usize]>(a.val.0) }
2607    }
2608    #[inline(always)]
2609    fn as_array_ref_f32x8(self, a: &f32x8<Self>) -> &[f32; 8usize] {
2610        unsafe { core::mem::transmute::<&[__m128; 2usize], &[f32; 8usize]>(&a.val.0) }
2611    }
2612    #[inline(always)]
2613    fn as_array_mut_f32x8(self, a: &mut f32x8<Self>) -> &mut [f32; 8usize] {
2614        unsafe { core::mem::transmute::<&mut [__m128; 2usize], &mut [f32; 8usize]>(&mut a.val.0) }
2615    }
2616    #[inline(always)]
2617    fn store_array_f32x8(self, a: f32x8<Self>, dest: &mut [f32; 8usize]) -> () {
2618        unsafe {
2619            core::ptr::copy_nonoverlapping(
2620                (&raw const a.val.0) as *const f32,
2621                dest.as_mut_ptr(),
2622                8usize,
2623            );
2624        }
2625    }
2626    #[inline(always)]
2627    fn cvt_from_bytes_f32x8(self, a: u8x32<Self>) -> f32x8<Self> {
2628        unsafe {
2629            f32x8 {
2630                val: core::mem::transmute(a.val),
2631                simd: self,
2632            }
2633        }
2634    }
2635    #[inline(always)]
2636    fn cvt_to_bytes_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
2637        unsafe {
2638            u8x32 {
2639                val: core::mem::transmute(a.val),
2640                simd: self,
2641            }
2642        }
2643    }
2644    #[inline(always)]
2645    fn slide_f32x8<const SHIFT: usize>(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2646        unsafe {
2647            if SHIFT >= 8usize {
2648                return b;
2649            }
2650            let result = cross_block_alignr_128x2(
2651                self.cvt_to_bytes_f32x8(b).val.0,
2652                self.cvt_to_bytes_f32x8(a).val.0,
2653                SHIFT * 4usize,
2654            );
2655            self.cvt_from_bytes_f32x8(u8x32 {
2656                val: crate::support::Aligned256(result),
2657                simd: self,
2658            })
2659        }
2660    }
2661    #[inline(always)]
2662    fn slide_within_blocks_f32x8<const SHIFT: usize>(
2663        self,
2664        a: f32x8<Self>,
2665        b: f32x8<Self>,
2666    ) -> f32x8<Self> {
2667        let (a0, a1) = self.split_f32x8(a);
2668        let (b0, b1) = self.split_f32x8(b);
2669        self.combine_f32x4(
2670            self.slide_within_blocks_f32x4::<SHIFT>(a0, b0),
2671            self.slide_within_blocks_f32x4::<SHIFT>(a1, b1),
2672        )
2673    }
2674    #[inline(always)]
2675    fn abs_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2676        let (a0, a1) = self.split_f32x8(a);
2677        self.combine_f32x4(self.abs_f32x4(a0), self.abs_f32x4(a1))
2678    }
2679    #[inline(always)]
2680    fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2681        let (a0, a1) = self.split_f32x8(a);
2682        self.combine_f32x4(self.neg_f32x4(a0), self.neg_f32x4(a1))
2683    }
2684    #[inline(always)]
2685    fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2686        let (a0, a1) = self.split_f32x8(a);
2687        self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1))
2688    }
2689    #[inline(always)]
2690    fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2691        let (a0, a1) = self.split_f32x8(a);
2692        let (b0, b1) = self.split_f32x8(b);
2693        self.combine_f32x4(self.add_f32x4(a0, b0), self.add_f32x4(a1, b1))
2694    }
2695    #[inline(always)]
2696    fn sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2697        let (a0, a1) = self.split_f32x8(a);
2698        let (b0, b1) = self.split_f32x8(b);
2699        self.combine_f32x4(self.sub_f32x4(a0, b0), self.sub_f32x4(a1, b1))
2700    }
2701    #[inline(always)]
2702    fn mul_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2703        let (a0, a1) = self.split_f32x8(a);
2704        let (b0, b1) = self.split_f32x8(b);
2705        self.combine_f32x4(self.mul_f32x4(a0, b0), self.mul_f32x4(a1, b1))
2706    }
2707    #[inline(always)]
2708    fn div_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2709        let (a0, a1) = self.split_f32x8(a);
2710        let (b0, b1) = self.split_f32x8(b);
2711        self.combine_f32x4(self.div_f32x4(a0, b0), self.div_f32x4(a1, b1))
2712    }
2713    #[inline(always)]
2714    fn copysign_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2715        let (a0, a1) = self.split_f32x8(a);
2716        let (b0, b1) = self.split_f32x8(b);
2717        self.combine_f32x4(self.copysign_f32x4(a0, b0), self.copysign_f32x4(a1, b1))
2718    }
2719    #[inline(always)]
2720    fn simd_eq_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
2721        let (a0, a1) = self.split_f32x8(a);
2722        let (b0, b1) = self.split_f32x8(b);
2723        self.combine_mask32x4(self.simd_eq_f32x4(a0, b0), self.simd_eq_f32x4(a1, b1))
2724    }
2725    #[inline(always)]
2726    fn simd_lt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
2727        let (a0, a1) = self.split_f32x8(a);
2728        let (b0, b1) = self.split_f32x8(b);
2729        self.combine_mask32x4(self.simd_lt_f32x4(a0, b0), self.simd_lt_f32x4(a1, b1))
2730    }
2731    #[inline(always)]
2732    fn simd_le_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
2733        let (a0, a1) = self.split_f32x8(a);
2734        let (b0, b1) = self.split_f32x8(b);
2735        self.combine_mask32x4(self.simd_le_f32x4(a0, b0), self.simd_le_f32x4(a1, b1))
2736    }
2737    #[inline(always)]
2738    fn simd_ge_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
2739        let (a0, a1) = self.split_f32x8(a);
2740        let (b0, b1) = self.split_f32x8(b);
2741        self.combine_mask32x4(self.simd_ge_f32x4(a0, b0), self.simd_ge_f32x4(a1, b1))
2742    }
2743    #[inline(always)]
2744    fn simd_gt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
2745        let (a0, a1) = self.split_f32x8(a);
2746        let (b0, b1) = self.split_f32x8(b);
2747        self.combine_mask32x4(self.simd_gt_f32x4(a0, b0), self.simd_gt_f32x4(a1, b1))
2748    }
2749    #[inline(always)]
2750    fn zip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2751        let (a0, _) = self.split_f32x8(a);
2752        let (b0, _) = self.split_f32x8(b);
2753        self.combine_f32x4(self.zip_low_f32x4(a0, b0), self.zip_high_f32x4(a0, b0))
2754    }
2755    #[inline(always)]
2756    fn zip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2757        let (_, a1) = self.split_f32x8(a);
2758        let (_, b1) = self.split_f32x8(b);
2759        self.combine_f32x4(self.zip_low_f32x4(a1, b1), self.zip_high_f32x4(a1, b1))
2760    }
2761    #[inline(always)]
2762    fn unzip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2763        let (a0, a1) = self.split_f32x8(a);
2764        let (b0, b1) = self.split_f32x8(b);
2765        self.combine_f32x4(self.unzip_low_f32x4(a0, a1), self.unzip_low_f32x4(b0, b1))
2766    }
2767    #[inline(always)]
2768    fn unzip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2769        let (a0, a1) = self.split_f32x8(a);
2770        let (b0, b1) = self.split_f32x8(b);
2771        self.combine_f32x4(self.unzip_high_f32x4(a0, a1), self.unzip_high_f32x4(b0, b1))
2772    }
2773    #[inline(always)]
2774    fn interleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
2775        let (a0, a1) = self.split_f32x8(a);
2776        let (b0, b1) = self.split_f32x8(b);
2777        let lo_lo = self.zip_low_f32x4(a0, b0);
2778        let lo_hi = self.zip_high_f32x4(a0, b0);
2779        let hi_lo = self.zip_low_f32x4(a1, b1);
2780        let hi_hi = self.zip_high_f32x4(a1, b1);
2781        (
2782            self.combine_f32x4(lo_lo, lo_hi),
2783            self.combine_f32x4(hi_lo, hi_hi),
2784        )
2785    }
2786    #[inline(always)]
2787    fn deinterleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
2788        let (a0, a1) = self.split_f32x8(a);
2789        let (b0, b1) = self.split_f32x8(b);
2790        let lo_even = self.unzip_low_f32x4(a0, a1);
2791        let lo_odd = self.unzip_high_f32x4(a0, a1);
2792        let hi_even = self.unzip_low_f32x4(b0, b1);
2793        let hi_odd = self.unzip_high_f32x4(b0, b1);
2794        (
2795            self.combine_f32x4(lo_even, hi_even),
2796            self.combine_f32x4(lo_odd, hi_odd),
2797        )
2798    }
2799    #[inline(always)]
2800    fn max_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2801        let (a0, a1) = self.split_f32x8(a);
2802        let (b0, b1) = self.split_f32x8(b);
2803        self.combine_f32x4(self.max_f32x4(a0, b0), self.max_f32x4(a1, b1))
2804    }
2805    #[inline(always)]
2806    fn min_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2807        let (a0, a1) = self.split_f32x8(a);
2808        let (b0, b1) = self.split_f32x8(b);
2809        self.combine_f32x4(self.min_f32x4(a0, b0), self.min_f32x4(a1, b1))
2810    }
2811    #[inline(always)]
2812    fn max_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2813        let (a0, a1) = self.split_f32x8(a);
2814        let (b0, b1) = self.split_f32x8(b);
2815        self.combine_f32x4(
2816            self.max_precise_f32x4(a0, b0),
2817            self.max_precise_f32x4(a1, b1),
2818        )
2819    }
2820    #[inline(always)]
2821    fn min_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2822        let (a0, a1) = self.split_f32x8(a);
2823        let (b0, b1) = self.split_f32x8(b);
2824        self.combine_f32x4(
2825            self.min_precise_f32x4(a0, b0),
2826            self.min_precise_f32x4(a1, b1),
2827        )
2828    }
2829    #[inline(always)]
2830    fn mul_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
2831        let (a0, a1) = self.split_f32x8(a);
2832        let (b0, b1) = self.split_f32x8(b);
2833        let (c0, c1) = self.split_f32x8(c);
2834        self.combine_f32x4(
2835            self.mul_add_f32x4(a0, b0, c0),
2836            self.mul_add_f32x4(a1, b1, c1),
2837        )
2838    }
2839    #[inline(always)]
2840    fn mul_sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
2841        let (a0, a1) = self.split_f32x8(a);
2842        let (b0, b1) = self.split_f32x8(b);
2843        let (c0, c1) = self.split_f32x8(c);
2844        self.combine_f32x4(
2845            self.mul_sub_f32x4(a0, b0, c0),
2846            self.mul_sub_f32x4(a1, b1, c1),
2847        )
2848    }
2849    #[inline(always)]
2850    fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2851        let (a0, a1) = self.split_f32x8(a);
2852        self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1))
2853    }
2854    #[inline(always)]
2855    fn ceil_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2856        let (a0, a1) = self.split_f32x8(a);
2857        self.combine_f32x4(self.ceil_f32x4(a0), self.ceil_f32x4(a1))
2858    }
2859    #[inline(always)]
2860    fn round_ties_even_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2861        let (a0, a1) = self.split_f32x8(a);
2862        self.combine_f32x4(
2863            self.round_ties_even_f32x4(a0),
2864            self.round_ties_even_f32x4(a1),
2865        )
2866    }
2867    #[inline(always)]
2868    fn fract_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2869        let (a0, a1) = self.split_f32x8(a);
2870        self.combine_f32x4(self.fract_f32x4(a0), self.fract_f32x4(a1))
2871    }
2872    #[inline(always)]
2873    fn trunc_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2874        let (a0, a1) = self.split_f32x8(a);
2875        self.combine_f32x4(self.trunc_f32x4(a0), self.trunc_f32x4(a1))
2876    }
2877    #[inline(always)]
2878    fn select_f32x8(self, a: mask32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
2879        let (a0, a1) = self.split_mask32x8(a);
2880        let (b0, b1) = self.split_f32x8(b);
2881        let (c0, c1) = self.split_f32x8(c);
2882        self.combine_f32x4(self.select_f32x4(a0, b0, c0), self.select_f32x4(a1, b1, c1))
2883    }
2884    #[inline(always)]
2885    fn combine_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x16<Self> {
2886        f32x16 {
2887            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
2888            simd: self,
2889        }
2890    }
2891    #[inline(always)]
2892    fn split_f32x8(self, a: f32x8<Self>) -> (f32x4<Self>, f32x4<Self>) {
2893        (
2894            f32x4 {
2895                val: crate::support::Aligned128(a.val.0[0]),
2896                simd: self,
2897            },
2898            f32x4 {
2899                val: crate::support::Aligned128(a.val.0[1]),
2900                simd: self,
2901            },
2902        )
2903    }
2904    #[inline(always)]
2905    fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self> {
2906        let (a0, a1) = self.split_f32x8(a);
2907        self.combine_f64x2(
2908            self.reinterpret_f64_f32x4(a0),
2909            self.reinterpret_f64_f32x4(a1),
2910        )
2911    }
2912    #[inline(always)]
2913    fn reinterpret_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
2914        let (a0, a1) = self.split_f32x8(a);
2915        self.combine_i32x4(
2916            self.reinterpret_i32_f32x4(a0),
2917            self.reinterpret_i32_f32x4(a1),
2918        )
2919    }
2920    #[inline(always)]
2921    fn reinterpret_u8_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
2922        let (a0, a1) = self.split_f32x8(a);
2923        self.combine_u8x16(self.reinterpret_u8_f32x4(a0), self.reinterpret_u8_f32x4(a1))
2924    }
2925    #[inline(always)]
2926    fn reinterpret_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
2927        let (a0, a1) = self.split_f32x8(a);
2928        self.combine_u32x4(
2929            self.reinterpret_u32_f32x4(a0),
2930            self.reinterpret_u32_f32x4(a1),
2931        )
2932    }
2933    #[inline(always)]
2934    fn cvt_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
2935        let (a0, a1) = self.split_f32x8(a);
2936        self.combine_u32x4(self.cvt_u32_f32x4(a0), self.cvt_u32_f32x4(a1))
2937    }
2938    #[inline(always)]
2939    fn cvt_u32_precise_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
2940        let (a0, a1) = self.split_f32x8(a);
2941        self.combine_u32x4(
2942            self.cvt_u32_precise_f32x4(a0),
2943            self.cvt_u32_precise_f32x4(a1),
2944        )
2945    }
2946    #[inline(always)]
2947    fn cvt_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
2948        let (a0, a1) = self.split_f32x8(a);
2949        self.combine_i32x4(self.cvt_i32_f32x4(a0), self.cvt_i32_f32x4(a1))
2950    }
2951    #[inline(always)]
2952    fn cvt_i32_precise_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
2953        let (a0, a1) = self.split_f32x8(a);
2954        self.combine_i32x4(
2955            self.cvt_i32_precise_f32x4(a0),
2956            self.cvt_i32_precise_f32x4(a1),
2957        )
2958    }
2959    #[inline(always)]
2960    fn splat_i8x32(self, val: i8) -> i8x32<Self> {
2961        let half = self.splat_i8x16(val);
2962        self.combine_i8x16(half, half)
2963    }
2964    #[inline(always)]
2965    fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32<Self> {
2966        i8x32 {
2967            val: unsafe { core::mem::transmute_copy(&val) },
2968            simd: self,
2969        }
2970    }
2971    #[inline(always)]
2972    fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32<Self> {
2973        i8x32 {
2974            val: unsafe { core::mem::transmute_copy(val) },
2975            simd: self,
2976        }
2977    }
2978    #[inline(always)]
2979    fn as_array_i8x32(self, a: i8x32<Self>) -> [i8; 32usize] {
2980        unsafe { core::mem::transmute::<[__m128i; 2usize], [i8; 32usize]>(a.val.0) }
2981    }
2982    #[inline(always)]
2983    fn as_array_ref_i8x32(self, a: &i8x32<Self>) -> &[i8; 32usize] {
2984        unsafe { core::mem::transmute::<&[__m128i; 2usize], &[i8; 32usize]>(&a.val.0) }
2985    }
2986    #[inline(always)]
2987    fn as_array_mut_i8x32(self, a: &mut i8x32<Self>) -> &mut [i8; 32usize] {
2988        unsafe { core::mem::transmute::<&mut [__m128i; 2usize], &mut [i8; 32usize]>(&mut a.val.0) }
2989    }
2990    #[inline(always)]
2991    fn store_array_i8x32(self, a: i8x32<Self>, dest: &mut [i8; 32usize]) -> () {
2992        unsafe {
2993            core::ptr::copy_nonoverlapping(
2994                (&raw const a.val.0) as *const i8,
2995                dest.as_mut_ptr(),
2996                32usize,
2997            );
2998        }
2999    }
3000    #[inline(always)]
3001    fn cvt_from_bytes_i8x32(self, a: u8x32<Self>) -> i8x32<Self> {
3002        unsafe {
3003            i8x32 {
3004                val: core::mem::transmute(a.val),
3005                simd: self,
3006            }
3007        }
3008    }
3009    #[inline(always)]
3010    fn cvt_to_bytes_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
3011        unsafe {
3012            u8x32 {
3013                val: core::mem::transmute(a.val),
3014                simd: self,
3015            }
3016        }
3017    }
3018    #[inline(always)]
3019    fn slide_i8x32<const SHIFT: usize>(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3020        unsafe {
3021            if SHIFT >= 32usize {
3022                return b;
3023            }
3024            let result = cross_block_alignr_128x2(
3025                self.cvt_to_bytes_i8x32(b).val.0,
3026                self.cvt_to_bytes_i8x32(a).val.0,
3027                SHIFT,
3028            );
3029            self.cvt_from_bytes_i8x32(u8x32 {
3030                val: crate::support::Aligned256(result),
3031                simd: self,
3032            })
3033        }
3034    }
3035    #[inline(always)]
3036    fn slide_within_blocks_i8x32<const SHIFT: usize>(
3037        self,
3038        a: i8x32<Self>,
3039        b: i8x32<Self>,
3040    ) -> i8x32<Self> {
3041        let (a0, a1) = self.split_i8x32(a);
3042        let (b0, b1) = self.split_i8x32(b);
3043        self.combine_i8x16(
3044            self.slide_within_blocks_i8x16::<SHIFT>(a0, b0),
3045            self.slide_within_blocks_i8x16::<SHIFT>(a1, b1),
3046        )
3047    }
3048    #[inline(always)]
3049    fn add_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3050        let (a0, a1) = self.split_i8x32(a);
3051        let (b0, b1) = self.split_i8x32(b);
3052        self.combine_i8x16(self.add_i8x16(a0, b0), self.add_i8x16(a1, b1))
3053    }
3054    #[inline(always)]
3055    fn sub_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3056        let (a0, a1) = self.split_i8x32(a);
3057        let (b0, b1) = self.split_i8x32(b);
3058        self.combine_i8x16(self.sub_i8x16(a0, b0), self.sub_i8x16(a1, b1))
3059    }
3060    #[inline(always)]
3061    fn mul_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3062        let (a0, a1) = self.split_i8x32(a);
3063        let (b0, b1) = self.split_i8x32(b);
3064        self.combine_i8x16(self.mul_i8x16(a0, b0), self.mul_i8x16(a1, b1))
3065    }
3066    #[inline(always)]
3067    fn and_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3068        let (a0, a1) = self.split_i8x32(a);
3069        let (b0, b1) = self.split_i8x32(b);
3070        self.combine_i8x16(self.and_i8x16(a0, b0), self.and_i8x16(a1, b1))
3071    }
3072    #[inline(always)]
3073    fn or_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3074        let (a0, a1) = self.split_i8x32(a);
3075        let (b0, b1) = self.split_i8x32(b);
3076        self.combine_i8x16(self.or_i8x16(a0, b0), self.or_i8x16(a1, b1))
3077    }
3078    #[inline(always)]
3079    fn xor_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3080        let (a0, a1) = self.split_i8x32(a);
3081        let (b0, b1) = self.split_i8x32(b);
3082        self.combine_i8x16(self.xor_i8x16(a0, b0), self.xor_i8x16(a1, b1))
3083    }
3084    #[inline(always)]
3085    fn not_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
3086        let (a0, a1) = self.split_i8x32(a);
3087        self.combine_i8x16(self.not_i8x16(a0), self.not_i8x16(a1))
3088    }
3089    #[inline(always)]
3090    fn shl_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
3091        let (a0, a1) = self.split_i8x32(a);
3092        self.combine_i8x16(self.shl_i8x16(a0, shift), self.shl_i8x16(a1, shift))
3093    }
3094    #[inline(always)]
3095    fn shlv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3096        let (a0, a1) = self.split_i8x32(a);
3097        let (b0, b1) = self.split_i8x32(b);
3098        self.combine_i8x16(self.shlv_i8x16(a0, b0), self.shlv_i8x16(a1, b1))
3099    }
3100    #[inline(always)]
3101    fn shr_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
3102        let (a0, a1) = self.split_i8x32(a);
3103        self.combine_i8x16(self.shr_i8x16(a0, shift), self.shr_i8x16(a1, shift))
3104    }
3105    #[inline(always)]
3106    fn shrv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3107        let (a0, a1) = self.split_i8x32(a);
3108        let (b0, b1) = self.split_i8x32(b);
3109        self.combine_i8x16(self.shrv_i8x16(a0, b0), self.shrv_i8x16(a1, b1))
3110    }
3111    #[inline(always)]
3112    fn simd_eq_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
3113        let (a0, a1) = self.split_i8x32(a);
3114        let (b0, b1) = self.split_i8x32(b);
3115        self.combine_mask8x16(self.simd_eq_i8x16(a0, b0), self.simd_eq_i8x16(a1, b1))
3116    }
3117    #[inline(always)]
3118    fn simd_lt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
3119        let (a0, a1) = self.split_i8x32(a);
3120        let (b0, b1) = self.split_i8x32(b);
3121        self.combine_mask8x16(self.simd_lt_i8x16(a0, b0), self.simd_lt_i8x16(a1, b1))
3122    }
3123    #[inline(always)]
3124    fn simd_le_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
3125        let (a0, a1) = self.split_i8x32(a);
3126        let (b0, b1) = self.split_i8x32(b);
3127        self.combine_mask8x16(self.simd_le_i8x16(a0, b0), self.simd_le_i8x16(a1, b1))
3128    }
3129    #[inline(always)]
3130    fn simd_ge_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
3131        let (a0, a1) = self.split_i8x32(a);
3132        let (b0, b1) = self.split_i8x32(b);
3133        self.combine_mask8x16(self.simd_ge_i8x16(a0, b0), self.simd_ge_i8x16(a1, b1))
3134    }
3135    #[inline(always)]
3136    fn simd_gt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
3137        let (a0, a1) = self.split_i8x32(a);
3138        let (b0, b1) = self.split_i8x32(b);
3139        self.combine_mask8x16(self.simd_gt_i8x16(a0, b0), self.simd_gt_i8x16(a1, b1))
3140    }
3141    #[inline(always)]
3142    fn zip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3143        let (a0, _) = self.split_i8x32(a);
3144        let (b0, _) = self.split_i8x32(b);
3145        self.combine_i8x16(self.zip_low_i8x16(a0, b0), self.zip_high_i8x16(a0, b0))
3146    }
3147    #[inline(always)]
3148    fn zip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3149        let (_, a1) = self.split_i8x32(a);
3150        let (_, b1) = self.split_i8x32(b);
3151        self.combine_i8x16(self.zip_low_i8x16(a1, b1), self.zip_high_i8x16(a1, b1))
3152    }
3153    #[inline(always)]
3154    fn unzip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3155        let (a0, a1) = self.split_i8x32(a);
3156        let (b0, b1) = self.split_i8x32(b);
3157        self.combine_i8x16(self.unzip_low_i8x16(a0, a1), self.unzip_low_i8x16(b0, b1))
3158    }
3159    #[inline(always)]
3160    fn unzip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3161        let (a0, a1) = self.split_i8x32(a);
3162        let (b0, b1) = self.split_i8x32(b);
3163        self.combine_i8x16(self.unzip_high_i8x16(a0, a1), self.unzip_high_i8x16(b0, b1))
3164    }
3165    #[inline(always)]
3166    fn interleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
3167        let (a0, a1) = self.split_i8x32(a);
3168        let (b0, b1) = self.split_i8x32(b);
3169        let lo_lo = self.zip_low_i8x16(a0, b0);
3170        let lo_hi = self.zip_high_i8x16(a0, b0);
3171        let hi_lo = self.zip_low_i8x16(a1, b1);
3172        let hi_hi = self.zip_high_i8x16(a1, b1);
3173        (
3174            self.combine_i8x16(lo_lo, lo_hi),
3175            self.combine_i8x16(hi_lo, hi_hi),
3176        )
3177    }
3178    #[inline(always)]
3179    fn deinterleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
3180        let (a0, a1) = self.split_i8x32(a);
3181        let (b0, b1) = self.split_i8x32(b);
3182        let lo_even = self.unzip_low_i8x16(a0, a1);
3183        let lo_odd = self.unzip_high_i8x16(a0, a1);
3184        let hi_even = self.unzip_low_i8x16(b0, b1);
3185        let hi_odd = self.unzip_high_i8x16(b0, b1);
3186        (
3187            self.combine_i8x16(lo_even, hi_even),
3188            self.combine_i8x16(lo_odd, hi_odd),
3189        )
3190    }
3191    #[inline(always)]
3192    fn select_i8x32(self, a: mask8x32<Self>, b: i8x32<Self>, c: i8x32<Self>) -> i8x32<Self> {
3193        let (a0, a1) = self.split_mask8x32(a);
3194        let (b0, b1) = self.split_i8x32(b);
3195        let (c0, c1) = self.split_i8x32(c);
3196        self.combine_i8x16(self.select_i8x16(a0, b0, c0), self.select_i8x16(a1, b1, c1))
3197    }
3198    #[inline(always)]
3199    fn min_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3200        let (a0, a1) = self.split_i8x32(a);
3201        let (b0, b1) = self.split_i8x32(b);
3202        self.combine_i8x16(self.min_i8x16(a0, b0), self.min_i8x16(a1, b1))
3203    }
3204    #[inline(always)]
3205    fn max_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3206        let (a0, a1) = self.split_i8x32(a);
3207        let (b0, b1) = self.split_i8x32(b);
3208        self.combine_i8x16(self.max_i8x16(a0, b0), self.max_i8x16(a1, b1))
3209    }
3210    #[inline(always)]
3211    fn combine_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x64<Self> {
3212        i8x64 {
3213            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
3214            simd: self,
3215        }
3216    }
3217    #[inline(always)]
3218    fn split_i8x32(self, a: i8x32<Self>) -> (i8x16<Self>, i8x16<Self>) {
3219        (
3220            i8x16 {
3221                val: crate::support::Aligned128(a.val.0[0]),
3222                simd: self,
3223            },
3224            i8x16 {
3225                val: crate::support::Aligned128(a.val.0[1]),
3226                simd: self,
3227            },
3228        )
3229    }
3230    #[inline(always)]
3231    fn neg_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
3232        let (a0, a1) = self.split_i8x32(a);
3233        self.combine_i8x16(self.neg_i8x16(a0), self.neg_i8x16(a1))
3234    }
3235    #[inline(always)]
3236    fn reinterpret_u8_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
3237        let (a0, a1) = self.split_i8x32(a);
3238        self.combine_u8x16(self.reinterpret_u8_i8x16(a0), self.reinterpret_u8_i8x16(a1))
3239    }
3240    #[inline(always)]
3241    fn reinterpret_u32_i8x32(self, a: i8x32<Self>) -> u32x8<Self> {
3242        let (a0, a1) = self.split_i8x32(a);
3243        self.combine_u32x4(
3244            self.reinterpret_u32_i8x16(a0),
3245            self.reinterpret_u32_i8x16(a1),
3246        )
3247    }
3248    #[inline(always)]
3249    fn splat_u8x32(self, val: u8) -> u8x32<Self> {
3250        let half = self.splat_u8x16(val);
3251        self.combine_u8x16(half, half)
3252    }
3253    #[inline(always)]
3254    fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32<Self> {
3255        u8x32 {
3256            val: unsafe { core::mem::transmute_copy(&val) },
3257            simd: self,
3258        }
3259    }
3260    #[inline(always)]
3261    fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32<Self> {
3262        u8x32 {
3263            val: unsafe { core::mem::transmute_copy(val) },
3264            simd: self,
3265        }
3266    }
3267    #[inline(always)]
3268    fn as_array_u8x32(self, a: u8x32<Self>) -> [u8; 32usize] {
3269        unsafe { core::mem::transmute::<[__m128i; 2usize], [u8; 32usize]>(a.val.0) }
3270    }
3271    #[inline(always)]
3272    fn as_array_ref_u8x32(self, a: &u8x32<Self>) -> &[u8; 32usize] {
3273        unsafe { core::mem::transmute::<&[__m128i; 2usize], &[u8; 32usize]>(&a.val.0) }
3274    }
3275    #[inline(always)]
3276    fn as_array_mut_u8x32(self, a: &mut u8x32<Self>) -> &mut [u8; 32usize] {
3277        unsafe { core::mem::transmute::<&mut [__m128i; 2usize], &mut [u8; 32usize]>(&mut a.val.0) }
3278    }
3279    #[inline(always)]
3280    fn store_array_u8x32(self, a: u8x32<Self>, dest: &mut [u8; 32usize]) -> () {
3281        unsafe {
3282            core::ptr::copy_nonoverlapping(
3283                (&raw const a.val.0) as *const u8,
3284                dest.as_mut_ptr(),
3285                32usize,
3286            );
3287        }
3288    }
3289    #[inline(always)]
3290    fn cvt_from_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
3291        unsafe {
3292            u8x32 {
3293                val: core::mem::transmute(a.val),
3294                simd: self,
3295            }
3296        }
3297    }
3298    #[inline(always)]
3299    fn cvt_to_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
3300        unsafe {
3301            u8x32 {
3302                val: core::mem::transmute(a.val),
3303                simd: self,
3304            }
3305        }
3306    }
3307    #[inline(always)]
3308    fn slide_u8x32<const SHIFT: usize>(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3309        unsafe {
3310            if SHIFT >= 32usize {
3311                return b;
3312            }
3313            let result = cross_block_alignr_128x2(
3314                self.cvt_to_bytes_u8x32(b).val.0,
3315                self.cvt_to_bytes_u8x32(a).val.0,
3316                SHIFT,
3317            );
3318            self.cvt_from_bytes_u8x32(u8x32 {
3319                val: crate::support::Aligned256(result),
3320                simd: self,
3321            })
3322        }
3323    }
3324    #[inline(always)]
3325    fn slide_within_blocks_u8x32<const SHIFT: usize>(
3326        self,
3327        a: u8x32<Self>,
3328        b: u8x32<Self>,
3329    ) -> u8x32<Self> {
3330        let (a0, a1) = self.split_u8x32(a);
3331        let (b0, b1) = self.split_u8x32(b);
3332        self.combine_u8x16(
3333            self.slide_within_blocks_u8x16::<SHIFT>(a0, b0),
3334            self.slide_within_blocks_u8x16::<SHIFT>(a1, b1),
3335        )
3336    }
3337    #[inline(always)]
3338    fn add_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3339        let (a0, a1) = self.split_u8x32(a);
3340        let (b0, b1) = self.split_u8x32(b);
3341        self.combine_u8x16(self.add_u8x16(a0, b0), self.add_u8x16(a1, b1))
3342    }
3343    #[inline(always)]
3344    fn sub_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3345        let (a0, a1) = self.split_u8x32(a);
3346        let (b0, b1) = self.split_u8x32(b);
3347        self.combine_u8x16(self.sub_u8x16(a0, b0), self.sub_u8x16(a1, b1))
3348    }
3349    #[inline(always)]
3350    fn mul_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3351        let (a0, a1) = self.split_u8x32(a);
3352        let (b0, b1) = self.split_u8x32(b);
3353        self.combine_u8x16(self.mul_u8x16(a0, b0), self.mul_u8x16(a1, b1))
3354    }
3355    #[inline(always)]
3356    fn and_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3357        let (a0, a1) = self.split_u8x32(a);
3358        let (b0, b1) = self.split_u8x32(b);
3359        self.combine_u8x16(self.and_u8x16(a0, b0), self.and_u8x16(a1, b1))
3360    }
3361    #[inline(always)]
3362    fn or_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3363        let (a0, a1) = self.split_u8x32(a);
3364        let (b0, b1) = self.split_u8x32(b);
3365        self.combine_u8x16(self.or_u8x16(a0, b0), self.or_u8x16(a1, b1))
3366    }
3367    #[inline(always)]
3368    fn xor_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3369        let (a0, a1) = self.split_u8x32(a);
3370        let (b0, b1) = self.split_u8x32(b);
3371        self.combine_u8x16(self.xor_u8x16(a0, b0), self.xor_u8x16(a1, b1))
3372    }
3373    #[inline(always)]
3374    fn not_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
3375        let (a0, a1) = self.split_u8x32(a);
3376        self.combine_u8x16(self.not_u8x16(a0), self.not_u8x16(a1))
3377    }
3378    #[inline(always)]
3379    fn shl_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
3380        let (a0, a1) = self.split_u8x32(a);
3381        self.combine_u8x16(self.shl_u8x16(a0, shift), self.shl_u8x16(a1, shift))
3382    }
3383    #[inline(always)]
3384    fn shlv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3385        let (a0, a1) = self.split_u8x32(a);
3386        let (b0, b1) = self.split_u8x32(b);
3387        self.combine_u8x16(self.shlv_u8x16(a0, b0), self.shlv_u8x16(a1, b1))
3388    }
3389    #[inline(always)]
3390    fn shr_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
3391        let (a0, a1) = self.split_u8x32(a);
3392        self.combine_u8x16(self.shr_u8x16(a0, shift), self.shr_u8x16(a1, shift))
3393    }
3394    #[inline(always)]
3395    fn shrv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3396        let (a0, a1) = self.split_u8x32(a);
3397        let (b0, b1) = self.split_u8x32(b);
3398        self.combine_u8x16(self.shrv_u8x16(a0, b0), self.shrv_u8x16(a1, b1))
3399    }
3400    #[inline(always)]
3401    fn simd_eq_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
3402        let (a0, a1) = self.split_u8x32(a);
3403        let (b0, b1) = self.split_u8x32(b);
3404        self.combine_mask8x16(self.simd_eq_u8x16(a0, b0), self.simd_eq_u8x16(a1, b1))
3405    }
3406    #[inline(always)]
3407    fn simd_lt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
3408        let (a0, a1) = self.split_u8x32(a);
3409        let (b0, b1) = self.split_u8x32(b);
3410        self.combine_mask8x16(self.simd_lt_u8x16(a0, b0), self.simd_lt_u8x16(a1, b1))
3411    }
3412    #[inline(always)]
3413    fn simd_le_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
3414        let (a0, a1) = self.split_u8x32(a);
3415        let (b0, b1) = self.split_u8x32(b);
3416        self.combine_mask8x16(self.simd_le_u8x16(a0, b0), self.simd_le_u8x16(a1, b1))
3417    }
3418    #[inline(always)]
3419    fn simd_ge_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
3420        let (a0, a1) = self.split_u8x32(a);
3421        let (b0, b1) = self.split_u8x32(b);
3422        self.combine_mask8x16(self.simd_ge_u8x16(a0, b0), self.simd_ge_u8x16(a1, b1))
3423    }
3424    #[inline(always)]
3425    fn simd_gt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
3426        let (a0, a1) = self.split_u8x32(a);
3427        let (b0, b1) = self.split_u8x32(b);
3428        self.combine_mask8x16(self.simd_gt_u8x16(a0, b0), self.simd_gt_u8x16(a1, b1))
3429    }
3430    #[inline(always)]
3431    fn zip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3432        let (a0, _) = self.split_u8x32(a);
3433        let (b0, _) = self.split_u8x32(b);
3434        self.combine_u8x16(self.zip_low_u8x16(a0, b0), self.zip_high_u8x16(a0, b0))
3435    }
3436    #[inline(always)]
3437    fn zip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3438        let (_, a1) = self.split_u8x32(a);
3439        let (_, b1) = self.split_u8x32(b);
3440        self.combine_u8x16(self.zip_low_u8x16(a1, b1), self.zip_high_u8x16(a1, b1))
3441    }
3442    #[inline(always)]
3443    fn unzip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3444        let (a0, a1) = self.split_u8x32(a);
3445        let (b0, b1) = self.split_u8x32(b);
3446        self.combine_u8x16(self.unzip_low_u8x16(a0, a1), self.unzip_low_u8x16(b0, b1))
3447    }
3448    #[inline(always)]
3449    fn unzip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3450        let (a0, a1) = self.split_u8x32(a);
3451        let (b0, b1) = self.split_u8x32(b);
3452        self.combine_u8x16(self.unzip_high_u8x16(a0, a1), self.unzip_high_u8x16(b0, b1))
3453    }
3454    #[inline(always)]
3455    fn interleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
3456        let (a0, a1) = self.split_u8x32(a);
3457        let (b0, b1) = self.split_u8x32(b);
3458        let lo_lo = self.zip_low_u8x16(a0, b0);
3459        let lo_hi = self.zip_high_u8x16(a0, b0);
3460        let hi_lo = self.zip_low_u8x16(a1, b1);
3461        let hi_hi = self.zip_high_u8x16(a1, b1);
3462        (
3463            self.combine_u8x16(lo_lo, lo_hi),
3464            self.combine_u8x16(hi_lo, hi_hi),
3465        )
3466    }
3467    #[inline(always)]
3468    fn deinterleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
3469        let (a0, a1) = self.split_u8x32(a);
3470        let (b0, b1) = self.split_u8x32(b);
3471        let lo_even = self.unzip_low_u8x16(a0, a1);
3472        let lo_odd = self.unzip_high_u8x16(a0, a1);
3473        let hi_even = self.unzip_low_u8x16(b0, b1);
3474        let hi_odd = self.unzip_high_u8x16(b0, b1);
3475        (
3476            self.combine_u8x16(lo_even, hi_even),
3477            self.combine_u8x16(lo_odd, hi_odd),
3478        )
3479    }
3480    #[inline(always)]
3481    fn select_u8x32(self, a: mask8x32<Self>, b: u8x32<Self>, c: u8x32<Self>) -> u8x32<Self> {
3482        let (a0, a1) = self.split_mask8x32(a);
3483        let (b0, b1) = self.split_u8x32(b);
3484        let (c0, c1) = self.split_u8x32(c);
3485        self.combine_u8x16(self.select_u8x16(a0, b0, c0), self.select_u8x16(a1, b1, c1))
3486    }
3487    #[inline(always)]
3488    fn min_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3489        let (a0, a1) = self.split_u8x32(a);
3490        let (b0, b1) = self.split_u8x32(b);
3491        self.combine_u8x16(self.min_u8x16(a0, b0), self.min_u8x16(a1, b1))
3492    }
3493    #[inline(always)]
3494    fn max_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3495        let (a0, a1) = self.split_u8x32(a);
3496        let (b0, b1) = self.split_u8x32(b);
3497        self.combine_u8x16(self.max_u8x16(a0, b0), self.max_u8x16(a1, b1))
3498    }
3499    #[inline(always)]
3500    fn combine_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x64<Self> {
3501        u8x64 {
3502            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
3503            simd: self,
3504        }
3505    }
3506    #[inline(always)]
3507    fn split_u8x32(self, a: u8x32<Self>) -> (u8x16<Self>, u8x16<Self>) {
3508        (
3509            u8x16 {
3510                val: crate::support::Aligned128(a.val.0[0]),
3511                simd: self,
3512            },
3513            u8x16 {
3514                val: crate::support::Aligned128(a.val.0[1]),
3515                simd: self,
3516            },
3517        )
3518    }
3519    #[inline(always)]
3520    fn widen_u8x32(self, a: u8x32<Self>) -> u16x32<Self> {
3521        let (a0, a1) = self.split_u8x32(a);
3522        self.combine_u16x16(self.widen_u8x16(a0), self.widen_u8x16(a1))
3523    }
3524    #[inline(always)]
3525    fn reinterpret_u32_u8x32(self, a: u8x32<Self>) -> u32x8<Self> {
3526        let (a0, a1) = self.split_u8x32(a);
3527        self.combine_u32x4(
3528            self.reinterpret_u32_u8x16(a0),
3529            self.reinterpret_u32_u8x16(a1),
3530        )
3531    }
3532    #[inline(always)]
3533    fn splat_mask8x32(self, val: i8) -> mask8x32<Self> {
3534        let half = self.splat_mask8x16(val);
3535        self.combine_mask8x16(half, half)
3536    }
3537    #[inline(always)]
3538    fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32<Self> {
3539        mask8x32 {
3540            val: unsafe { core::mem::transmute_copy(&val) },
3541            simd: self,
3542        }
3543    }
3544    #[inline(always)]
3545    fn load_array_ref_mask8x32(self, val: &[i8; 32usize]) -> mask8x32<Self> {
3546        mask8x32 {
3547            val: unsafe { core::mem::transmute_copy(val) },
3548            simd: self,
3549        }
3550    }
3551    #[inline(always)]
3552    fn as_array_mask8x32(self, a: mask8x32<Self>) -> [i8; 32usize] {
3553        unsafe { core::mem::transmute::<[__m128i; 2usize], [i8; 32usize]>(a.val.0) }
3554    }
3555    #[inline(always)]
3556    fn as_array_ref_mask8x32(self, a: &mask8x32<Self>) -> &[i8; 32usize] {
3557        unsafe { core::mem::transmute::<&[__m128i; 2usize], &[i8; 32usize]>(&a.val.0) }
3558    }
3559    #[inline(always)]
3560    fn as_array_mut_mask8x32(self, a: &mut mask8x32<Self>) -> &mut [i8; 32usize] {
3561        unsafe { core::mem::transmute::<&mut [__m128i; 2usize], &mut [i8; 32usize]>(&mut a.val.0) }
3562    }
3563    #[inline(always)]
3564    fn store_array_mask8x32(self, a: mask8x32<Self>, dest: &mut [i8; 32usize]) -> () {
3565        unsafe {
3566            core::ptr::copy_nonoverlapping(
3567                (&raw const a.val.0) as *const i8,
3568                dest.as_mut_ptr(),
3569                32usize,
3570            );
3571        }
3572    }
3573    #[inline(always)]
3574    fn cvt_from_bytes_mask8x32(self, a: u8x32<Self>) -> mask8x32<Self> {
3575        unsafe {
3576            mask8x32 {
3577                val: core::mem::transmute(a.val),
3578                simd: self,
3579            }
3580        }
3581    }
3582    #[inline(always)]
3583    fn cvt_to_bytes_mask8x32(self, a: mask8x32<Self>) -> u8x32<Self> {
3584        unsafe {
3585            u8x32 {
3586                val: core::mem::transmute(a.val),
3587                simd: self,
3588            }
3589        }
3590    }
3591    #[inline(always)]
3592    fn slide_mask8x32<const SHIFT: usize>(
3593        self,
3594        a: mask8x32<Self>,
3595        b: mask8x32<Self>,
3596    ) -> mask8x32<Self> {
3597        unsafe {
3598            if SHIFT >= 32usize {
3599                return b;
3600            }
3601            let result = cross_block_alignr_128x2(
3602                self.cvt_to_bytes_mask8x32(b).val.0,
3603                self.cvt_to_bytes_mask8x32(a).val.0,
3604                SHIFT,
3605            );
3606            self.cvt_from_bytes_mask8x32(u8x32 {
3607                val: crate::support::Aligned256(result),
3608                simd: self,
3609            })
3610        }
3611    }
3612    #[inline(always)]
3613    fn slide_within_blocks_mask8x32<const SHIFT: usize>(
3614        self,
3615        a: mask8x32<Self>,
3616        b: mask8x32<Self>,
3617    ) -> mask8x32<Self> {
3618        let (a0, a1) = self.split_mask8x32(a);
3619        let (b0, b1) = self.split_mask8x32(b);
3620        self.combine_mask8x16(
3621            self.slide_within_blocks_mask8x16::<SHIFT>(a0, b0),
3622            self.slide_within_blocks_mask8x16::<SHIFT>(a1, b1),
3623        )
3624    }
3625    #[inline(always)]
3626    fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
3627        let (a0, a1) = self.split_mask8x32(a);
3628        let (b0, b1) = self.split_mask8x32(b);
3629        self.combine_mask8x16(self.and_mask8x16(a0, b0), self.and_mask8x16(a1, b1))
3630    }
3631    #[inline(always)]
3632    fn or_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
3633        let (a0, a1) = self.split_mask8x32(a);
3634        let (b0, b1) = self.split_mask8x32(b);
3635        self.combine_mask8x16(self.or_mask8x16(a0, b0), self.or_mask8x16(a1, b1))
3636    }
3637    #[inline(always)]
3638    fn xor_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
3639        let (a0, a1) = self.split_mask8x32(a);
3640        let (b0, b1) = self.split_mask8x32(b);
3641        self.combine_mask8x16(self.xor_mask8x16(a0, b0), self.xor_mask8x16(a1, b1))
3642    }
3643    #[inline(always)]
3644    fn not_mask8x32(self, a: mask8x32<Self>) -> mask8x32<Self> {
3645        let (a0, a1) = self.split_mask8x32(a);
3646        self.combine_mask8x16(self.not_mask8x16(a0), self.not_mask8x16(a1))
3647    }
3648    #[inline(always)]
3649    fn select_mask8x32(
3650        self,
3651        a: mask8x32<Self>,
3652        b: mask8x32<Self>,
3653        c: mask8x32<Self>,
3654    ) -> mask8x32<Self> {
3655        let (a0, a1) = self.split_mask8x32(a);
3656        let (b0, b1) = self.split_mask8x32(b);
3657        let (c0, c1) = self.split_mask8x32(c);
3658        self.combine_mask8x16(
3659            self.select_mask8x16(a0, b0, c0),
3660            self.select_mask8x16(a1, b1, c1),
3661        )
3662    }
3663    #[inline(always)]
3664    fn simd_eq_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
3665        let (a0, a1) = self.split_mask8x32(a);
3666        let (b0, b1) = self.split_mask8x32(b);
3667        self.combine_mask8x16(self.simd_eq_mask8x16(a0, b0), self.simd_eq_mask8x16(a1, b1))
3668    }
3669    #[inline(always)]
3670    fn any_true_mask8x32(self, a: mask8x32<Self>) -> bool {
3671        let (a0, a1) = self.split_mask8x32(a);
3672        self.any_true_mask8x16(a0) || self.any_true_mask8x16(a1)
3673    }
3674    #[inline(always)]
3675    fn all_true_mask8x32(self, a: mask8x32<Self>) -> bool {
3676        let (a0, a1) = self.split_mask8x32(a);
3677        self.all_true_mask8x16(a0) && self.all_true_mask8x16(a1)
3678    }
3679    #[inline(always)]
3680    fn any_false_mask8x32(self, a: mask8x32<Self>) -> bool {
3681        let (a0, a1) = self.split_mask8x32(a);
3682        self.any_false_mask8x16(a0) || self.any_false_mask8x16(a1)
3683    }
3684    #[inline(always)]
3685    fn all_false_mask8x32(self, a: mask8x32<Self>) -> bool {
3686        let (a0, a1) = self.split_mask8x32(a);
3687        self.all_false_mask8x16(a0) && self.all_false_mask8x16(a1)
3688    }
3689    #[inline(always)]
3690    fn combine_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x64<Self> {
3691        mask8x64 {
3692            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
3693            simd: self,
3694        }
3695    }
3696    #[inline(always)]
3697    fn split_mask8x32(self, a: mask8x32<Self>) -> (mask8x16<Self>, mask8x16<Self>) {
3698        (
3699            mask8x16 {
3700                val: crate::support::Aligned128(a.val.0[0]),
3701                simd: self,
3702            },
3703            mask8x16 {
3704                val: crate::support::Aligned128(a.val.0[1]),
3705                simd: self,
3706            },
3707        )
3708    }
3709    #[inline(always)]
3710    fn splat_i16x16(self, val: i16) -> i16x16<Self> {
3711        let half = self.splat_i16x8(val);
3712        self.combine_i16x8(half, half)
3713    }
3714    #[inline(always)]
3715    fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16<Self> {
3716        i16x16 {
3717            val: unsafe { core::mem::transmute_copy(&val) },
3718            simd: self,
3719        }
3720    }
3721    #[inline(always)]
3722    fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16<Self> {
3723        i16x16 {
3724            val: unsafe { core::mem::transmute_copy(val) },
3725            simd: self,
3726        }
3727    }
3728    #[inline(always)]
3729    fn as_array_i16x16(self, a: i16x16<Self>) -> [i16; 16usize] {
3730        unsafe { core::mem::transmute::<[__m128i; 2usize], [i16; 16usize]>(a.val.0) }
3731    }
3732    #[inline(always)]
3733    fn as_array_ref_i16x16(self, a: &i16x16<Self>) -> &[i16; 16usize] {
3734        unsafe { core::mem::transmute::<&[__m128i; 2usize], &[i16; 16usize]>(&a.val.0) }
3735    }
3736    #[inline(always)]
3737    fn as_array_mut_i16x16(self, a: &mut i16x16<Self>) -> &mut [i16; 16usize] {
3738        unsafe { core::mem::transmute::<&mut [__m128i; 2usize], &mut [i16; 16usize]>(&mut a.val.0) }
3739    }
3740    #[inline(always)]
3741    fn store_array_i16x16(self, a: i16x16<Self>, dest: &mut [i16; 16usize]) -> () {
3742        unsafe {
3743            core::ptr::copy_nonoverlapping(
3744                (&raw const a.val.0) as *const i16,
3745                dest.as_mut_ptr(),
3746                16usize,
3747            );
3748        }
3749    }
3750    #[inline(always)]
3751    fn cvt_from_bytes_i16x16(self, a: u8x32<Self>) -> i16x16<Self> {
3752        unsafe {
3753            i16x16 {
3754                val: core::mem::transmute(a.val),
3755                simd: self,
3756            }
3757        }
3758    }
3759    #[inline(always)]
3760    fn cvt_to_bytes_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
3761        unsafe {
3762            u8x32 {
3763                val: core::mem::transmute(a.val),
3764                simd: self,
3765            }
3766        }
3767    }
3768    #[inline(always)]
3769    fn slide_i16x16<const SHIFT: usize>(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3770        unsafe {
3771            if SHIFT >= 16usize {
3772                return b;
3773            }
3774            let result = cross_block_alignr_128x2(
3775                self.cvt_to_bytes_i16x16(b).val.0,
3776                self.cvt_to_bytes_i16x16(a).val.0,
3777                SHIFT * 2usize,
3778            );
3779            self.cvt_from_bytes_i16x16(u8x32 {
3780                val: crate::support::Aligned256(result),
3781                simd: self,
3782            })
3783        }
3784    }
3785    #[inline(always)]
3786    fn slide_within_blocks_i16x16<const SHIFT: usize>(
3787        self,
3788        a: i16x16<Self>,
3789        b: i16x16<Self>,
3790    ) -> i16x16<Self> {
3791        let (a0, a1) = self.split_i16x16(a);
3792        let (b0, b1) = self.split_i16x16(b);
3793        self.combine_i16x8(
3794            self.slide_within_blocks_i16x8::<SHIFT>(a0, b0),
3795            self.slide_within_blocks_i16x8::<SHIFT>(a1, b1),
3796        )
3797    }
3798    #[inline(always)]
3799    fn add_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3800        let (a0, a1) = self.split_i16x16(a);
3801        let (b0, b1) = self.split_i16x16(b);
3802        self.combine_i16x8(self.add_i16x8(a0, b0), self.add_i16x8(a1, b1))
3803    }
3804    #[inline(always)]
3805    fn sub_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3806        let (a0, a1) = self.split_i16x16(a);
3807        let (b0, b1) = self.split_i16x16(b);
3808        self.combine_i16x8(self.sub_i16x8(a0, b0), self.sub_i16x8(a1, b1))
3809    }
3810    #[inline(always)]
3811    fn mul_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3812        let (a0, a1) = self.split_i16x16(a);
3813        let (b0, b1) = self.split_i16x16(b);
3814        self.combine_i16x8(self.mul_i16x8(a0, b0), self.mul_i16x8(a1, b1))
3815    }
3816    #[inline(always)]
3817    fn and_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3818        let (a0, a1) = self.split_i16x16(a);
3819        let (b0, b1) = self.split_i16x16(b);
3820        self.combine_i16x8(self.and_i16x8(a0, b0), self.and_i16x8(a1, b1))
3821    }
3822    #[inline(always)]
3823    fn or_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3824        let (a0, a1) = self.split_i16x16(a);
3825        let (b0, b1) = self.split_i16x16(b);
3826        self.combine_i16x8(self.or_i16x8(a0, b0), self.or_i16x8(a1, b1))
3827    }
3828    #[inline(always)]
3829    fn xor_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3830        let (a0, a1) = self.split_i16x16(a);
3831        let (b0, b1) = self.split_i16x16(b);
3832        self.combine_i16x8(self.xor_i16x8(a0, b0), self.xor_i16x8(a1, b1))
3833    }
3834    #[inline(always)]
3835    fn not_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
3836        let (a0, a1) = self.split_i16x16(a);
3837        self.combine_i16x8(self.not_i16x8(a0), self.not_i16x8(a1))
3838    }
3839    #[inline(always)]
3840    fn shl_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
3841        let (a0, a1) = self.split_i16x16(a);
3842        self.combine_i16x8(self.shl_i16x8(a0, shift), self.shl_i16x8(a1, shift))
3843    }
3844    #[inline(always)]
3845    fn shlv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3846        let (a0, a1) = self.split_i16x16(a);
3847        let (b0, b1) = self.split_i16x16(b);
3848        self.combine_i16x8(self.shlv_i16x8(a0, b0), self.shlv_i16x8(a1, b1))
3849    }
3850    #[inline(always)]
3851    fn shr_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
3852        let (a0, a1) = self.split_i16x16(a);
3853        self.combine_i16x8(self.shr_i16x8(a0, shift), self.shr_i16x8(a1, shift))
3854    }
3855    #[inline(always)]
3856    fn shrv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3857        let (a0, a1) = self.split_i16x16(a);
3858        let (b0, b1) = self.split_i16x16(b);
3859        self.combine_i16x8(self.shrv_i16x8(a0, b0), self.shrv_i16x8(a1, b1))
3860    }
3861    #[inline(always)]
3862    fn simd_eq_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
3863        let (a0, a1) = self.split_i16x16(a);
3864        let (b0, b1) = self.split_i16x16(b);
3865        self.combine_mask16x8(self.simd_eq_i16x8(a0, b0), self.simd_eq_i16x8(a1, b1))
3866    }
3867    #[inline(always)]
3868    fn simd_lt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
3869        let (a0, a1) = self.split_i16x16(a);
3870        let (b0, b1) = self.split_i16x16(b);
3871        self.combine_mask16x8(self.simd_lt_i16x8(a0, b0), self.simd_lt_i16x8(a1, b1))
3872    }
3873    #[inline(always)]
3874    fn simd_le_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
3875        let (a0, a1) = self.split_i16x16(a);
3876        let (b0, b1) = self.split_i16x16(b);
3877        self.combine_mask16x8(self.simd_le_i16x8(a0, b0), self.simd_le_i16x8(a1, b1))
3878    }
3879    #[inline(always)]
3880    fn simd_ge_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
3881        let (a0, a1) = self.split_i16x16(a);
3882        let (b0, b1) = self.split_i16x16(b);
3883        self.combine_mask16x8(self.simd_ge_i16x8(a0, b0), self.simd_ge_i16x8(a1, b1))
3884    }
3885    #[inline(always)]
3886    fn simd_gt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
3887        let (a0, a1) = self.split_i16x16(a);
3888        let (b0, b1) = self.split_i16x16(b);
3889        self.combine_mask16x8(self.simd_gt_i16x8(a0, b0), self.simd_gt_i16x8(a1, b1))
3890    }
3891    #[inline(always)]
3892    fn zip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3893        let (a0, _) = self.split_i16x16(a);
3894        let (b0, _) = self.split_i16x16(b);
3895        self.combine_i16x8(self.zip_low_i16x8(a0, b0), self.zip_high_i16x8(a0, b0))
3896    }
3897    #[inline(always)]
3898    fn zip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3899        let (_, a1) = self.split_i16x16(a);
3900        let (_, b1) = self.split_i16x16(b);
3901        self.combine_i16x8(self.zip_low_i16x8(a1, b1), self.zip_high_i16x8(a1, b1))
3902    }
3903    #[inline(always)]
3904    fn unzip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3905        let (a0, a1) = self.split_i16x16(a);
3906        let (b0, b1) = self.split_i16x16(b);
3907        self.combine_i16x8(self.unzip_low_i16x8(a0, a1), self.unzip_low_i16x8(b0, b1))
3908    }
3909    #[inline(always)]
3910    fn unzip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3911        let (a0, a1) = self.split_i16x16(a);
3912        let (b0, b1) = self.split_i16x16(b);
3913        self.combine_i16x8(self.unzip_high_i16x8(a0, a1), self.unzip_high_i16x8(b0, b1))
3914    }
3915    #[inline(always)]
3916    fn interleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
3917        let (a0, a1) = self.split_i16x16(a);
3918        let (b0, b1) = self.split_i16x16(b);
3919        let lo_lo = self.zip_low_i16x8(a0, b0);
3920        let lo_hi = self.zip_high_i16x8(a0, b0);
3921        let hi_lo = self.zip_low_i16x8(a1, b1);
3922        let hi_hi = self.zip_high_i16x8(a1, b1);
3923        (
3924            self.combine_i16x8(lo_lo, lo_hi),
3925            self.combine_i16x8(hi_lo, hi_hi),
3926        )
3927    }
3928    #[inline(always)]
3929    fn deinterleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
3930        let (a0, a1) = self.split_i16x16(a);
3931        let (b0, b1) = self.split_i16x16(b);
3932        let lo_even = self.unzip_low_i16x8(a0, a1);
3933        let lo_odd = self.unzip_high_i16x8(a0, a1);
3934        let hi_even = self.unzip_low_i16x8(b0, b1);
3935        let hi_odd = self.unzip_high_i16x8(b0, b1);
3936        (
3937            self.combine_i16x8(lo_even, hi_even),
3938            self.combine_i16x8(lo_odd, hi_odd),
3939        )
3940    }
3941    #[inline(always)]
3942    fn select_i16x16(self, a: mask16x16<Self>, b: i16x16<Self>, c: i16x16<Self>) -> i16x16<Self> {
3943        let (a0, a1) = self.split_mask16x16(a);
3944        let (b0, b1) = self.split_i16x16(b);
3945        let (c0, c1) = self.split_i16x16(c);
3946        self.combine_i16x8(self.select_i16x8(a0, b0, c0), self.select_i16x8(a1, b1, c1))
3947    }
3948    #[inline(always)]
3949    fn min_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3950        let (a0, a1) = self.split_i16x16(a);
3951        let (b0, b1) = self.split_i16x16(b);
3952        self.combine_i16x8(self.min_i16x8(a0, b0), self.min_i16x8(a1, b1))
3953    }
3954    #[inline(always)]
3955    fn max_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3956        let (a0, a1) = self.split_i16x16(a);
3957        let (b0, b1) = self.split_i16x16(b);
3958        self.combine_i16x8(self.max_i16x8(a0, b0), self.max_i16x8(a1, b1))
3959    }
3960    #[inline(always)]
3961    fn combine_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x32<Self> {
3962        i16x32 {
3963            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
3964            simd: self,
3965        }
3966    }
3967    #[inline(always)]
3968    fn split_i16x16(self, a: i16x16<Self>) -> (i16x8<Self>, i16x8<Self>) {
3969        (
3970            i16x8 {
3971                val: crate::support::Aligned128(a.val.0[0]),
3972                simd: self,
3973            },
3974            i16x8 {
3975                val: crate::support::Aligned128(a.val.0[1]),
3976                simd: self,
3977            },
3978        )
3979    }
3980    #[inline(always)]
3981    fn neg_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
3982        let (a0, a1) = self.split_i16x16(a);
3983        self.combine_i16x8(self.neg_i16x8(a0), self.neg_i16x8(a1))
3984    }
3985    #[inline(always)]
3986    fn reinterpret_u8_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
3987        let (a0, a1) = self.split_i16x16(a);
3988        self.combine_u8x16(self.reinterpret_u8_i16x8(a0), self.reinterpret_u8_i16x8(a1))
3989    }
3990    #[inline(always)]
3991    fn reinterpret_u32_i16x16(self, a: i16x16<Self>) -> u32x8<Self> {
3992        let (a0, a1) = self.split_i16x16(a);
3993        self.combine_u32x4(
3994            self.reinterpret_u32_i16x8(a0),
3995            self.reinterpret_u32_i16x8(a1),
3996        )
3997    }
3998    #[inline(always)]
3999    fn splat_u16x16(self, val: u16) -> u16x16<Self> {
4000        let half = self.splat_u16x8(val);
4001        self.combine_u16x8(half, half)
4002    }
4003    #[inline(always)]
4004    fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16<Self> {
4005        u16x16 {
4006            val: unsafe { core::mem::transmute_copy(&val) },
4007            simd: self,
4008        }
4009    }
4010    #[inline(always)]
4011    fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16<Self> {
4012        u16x16 {
4013            val: unsafe { core::mem::transmute_copy(val) },
4014            simd: self,
4015        }
4016    }
4017    #[inline(always)]
4018    fn as_array_u16x16(self, a: u16x16<Self>) -> [u16; 16usize] {
4019        unsafe { core::mem::transmute::<[__m128i; 2usize], [u16; 16usize]>(a.val.0) }
4020    }
4021    #[inline(always)]
4022    fn as_array_ref_u16x16(self, a: &u16x16<Self>) -> &[u16; 16usize] {
4023        unsafe { core::mem::transmute::<&[__m128i; 2usize], &[u16; 16usize]>(&a.val.0) }
4024    }
4025    #[inline(always)]
4026    fn as_array_mut_u16x16(self, a: &mut u16x16<Self>) -> &mut [u16; 16usize] {
4027        unsafe { core::mem::transmute::<&mut [__m128i; 2usize], &mut [u16; 16usize]>(&mut a.val.0) }
4028    }
4029    #[inline(always)]
4030    fn store_array_u16x16(self, a: u16x16<Self>, dest: &mut [u16; 16usize]) -> () {
4031        unsafe {
4032            core::ptr::copy_nonoverlapping(
4033                (&raw const a.val.0) as *const u16,
4034                dest.as_mut_ptr(),
4035                16usize,
4036            );
4037        }
4038    }
4039    #[inline(always)]
4040    fn cvt_from_bytes_u16x16(self, a: u8x32<Self>) -> u16x16<Self> {
4041        unsafe {
4042            u16x16 {
4043                val: core::mem::transmute(a.val),
4044                simd: self,
4045            }
4046        }
4047    }
4048    #[inline(always)]
4049    fn cvt_to_bytes_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
4050        unsafe {
4051            u8x32 {
4052                val: core::mem::transmute(a.val),
4053                simd: self,
4054            }
4055        }
4056    }
4057    #[inline(always)]
4058    fn slide_u16x16<const SHIFT: usize>(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4059        unsafe {
4060            if SHIFT >= 16usize {
4061                return b;
4062            }
4063            let result = cross_block_alignr_128x2(
4064                self.cvt_to_bytes_u16x16(b).val.0,
4065                self.cvt_to_bytes_u16x16(a).val.0,
4066                SHIFT * 2usize,
4067            );
4068            self.cvt_from_bytes_u16x16(u8x32 {
4069                val: crate::support::Aligned256(result),
4070                simd: self,
4071            })
4072        }
4073    }
4074    #[inline(always)]
4075    fn slide_within_blocks_u16x16<const SHIFT: usize>(
4076        self,
4077        a: u16x16<Self>,
4078        b: u16x16<Self>,
4079    ) -> u16x16<Self> {
4080        let (a0, a1) = self.split_u16x16(a);
4081        let (b0, b1) = self.split_u16x16(b);
4082        self.combine_u16x8(
4083            self.slide_within_blocks_u16x8::<SHIFT>(a0, b0),
4084            self.slide_within_blocks_u16x8::<SHIFT>(a1, b1),
4085        )
4086    }
4087    #[inline(always)]
4088    fn add_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4089        let (a0, a1) = self.split_u16x16(a);
4090        let (b0, b1) = self.split_u16x16(b);
4091        self.combine_u16x8(self.add_u16x8(a0, b0), self.add_u16x8(a1, b1))
4092    }
4093    #[inline(always)]
4094    fn sub_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4095        let (a0, a1) = self.split_u16x16(a);
4096        let (b0, b1) = self.split_u16x16(b);
4097        self.combine_u16x8(self.sub_u16x8(a0, b0), self.sub_u16x8(a1, b1))
4098    }
4099    #[inline(always)]
4100    fn mul_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4101        let (a0, a1) = self.split_u16x16(a);
4102        let (b0, b1) = self.split_u16x16(b);
4103        self.combine_u16x8(self.mul_u16x8(a0, b0), self.mul_u16x8(a1, b1))
4104    }
4105    #[inline(always)]
4106    fn and_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4107        let (a0, a1) = self.split_u16x16(a);
4108        let (b0, b1) = self.split_u16x16(b);
4109        self.combine_u16x8(self.and_u16x8(a0, b0), self.and_u16x8(a1, b1))
4110    }
4111    #[inline(always)]
4112    fn or_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4113        let (a0, a1) = self.split_u16x16(a);
4114        let (b0, b1) = self.split_u16x16(b);
4115        self.combine_u16x8(self.or_u16x8(a0, b0), self.or_u16x8(a1, b1))
4116    }
4117    #[inline(always)]
4118    fn xor_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4119        let (a0, a1) = self.split_u16x16(a);
4120        let (b0, b1) = self.split_u16x16(b);
4121        self.combine_u16x8(self.xor_u16x8(a0, b0), self.xor_u16x8(a1, b1))
4122    }
4123    #[inline(always)]
4124    fn not_u16x16(self, a: u16x16<Self>) -> u16x16<Self> {
4125        let (a0, a1) = self.split_u16x16(a);
4126        self.combine_u16x8(self.not_u16x8(a0), self.not_u16x8(a1))
4127    }
4128    #[inline(always)]
4129    fn shl_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
4130        let (a0, a1) = self.split_u16x16(a);
4131        self.combine_u16x8(self.shl_u16x8(a0, shift), self.shl_u16x8(a1, shift))
4132    }
4133    #[inline(always)]
4134    fn shlv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4135        let (a0, a1) = self.split_u16x16(a);
4136        let (b0, b1) = self.split_u16x16(b);
4137        self.combine_u16x8(self.shlv_u16x8(a0, b0), self.shlv_u16x8(a1, b1))
4138    }
4139    #[inline(always)]
4140    fn shr_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
4141        let (a0, a1) = self.split_u16x16(a);
4142        self.combine_u16x8(self.shr_u16x8(a0, shift), self.shr_u16x8(a1, shift))
4143    }
4144    #[inline(always)]
4145    fn shrv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4146        let (a0, a1) = self.split_u16x16(a);
4147        let (b0, b1) = self.split_u16x16(b);
4148        self.combine_u16x8(self.shrv_u16x8(a0, b0), self.shrv_u16x8(a1, b1))
4149    }
4150    #[inline(always)]
4151    fn simd_eq_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
4152        let (a0, a1) = self.split_u16x16(a);
4153        let (b0, b1) = self.split_u16x16(b);
4154        self.combine_mask16x8(self.simd_eq_u16x8(a0, b0), self.simd_eq_u16x8(a1, b1))
4155    }
4156    #[inline(always)]
4157    fn simd_lt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
4158        let (a0, a1) = self.split_u16x16(a);
4159        let (b0, b1) = self.split_u16x16(b);
4160        self.combine_mask16x8(self.simd_lt_u16x8(a0, b0), self.simd_lt_u16x8(a1, b1))
4161    }
4162    #[inline(always)]
4163    fn simd_le_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
4164        let (a0, a1) = self.split_u16x16(a);
4165        let (b0, b1) = self.split_u16x16(b);
4166        self.combine_mask16x8(self.simd_le_u16x8(a0, b0), self.simd_le_u16x8(a1, b1))
4167    }
4168    #[inline(always)]
4169    fn simd_ge_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
4170        let (a0, a1) = self.split_u16x16(a);
4171        let (b0, b1) = self.split_u16x16(b);
4172        self.combine_mask16x8(self.simd_ge_u16x8(a0, b0), self.simd_ge_u16x8(a1, b1))
4173    }
4174    #[inline(always)]
4175    fn simd_gt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
4176        let (a0, a1) = self.split_u16x16(a);
4177        let (b0, b1) = self.split_u16x16(b);
4178        self.combine_mask16x8(self.simd_gt_u16x8(a0, b0), self.simd_gt_u16x8(a1, b1))
4179    }
4180    #[inline(always)]
4181    fn zip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4182        let (a0, _) = self.split_u16x16(a);
4183        let (b0, _) = self.split_u16x16(b);
4184        self.combine_u16x8(self.zip_low_u16x8(a0, b0), self.zip_high_u16x8(a0, b0))
4185    }
4186    #[inline(always)]
4187    fn zip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4188        let (_, a1) = self.split_u16x16(a);
4189        let (_, b1) = self.split_u16x16(b);
4190        self.combine_u16x8(self.zip_low_u16x8(a1, b1), self.zip_high_u16x8(a1, b1))
4191    }
4192    #[inline(always)]
4193    fn unzip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4194        let (a0, a1) = self.split_u16x16(a);
4195        let (b0, b1) = self.split_u16x16(b);
4196        self.combine_u16x8(self.unzip_low_u16x8(a0, a1), self.unzip_low_u16x8(b0, b1))
4197    }
4198    #[inline(always)]
4199    fn unzip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4200        let (a0, a1) = self.split_u16x16(a);
4201        let (b0, b1) = self.split_u16x16(b);
4202        self.combine_u16x8(self.unzip_high_u16x8(a0, a1), self.unzip_high_u16x8(b0, b1))
4203    }
4204    #[inline(always)]
4205    fn interleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
4206        let (a0, a1) = self.split_u16x16(a);
4207        let (b0, b1) = self.split_u16x16(b);
4208        let lo_lo = self.zip_low_u16x8(a0, b0);
4209        let lo_hi = self.zip_high_u16x8(a0, b0);
4210        let hi_lo = self.zip_low_u16x8(a1, b1);
4211        let hi_hi = self.zip_high_u16x8(a1, b1);
4212        (
4213            self.combine_u16x8(lo_lo, lo_hi),
4214            self.combine_u16x8(hi_lo, hi_hi),
4215        )
4216    }
4217    #[inline(always)]
4218    fn deinterleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
4219        let (a0, a1) = self.split_u16x16(a);
4220        let (b0, b1) = self.split_u16x16(b);
4221        let lo_even = self.unzip_low_u16x8(a0, a1);
4222        let lo_odd = self.unzip_high_u16x8(a0, a1);
4223        let hi_even = self.unzip_low_u16x8(b0, b1);
4224        let hi_odd = self.unzip_high_u16x8(b0, b1);
4225        (
4226            self.combine_u16x8(lo_even, hi_even),
4227            self.combine_u16x8(lo_odd, hi_odd),
4228        )
4229    }
4230    #[inline(always)]
4231    fn select_u16x16(self, a: mask16x16<Self>, b: u16x16<Self>, c: u16x16<Self>) -> u16x16<Self> {
4232        let (a0, a1) = self.split_mask16x16(a);
4233        let (b0, b1) = self.split_u16x16(b);
4234        let (c0, c1) = self.split_u16x16(c);
4235        self.combine_u16x8(self.select_u16x8(a0, b0, c0), self.select_u16x8(a1, b1, c1))
4236    }
4237    #[inline(always)]
4238    fn min_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4239        let (a0, a1) = self.split_u16x16(a);
4240        let (b0, b1) = self.split_u16x16(b);
4241        self.combine_u16x8(self.min_u16x8(a0, b0), self.min_u16x8(a1, b1))
4242    }
4243    #[inline(always)]
4244    fn max_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4245        let (a0, a1) = self.split_u16x16(a);
4246        let (b0, b1) = self.split_u16x16(b);
4247        self.combine_u16x8(self.max_u16x8(a0, b0), self.max_u16x8(a1, b1))
4248    }
4249    #[inline(always)]
4250    fn combine_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x32<Self> {
4251        u16x32 {
4252            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
4253            simd: self,
4254        }
4255    }
4256    #[inline(always)]
4257    fn split_u16x16(self, a: u16x16<Self>) -> (u16x8<Self>, u16x8<Self>) {
4258        (
4259            u16x8 {
4260                val: crate::support::Aligned128(a.val.0[0]),
4261                simd: self,
4262            },
4263            u16x8 {
4264                val: crate::support::Aligned128(a.val.0[1]),
4265                simd: self,
4266            },
4267        )
4268    }
4269    #[inline(always)]
4270    fn narrow_u16x16(self, a: u16x16<Self>) -> u8x16<Self> {
4271        let (a, b) = self.split_u16x16(a);
4272        unsafe {
4273            let mask = _mm_set1_epi16(0xFF);
4274            let lo_masked = _mm_and_si128(a.into(), mask);
4275            let hi_masked = _mm_and_si128(b.into(), mask);
4276            let result = _mm_packus_epi16(lo_masked, hi_masked);
4277            result.simd_into(self)
4278        }
4279    }
4280    #[inline(always)]
4281    fn reinterpret_u8_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
4282        let (a0, a1) = self.split_u16x16(a);
4283        self.combine_u8x16(self.reinterpret_u8_u16x8(a0), self.reinterpret_u8_u16x8(a1))
4284    }
4285    #[inline(always)]
4286    fn reinterpret_u32_u16x16(self, a: u16x16<Self>) -> u32x8<Self> {
4287        let (a0, a1) = self.split_u16x16(a);
4288        self.combine_u32x4(
4289            self.reinterpret_u32_u16x8(a0),
4290            self.reinterpret_u32_u16x8(a1),
4291        )
4292    }
4293    #[inline(always)]
4294    fn splat_mask16x16(self, val: i16) -> mask16x16<Self> {
4295        let half = self.splat_mask16x8(val);
4296        self.combine_mask16x8(half, half)
4297    }
4298    #[inline(always)]
4299    fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16<Self> {
4300        mask16x16 {
4301            val: unsafe { core::mem::transmute_copy(&val) },
4302            simd: self,
4303        }
4304    }
4305    #[inline(always)]
4306    fn load_array_ref_mask16x16(self, val: &[i16; 16usize]) -> mask16x16<Self> {
4307        mask16x16 {
4308            val: unsafe { core::mem::transmute_copy(val) },
4309            simd: self,
4310        }
4311    }
4312    #[inline(always)]
4313    fn as_array_mask16x16(self, a: mask16x16<Self>) -> [i16; 16usize] {
4314        unsafe { core::mem::transmute::<[__m128i; 2usize], [i16; 16usize]>(a.val.0) }
4315    }
4316    #[inline(always)]
4317    fn as_array_ref_mask16x16(self, a: &mask16x16<Self>) -> &[i16; 16usize] {
4318        unsafe { core::mem::transmute::<&[__m128i; 2usize], &[i16; 16usize]>(&a.val.0) }
4319    }
4320    #[inline(always)]
4321    fn as_array_mut_mask16x16(self, a: &mut mask16x16<Self>) -> &mut [i16; 16usize] {
4322        unsafe { core::mem::transmute::<&mut [__m128i; 2usize], &mut [i16; 16usize]>(&mut a.val.0) }
4323    }
4324    #[inline(always)]
4325    fn store_array_mask16x16(self, a: mask16x16<Self>, dest: &mut [i16; 16usize]) -> () {
4326        unsafe {
4327            core::ptr::copy_nonoverlapping(
4328                (&raw const a.val.0) as *const i16,
4329                dest.as_mut_ptr(),
4330                16usize,
4331            );
4332        }
4333    }
4334    #[inline(always)]
4335    fn cvt_from_bytes_mask16x16(self, a: u8x32<Self>) -> mask16x16<Self> {
4336        unsafe {
4337            mask16x16 {
4338                val: core::mem::transmute(a.val),
4339                simd: self,
4340            }
4341        }
4342    }
4343    #[inline(always)]
4344    fn cvt_to_bytes_mask16x16(self, a: mask16x16<Self>) -> u8x32<Self> {
4345        unsafe {
4346            u8x32 {
4347                val: core::mem::transmute(a.val),
4348                simd: self,
4349            }
4350        }
4351    }
4352    #[inline(always)]
4353    fn slide_mask16x16<const SHIFT: usize>(
4354        self,
4355        a: mask16x16<Self>,
4356        b: mask16x16<Self>,
4357    ) -> mask16x16<Self> {
4358        unsafe {
4359            if SHIFT >= 16usize {
4360                return b;
4361            }
4362            let result = cross_block_alignr_128x2(
4363                self.cvt_to_bytes_mask16x16(b).val.0,
4364                self.cvt_to_bytes_mask16x16(a).val.0,
4365                SHIFT * 2usize,
4366            );
4367            self.cvt_from_bytes_mask16x16(u8x32 {
4368                val: crate::support::Aligned256(result),
4369                simd: self,
4370            })
4371        }
4372    }
4373    #[inline(always)]
4374    fn slide_within_blocks_mask16x16<const SHIFT: usize>(
4375        self,
4376        a: mask16x16<Self>,
4377        b: mask16x16<Self>,
4378    ) -> mask16x16<Self> {
4379        let (a0, a1) = self.split_mask16x16(a);
4380        let (b0, b1) = self.split_mask16x16(b);
4381        self.combine_mask16x8(
4382            self.slide_within_blocks_mask16x8::<SHIFT>(a0, b0),
4383            self.slide_within_blocks_mask16x8::<SHIFT>(a1, b1),
4384        )
4385    }
4386    #[inline(always)]
4387    fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
4388        let (a0, a1) = self.split_mask16x16(a);
4389        let (b0, b1) = self.split_mask16x16(b);
4390        self.combine_mask16x8(self.and_mask16x8(a0, b0), self.and_mask16x8(a1, b1))
4391    }
4392    #[inline(always)]
4393    fn or_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
4394        let (a0, a1) = self.split_mask16x16(a);
4395        let (b0, b1) = self.split_mask16x16(b);
4396        self.combine_mask16x8(self.or_mask16x8(a0, b0), self.or_mask16x8(a1, b1))
4397    }
4398    #[inline(always)]
4399    fn xor_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
4400        let (a0, a1) = self.split_mask16x16(a);
4401        let (b0, b1) = self.split_mask16x16(b);
4402        self.combine_mask16x8(self.xor_mask16x8(a0, b0), self.xor_mask16x8(a1, b1))
4403    }
4404    #[inline(always)]
4405    fn not_mask16x16(self, a: mask16x16<Self>) -> mask16x16<Self> {
4406        let (a0, a1) = self.split_mask16x16(a);
4407        self.combine_mask16x8(self.not_mask16x8(a0), self.not_mask16x8(a1))
4408    }
4409    #[inline(always)]
4410    fn select_mask16x16(
4411        self,
4412        a: mask16x16<Self>,
4413        b: mask16x16<Self>,
4414        c: mask16x16<Self>,
4415    ) -> mask16x16<Self> {
4416        let (a0, a1) = self.split_mask16x16(a);
4417        let (b0, b1) = self.split_mask16x16(b);
4418        let (c0, c1) = self.split_mask16x16(c);
4419        self.combine_mask16x8(
4420            self.select_mask16x8(a0, b0, c0),
4421            self.select_mask16x8(a1, b1, c1),
4422        )
4423    }
4424    #[inline(always)]
4425    fn simd_eq_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
4426        let (a0, a1) = self.split_mask16x16(a);
4427        let (b0, b1) = self.split_mask16x16(b);
4428        self.combine_mask16x8(self.simd_eq_mask16x8(a0, b0), self.simd_eq_mask16x8(a1, b1))
4429    }
4430    #[inline(always)]
4431    fn any_true_mask16x16(self, a: mask16x16<Self>) -> bool {
4432        let (a0, a1) = self.split_mask16x16(a);
4433        self.any_true_mask16x8(a0) || self.any_true_mask16x8(a1)
4434    }
4435    #[inline(always)]
4436    fn all_true_mask16x16(self, a: mask16x16<Self>) -> bool {
4437        let (a0, a1) = self.split_mask16x16(a);
4438        self.all_true_mask16x8(a0) && self.all_true_mask16x8(a1)
4439    }
4440    #[inline(always)]
4441    fn any_false_mask16x16(self, a: mask16x16<Self>) -> bool {
4442        let (a0, a1) = self.split_mask16x16(a);
4443        self.any_false_mask16x8(a0) || self.any_false_mask16x8(a1)
4444    }
4445    #[inline(always)]
4446    fn all_false_mask16x16(self, a: mask16x16<Self>) -> bool {
4447        let (a0, a1) = self.split_mask16x16(a);
4448        self.all_false_mask16x8(a0) && self.all_false_mask16x8(a1)
4449    }
4450    #[inline(always)]
4451    fn combine_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x32<Self> {
4452        mask16x32 {
4453            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
4454            simd: self,
4455        }
4456    }
4457    #[inline(always)]
4458    fn split_mask16x16(self, a: mask16x16<Self>) -> (mask16x8<Self>, mask16x8<Self>) {
4459        (
4460            mask16x8 {
4461                val: crate::support::Aligned128(a.val.0[0]),
4462                simd: self,
4463            },
4464            mask16x8 {
4465                val: crate::support::Aligned128(a.val.0[1]),
4466                simd: self,
4467            },
4468        )
4469    }
4470    #[inline(always)]
4471    fn splat_i32x8(self, val: i32) -> i32x8<Self> {
4472        let half = self.splat_i32x4(val);
4473        self.combine_i32x4(half, half)
4474    }
4475    #[inline(always)]
4476    fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8<Self> {
4477        i32x8 {
4478            val: unsafe { core::mem::transmute_copy(&val) },
4479            simd: self,
4480        }
4481    }
4482    #[inline(always)]
4483    fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8<Self> {
4484        i32x8 {
4485            val: unsafe { core::mem::transmute_copy(val) },
4486            simd: self,
4487        }
4488    }
4489    #[inline(always)]
4490    fn as_array_i32x8(self, a: i32x8<Self>) -> [i32; 8usize] {
4491        unsafe { core::mem::transmute::<[__m128i; 2usize], [i32; 8usize]>(a.val.0) }
4492    }
4493    #[inline(always)]
4494    fn as_array_ref_i32x8(self, a: &i32x8<Self>) -> &[i32; 8usize] {
4495        unsafe { core::mem::transmute::<&[__m128i; 2usize], &[i32; 8usize]>(&a.val.0) }
4496    }
4497    #[inline(always)]
4498    fn as_array_mut_i32x8(self, a: &mut i32x8<Self>) -> &mut [i32; 8usize] {
4499        unsafe { core::mem::transmute::<&mut [__m128i; 2usize], &mut [i32; 8usize]>(&mut a.val.0) }
4500    }
4501    #[inline(always)]
4502    fn store_array_i32x8(self, a: i32x8<Self>, dest: &mut [i32; 8usize]) -> () {
4503        unsafe {
4504            core::ptr::copy_nonoverlapping(
4505                (&raw const a.val.0) as *const i32,
4506                dest.as_mut_ptr(),
4507                8usize,
4508            );
4509        }
4510    }
4511    #[inline(always)]
4512    fn cvt_from_bytes_i32x8(self, a: u8x32<Self>) -> i32x8<Self> {
4513        unsafe {
4514            i32x8 {
4515                val: core::mem::transmute(a.val),
4516                simd: self,
4517            }
4518        }
4519    }
4520    #[inline(always)]
4521    fn cvt_to_bytes_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
4522        unsafe {
4523            u8x32 {
4524                val: core::mem::transmute(a.val),
4525                simd: self,
4526            }
4527        }
4528    }
4529    #[inline(always)]
4530    fn slide_i32x8<const SHIFT: usize>(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4531        unsafe {
4532            if SHIFT >= 8usize {
4533                return b;
4534            }
4535            let result = cross_block_alignr_128x2(
4536                self.cvt_to_bytes_i32x8(b).val.0,
4537                self.cvt_to_bytes_i32x8(a).val.0,
4538                SHIFT * 4usize,
4539            );
4540            self.cvt_from_bytes_i32x8(u8x32 {
4541                val: crate::support::Aligned256(result),
4542                simd: self,
4543            })
4544        }
4545    }
4546    #[inline(always)]
4547    fn slide_within_blocks_i32x8<const SHIFT: usize>(
4548        self,
4549        a: i32x8<Self>,
4550        b: i32x8<Self>,
4551    ) -> i32x8<Self> {
4552        let (a0, a1) = self.split_i32x8(a);
4553        let (b0, b1) = self.split_i32x8(b);
4554        self.combine_i32x4(
4555            self.slide_within_blocks_i32x4::<SHIFT>(a0, b0),
4556            self.slide_within_blocks_i32x4::<SHIFT>(a1, b1),
4557        )
4558    }
4559    #[inline(always)]
4560    fn add_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4561        let (a0, a1) = self.split_i32x8(a);
4562        let (b0, b1) = self.split_i32x8(b);
4563        self.combine_i32x4(self.add_i32x4(a0, b0), self.add_i32x4(a1, b1))
4564    }
4565    #[inline(always)]
4566    fn sub_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4567        let (a0, a1) = self.split_i32x8(a);
4568        let (b0, b1) = self.split_i32x8(b);
4569        self.combine_i32x4(self.sub_i32x4(a0, b0), self.sub_i32x4(a1, b1))
4570    }
4571    #[inline(always)]
4572    fn mul_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4573        let (a0, a1) = self.split_i32x8(a);
4574        let (b0, b1) = self.split_i32x8(b);
4575        self.combine_i32x4(self.mul_i32x4(a0, b0), self.mul_i32x4(a1, b1))
4576    }
4577    #[inline(always)]
4578    fn and_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4579        let (a0, a1) = self.split_i32x8(a);
4580        let (b0, b1) = self.split_i32x8(b);
4581        self.combine_i32x4(self.and_i32x4(a0, b0), self.and_i32x4(a1, b1))
4582    }
4583    #[inline(always)]
4584    fn or_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4585        let (a0, a1) = self.split_i32x8(a);
4586        let (b0, b1) = self.split_i32x8(b);
4587        self.combine_i32x4(self.or_i32x4(a0, b0), self.or_i32x4(a1, b1))
4588    }
4589    #[inline(always)]
4590    fn xor_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4591        let (a0, a1) = self.split_i32x8(a);
4592        let (b0, b1) = self.split_i32x8(b);
4593        self.combine_i32x4(self.xor_i32x4(a0, b0), self.xor_i32x4(a1, b1))
4594    }
4595    #[inline(always)]
4596    fn not_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
4597        let (a0, a1) = self.split_i32x8(a);
4598        self.combine_i32x4(self.not_i32x4(a0), self.not_i32x4(a1))
4599    }
4600    #[inline(always)]
4601    fn shl_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
4602        let (a0, a1) = self.split_i32x8(a);
4603        self.combine_i32x4(self.shl_i32x4(a0, shift), self.shl_i32x4(a1, shift))
4604    }
4605    #[inline(always)]
4606    fn shlv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4607        let (a0, a1) = self.split_i32x8(a);
4608        let (b0, b1) = self.split_i32x8(b);
4609        self.combine_i32x4(self.shlv_i32x4(a0, b0), self.shlv_i32x4(a1, b1))
4610    }
4611    #[inline(always)]
4612    fn shr_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
4613        let (a0, a1) = self.split_i32x8(a);
4614        self.combine_i32x4(self.shr_i32x4(a0, shift), self.shr_i32x4(a1, shift))
4615    }
4616    #[inline(always)]
4617    fn shrv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4618        let (a0, a1) = self.split_i32x8(a);
4619        let (b0, b1) = self.split_i32x8(b);
4620        self.combine_i32x4(self.shrv_i32x4(a0, b0), self.shrv_i32x4(a1, b1))
4621    }
4622    #[inline(always)]
4623    fn simd_eq_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
4624        let (a0, a1) = self.split_i32x8(a);
4625        let (b0, b1) = self.split_i32x8(b);
4626        self.combine_mask32x4(self.simd_eq_i32x4(a0, b0), self.simd_eq_i32x4(a1, b1))
4627    }
4628    #[inline(always)]
4629    fn simd_lt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
4630        let (a0, a1) = self.split_i32x8(a);
4631        let (b0, b1) = self.split_i32x8(b);
4632        self.combine_mask32x4(self.simd_lt_i32x4(a0, b0), self.simd_lt_i32x4(a1, b1))
4633    }
4634    #[inline(always)]
4635    fn simd_le_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
4636        let (a0, a1) = self.split_i32x8(a);
4637        let (b0, b1) = self.split_i32x8(b);
4638        self.combine_mask32x4(self.simd_le_i32x4(a0, b0), self.simd_le_i32x4(a1, b1))
4639    }
4640    #[inline(always)]
4641    fn simd_ge_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
4642        let (a0, a1) = self.split_i32x8(a);
4643        let (b0, b1) = self.split_i32x8(b);
4644        self.combine_mask32x4(self.simd_ge_i32x4(a0, b0), self.simd_ge_i32x4(a1, b1))
4645    }
4646    #[inline(always)]
4647    fn simd_gt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
4648        let (a0, a1) = self.split_i32x8(a);
4649        let (b0, b1) = self.split_i32x8(b);
4650        self.combine_mask32x4(self.simd_gt_i32x4(a0, b0), self.simd_gt_i32x4(a1, b1))
4651    }
4652    #[inline(always)]
4653    fn zip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4654        let (a0, _) = self.split_i32x8(a);
4655        let (b0, _) = self.split_i32x8(b);
4656        self.combine_i32x4(self.zip_low_i32x4(a0, b0), self.zip_high_i32x4(a0, b0))
4657    }
4658    #[inline(always)]
4659    fn zip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4660        let (_, a1) = self.split_i32x8(a);
4661        let (_, b1) = self.split_i32x8(b);
4662        self.combine_i32x4(self.zip_low_i32x4(a1, b1), self.zip_high_i32x4(a1, b1))
4663    }
4664    #[inline(always)]
4665    fn unzip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4666        let (a0, a1) = self.split_i32x8(a);
4667        let (b0, b1) = self.split_i32x8(b);
4668        self.combine_i32x4(self.unzip_low_i32x4(a0, a1), self.unzip_low_i32x4(b0, b1))
4669    }
4670    #[inline(always)]
4671    fn unzip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4672        let (a0, a1) = self.split_i32x8(a);
4673        let (b0, b1) = self.split_i32x8(b);
4674        self.combine_i32x4(self.unzip_high_i32x4(a0, a1), self.unzip_high_i32x4(b0, b1))
4675    }
4676    #[inline(always)]
4677    fn interleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
4678        let (a0, a1) = self.split_i32x8(a);
4679        let (b0, b1) = self.split_i32x8(b);
4680        let lo_lo = self.zip_low_i32x4(a0, b0);
4681        let lo_hi = self.zip_high_i32x4(a0, b0);
4682        let hi_lo = self.zip_low_i32x4(a1, b1);
4683        let hi_hi = self.zip_high_i32x4(a1, b1);
4684        (
4685            self.combine_i32x4(lo_lo, lo_hi),
4686            self.combine_i32x4(hi_lo, hi_hi),
4687        )
4688    }
4689    #[inline(always)]
4690    fn deinterleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
4691        let (a0, a1) = self.split_i32x8(a);
4692        let (b0, b1) = self.split_i32x8(b);
4693        let lo_even = self.unzip_low_i32x4(a0, a1);
4694        let lo_odd = self.unzip_high_i32x4(a0, a1);
4695        let hi_even = self.unzip_low_i32x4(b0, b1);
4696        let hi_odd = self.unzip_high_i32x4(b0, b1);
4697        (
4698            self.combine_i32x4(lo_even, hi_even),
4699            self.combine_i32x4(lo_odd, hi_odd),
4700        )
4701    }
4702    #[inline(always)]
4703    fn select_i32x8(self, a: mask32x8<Self>, b: i32x8<Self>, c: i32x8<Self>) -> i32x8<Self> {
4704        let (a0, a1) = self.split_mask32x8(a);
4705        let (b0, b1) = self.split_i32x8(b);
4706        let (c0, c1) = self.split_i32x8(c);
4707        self.combine_i32x4(self.select_i32x4(a0, b0, c0), self.select_i32x4(a1, b1, c1))
4708    }
4709    #[inline(always)]
4710    fn min_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4711        let (a0, a1) = self.split_i32x8(a);
4712        let (b0, b1) = self.split_i32x8(b);
4713        self.combine_i32x4(self.min_i32x4(a0, b0), self.min_i32x4(a1, b1))
4714    }
4715    #[inline(always)]
4716    fn max_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4717        let (a0, a1) = self.split_i32x8(a);
4718        let (b0, b1) = self.split_i32x8(b);
4719        self.combine_i32x4(self.max_i32x4(a0, b0), self.max_i32x4(a1, b1))
4720    }
4721    #[inline(always)]
4722    fn combine_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x16<Self> {
4723        i32x16 {
4724            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
4725            simd: self,
4726        }
4727    }
4728    #[inline(always)]
4729    fn split_i32x8(self, a: i32x8<Self>) -> (i32x4<Self>, i32x4<Self>) {
4730        (
4731            i32x4 {
4732                val: crate::support::Aligned128(a.val.0[0]),
4733                simd: self,
4734            },
4735            i32x4 {
4736                val: crate::support::Aligned128(a.val.0[1]),
4737                simd: self,
4738            },
4739        )
4740    }
4741    #[inline(always)]
4742    fn neg_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
4743        let (a0, a1) = self.split_i32x8(a);
4744        self.combine_i32x4(self.neg_i32x4(a0), self.neg_i32x4(a1))
4745    }
4746    #[inline(always)]
4747    fn reinterpret_u8_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
4748        let (a0, a1) = self.split_i32x8(a);
4749        self.combine_u8x16(self.reinterpret_u8_i32x4(a0), self.reinterpret_u8_i32x4(a1))
4750    }
4751    #[inline(always)]
4752    fn reinterpret_u32_i32x8(self, a: i32x8<Self>) -> u32x8<Self> {
4753        let (a0, a1) = self.split_i32x8(a);
4754        self.combine_u32x4(
4755            self.reinterpret_u32_i32x4(a0),
4756            self.reinterpret_u32_i32x4(a1),
4757        )
4758    }
4759    #[inline(always)]
4760    fn cvt_f32_i32x8(self, a: i32x8<Self>) -> f32x8<Self> {
4761        let (a0, a1) = self.split_i32x8(a);
4762        self.combine_f32x4(self.cvt_f32_i32x4(a0), self.cvt_f32_i32x4(a1))
4763    }
4764    #[inline(always)]
4765    fn splat_u32x8(self, val: u32) -> u32x8<Self> {
4766        let half = self.splat_u32x4(val);
4767        self.combine_u32x4(half, half)
4768    }
4769    #[inline(always)]
4770    fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8<Self> {
4771        u32x8 {
4772            val: unsafe { core::mem::transmute_copy(&val) },
4773            simd: self,
4774        }
4775    }
4776    #[inline(always)]
4777    fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8<Self> {
4778        u32x8 {
4779            val: unsafe { core::mem::transmute_copy(val) },
4780            simd: self,
4781        }
4782    }
4783    #[inline(always)]
4784    fn as_array_u32x8(self, a: u32x8<Self>) -> [u32; 8usize] {
4785        unsafe { core::mem::transmute::<[__m128i; 2usize], [u32; 8usize]>(a.val.0) }
4786    }
4787    #[inline(always)]
4788    fn as_array_ref_u32x8(self, a: &u32x8<Self>) -> &[u32; 8usize] {
4789        unsafe { core::mem::transmute::<&[__m128i; 2usize], &[u32; 8usize]>(&a.val.0) }
4790    }
4791    #[inline(always)]
4792    fn as_array_mut_u32x8(self, a: &mut u32x8<Self>) -> &mut [u32; 8usize] {
4793        unsafe { core::mem::transmute::<&mut [__m128i; 2usize], &mut [u32; 8usize]>(&mut a.val.0) }
4794    }
4795    #[inline(always)]
4796    fn store_array_u32x8(self, a: u32x8<Self>, dest: &mut [u32; 8usize]) -> () {
4797        unsafe {
4798            core::ptr::copy_nonoverlapping(
4799                (&raw const a.val.0) as *const u32,
4800                dest.as_mut_ptr(),
4801                8usize,
4802            );
4803        }
4804    }
4805    #[inline(always)]
4806    fn cvt_from_bytes_u32x8(self, a: u8x32<Self>) -> u32x8<Self> {
4807        unsafe {
4808            u32x8 {
4809                val: core::mem::transmute(a.val),
4810                simd: self,
4811            }
4812        }
4813    }
4814    #[inline(always)]
4815    fn cvt_to_bytes_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
4816        unsafe {
4817            u8x32 {
4818                val: core::mem::transmute(a.val),
4819                simd: self,
4820            }
4821        }
4822    }
4823    #[inline(always)]
4824    fn slide_u32x8<const SHIFT: usize>(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4825        unsafe {
4826            if SHIFT >= 8usize {
4827                return b;
4828            }
4829            let result = cross_block_alignr_128x2(
4830                self.cvt_to_bytes_u32x8(b).val.0,
4831                self.cvt_to_bytes_u32x8(a).val.0,
4832                SHIFT * 4usize,
4833            );
4834            self.cvt_from_bytes_u32x8(u8x32 {
4835                val: crate::support::Aligned256(result),
4836                simd: self,
4837            })
4838        }
4839    }
4840    #[inline(always)]
4841    fn slide_within_blocks_u32x8<const SHIFT: usize>(
4842        self,
4843        a: u32x8<Self>,
4844        b: u32x8<Self>,
4845    ) -> u32x8<Self> {
4846        let (a0, a1) = self.split_u32x8(a);
4847        let (b0, b1) = self.split_u32x8(b);
4848        self.combine_u32x4(
4849            self.slide_within_blocks_u32x4::<SHIFT>(a0, b0),
4850            self.slide_within_blocks_u32x4::<SHIFT>(a1, b1),
4851        )
4852    }
4853    #[inline(always)]
4854    fn add_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4855        let (a0, a1) = self.split_u32x8(a);
4856        let (b0, b1) = self.split_u32x8(b);
4857        self.combine_u32x4(self.add_u32x4(a0, b0), self.add_u32x4(a1, b1))
4858    }
4859    #[inline(always)]
4860    fn sub_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4861        let (a0, a1) = self.split_u32x8(a);
4862        let (b0, b1) = self.split_u32x8(b);
4863        self.combine_u32x4(self.sub_u32x4(a0, b0), self.sub_u32x4(a1, b1))
4864    }
4865    #[inline(always)]
4866    fn mul_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4867        let (a0, a1) = self.split_u32x8(a);
4868        let (b0, b1) = self.split_u32x8(b);
4869        self.combine_u32x4(self.mul_u32x4(a0, b0), self.mul_u32x4(a1, b1))
4870    }
4871    #[inline(always)]
4872    fn and_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4873        let (a0, a1) = self.split_u32x8(a);
4874        let (b0, b1) = self.split_u32x8(b);
4875        self.combine_u32x4(self.and_u32x4(a0, b0), self.and_u32x4(a1, b1))
4876    }
4877    #[inline(always)]
4878    fn or_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4879        let (a0, a1) = self.split_u32x8(a);
4880        let (b0, b1) = self.split_u32x8(b);
4881        self.combine_u32x4(self.or_u32x4(a0, b0), self.or_u32x4(a1, b1))
4882    }
4883    #[inline(always)]
4884    fn xor_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4885        let (a0, a1) = self.split_u32x8(a);
4886        let (b0, b1) = self.split_u32x8(b);
4887        self.combine_u32x4(self.xor_u32x4(a0, b0), self.xor_u32x4(a1, b1))
4888    }
4889    #[inline(always)]
4890    fn not_u32x8(self, a: u32x8<Self>) -> u32x8<Self> {
4891        let (a0, a1) = self.split_u32x8(a);
4892        self.combine_u32x4(self.not_u32x4(a0), self.not_u32x4(a1))
4893    }
4894    #[inline(always)]
4895    fn shl_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
4896        let (a0, a1) = self.split_u32x8(a);
4897        self.combine_u32x4(self.shl_u32x4(a0, shift), self.shl_u32x4(a1, shift))
4898    }
4899    #[inline(always)]
4900    fn shlv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4901        let (a0, a1) = self.split_u32x8(a);
4902        let (b0, b1) = self.split_u32x8(b);
4903        self.combine_u32x4(self.shlv_u32x4(a0, b0), self.shlv_u32x4(a1, b1))
4904    }
4905    #[inline(always)]
4906    fn shr_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
4907        let (a0, a1) = self.split_u32x8(a);
4908        self.combine_u32x4(self.shr_u32x4(a0, shift), self.shr_u32x4(a1, shift))
4909    }
4910    #[inline(always)]
4911    fn shrv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4912        let (a0, a1) = self.split_u32x8(a);
4913        let (b0, b1) = self.split_u32x8(b);
4914        self.combine_u32x4(self.shrv_u32x4(a0, b0), self.shrv_u32x4(a1, b1))
4915    }
4916    #[inline(always)]
4917    fn simd_eq_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
4918        let (a0, a1) = self.split_u32x8(a);
4919        let (b0, b1) = self.split_u32x8(b);
4920        self.combine_mask32x4(self.simd_eq_u32x4(a0, b0), self.simd_eq_u32x4(a1, b1))
4921    }
4922    #[inline(always)]
4923    fn simd_lt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
4924        let (a0, a1) = self.split_u32x8(a);
4925        let (b0, b1) = self.split_u32x8(b);
4926        self.combine_mask32x4(self.simd_lt_u32x4(a0, b0), self.simd_lt_u32x4(a1, b1))
4927    }
4928    #[inline(always)]
4929    fn simd_le_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
4930        let (a0, a1) = self.split_u32x8(a);
4931        let (b0, b1) = self.split_u32x8(b);
4932        self.combine_mask32x4(self.simd_le_u32x4(a0, b0), self.simd_le_u32x4(a1, b1))
4933    }
4934    #[inline(always)]
4935    fn simd_ge_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
4936        let (a0, a1) = self.split_u32x8(a);
4937        let (b0, b1) = self.split_u32x8(b);
4938        self.combine_mask32x4(self.simd_ge_u32x4(a0, b0), self.simd_ge_u32x4(a1, b1))
4939    }
4940    #[inline(always)]
4941    fn simd_gt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
4942        let (a0, a1) = self.split_u32x8(a);
4943        let (b0, b1) = self.split_u32x8(b);
4944        self.combine_mask32x4(self.simd_gt_u32x4(a0, b0), self.simd_gt_u32x4(a1, b1))
4945    }
4946    #[inline(always)]
4947    fn zip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4948        let (a0, _) = self.split_u32x8(a);
4949        let (b0, _) = self.split_u32x8(b);
4950        self.combine_u32x4(self.zip_low_u32x4(a0, b0), self.zip_high_u32x4(a0, b0))
4951    }
4952    #[inline(always)]
4953    fn zip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4954        let (_, a1) = self.split_u32x8(a);
4955        let (_, b1) = self.split_u32x8(b);
4956        self.combine_u32x4(self.zip_low_u32x4(a1, b1), self.zip_high_u32x4(a1, b1))
4957    }
4958    #[inline(always)]
4959    fn unzip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4960        let (a0, a1) = self.split_u32x8(a);
4961        let (b0, b1) = self.split_u32x8(b);
4962        self.combine_u32x4(self.unzip_low_u32x4(a0, a1), self.unzip_low_u32x4(b0, b1))
4963    }
4964    #[inline(always)]
4965    fn unzip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4966        let (a0, a1) = self.split_u32x8(a);
4967        let (b0, b1) = self.split_u32x8(b);
4968        self.combine_u32x4(self.unzip_high_u32x4(a0, a1), self.unzip_high_u32x4(b0, b1))
4969    }
4970    #[inline(always)]
4971    fn interleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
4972        let (a0, a1) = self.split_u32x8(a);
4973        let (b0, b1) = self.split_u32x8(b);
4974        let lo_lo = self.zip_low_u32x4(a0, b0);
4975        let lo_hi = self.zip_high_u32x4(a0, b0);
4976        let hi_lo = self.zip_low_u32x4(a1, b1);
4977        let hi_hi = self.zip_high_u32x4(a1, b1);
4978        (
4979            self.combine_u32x4(lo_lo, lo_hi),
4980            self.combine_u32x4(hi_lo, hi_hi),
4981        )
4982    }
4983    #[inline(always)]
4984    fn deinterleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
4985        let (a0, a1) = self.split_u32x8(a);
4986        let (b0, b1) = self.split_u32x8(b);
4987        let lo_even = self.unzip_low_u32x4(a0, a1);
4988        let lo_odd = self.unzip_high_u32x4(a0, a1);
4989        let hi_even = self.unzip_low_u32x4(b0, b1);
4990        let hi_odd = self.unzip_high_u32x4(b0, b1);
4991        (
4992            self.combine_u32x4(lo_even, hi_even),
4993            self.combine_u32x4(lo_odd, hi_odd),
4994        )
4995    }
4996    #[inline(always)]
4997    fn select_u32x8(self, a: mask32x8<Self>, b: u32x8<Self>, c: u32x8<Self>) -> u32x8<Self> {
4998        let (a0, a1) = self.split_mask32x8(a);
4999        let (b0, b1) = self.split_u32x8(b);
5000        let (c0, c1) = self.split_u32x8(c);
5001        self.combine_u32x4(self.select_u32x4(a0, b0, c0), self.select_u32x4(a1, b1, c1))
5002    }
5003    #[inline(always)]
5004    fn min_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
5005        let (a0, a1) = self.split_u32x8(a);
5006        let (b0, b1) = self.split_u32x8(b);
5007        self.combine_u32x4(self.min_u32x4(a0, b0), self.min_u32x4(a1, b1))
5008    }
5009    #[inline(always)]
5010    fn max_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
5011        let (a0, a1) = self.split_u32x8(a);
5012        let (b0, b1) = self.split_u32x8(b);
5013        self.combine_u32x4(self.max_u32x4(a0, b0), self.max_u32x4(a1, b1))
5014    }
5015    #[inline(always)]
5016    fn combine_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x16<Self> {
5017        u32x16 {
5018            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
5019            simd: self,
5020        }
5021    }
5022    #[inline(always)]
5023    fn split_u32x8(self, a: u32x8<Self>) -> (u32x4<Self>, u32x4<Self>) {
5024        (
5025            u32x4 {
5026                val: crate::support::Aligned128(a.val.0[0]),
5027                simd: self,
5028            },
5029            u32x4 {
5030                val: crate::support::Aligned128(a.val.0[1]),
5031                simd: self,
5032            },
5033        )
5034    }
5035    #[inline(always)]
5036    fn reinterpret_u8_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
5037        let (a0, a1) = self.split_u32x8(a);
5038        self.combine_u8x16(self.reinterpret_u8_u32x4(a0), self.reinterpret_u8_u32x4(a1))
5039    }
5040    #[inline(always)]
5041    fn cvt_f32_u32x8(self, a: u32x8<Self>) -> f32x8<Self> {
5042        let (a0, a1) = self.split_u32x8(a);
5043        self.combine_f32x4(self.cvt_f32_u32x4(a0), self.cvt_f32_u32x4(a1))
5044    }
5045    #[inline(always)]
5046    fn splat_mask32x8(self, val: i32) -> mask32x8<Self> {
5047        let half = self.splat_mask32x4(val);
5048        self.combine_mask32x4(half, half)
5049    }
5050    #[inline(always)]
5051    fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8<Self> {
5052        mask32x8 {
5053            val: unsafe { core::mem::transmute_copy(&val) },
5054            simd: self,
5055        }
5056    }
5057    #[inline(always)]
5058    fn load_array_ref_mask32x8(self, val: &[i32; 8usize]) -> mask32x8<Self> {
5059        mask32x8 {
5060            val: unsafe { core::mem::transmute_copy(val) },
5061            simd: self,
5062        }
5063    }
5064    #[inline(always)]
5065    fn as_array_mask32x8(self, a: mask32x8<Self>) -> [i32; 8usize] {
5066        unsafe { core::mem::transmute::<[__m128i; 2usize], [i32; 8usize]>(a.val.0) }
5067    }
5068    #[inline(always)]
5069    fn as_array_ref_mask32x8(self, a: &mask32x8<Self>) -> &[i32; 8usize] {
5070        unsafe { core::mem::transmute::<&[__m128i; 2usize], &[i32; 8usize]>(&a.val.0) }
5071    }
5072    #[inline(always)]
5073    fn as_array_mut_mask32x8(self, a: &mut mask32x8<Self>) -> &mut [i32; 8usize] {
5074        unsafe { core::mem::transmute::<&mut [__m128i; 2usize], &mut [i32; 8usize]>(&mut a.val.0) }
5075    }
5076    #[inline(always)]
5077    fn store_array_mask32x8(self, a: mask32x8<Self>, dest: &mut [i32; 8usize]) -> () {
5078        unsafe {
5079            core::ptr::copy_nonoverlapping(
5080                (&raw const a.val.0) as *const i32,
5081                dest.as_mut_ptr(),
5082                8usize,
5083            );
5084        }
5085    }
5086    #[inline(always)]
5087    fn cvt_from_bytes_mask32x8(self, a: u8x32<Self>) -> mask32x8<Self> {
5088        unsafe {
5089            mask32x8 {
5090                val: core::mem::transmute(a.val),
5091                simd: self,
5092            }
5093        }
5094    }
5095    #[inline(always)]
5096    fn cvt_to_bytes_mask32x8(self, a: mask32x8<Self>) -> u8x32<Self> {
5097        unsafe {
5098            u8x32 {
5099                val: core::mem::transmute(a.val),
5100                simd: self,
5101            }
5102        }
5103    }
5104    #[inline(always)]
5105    fn slide_mask32x8<const SHIFT: usize>(
5106        self,
5107        a: mask32x8<Self>,
5108        b: mask32x8<Self>,
5109    ) -> mask32x8<Self> {
5110        unsafe {
5111            if SHIFT >= 8usize {
5112                return b;
5113            }
5114            let result = cross_block_alignr_128x2(
5115                self.cvt_to_bytes_mask32x8(b).val.0,
5116                self.cvt_to_bytes_mask32x8(a).val.0,
5117                SHIFT * 4usize,
5118            );
5119            self.cvt_from_bytes_mask32x8(u8x32 {
5120                val: crate::support::Aligned256(result),
5121                simd: self,
5122            })
5123        }
5124    }
5125    #[inline(always)]
5126    fn slide_within_blocks_mask32x8<const SHIFT: usize>(
5127        self,
5128        a: mask32x8<Self>,
5129        b: mask32x8<Self>,
5130    ) -> mask32x8<Self> {
5131        let (a0, a1) = self.split_mask32x8(a);
5132        let (b0, b1) = self.split_mask32x8(b);
5133        self.combine_mask32x4(
5134            self.slide_within_blocks_mask32x4::<SHIFT>(a0, b0),
5135            self.slide_within_blocks_mask32x4::<SHIFT>(a1, b1),
5136        )
5137    }
5138    #[inline(always)]
5139    fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
5140        let (a0, a1) = self.split_mask32x8(a);
5141        let (b0, b1) = self.split_mask32x8(b);
5142        self.combine_mask32x4(self.and_mask32x4(a0, b0), self.and_mask32x4(a1, b1))
5143    }
5144    #[inline(always)]
5145    fn or_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
5146        let (a0, a1) = self.split_mask32x8(a);
5147        let (b0, b1) = self.split_mask32x8(b);
5148        self.combine_mask32x4(self.or_mask32x4(a0, b0), self.or_mask32x4(a1, b1))
5149    }
5150    #[inline(always)]
5151    fn xor_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
5152        let (a0, a1) = self.split_mask32x8(a);
5153        let (b0, b1) = self.split_mask32x8(b);
5154        self.combine_mask32x4(self.xor_mask32x4(a0, b0), self.xor_mask32x4(a1, b1))
5155    }
5156    #[inline(always)]
5157    fn not_mask32x8(self, a: mask32x8<Self>) -> mask32x8<Self> {
5158        let (a0, a1) = self.split_mask32x8(a);
5159        self.combine_mask32x4(self.not_mask32x4(a0), self.not_mask32x4(a1))
5160    }
5161    #[inline(always)]
5162    fn select_mask32x8(
5163        self,
5164        a: mask32x8<Self>,
5165        b: mask32x8<Self>,
5166        c: mask32x8<Self>,
5167    ) -> mask32x8<Self> {
5168        let (a0, a1) = self.split_mask32x8(a);
5169        let (b0, b1) = self.split_mask32x8(b);
5170        let (c0, c1) = self.split_mask32x8(c);
5171        self.combine_mask32x4(
5172            self.select_mask32x4(a0, b0, c0),
5173            self.select_mask32x4(a1, b1, c1),
5174        )
5175    }
5176    #[inline(always)]
5177    fn simd_eq_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
5178        let (a0, a1) = self.split_mask32x8(a);
5179        let (b0, b1) = self.split_mask32x8(b);
5180        self.combine_mask32x4(self.simd_eq_mask32x4(a0, b0), self.simd_eq_mask32x4(a1, b1))
5181    }
5182    #[inline(always)]
5183    fn any_true_mask32x8(self, a: mask32x8<Self>) -> bool {
5184        let (a0, a1) = self.split_mask32x8(a);
5185        self.any_true_mask32x4(a0) || self.any_true_mask32x4(a1)
5186    }
5187    #[inline(always)]
5188    fn all_true_mask32x8(self, a: mask32x8<Self>) -> bool {
5189        let (a0, a1) = self.split_mask32x8(a);
5190        self.all_true_mask32x4(a0) && self.all_true_mask32x4(a1)
5191    }
5192    #[inline(always)]
5193    fn any_false_mask32x8(self, a: mask32x8<Self>) -> bool {
5194        let (a0, a1) = self.split_mask32x8(a);
5195        self.any_false_mask32x4(a0) || self.any_false_mask32x4(a1)
5196    }
5197    #[inline(always)]
5198    fn all_false_mask32x8(self, a: mask32x8<Self>) -> bool {
5199        let (a0, a1) = self.split_mask32x8(a);
5200        self.all_false_mask32x4(a0) && self.all_false_mask32x4(a1)
5201    }
5202    #[inline(always)]
5203    fn combine_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x16<Self> {
5204        mask32x16 {
5205            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
5206            simd: self,
5207        }
5208    }
5209    #[inline(always)]
5210    fn split_mask32x8(self, a: mask32x8<Self>) -> (mask32x4<Self>, mask32x4<Self>) {
5211        (
5212            mask32x4 {
5213                val: crate::support::Aligned128(a.val.0[0]),
5214                simd: self,
5215            },
5216            mask32x4 {
5217                val: crate::support::Aligned128(a.val.0[1]),
5218                simd: self,
5219            },
5220        )
5221    }
5222    #[inline(always)]
5223    fn splat_f64x4(self, val: f64) -> f64x4<Self> {
5224        let half = self.splat_f64x2(val);
5225        self.combine_f64x2(half, half)
5226    }
5227    #[inline(always)]
5228    fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4<Self> {
5229        f64x4 {
5230            val: unsafe { core::mem::transmute_copy(&val) },
5231            simd: self,
5232        }
5233    }
5234    #[inline(always)]
5235    fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4<Self> {
5236        f64x4 {
5237            val: unsafe { core::mem::transmute_copy(val) },
5238            simd: self,
5239        }
5240    }
5241    #[inline(always)]
5242    fn as_array_f64x4(self, a: f64x4<Self>) -> [f64; 4usize] {
5243        unsafe { core::mem::transmute::<[__m128d; 2usize], [f64; 4usize]>(a.val.0) }
5244    }
5245    #[inline(always)]
5246    fn as_array_ref_f64x4(self, a: &f64x4<Self>) -> &[f64; 4usize] {
5247        unsafe { core::mem::transmute::<&[__m128d; 2usize], &[f64; 4usize]>(&a.val.0) }
5248    }
5249    #[inline(always)]
5250    fn as_array_mut_f64x4(self, a: &mut f64x4<Self>) -> &mut [f64; 4usize] {
5251        unsafe { core::mem::transmute::<&mut [__m128d; 2usize], &mut [f64; 4usize]>(&mut a.val.0) }
5252    }
5253    #[inline(always)]
5254    fn store_array_f64x4(self, a: f64x4<Self>, dest: &mut [f64; 4usize]) -> () {
5255        unsafe {
5256            core::ptr::copy_nonoverlapping(
5257                (&raw const a.val.0) as *const f64,
5258                dest.as_mut_ptr(),
5259                4usize,
5260            );
5261        }
5262    }
5263    #[inline(always)]
5264    fn cvt_from_bytes_f64x4(self, a: u8x32<Self>) -> f64x4<Self> {
5265        unsafe {
5266            f64x4 {
5267                val: core::mem::transmute(a.val),
5268                simd: self,
5269            }
5270        }
5271    }
5272    #[inline(always)]
5273    fn cvt_to_bytes_f64x4(self, a: f64x4<Self>) -> u8x32<Self> {
5274        unsafe {
5275            u8x32 {
5276                val: core::mem::transmute(a.val),
5277                simd: self,
5278            }
5279        }
5280    }
5281    #[inline(always)]
5282    fn slide_f64x4<const SHIFT: usize>(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5283        unsafe {
5284            if SHIFT >= 4usize {
5285                return b;
5286            }
5287            let result = cross_block_alignr_128x2(
5288                self.cvt_to_bytes_f64x4(b).val.0,
5289                self.cvt_to_bytes_f64x4(a).val.0,
5290                SHIFT * 8usize,
5291            );
5292            self.cvt_from_bytes_f64x4(u8x32 {
5293                val: crate::support::Aligned256(result),
5294                simd: self,
5295            })
5296        }
5297    }
5298    #[inline(always)]
5299    fn slide_within_blocks_f64x4<const SHIFT: usize>(
5300        self,
5301        a: f64x4<Self>,
5302        b: f64x4<Self>,
5303    ) -> f64x4<Self> {
5304        let (a0, a1) = self.split_f64x4(a);
5305        let (b0, b1) = self.split_f64x4(b);
5306        self.combine_f64x2(
5307            self.slide_within_blocks_f64x2::<SHIFT>(a0, b0),
5308            self.slide_within_blocks_f64x2::<SHIFT>(a1, b1),
5309        )
5310    }
5311    #[inline(always)]
5312    fn abs_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5313        let (a0, a1) = self.split_f64x4(a);
5314        self.combine_f64x2(self.abs_f64x2(a0), self.abs_f64x2(a1))
5315    }
5316    #[inline(always)]
5317    fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5318        let (a0, a1) = self.split_f64x4(a);
5319        self.combine_f64x2(self.neg_f64x2(a0), self.neg_f64x2(a1))
5320    }
5321    #[inline(always)]
5322    fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5323        let (a0, a1) = self.split_f64x4(a);
5324        self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1))
5325    }
5326    #[inline(always)]
5327    fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5328        let (a0, a1) = self.split_f64x4(a);
5329        let (b0, b1) = self.split_f64x4(b);
5330        self.combine_f64x2(self.add_f64x2(a0, b0), self.add_f64x2(a1, b1))
5331    }
5332    #[inline(always)]
5333    fn sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5334        let (a0, a1) = self.split_f64x4(a);
5335        let (b0, b1) = self.split_f64x4(b);
5336        self.combine_f64x2(self.sub_f64x2(a0, b0), self.sub_f64x2(a1, b1))
5337    }
5338    #[inline(always)]
5339    fn mul_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5340        let (a0, a1) = self.split_f64x4(a);
5341        let (b0, b1) = self.split_f64x4(b);
5342        self.combine_f64x2(self.mul_f64x2(a0, b0), self.mul_f64x2(a1, b1))
5343    }
5344    #[inline(always)]
5345    fn div_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5346        let (a0, a1) = self.split_f64x4(a);
5347        let (b0, b1) = self.split_f64x4(b);
5348        self.combine_f64x2(self.div_f64x2(a0, b0), self.div_f64x2(a1, b1))
5349    }
5350    #[inline(always)]
5351    fn copysign_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5352        let (a0, a1) = self.split_f64x4(a);
5353        let (b0, b1) = self.split_f64x4(b);
5354        self.combine_f64x2(self.copysign_f64x2(a0, b0), self.copysign_f64x2(a1, b1))
5355    }
5356    #[inline(always)]
5357    fn simd_eq_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
5358        let (a0, a1) = self.split_f64x4(a);
5359        let (b0, b1) = self.split_f64x4(b);
5360        self.combine_mask64x2(self.simd_eq_f64x2(a0, b0), self.simd_eq_f64x2(a1, b1))
5361    }
5362    #[inline(always)]
5363    fn simd_lt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
5364        let (a0, a1) = self.split_f64x4(a);
5365        let (b0, b1) = self.split_f64x4(b);
5366        self.combine_mask64x2(self.simd_lt_f64x2(a0, b0), self.simd_lt_f64x2(a1, b1))
5367    }
5368    #[inline(always)]
5369    fn simd_le_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
5370        let (a0, a1) = self.split_f64x4(a);
5371        let (b0, b1) = self.split_f64x4(b);
5372        self.combine_mask64x2(self.simd_le_f64x2(a0, b0), self.simd_le_f64x2(a1, b1))
5373    }
5374    #[inline(always)]
5375    fn simd_ge_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
5376        let (a0, a1) = self.split_f64x4(a);
5377        let (b0, b1) = self.split_f64x4(b);
5378        self.combine_mask64x2(self.simd_ge_f64x2(a0, b0), self.simd_ge_f64x2(a1, b1))
5379    }
5380    #[inline(always)]
5381    fn simd_gt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
5382        let (a0, a1) = self.split_f64x4(a);
5383        let (b0, b1) = self.split_f64x4(b);
5384        self.combine_mask64x2(self.simd_gt_f64x2(a0, b0), self.simd_gt_f64x2(a1, b1))
5385    }
5386    #[inline(always)]
5387    fn zip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5388        let (a0, _) = self.split_f64x4(a);
5389        let (b0, _) = self.split_f64x4(b);
5390        self.combine_f64x2(self.zip_low_f64x2(a0, b0), self.zip_high_f64x2(a0, b0))
5391    }
5392    #[inline(always)]
5393    fn zip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5394        let (_, a1) = self.split_f64x4(a);
5395        let (_, b1) = self.split_f64x4(b);
5396        self.combine_f64x2(self.zip_low_f64x2(a1, b1), self.zip_high_f64x2(a1, b1))
5397    }
5398    #[inline(always)]
5399    fn unzip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5400        let (a0, a1) = self.split_f64x4(a);
5401        let (b0, b1) = self.split_f64x4(b);
5402        self.combine_f64x2(self.unzip_low_f64x2(a0, a1), self.unzip_low_f64x2(b0, b1))
5403    }
5404    #[inline(always)]
5405    fn unzip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5406        let (a0, a1) = self.split_f64x4(a);
5407        let (b0, b1) = self.split_f64x4(b);
5408        self.combine_f64x2(self.unzip_high_f64x2(a0, a1), self.unzip_high_f64x2(b0, b1))
5409    }
5410    #[inline(always)]
5411    fn interleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
5412        let (a0, a1) = self.split_f64x4(a);
5413        let (b0, b1) = self.split_f64x4(b);
5414        let lo_lo = self.zip_low_f64x2(a0, b0);
5415        let lo_hi = self.zip_high_f64x2(a0, b0);
5416        let hi_lo = self.zip_low_f64x2(a1, b1);
5417        let hi_hi = self.zip_high_f64x2(a1, b1);
5418        (
5419            self.combine_f64x2(lo_lo, lo_hi),
5420            self.combine_f64x2(hi_lo, hi_hi),
5421        )
5422    }
5423    #[inline(always)]
5424    fn deinterleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
5425        let (a0, a1) = self.split_f64x4(a);
5426        let (b0, b1) = self.split_f64x4(b);
5427        let lo_even = self.unzip_low_f64x2(a0, a1);
5428        let lo_odd = self.unzip_high_f64x2(a0, a1);
5429        let hi_even = self.unzip_low_f64x2(b0, b1);
5430        let hi_odd = self.unzip_high_f64x2(b0, b1);
5431        (
5432            self.combine_f64x2(lo_even, hi_even),
5433            self.combine_f64x2(lo_odd, hi_odd),
5434        )
5435    }
5436    #[inline(always)]
5437    fn max_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5438        let (a0, a1) = self.split_f64x4(a);
5439        let (b0, b1) = self.split_f64x4(b);
5440        self.combine_f64x2(self.max_f64x2(a0, b0), self.max_f64x2(a1, b1))
5441    }
5442    #[inline(always)]
5443    fn min_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5444        let (a0, a1) = self.split_f64x4(a);
5445        let (b0, b1) = self.split_f64x4(b);
5446        self.combine_f64x2(self.min_f64x2(a0, b0), self.min_f64x2(a1, b1))
5447    }
5448    #[inline(always)]
5449    fn max_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5450        let (a0, a1) = self.split_f64x4(a);
5451        let (b0, b1) = self.split_f64x4(b);
5452        self.combine_f64x2(
5453            self.max_precise_f64x2(a0, b0),
5454            self.max_precise_f64x2(a1, b1),
5455        )
5456    }
5457    #[inline(always)]
5458    fn min_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5459        let (a0, a1) = self.split_f64x4(a);
5460        let (b0, b1) = self.split_f64x4(b);
5461        self.combine_f64x2(
5462            self.min_precise_f64x2(a0, b0),
5463            self.min_precise_f64x2(a1, b1),
5464        )
5465    }
5466    #[inline(always)]
5467    fn mul_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
5468        let (a0, a1) = self.split_f64x4(a);
5469        let (b0, b1) = self.split_f64x4(b);
5470        let (c0, c1) = self.split_f64x4(c);
5471        self.combine_f64x2(
5472            self.mul_add_f64x2(a0, b0, c0),
5473            self.mul_add_f64x2(a1, b1, c1),
5474        )
5475    }
5476    #[inline(always)]
5477    fn mul_sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
5478        let (a0, a1) = self.split_f64x4(a);
5479        let (b0, b1) = self.split_f64x4(b);
5480        let (c0, c1) = self.split_f64x4(c);
5481        self.combine_f64x2(
5482            self.mul_sub_f64x2(a0, b0, c0),
5483            self.mul_sub_f64x2(a1, b1, c1),
5484        )
5485    }
5486    #[inline(always)]
5487    fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5488        let (a0, a1) = self.split_f64x4(a);
5489        self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1))
5490    }
5491    #[inline(always)]
5492    fn ceil_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5493        let (a0, a1) = self.split_f64x4(a);
5494        self.combine_f64x2(self.ceil_f64x2(a0), self.ceil_f64x2(a1))
5495    }
5496    #[inline(always)]
5497    fn round_ties_even_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5498        let (a0, a1) = self.split_f64x4(a);
5499        self.combine_f64x2(
5500            self.round_ties_even_f64x2(a0),
5501            self.round_ties_even_f64x2(a1),
5502        )
5503    }
5504    #[inline(always)]
5505    fn fract_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5506        let (a0, a1) = self.split_f64x4(a);
5507        self.combine_f64x2(self.fract_f64x2(a0), self.fract_f64x2(a1))
5508    }
5509    #[inline(always)]
5510    fn trunc_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5511        let (a0, a1) = self.split_f64x4(a);
5512        self.combine_f64x2(self.trunc_f64x2(a0), self.trunc_f64x2(a1))
5513    }
5514    #[inline(always)]
5515    fn select_f64x4(self, a: mask64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
5516        let (a0, a1) = self.split_mask64x4(a);
5517        let (b0, b1) = self.split_f64x4(b);
5518        let (c0, c1) = self.split_f64x4(c);
5519        self.combine_f64x2(self.select_f64x2(a0, b0, c0), self.select_f64x2(a1, b1, c1))
5520    }
5521    #[inline(always)]
5522    fn combine_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x8<Self> {
5523        f64x8 {
5524            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
5525            simd: self,
5526        }
5527    }
5528    #[inline(always)]
5529    fn split_f64x4(self, a: f64x4<Self>) -> (f64x2<Self>, f64x2<Self>) {
5530        (
5531            f64x2 {
5532                val: crate::support::Aligned128(a.val.0[0]),
5533                simd: self,
5534            },
5535            f64x2 {
5536                val: crate::support::Aligned128(a.val.0[1]),
5537                simd: self,
5538            },
5539        )
5540    }
5541    #[inline(always)]
5542    fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
5543        let (a0, a1) = self.split_f64x4(a);
5544        self.combine_f32x4(
5545            self.reinterpret_f32_f64x2(a0),
5546            self.reinterpret_f32_f64x2(a1),
5547        )
5548    }
5549    #[inline(always)]
5550    fn splat_mask64x4(self, val: i64) -> mask64x4<Self> {
5551        let half = self.splat_mask64x2(val);
5552        self.combine_mask64x2(half, half)
5553    }
5554    #[inline(always)]
5555    fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4<Self> {
5556        mask64x4 {
5557            val: unsafe { core::mem::transmute_copy(&val) },
5558            simd: self,
5559        }
5560    }
5561    #[inline(always)]
5562    fn load_array_ref_mask64x4(self, val: &[i64; 4usize]) -> mask64x4<Self> {
5563        mask64x4 {
5564            val: unsafe { core::mem::transmute_copy(val) },
5565            simd: self,
5566        }
5567    }
5568    #[inline(always)]
5569    fn as_array_mask64x4(self, a: mask64x4<Self>) -> [i64; 4usize] {
5570        unsafe { core::mem::transmute::<[__m128i; 2usize], [i64; 4usize]>(a.val.0) }
5571    }
5572    #[inline(always)]
5573    fn as_array_ref_mask64x4(self, a: &mask64x4<Self>) -> &[i64; 4usize] {
5574        unsafe { core::mem::transmute::<&[__m128i; 2usize], &[i64; 4usize]>(&a.val.0) }
5575    }
5576    #[inline(always)]
5577    fn as_array_mut_mask64x4(self, a: &mut mask64x4<Self>) -> &mut [i64; 4usize] {
5578        unsafe { core::mem::transmute::<&mut [__m128i; 2usize], &mut [i64; 4usize]>(&mut a.val.0) }
5579    }
5580    #[inline(always)]
5581    fn store_array_mask64x4(self, a: mask64x4<Self>, dest: &mut [i64; 4usize]) -> () {
5582        unsafe {
5583            core::ptr::copy_nonoverlapping(
5584                (&raw const a.val.0) as *const i64,
5585                dest.as_mut_ptr(),
5586                4usize,
5587            );
5588        }
5589    }
5590    #[inline(always)]
5591    fn cvt_from_bytes_mask64x4(self, a: u8x32<Self>) -> mask64x4<Self> {
5592        unsafe {
5593            mask64x4 {
5594                val: core::mem::transmute(a.val),
5595                simd: self,
5596            }
5597        }
5598    }
5599    #[inline(always)]
5600    fn cvt_to_bytes_mask64x4(self, a: mask64x4<Self>) -> u8x32<Self> {
5601        unsafe {
5602            u8x32 {
5603                val: core::mem::transmute(a.val),
5604                simd: self,
5605            }
5606        }
5607    }
5608    #[inline(always)]
5609    fn slide_mask64x4<const SHIFT: usize>(
5610        self,
5611        a: mask64x4<Self>,
5612        b: mask64x4<Self>,
5613    ) -> mask64x4<Self> {
5614        unsafe {
5615            if SHIFT >= 4usize {
5616                return b;
5617            }
5618            let result = cross_block_alignr_128x2(
5619                self.cvt_to_bytes_mask64x4(b).val.0,
5620                self.cvt_to_bytes_mask64x4(a).val.0,
5621                SHIFT * 8usize,
5622            );
5623            self.cvt_from_bytes_mask64x4(u8x32 {
5624                val: crate::support::Aligned256(result),
5625                simd: self,
5626            })
5627        }
5628    }
5629    #[inline(always)]
5630    fn slide_within_blocks_mask64x4<const SHIFT: usize>(
5631        self,
5632        a: mask64x4<Self>,
5633        b: mask64x4<Self>,
5634    ) -> mask64x4<Self> {
5635        let (a0, a1) = self.split_mask64x4(a);
5636        let (b0, b1) = self.split_mask64x4(b);
5637        self.combine_mask64x2(
5638            self.slide_within_blocks_mask64x2::<SHIFT>(a0, b0),
5639            self.slide_within_blocks_mask64x2::<SHIFT>(a1, b1),
5640        )
5641    }
5642    #[inline(always)]
5643    fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
5644        let (a0, a1) = self.split_mask64x4(a);
5645        let (b0, b1) = self.split_mask64x4(b);
5646        self.combine_mask64x2(self.and_mask64x2(a0, b0), self.and_mask64x2(a1, b1))
5647    }
5648    #[inline(always)]
5649    fn or_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
5650        let (a0, a1) = self.split_mask64x4(a);
5651        let (b0, b1) = self.split_mask64x4(b);
5652        self.combine_mask64x2(self.or_mask64x2(a0, b0), self.or_mask64x2(a1, b1))
5653    }
5654    #[inline(always)]
5655    fn xor_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
5656        let (a0, a1) = self.split_mask64x4(a);
5657        let (b0, b1) = self.split_mask64x4(b);
5658        self.combine_mask64x2(self.xor_mask64x2(a0, b0), self.xor_mask64x2(a1, b1))
5659    }
5660    #[inline(always)]
5661    fn not_mask64x4(self, a: mask64x4<Self>) -> mask64x4<Self> {
5662        let (a0, a1) = self.split_mask64x4(a);
5663        self.combine_mask64x2(self.not_mask64x2(a0), self.not_mask64x2(a1))
5664    }
5665    #[inline(always)]
5666    fn select_mask64x4(
5667        self,
5668        a: mask64x4<Self>,
5669        b: mask64x4<Self>,
5670        c: mask64x4<Self>,
5671    ) -> mask64x4<Self> {
5672        let (a0, a1) = self.split_mask64x4(a);
5673        let (b0, b1) = self.split_mask64x4(b);
5674        let (c0, c1) = self.split_mask64x4(c);
5675        self.combine_mask64x2(
5676            self.select_mask64x2(a0, b0, c0),
5677            self.select_mask64x2(a1, b1, c1),
5678        )
5679    }
5680    #[inline(always)]
5681    fn simd_eq_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
5682        let (a0, a1) = self.split_mask64x4(a);
5683        let (b0, b1) = self.split_mask64x4(b);
5684        self.combine_mask64x2(self.simd_eq_mask64x2(a0, b0), self.simd_eq_mask64x2(a1, b1))
5685    }
5686    #[inline(always)]
5687    fn any_true_mask64x4(self, a: mask64x4<Self>) -> bool {
5688        let (a0, a1) = self.split_mask64x4(a);
5689        self.any_true_mask64x2(a0) || self.any_true_mask64x2(a1)
5690    }
5691    #[inline(always)]
5692    fn all_true_mask64x4(self, a: mask64x4<Self>) -> bool {
5693        let (a0, a1) = self.split_mask64x4(a);
5694        self.all_true_mask64x2(a0) && self.all_true_mask64x2(a1)
5695    }
5696    #[inline(always)]
5697    fn any_false_mask64x4(self, a: mask64x4<Self>) -> bool {
5698        let (a0, a1) = self.split_mask64x4(a);
5699        self.any_false_mask64x2(a0) || self.any_false_mask64x2(a1)
5700    }
5701    #[inline(always)]
5702    fn all_false_mask64x4(self, a: mask64x4<Self>) -> bool {
5703        let (a0, a1) = self.split_mask64x4(a);
5704        self.all_false_mask64x2(a0) && self.all_false_mask64x2(a1)
5705    }
5706    #[inline(always)]
5707    fn combine_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x8<Self> {
5708        mask64x8 {
5709            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
5710            simd: self,
5711        }
5712    }
5713    #[inline(always)]
5714    fn split_mask64x4(self, a: mask64x4<Self>) -> (mask64x2<Self>, mask64x2<Self>) {
5715        (
5716            mask64x2 {
5717                val: crate::support::Aligned128(a.val.0[0]),
5718                simd: self,
5719            },
5720            mask64x2 {
5721                val: crate::support::Aligned128(a.val.0[1]),
5722                simd: self,
5723            },
5724        )
5725    }
5726    #[inline(always)]
5727    fn splat_f32x16(self, val: f32) -> f32x16<Self> {
5728        let half = self.splat_f32x8(val);
5729        self.combine_f32x8(half, half)
5730    }
5731    #[inline(always)]
5732    fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16<Self> {
5733        f32x16 {
5734            val: unsafe { core::mem::transmute_copy(&val) },
5735            simd: self,
5736        }
5737    }
5738    #[inline(always)]
5739    fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16<Self> {
5740        f32x16 {
5741            val: unsafe { core::mem::transmute_copy(val) },
5742            simd: self,
5743        }
5744    }
5745    #[inline(always)]
5746    fn as_array_f32x16(self, a: f32x16<Self>) -> [f32; 16usize] {
5747        unsafe { core::mem::transmute::<[__m128; 4usize], [f32; 16usize]>(a.val.0) }
5748    }
5749    #[inline(always)]
5750    fn as_array_ref_f32x16(self, a: &f32x16<Self>) -> &[f32; 16usize] {
5751        unsafe { core::mem::transmute::<&[__m128; 4usize], &[f32; 16usize]>(&a.val.0) }
5752    }
5753    #[inline(always)]
5754    fn as_array_mut_f32x16(self, a: &mut f32x16<Self>) -> &mut [f32; 16usize] {
5755        unsafe { core::mem::transmute::<&mut [__m128; 4usize], &mut [f32; 16usize]>(&mut a.val.0) }
5756    }
5757    #[inline(always)]
5758    fn store_array_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
5759        unsafe {
5760            core::ptr::copy_nonoverlapping(
5761                (&raw const a.val.0) as *const f32,
5762                dest.as_mut_ptr(),
5763                16usize,
5764            );
5765        }
5766    }
5767    #[inline(always)]
5768    fn cvt_from_bytes_f32x16(self, a: u8x64<Self>) -> f32x16<Self> {
5769        unsafe {
5770            f32x16 {
5771                val: core::mem::transmute(a.val),
5772                simd: self,
5773            }
5774        }
5775    }
5776    #[inline(always)]
5777    fn cvt_to_bytes_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
5778        unsafe {
5779            u8x64 {
5780                val: core::mem::transmute(a.val),
5781                simd: self,
5782            }
5783        }
5784    }
5785    #[inline(always)]
5786    fn slide_f32x16<const SHIFT: usize>(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5787        unsafe {
5788            if SHIFT >= 16usize {
5789                return b;
5790            }
5791            let result = cross_block_alignr_128x4(
5792                self.cvt_to_bytes_f32x16(b).val.0,
5793                self.cvt_to_bytes_f32x16(a).val.0,
5794                SHIFT * 4usize,
5795            );
5796            self.cvt_from_bytes_f32x16(u8x64 {
5797                val: crate::support::Aligned512(result),
5798                simd: self,
5799            })
5800        }
5801    }
5802    #[inline(always)]
5803    fn slide_within_blocks_f32x16<const SHIFT: usize>(
5804        self,
5805        a: f32x16<Self>,
5806        b: f32x16<Self>,
5807    ) -> f32x16<Self> {
5808        let (a0, a1) = self.split_f32x16(a);
5809        let (b0, b1) = self.split_f32x16(b);
5810        self.combine_f32x8(
5811            self.slide_within_blocks_f32x8::<SHIFT>(a0, b0),
5812            self.slide_within_blocks_f32x8::<SHIFT>(a1, b1),
5813        )
5814    }
5815    #[inline(always)]
5816    fn abs_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
5817        let (a0, a1) = self.split_f32x16(a);
5818        self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1))
5819    }
5820    #[inline(always)]
5821    fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
5822        let (a0, a1) = self.split_f32x16(a);
5823        self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1))
5824    }
5825    #[inline(always)]
5826    fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
5827        let (a0, a1) = self.split_f32x16(a);
5828        self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
5829    }
5830    #[inline(always)]
5831    fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5832        let (a0, a1) = self.split_f32x16(a);
5833        let (b0, b1) = self.split_f32x16(b);
5834        self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1))
5835    }
5836    #[inline(always)]
5837    fn sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5838        let (a0, a1) = self.split_f32x16(a);
5839        let (b0, b1) = self.split_f32x16(b);
5840        self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1))
5841    }
5842    #[inline(always)]
5843    fn mul_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5844        let (a0, a1) = self.split_f32x16(a);
5845        let (b0, b1) = self.split_f32x16(b);
5846        self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1))
5847    }
5848    #[inline(always)]
5849    fn div_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5850        let (a0, a1) = self.split_f32x16(a);
5851        let (b0, b1) = self.split_f32x16(b);
5852        self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1))
5853    }
5854    #[inline(always)]
5855    fn copysign_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5856        let (a0, a1) = self.split_f32x16(a);
5857        let (b0, b1) = self.split_f32x16(b);
5858        self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1))
5859    }
5860    #[inline(always)]
5861    fn simd_eq_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
5862        let (a0, a1) = self.split_f32x16(a);
5863        let (b0, b1) = self.split_f32x16(b);
5864        self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1))
5865    }
5866    #[inline(always)]
5867    fn simd_lt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
5868        let (a0, a1) = self.split_f32x16(a);
5869        let (b0, b1) = self.split_f32x16(b);
5870        self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1))
5871    }
5872    #[inline(always)]
5873    fn simd_le_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
5874        let (a0, a1) = self.split_f32x16(a);
5875        let (b0, b1) = self.split_f32x16(b);
5876        self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1))
5877    }
5878    #[inline(always)]
5879    fn simd_ge_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
5880        let (a0, a1) = self.split_f32x16(a);
5881        let (b0, b1) = self.split_f32x16(b);
5882        self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1))
5883    }
5884    #[inline(always)]
5885    fn simd_gt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
5886        let (a0, a1) = self.split_f32x16(a);
5887        let (b0, b1) = self.split_f32x16(b);
5888        self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1))
5889    }
5890    #[inline(always)]
5891    fn zip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5892        let (a0, _) = self.split_f32x16(a);
5893        let (b0, _) = self.split_f32x16(b);
5894        self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0))
5895    }
5896    #[inline(always)]
5897    fn zip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5898        let (_, a1) = self.split_f32x16(a);
5899        let (_, b1) = self.split_f32x16(b);
5900        self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1))
5901    }
5902    #[inline(always)]
5903    fn unzip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5904        let (a0, a1) = self.split_f32x16(a);
5905        let (b0, b1) = self.split_f32x16(b);
5906        self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1))
5907    }
5908    #[inline(always)]
5909    fn unzip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5910        let (a0, a1) = self.split_f32x16(a);
5911        let (b0, b1) = self.split_f32x16(b);
5912        self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1))
5913    }
5914    #[inline(always)]
5915    fn interleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
5916        let (a0, a1) = self.split_f32x16(a);
5917        let (b0, b1) = self.split_f32x16(b);
5918        let lo_lo = self.zip_low_f32x8(a0, b0);
5919        let lo_hi = self.zip_high_f32x8(a0, b0);
5920        let hi_lo = self.zip_low_f32x8(a1, b1);
5921        let hi_hi = self.zip_high_f32x8(a1, b1);
5922        (
5923            self.combine_f32x8(lo_lo, lo_hi),
5924            self.combine_f32x8(hi_lo, hi_hi),
5925        )
5926    }
5927    #[inline(always)]
5928    fn deinterleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
5929        let (a0, a1) = self.split_f32x16(a);
5930        let (b0, b1) = self.split_f32x16(b);
5931        let lo_even = self.unzip_low_f32x8(a0, a1);
5932        let lo_odd = self.unzip_high_f32x8(a0, a1);
5933        let hi_even = self.unzip_low_f32x8(b0, b1);
5934        let hi_odd = self.unzip_high_f32x8(b0, b1);
5935        (
5936            self.combine_f32x8(lo_even, hi_even),
5937            self.combine_f32x8(lo_odd, hi_odd),
5938        )
5939    }
5940    #[inline(always)]
5941    fn max_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5942        let (a0, a1) = self.split_f32x16(a);
5943        let (b0, b1) = self.split_f32x16(b);
5944        self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1))
5945    }
5946    #[inline(always)]
5947    fn min_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5948        let (a0, a1) = self.split_f32x16(a);
5949        let (b0, b1) = self.split_f32x16(b);
5950        self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1))
5951    }
5952    #[inline(always)]
5953    fn max_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5954        let (a0, a1) = self.split_f32x16(a);
5955        let (b0, b1) = self.split_f32x16(b);
5956        self.combine_f32x8(
5957            self.max_precise_f32x8(a0, b0),
5958            self.max_precise_f32x8(a1, b1),
5959        )
5960    }
5961    #[inline(always)]
5962    fn min_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5963        let (a0, a1) = self.split_f32x16(a);
5964        let (b0, b1) = self.split_f32x16(b);
5965        self.combine_f32x8(
5966            self.min_precise_f32x8(a0, b0),
5967            self.min_precise_f32x8(a1, b1),
5968        )
5969    }
5970    #[inline(always)]
5971    fn mul_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
5972        let (a0, a1) = self.split_f32x16(a);
5973        let (b0, b1) = self.split_f32x16(b);
5974        let (c0, c1) = self.split_f32x16(c);
5975        self.combine_f32x8(
5976            self.mul_add_f32x8(a0, b0, c0),
5977            self.mul_add_f32x8(a1, b1, c1),
5978        )
5979    }
5980    #[inline(always)]
5981    fn mul_sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
5982        let (a0, a1) = self.split_f32x16(a);
5983        let (b0, b1) = self.split_f32x16(b);
5984        let (c0, c1) = self.split_f32x16(c);
5985        self.combine_f32x8(
5986            self.mul_sub_f32x8(a0, b0, c0),
5987            self.mul_sub_f32x8(a1, b1, c1),
5988        )
5989    }
5990    #[inline(always)]
5991    fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
5992        let (a0, a1) = self.split_f32x16(a);
5993        self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1))
5994    }
5995    #[inline(always)]
5996    fn ceil_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
5997        let (a0, a1) = self.split_f32x16(a);
5998        self.combine_f32x8(self.ceil_f32x8(a0), self.ceil_f32x8(a1))
5999    }
6000    #[inline(always)]
6001    fn round_ties_even_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
6002        let (a0, a1) = self.split_f32x16(a);
6003        self.combine_f32x8(
6004            self.round_ties_even_f32x8(a0),
6005            self.round_ties_even_f32x8(a1),
6006        )
6007    }
6008    #[inline(always)]
6009    fn fract_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
6010        let (a0, a1) = self.split_f32x16(a);
6011        self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1))
6012    }
6013    #[inline(always)]
6014    fn trunc_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
6015        let (a0, a1) = self.split_f32x16(a);
6016        self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1))
6017    }
6018    #[inline(always)]
6019    fn select_f32x16(self, a: mask32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
6020        let (a0, a1) = self.split_mask32x16(a);
6021        let (b0, b1) = self.split_f32x16(b);
6022        let (c0, c1) = self.split_f32x16(c);
6023        self.combine_f32x8(self.select_f32x8(a0, b0, c0), self.select_f32x8(a1, b1, c1))
6024    }
6025    #[inline(always)]
6026    fn split_f32x16(self, a: f32x16<Self>) -> (f32x8<Self>, f32x8<Self>) {
6027        (
6028            f32x8 {
6029                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
6030                simd: self,
6031            },
6032            f32x8 {
6033                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
6034                simd: self,
6035            },
6036        )
6037    }
6038    #[inline(always)]
6039    fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
6040        let (a0, a1) = self.split_f32x16(a);
6041        self.combine_f64x4(
6042            self.reinterpret_f64_f32x8(a0),
6043            self.reinterpret_f64_f32x8(a1),
6044        )
6045    }
6046    #[inline(always)]
6047    fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
6048        let (a0, a1) = self.split_f32x16(a);
6049        self.combine_i32x8(
6050            self.reinterpret_i32_f32x8(a0),
6051            self.reinterpret_i32_f32x8(a1),
6052        )
6053    }
6054    #[inline(always)]
6055    fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
6056        unsafe {
6057            let v0 = _mm_loadu_ps(src.as_ptr() as *const _);
6058            let v1 = _mm_loadu_ps(src.as_ptr().add(4usize) as *const _);
6059            let v2 = _mm_loadu_ps(src.as_ptr().add(2 * 4usize) as *const _);
6060            let v3 = _mm_loadu_ps(src.as_ptr().add(3 * 4usize) as *const _);
6061            let tmp0 = _mm_unpacklo_ps(v0, v1);
6062            let tmp1 = _mm_unpackhi_ps(v0, v1);
6063            let tmp2 = _mm_unpacklo_ps(v2, v3);
6064            let tmp3 = _mm_unpackhi_ps(v2, v3);
6065            let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
6066            let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
6067            let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
6068            let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
6069            self.combine_f32x8(
6070                self.combine_f32x4(out0.simd_into(self), out1.simd_into(self)),
6071                self.combine_f32x4(out2.simd_into(self), out3.simd_into(self)),
6072            )
6073        }
6074    }
6075    #[inline(always)]
6076    fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
6077        let (v01, v23) = self.split_f32x16(a);
6078        let (v0, v1) = self.split_f32x8(v01);
6079        let (v2, v3) = self.split_f32x8(v23);
6080        let v0 = v0.into();
6081        let v1 = v1.into();
6082        let v2 = v2.into();
6083        let v3 = v3.into();
6084        unsafe {
6085            let tmp0 = _mm_unpacklo_ps(v0, v1);
6086            let tmp1 = _mm_unpackhi_ps(v0, v1);
6087            let tmp2 = _mm_unpacklo_ps(v2, v3);
6088            let tmp3 = _mm_unpackhi_ps(v2, v3);
6089            let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
6090            let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
6091            let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
6092            let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
6093            _mm_storeu_ps(dest.as_mut_ptr() as *mut _, out0);
6094            _mm_storeu_ps(dest.as_mut_ptr().add(4usize) as *mut _, out1);
6095            _mm_storeu_ps(dest.as_mut_ptr().add(2 * 4usize) as *mut _, out2);
6096            _mm_storeu_ps(dest.as_mut_ptr().add(3 * 4usize) as *mut _, out3);
6097        }
6098    }
6099    #[inline(always)]
6100    fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
6101        let (a0, a1) = self.split_f32x16(a);
6102        self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1))
6103    }
6104    #[inline(always)]
6105    fn reinterpret_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
6106        let (a0, a1) = self.split_f32x16(a);
6107        self.combine_u32x8(
6108            self.reinterpret_u32_f32x8(a0),
6109            self.reinterpret_u32_f32x8(a1),
6110        )
6111    }
6112    #[inline(always)]
6113    fn cvt_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
6114        let (a0, a1) = self.split_f32x16(a);
6115        self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1))
6116    }
6117    #[inline(always)]
6118    fn cvt_u32_precise_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
6119        let (a0, a1) = self.split_f32x16(a);
6120        self.combine_u32x8(
6121            self.cvt_u32_precise_f32x8(a0),
6122            self.cvt_u32_precise_f32x8(a1),
6123        )
6124    }
6125    #[inline(always)]
6126    fn cvt_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
6127        let (a0, a1) = self.split_f32x16(a);
6128        self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1))
6129    }
6130    #[inline(always)]
6131    fn cvt_i32_precise_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
6132        let (a0, a1) = self.split_f32x16(a);
6133        self.combine_i32x8(
6134            self.cvt_i32_precise_f32x8(a0),
6135            self.cvt_i32_precise_f32x8(a1),
6136        )
6137    }
6138    #[inline(always)]
6139    fn splat_i8x64(self, val: i8) -> i8x64<Self> {
6140        let half = self.splat_i8x32(val);
6141        self.combine_i8x32(half, half)
6142    }
6143    #[inline(always)]
6144    fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64<Self> {
6145        i8x64 {
6146            val: unsafe { core::mem::transmute_copy(&val) },
6147            simd: self,
6148        }
6149    }
6150    #[inline(always)]
6151    fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64<Self> {
6152        i8x64 {
6153            val: unsafe { core::mem::transmute_copy(val) },
6154            simd: self,
6155        }
6156    }
6157    #[inline(always)]
6158    fn as_array_i8x64(self, a: i8x64<Self>) -> [i8; 64usize] {
6159        unsafe { core::mem::transmute::<[__m128i; 4usize], [i8; 64usize]>(a.val.0) }
6160    }
6161    #[inline(always)]
6162    fn as_array_ref_i8x64(self, a: &i8x64<Self>) -> &[i8; 64usize] {
6163        unsafe { core::mem::transmute::<&[__m128i; 4usize], &[i8; 64usize]>(&a.val.0) }
6164    }
6165    #[inline(always)]
6166    fn as_array_mut_i8x64(self, a: &mut i8x64<Self>) -> &mut [i8; 64usize] {
6167        unsafe { core::mem::transmute::<&mut [__m128i; 4usize], &mut [i8; 64usize]>(&mut a.val.0) }
6168    }
6169    #[inline(always)]
6170    fn store_array_i8x64(self, a: i8x64<Self>, dest: &mut [i8; 64usize]) -> () {
6171        unsafe {
6172            core::ptr::copy_nonoverlapping(
6173                (&raw const a.val.0) as *const i8,
6174                dest.as_mut_ptr(),
6175                64usize,
6176            );
6177        }
6178    }
6179    #[inline(always)]
6180    fn cvt_from_bytes_i8x64(self, a: u8x64<Self>) -> i8x64<Self> {
6181        unsafe {
6182            i8x64 {
6183                val: core::mem::transmute(a.val),
6184                simd: self,
6185            }
6186        }
6187    }
6188    #[inline(always)]
6189    fn cvt_to_bytes_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
6190        unsafe {
6191            u8x64 {
6192                val: core::mem::transmute(a.val),
6193                simd: self,
6194            }
6195        }
6196    }
6197    #[inline(always)]
6198    fn slide_i8x64<const SHIFT: usize>(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6199        unsafe {
6200            if SHIFT >= 64usize {
6201                return b;
6202            }
6203            let result = cross_block_alignr_128x4(
6204                self.cvt_to_bytes_i8x64(b).val.0,
6205                self.cvt_to_bytes_i8x64(a).val.0,
6206                SHIFT,
6207            );
6208            self.cvt_from_bytes_i8x64(u8x64 {
6209                val: crate::support::Aligned512(result),
6210                simd: self,
6211            })
6212        }
6213    }
6214    #[inline(always)]
6215    fn slide_within_blocks_i8x64<const SHIFT: usize>(
6216        self,
6217        a: i8x64<Self>,
6218        b: i8x64<Self>,
6219    ) -> i8x64<Self> {
6220        let (a0, a1) = self.split_i8x64(a);
6221        let (b0, b1) = self.split_i8x64(b);
6222        self.combine_i8x32(
6223            self.slide_within_blocks_i8x32::<SHIFT>(a0, b0),
6224            self.slide_within_blocks_i8x32::<SHIFT>(a1, b1),
6225        )
6226    }
6227    #[inline(always)]
6228    fn add_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6229        let (a0, a1) = self.split_i8x64(a);
6230        let (b0, b1) = self.split_i8x64(b);
6231        self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1))
6232    }
6233    #[inline(always)]
6234    fn sub_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6235        let (a0, a1) = self.split_i8x64(a);
6236        let (b0, b1) = self.split_i8x64(b);
6237        self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1))
6238    }
6239    #[inline(always)]
6240    fn mul_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6241        let (a0, a1) = self.split_i8x64(a);
6242        let (b0, b1) = self.split_i8x64(b);
6243        self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1))
6244    }
6245    #[inline(always)]
6246    fn and_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6247        let (a0, a1) = self.split_i8x64(a);
6248        let (b0, b1) = self.split_i8x64(b);
6249        self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1))
6250    }
6251    #[inline(always)]
6252    fn or_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6253        let (a0, a1) = self.split_i8x64(a);
6254        let (b0, b1) = self.split_i8x64(b);
6255        self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1))
6256    }
6257    #[inline(always)]
6258    fn xor_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6259        let (a0, a1) = self.split_i8x64(a);
6260        let (b0, b1) = self.split_i8x64(b);
6261        self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1))
6262    }
6263    #[inline(always)]
6264    fn not_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
6265        let (a0, a1) = self.split_i8x64(a);
6266        self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1))
6267    }
6268    #[inline(always)]
6269    fn shl_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
6270        let (a0, a1) = self.split_i8x64(a);
6271        self.combine_i8x32(self.shl_i8x32(a0, shift), self.shl_i8x32(a1, shift))
6272    }
6273    #[inline(always)]
6274    fn shlv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6275        let (a0, a1) = self.split_i8x64(a);
6276        let (b0, b1) = self.split_i8x64(b);
6277        self.combine_i8x32(self.shlv_i8x32(a0, b0), self.shlv_i8x32(a1, b1))
6278    }
6279    #[inline(always)]
6280    fn shr_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
6281        let (a0, a1) = self.split_i8x64(a);
6282        self.combine_i8x32(self.shr_i8x32(a0, shift), self.shr_i8x32(a1, shift))
6283    }
6284    #[inline(always)]
6285    fn shrv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6286        let (a0, a1) = self.split_i8x64(a);
6287        let (b0, b1) = self.split_i8x64(b);
6288        self.combine_i8x32(self.shrv_i8x32(a0, b0), self.shrv_i8x32(a1, b1))
6289    }
6290    #[inline(always)]
6291    fn simd_eq_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
6292        let (a0, a1) = self.split_i8x64(a);
6293        let (b0, b1) = self.split_i8x64(b);
6294        self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1))
6295    }
6296    #[inline(always)]
6297    fn simd_lt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
6298        let (a0, a1) = self.split_i8x64(a);
6299        let (b0, b1) = self.split_i8x64(b);
6300        self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1))
6301    }
6302    #[inline(always)]
6303    fn simd_le_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
6304        let (a0, a1) = self.split_i8x64(a);
6305        let (b0, b1) = self.split_i8x64(b);
6306        self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1))
6307    }
6308    #[inline(always)]
6309    fn simd_ge_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
6310        let (a0, a1) = self.split_i8x64(a);
6311        let (b0, b1) = self.split_i8x64(b);
6312        self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1))
6313    }
6314    #[inline(always)]
6315    fn simd_gt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
6316        let (a0, a1) = self.split_i8x64(a);
6317        let (b0, b1) = self.split_i8x64(b);
6318        self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1))
6319    }
6320    #[inline(always)]
6321    fn zip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6322        let (a0, _) = self.split_i8x64(a);
6323        let (b0, _) = self.split_i8x64(b);
6324        self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0))
6325    }
6326    #[inline(always)]
6327    fn zip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6328        let (_, a1) = self.split_i8x64(a);
6329        let (_, b1) = self.split_i8x64(b);
6330        self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1))
6331    }
6332    #[inline(always)]
6333    fn unzip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6334        let (a0, a1) = self.split_i8x64(a);
6335        let (b0, b1) = self.split_i8x64(b);
6336        self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1))
6337    }
6338    #[inline(always)]
6339    fn unzip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6340        let (a0, a1) = self.split_i8x64(a);
6341        let (b0, b1) = self.split_i8x64(b);
6342        self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1))
6343    }
6344    #[inline(always)]
6345    fn interleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
6346        let (a0, a1) = self.split_i8x64(a);
6347        let (b0, b1) = self.split_i8x64(b);
6348        let lo_lo = self.zip_low_i8x32(a0, b0);
6349        let lo_hi = self.zip_high_i8x32(a0, b0);
6350        let hi_lo = self.zip_low_i8x32(a1, b1);
6351        let hi_hi = self.zip_high_i8x32(a1, b1);
6352        (
6353            self.combine_i8x32(lo_lo, lo_hi),
6354            self.combine_i8x32(hi_lo, hi_hi),
6355        )
6356    }
6357    #[inline(always)]
6358    fn deinterleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
6359        let (a0, a1) = self.split_i8x64(a);
6360        let (b0, b1) = self.split_i8x64(b);
6361        let lo_even = self.unzip_low_i8x32(a0, a1);
6362        let lo_odd = self.unzip_high_i8x32(a0, a1);
6363        let hi_even = self.unzip_low_i8x32(b0, b1);
6364        let hi_odd = self.unzip_high_i8x32(b0, b1);
6365        (
6366            self.combine_i8x32(lo_even, hi_even),
6367            self.combine_i8x32(lo_odd, hi_odd),
6368        )
6369    }
6370    #[inline(always)]
6371    fn select_i8x64(self, a: mask8x64<Self>, b: i8x64<Self>, c: i8x64<Self>) -> i8x64<Self> {
6372        let (a0, a1) = self.split_mask8x64(a);
6373        let (b0, b1) = self.split_i8x64(b);
6374        let (c0, c1) = self.split_i8x64(c);
6375        self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1))
6376    }
6377    #[inline(always)]
6378    fn min_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6379        let (a0, a1) = self.split_i8x64(a);
6380        let (b0, b1) = self.split_i8x64(b);
6381        self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1))
6382    }
6383    #[inline(always)]
6384    fn max_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6385        let (a0, a1) = self.split_i8x64(a);
6386        let (b0, b1) = self.split_i8x64(b);
6387        self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1))
6388    }
6389    #[inline(always)]
6390    fn split_i8x64(self, a: i8x64<Self>) -> (i8x32<Self>, i8x32<Self>) {
6391        (
6392            i8x32 {
6393                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
6394                simd: self,
6395            },
6396            i8x32 {
6397                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
6398                simd: self,
6399            },
6400        )
6401    }
6402    #[inline(always)]
6403    fn neg_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
6404        let (a0, a1) = self.split_i8x64(a);
6405        self.combine_i8x32(self.neg_i8x32(a0), self.neg_i8x32(a1))
6406    }
6407    #[inline(always)]
6408    fn reinterpret_u8_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
6409        let (a0, a1) = self.split_i8x64(a);
6410        self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1))
6411    }
6412    #[inline(always)]
6413    fn reinterpret_u32_i8x64(self, a: i8x64<Self>) -> u32x16<Self> {
6414        let (a0, a1) = self.split_i8x64(a);
6415        self.combine_u32x8(
6416            self.reinterpret_u32_i8x32(a0),
6417            self.reinterpret_u32_i8x32(a1),
6418        )
6419    }
6420    #[inline(always)]
6421    fn splat_u8x64(self, val: u8) -> u8x64<Self> {
6422        let half = self.splat_u8x32(val);
6423        self.combine_u8x32(half, half)
6424    }
6425    #[inline(always)]
6426    fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64<Self> {
6427        u8x64 {
6428            val: unsafe { core::mem::transmute_copy(&val) },
6429            simd: self,
6430        }
6431    }
6432    #[inline(always)]
6433    fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64<Self> {
6434        u8x64 {
6435            val: unsafe { core::mem::transmute_copy(val) },
6436            simd: self,
6437        }
6438    }
6439    #[inline(always)]
6440    fn as_array_u8x64(self, a: u8x64<Self>) -> [u8; 64usize] {
6441        unsafe { core::mem::transmute::<[__m128i; 4usize], [u8; 64usize]>(a.val.0) }
6442    }
6443    #[inline(always)]
6444    fn as_array_ref_u8x64(self, a: &u8x64<Self>) -> &[u8; 64usize] {
6445        unsafe { core::mem::transmute::<&[__m128i; 4usize], &[u8; 64usize]>(&a.val.0) }
6446    }
6447    #[inline(always)]
6448    fn as_array_mut_u8x64(self, a: &mut u8x64<Self>) -> &mut [u8; 64usize] {
6449        unsafe { core::mem::transmute::<&mut [__m128i; 4usize], &mut [u8; 64usize]>(&mut a.val.0) }
6450    }
6451    #[inline(always)]
6452    fn store_array_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
6453        unsafe {
6454            core::ptr::copy_nonoverlapping(
6455                (&raw const a.val.0) as *const u8,
6456                dest.as_mut_ptr(),
6457                64usize,
6458            );
6459        }
6460    }
6461    #[inline(always)]
6462    fn cvt_from_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
6463        unsafe {
6464            u8x64 {
6465                val: core::mem::transmute(a.val),
6466                simd: self,
6467            }
6468        }
6469    }
6470    #[inline(always)]
6471    fn cvt_to_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
6472        unsafe {
6473            u8x64 {
6474                val: core::mem::transmute(a.val),
6475                simd: self,
6476            }
6477        }
6478    }
6479    #[inline(always)]
6480    fn slide_u8x64<const SHIFT: usize>(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6481        unsafe {
6482            if SHIFT >= 64usize {
6483                return b;
6484            }
6485            let result = cross_block_alignr_128x4(
6486                self.cvt_to_bytes_u8x64(b).val.0,
6487                self.cvt_to_bytes_u8x64(a).val.0,
6488                SHIFT,
6489            );
6490            self.cvt_from_bytes_u8x64(u8x64 {
6491                val: crate::support::Aligned512(result),
6492                simd: self,
6493            })
6494        }
6495    }
6496    #[inline(always)]
6497    fn slide_within_blocks_u8x64<const SHIFT: usize>(
6498        self,
6499        a: u8x64<Self>,
6500        b: u8x64<Self>,
6501    ) -> u8x64<Self> {
6502        let (a0, a1) = self.split_u8x64(a);
6503        let (b0, b1) = self.split_u8x64(b);
6504        self.combine_u8x32(
6505            self.slide_within_blocks_u8x32::<SHIFT>(a0, b0),
6506            self.slide_within_blocks_u8x32::<SHIFT>(a1, b1),
6507        )
6508    }
6509    #[inline(always)]
6510    fn add_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6511        let (a0, a1) = self.split_u8x64(a);
6512        let (b0, b1) = self.split_u8x64(b);
6513        self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1))
6514    }
6515    #[inline(always)]
6516    fn sub_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6517        let (a0, a1) = self.split_u8x64(a);
6518        let (b0, b1) = self.split_u8x64(b);
6519        self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1))
6520    }
6521    #[inline(always)]
6522    fn mul_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6523        let (a0, a1) = self.split_u8x64(a);
6524        let (b0, b1) = self.split_u8x64(b);
6525        self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1))
6526    }
6527    #[inline(always)]
6528    fn and_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6529        let (a0, a1) = self.split_u8x64(a);
6530        let (b0, b1) = self.split_u8x64(b);
6531        self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1))
6532    }
6533    #[inline(always)]
6534    fn or_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6535        let (a0, a1) = self.split_u8x64(a);
6536        let (b0, b1) = self.split_u8x64(b);
6537        self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1))
6538    }
6539    #[inline(always)]
6540    fn xor_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6541        let (a0, a1) = self.split_u8x64(a);
6542        let (b0, b1) = self.split_u8x64(b);
6543        self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1))
6544    }
6545    #[inline(always)]
6546    fn not_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
6547        let (a0, a1) = self.split_u8x64(a);
6548        self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1))
6549    }
6550    #[inline(always)]
6551    fn shl_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
6552        let (a0, a1) = self.split_u8x64(a);
6553        self.combine_u8x32(self.shl_u8x32(a0, shift), self.shl_u8x32(a1, shift))
6554    }
6555    #[inline(always)]
6556    fn shlv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6557        let (a0, a1) = self.split_u8x64(a);
6558        let (b0, b1) = self.split_u8x64(b);
6559        self.combine_u8x32(self.shlv_u8x32(a0, b0), self.shlv_u8x32(a1, b1))
6560    }
6561    #[inline(always)]
6562    fn shr_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
6563        let (a0, a1) = self.split_u8x64(a);
6564        self.combine_u8x32(self.shr_u8x32(a0, shift), self.shr_u8x32(a1, shift))
6565    }
6566    #[inline(always)]
6567    fn shrv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6568        let (a0, a1) = self.split_u8x64(a);
6569        let (b0, b1) = self.split_u8x64(b);
6570        self.combine_u8x32(self.shrv_u8x32(a0, b0), self.shrv_u8x32(a1, b1))
6571    }
6572    #[inline(always)]
6573    fn simd_eq_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
6574        let (a0, a1) = self.split_u8x64(a);
6575        let (b0, b1) = self.split_u8x64(b);
6576        self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1))
6577    }
6578    #[inline(always)]
6579    fn simd_lt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
6580        let (a0, a1) = self.split_u8x64(a);
6581        let (b0, b1) = self.split_u8x64(b);
6582        self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1))
6583    }
6584    #[inline(always)]
6585    fn simd_le_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
6586        let (a0, a1) = self.split_u8x64(a);
6587        let (b0, b1) = self.split_u8x64(b);
6588        self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1))
6589    }
6590    #[inline(always)]
6591    fn simd_ge_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
6592        let (a0, a1) = self.split_u8x64(a);
6593        let (b0, b1) = self.split_u8x64(b);
6594        self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1))
6595    }
6596    #[inline(always)]
6597    fn simd_gt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
6598        let (a0, a1) = self.split_u8x64(a);
6599        let (b0, b1) = self.split_u8x64(b);
6600        self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1))
6601    }
6602    #[inline(always)]
6603    fn zip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6604        let (a0, _) = self.split_u8x64(a);
6605        let (b0, _) = self.split_u8x64(b);
6606        self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0))
6607    }
6608    #[inline(always)]
6609    fn zip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6610        let (_, a1) = self.split_u8x64(a);
6611        let (_, b1) = self.split_u8x64(b);
6612        self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1))
6613    }
6614    #[inline(always)]
6615    fn unzip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6616        let (a0, a1) = self.split_u8x64(a);
6617        let (b0, b1) = self.split_u8x64(b);
6618        self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1))
6619    }
6620    #[inline(always)]
6621    fn unzip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6622        let (a0, a1) = self.split_u8x64(a);
6623        let (b0, b1) = self.split_u8x64(b);
6624        self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1))
6625    }
6626    #[inline(always)]
6627    fn interleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
6628        let (a0, a1) = self.split_u8x64(a);
6629        let (b0, b1) = self.split_u8x64(b);
6630        let lo_lo = self.zip_low_u8x32(a0, b0);
6631        let lo_hi = self.zip_high_u8x32(a0, b0);
6632        let hi_lo = self.zip_low_u8x32(a1, b1);
6633        let hi_hi = self.zip_high_u8x32(a1, b1);
6634        (
6635            self.combine_u8x32(lo_lo, lo_hi),
6636            self.combine_u8x32(hi_lo, hi_hi),
6637        )
6638    }
6639    #[inline(always)]
6640    fn deinterleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
6641        let (a0, a1) = self.split_u8x64(a);
6642        let (b0, b1) = self.split_u8x64(b);
6643        let lo_even = self.unzip_low_u8x32(a0, a1);
6644        let lo_odd = self.unzip_high_u8x32(a0, a1);
6645        let hi_even = self.unzip_low_u8x32(b0, b1);
6646        let hi_odd = self.unzip_high_u8x32(b0, b1);
6647        (
6648            self.combine_u8x32(lo_even, hi_even),
6649            self.combine_u8x32(lo_odd, hi_odd),
6650        )
6651    }
6652    #[inline(always)]
6653    fn select_u8x64(self, a: mask8x64<Self>, b: u8x64<Self>, c: u8x64<Self>) -> u8x64<Self> {
6654        let (a0, a1) = self.split_mask8x64(a);
6655        let (b0, b1) = self.split_u8x64(b);
6656        let (c0, c1) = self.split_u8x64(c);
6657        self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1))
6658    }
6659    #[inline(always)]
6660    fn min_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6661        let (a0, a1) = self.split_u8x64(a);
6662        let (b0, b1) = self.split_u8x64(b);
6663        self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1))
6664    }
6665    #[inline(always)]
6666    fn max_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6667        let (a0, a1) = self.split_u8x64(a);
6668        let (b0, b1) = self.split_u8x64(b);
6669        self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1))
6670    }
6671    #[inline(always)]
6672    fn split_u8x64(self, a: u8x64<Self>) -> (u8x32<Self>, u8x32<Self>) {
6673        (
6674            u8x32 {
6675                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
6676                simd: self,
6677            },
6678            u8x32 {
6679                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
6680                simd: self,
6681            },
6682        )
6683    }
6684    #[inline(always)]
6685    fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self> {
6686        unsafe {
6687            let v0 = _mm_loadu_si128(src.as_ptr() as *const _);
6688            let v1 = _mm_loadu_si128(src.as_ptr().add(16usize) as *const _);
6689            let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 16usize) as *const _);
6690            let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 16usize) as *const _);
6691            let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
6692            let v0 = _mm_shuffle_epi8(v0, mask);
6693            let v1 = _mm_shuffle_epi8(v1, mask);
6694            let v2 = _mm_shuffle_epi8(v2, mask);
6695            let v3 = _mm_shuffle_epi8(v3, mask);
6696            let tmp0 = _mm_unpacklo_epi32(v0, v1);
6697            let tmp1 = _mm_unpackhi_epi32(v0, v1);
6698            let tmp2 = _mm_unpacklo_epi32(v2, v3);
6699            let tmp3 = _mm_unpackhi_epi32(v2, v3);
6700            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
6701            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
6702            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
6703            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
6704            self.combine_u8x32(
6705                self.combine_u8x16(out0.simd_into(self), out1.simd_into(self)),
6706                self.combine_u8x16(out2.simd_into(self), out3.simd_into(self)),
6707            )
6708        }
6709    }
6710    #[inline(always)]
6711    fn store_interleaved_128_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
6712        let (v01, v23) = self.split_u8x64(a);
6713        let (v0, v1) = self.split_u8x32(v01);
6714        let (v2, v3) = self.split_u8x32(v23);
6715        let v0 = v0.into();
6716        let v1 = v1.into();
6717        let v2 = v2.into();
6718        let v3 = v3.into();
6719        unsafe {
6720            let tmp0 = _mm_unpacklo_epi32(v0, v1);
6721            let tmp1 = _mm_unpackhi_epi32(v0, v1);
6722            let tmp2 = _mm_unpacklo_epi32(v2, v3);
6723            let tmp3 = _mm_unpackhi_epi32(v2, v3);
6724            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
6725            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
6726            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
6727            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
6728            let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
6729            let out0 = _mm_shuffle_epi8(out0, mask);
6730            let out1 = _mm_shuffle_epi8(out1, mask);
6731            let out2 = _mm_shuffle_epi8(out2, mask);
6732            let out3 = _mm_shuffle_epi8(out3, mask);
6733            _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0);
6734            _mm_storeu_si128(dest.as_mut_ptr().add(16usize) as *mut _, out1);
6735            _mm_storeu_si128(dest.as_mut_ptr().add(2 * 16usize) as *mut _, out2);
6736            _mm_storeu_si128(dest.as_mut_ptr().add(3 * 16usize) as *mut _, out3);
6737        }
6738    }
6739    #[inline(always)]
6740    fn reinterpret_u32_u8x64(self, a: u8x64<Self>) -> u32x16<Self> {
6741        let (a0, a1) = self.split_u8x64(a);
6742        self.combine_u32x8(
6743            self.reinterpret_u32_u8x32(a0),
6744            self.reinterpret_u32_u8x32(a1),
6745        )
6746    }
6747    #[inline(always)]
6748    fn splat_mask8x64(self, val: i8) -> mask8x64<Self> {
6749        let half = self.splat_mask8x32(val);
6750        self.combine_mask8x32(half, half)
6751    }
6752    #[inline(always)]
6753    fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64<Self> {
6754        mask8x64 {
6755            val: unsafe { core::mem::transmute_copy(&val) },
6756            simd: self,
6757        }
6758    }
6759    #[inline(always)]
6760    fn load_array_ref_mask8x64(self, val: &[i8; 64usize]) -> mask8x64<Self> {
6761        mask8x64 {
6762            val: unsafe { core::mem::transmute_copy(val) },
6763            simd: self,
6764        }
6765    }
6766    #[inline(always)]
6767    fn as_array_mask8x64(self, a: mask8x64<Self>) -> [i8; 64usize] {
6768        unsafe { core::mem::transmute::<[__m128i; 4usize], [i8; 64usize]>(a.val.0) }
6769    }
6770    #[inline(always)]
6771    fn as_array_ref_mask8x64(self, a: &mask8x64<Self>) -> &[i8; 64usize] {
6772        unsafe { core::mem::transmute::<&[__m128i; 4usize], &[i8; 64usize]>(&a.val.0) }
6773    }
6774    #[inline(always)]
6775    fn as_array_mut_mask8x64(self, a: &mut mask8x64<Self>) -> &mut [i8; 64usize] {
6776        unsafe { core::mem::transmute::<&mut [__m128i; 4usize], &mut [i8; 64usize]>(&mut a.val.0) }
6777    }
6778    #[inline(always)]
6779    fn store_array_mask8x64(self, a: mask8x64<Self>, dest: &mut [i8; 64usize]) -> () {
6780        unsafe {
6781            core::ptr::copy_nonoverlapping(
6782                (&raw const a.val.0) as *const i8,
6783                dest.as_mut_ptr(),
6784                64usize,
6785            );
6786        }
6787    }
6788    #[inline(always)]
6789    fn cvt_from_bytes_mask8x64(self, a: u8x64<Self>) -> mask8x64<Self> {
6790        unsafe {
6791            mask8x64 {
6792                val: core::mem::transmute(a.val),
6793                simd: self,
6794            }
6795        }
6796    }
6797    #[inline(always)]
6798    fn cvt_to_bytes_mask8x64(self, a: mask8x64<Self>) -> u8x64<Self> {
6799        unsafe {
6800            u8x64 {
6801                val: core::mem::transmute(a.val),
6802                simd: self,
6803            }
6804        }
6805    }
6806    #[inline(always)]
6807    fn slide_mask8x64<const SHIFT: usize>(
6808        self,
6809        a: mask8x64<Self>,
6810        b: mask8x64<Self>,
6811    ) -> mask8x64<Self> {
6812        unsafe {
6813            if SHIFT >= 64usize {
6814                return b;
6815            }
6816            let result = cross_block_alignr_128x4(
6817                self.cvt_to_bytes_mask8x64(b).val.0,
6818                self.cvt_to_bytes_mask8x64(a).val.0,
6819                SHIFT,
6820            );
6821            self.cvt_from_bytes_mask8x64(u8x64 {
6822                val: crate::support::Aligned512(result),
6823                simd: self,
6824            })
6825        }
6826    }
6827    #[inline(always)]
6828    fn slide_within_blocks_mask8x64<const SHIFT: usize>(
6829        self,
6830        a: mask8x64<Self>,
6831        b: mask8x64<Self>,
6832    ) -> mask8x64<Self> {
6833        let (a0, a1) = self.split_mask8x64(a);
6834        let (b0, b1) = self.split_mask8x64(b);
6835        self.combine_mask8x32(
6836            self.slide_within_blocks_mask8x32::<SHIFT>(a0, b0),
6837            self.slide_within_blocks_mask8x32::<SHIFT>(a1, b1),
6838        )
6839    }
6840    #[inline(always)]
6841    fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
6842        let (a0, a1) = self.split_mask8x64(a);
6843        let (b0, b1) = self.split_mask8x64(b);
6844        self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1))
6845    }
6846    #[inline(always)]
6847    fn or_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
6848        let (a0, a1) = self.split_mask8x64(a);
6849        let (b0, b1) = self.split_mask8x64(b);
6850        self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1))
6851    }
6852    #[inline(always)]
6853    fn xor_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
6854        let (a0, a1) = self.split_mask8x64(a);
6855        let (b0, b1) = self.split_mask8x64(b);
6856        self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1))
6857    }
6858    #[inline(always)]
6859    fn not_mask8x64(self, a: mask8x64<Self>) -> mask8x64<Self> {
6860        let (a0, a1) = self.split_mask8x64(a);
6861        self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1))
6862    }
6863    #[inline(always)]
6864    fn select_mask8x64(
6865        self,
6866        a: mask8x64<Self>,
6867        b: mask8x64<Self>,
6868        c: mask8x64<Self>,
6869    ) -> mask8x64<Self> {
6870        let (a0, a1) = self.split_mask8x64(a);
6871        let (b0, b1) = self.split_mask8x64(b);
6872        let (c0, c1) = self.split_mask8x64(c);
6873        self.combine_mask8x32(
6874            self.select_mask8x32(a0, b0, c0),
6875            self.select_mask8x32(a1, b1, c1),
6876        )
6877    }
6878    #[inline(always)]
6879    fn simd_eq_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
6880        let (a0, a1) = self.split_mask8x64(a);
6881        let (b0, b1) = self.split_mask8x64(b);
6882        self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1))
6883    }
6884    #[inline(always)]
6885    fn any_true_mask8x64(self, a: mask8x64<Self>) -> bool {
6886        let (a0, a1) = self.split_mask8x64(a);
6887        self.any_true_mask8x32(a0) || self.any_true_mask8x32(a1)
6888    }
6889    #[inline(always)]
6890    fn all_true_mask8x64(self, a: mask8x64<Self>) -> bool {
6891        let (a0, a1) = self.split_mask8x64(a);
6892        self.all_true_mask8x32(a0) && self.all_true_mask8x32(a1)
6893    }
6894    #[inline(always)]
6895    fn any_false_mask8x64(self, a: mask8x64<Self>) -> bool {
6896        let (a0, a1) = self.split_mask8x64(a);
6897        self.any_false_mask8x32(a0) || self.any_false_mask8x32(a1)
6898    }
6899    #[inline(always)]
6900    fn all_false_mask8x64(self, a: mask8x64<Self>) -> bool {
6901        let (a0, a1) = self.split_mask8x64(a);
6902        self.all_false_mask8x32(a0) && self.all_false_mask8x32(a1)
6903    }
6904    #[inline(always)]
6905    fn split_mask8x64(self, a: mask8x64<Self>) -> (mask8x32<Self>, mask8x32<Self>) {
6906        (
6907            mask8x32 {
6908                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
6909                simd: self,
6910            },
6911            mask8x32 {
6912                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
6913                simd: self,
6914            },
6915        )
6916    }
6917    #[inline(always)]
6918    fn splat_i16x32(self, val: i16) -> i16x32<Self> {
6919        let half = self.splat_i16x16(val);
6920        self.combine_i16x16(half, half)
6921    }
6922    #[inline(always)]
6923    fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32<Self> {
6924        i16x32 {
6925            val: unsafe { core::mem::transmute_copy(&val) },
6926            simd: self,
6927        }
6928    }
6929    #[inline(always)]
6930    fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32<Self> {
6931        i16x32 {
6932            val: unsafe { core::mem::transmute_copy(val) },
6933            simd: self,
6934        }
6935    }
6936    #[inline(always)]
6937    fn as_array_i16x32(self, a: i16x32<Self>) -> [i16; 32usize] {
6938        unsafe { core::mem::transmute::<[__m128i; 4usize], [i16; 32usize]>(a.val.0) }
6939    }
6940    #[inline(always)]
6941    fn as_array_ref_i16x32(self, a: &i16x32<Self>) -> &[i16; 32usize] {
6942        unsafe { core::mem::transmute::<&[__m128i; 4usize], &[i16; 32usize]>(&a.val.0) }
6943    }
6944    #[inline(always)]
6945    fn as_array_mut_i16x32(self, a: &mut i16x32<Self>) -> &mut [i16; 32usize] {
6946        unsafe { core::mem::transmute::<&mut [__m128i; 4usize], &mut [i16; 32usize]>(&mut a.val.0) }
6947    }
6948    #[inline(always)]
6949    fn store_array_i16x32(self, a: i16x32<Self>, dest: &mut [i16; 32usize]) -> () {
6950        unsafe {
6951            core::ptr::copy_nonoverlapping(
6952                (&raw const a.val.0) as *const i16,
6953                dest.as_mut_ptr(),
6954                32usize,
6955            );
6956        }
6957    }
6958    #[inline(always)]
6959    fn cvt_from_bytes_i16x32(self, a: u8x64<Self>) -> i16x32<Self> {
6960        unsafe {
6961            i16x32 {
6962                val: core::mem::transmute(a.val),
6963                simd: self,
6964            }
6965        }
6966    }
6967    #[inline(always)]
6968    fn cvt_to_bytes_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
6969        unsafe {
6970            u8x64 {
6971                val: core::mem::transmute(a.val),
6972                simd: self,
6973            }
6974        }
6975    }
6976    #[inline(always)]
6977    fn slide_i16x32<const SHIFT: usize>(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
6978        unsafe {
6979            if SHIFT >= 32usize {
6980                return b;
6981            }
6982            let result = cross_block_alignr_128x4(
6983                self.cvt_to_bytes_i16x32(b).val.0,
6984                self.cvt_to_bytes_i16x32(a).val.0,
6985                SHIFT * 2usize,
6986            );
6987            self.cvt_from_bytes_i16x32(u8x64 {
6988                val: crate::support::Aligned512(result),
6989                simd: self,
6990            })
6991        }
6992    }
6993    #[inline(always)]
6994    fn slide_within_blocks_i16x32<const SHIFT: usize>(
6995        self,
6996        a: i16x32<Self>,
6997        b: i16x32<Self>,
6998    ) -> i16x32<Self> {
6999        let (a0, a1) = self.split_i16x32(a);
7000        let (b0, b1) = self.split_i16x32(b);
7001        self.combine_i16x16(
7002            self.slide_within_blocks_i16x16::<SHIFT>(a0, b0),
7003            self.slide_within_blocks_i16x16::<SHIFT>(a1, b1),
7004        )
7005    }
7006    #[inline(always)]
7007    fn add_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7008        let (a0, a1) = self.split_i16x32(a);
7009        let (b0, b1) = self.split_i16x32(b);
7010        self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1))
7011    }
7012    #[inline(always)]
7013    fn sub_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7014        let (a0, a1) = self.split_i16x32(a);
7015        let (b0, b1) = self.split_i16x32(b);
7016        self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1))
7017    }
7018    #[inline(always)]
7019    fn mul_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7020        let (a0, a1) = self.split_i16x32(a);
7021        let (b0, b1) = self.split_i16x32(b);
7022        self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1))
7023    }
7024    #[inline(always)]
7025    fn and_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7026        let (a0, a1) = self.split_i16x32(a);
7027        let (b0, b1) = self.split_i16x32(b);
7028        self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1))
7029    }
7030    #[inline(always)]
7031    fn or_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7032        let (a0, a1) = self.split_i16x32(a);
7033        let (b0, b1) = self.split_i16x32(b);
7034        self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1))
7035    }
7036    #[inline(always)]
7037    fn xor_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7038        let (a0, a1) = self.split_i16x32(a);
7039        let (b0, b1) = self.split_i16x32(b);
7040        self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1))
7041    }
7042    #[inline(always)]
7043    fn not_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
7044        let (a0, a1) = self.split_i16x32(a);
7045        self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1))
7046    }
7047    #[inline(always)]
7048    fn shl_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
7049        let (a0, a1) = self.split_i16x32(a);
7050        self.combine_i16x16(self.shl_i16x16(a0, shift), self.shl_i16x16(a1, shift))
7051    }
7052    #[inline(always)]
7053    fn shlv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7054        let (a0, a1) = self.split_i16x32(a);
7055        let (b0, b1) = self.split_i16x32(b);
7056        self.combine_i16x16(self.shlv_i16x16(a0, b0), self.shlv_i16x16(a1, b1))
7057    }
7058    #[inline(always)]
7059    fn shr_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
7060        let (a0, a1) = self.split_i16x32(a);
7061        self.combine_i16x16(self.shr_i16x16(a0, shift), self.shr_i16x16(a1, shift))
7062    }
7063    #[inline(always)]
7064    fn shrv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7065        let (a0, a1) = self.split_i16x32(a);
7066        let (b0, b1) = self.split_i16x32(b);
7067        self.combine_i16x16(self.shrv_i16x16(a0, b0), self.shrv_i16x16(a1, b1))
7068    }
7069    #[inline(always)]
7070    fn simd_eq_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
7071        let (a0, a1) = self.split_i16x32(a);
7072        let (b0, b1) = self.split_i16x32(b);
7073        self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1))
7074    }
7075    #[inline(always)]
7076    fn simd_lt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
7077        let (a0, a1) = self.split_i16x32(a);
7078        let (b0, b1) = self.split_i16x32(b);
7079        self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1))
7080    }
7081    #[inline(always)]
7082    fn simd_le_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
7083        let (a0, a1) = self.split_i16x32(a);
7084        let (b0, b1) = self.split_i16x32(b);
7085        self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1))
7086    }
7087    #[inline(always)]
7088    fn simd_ge_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
7089        let (a0, a1) = self.split_i16x32(a);
7090        let (b0, b1) = self.split_i16x32(b);
7091        self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1))
7092    }
7093    #[inline(always)]
7094    fn simd_gt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
7095        let (a0, a1) = self.split_i16x32(a);
7096        let (b0, b1) = self.split_i16x32(b);
7097        self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1))
7098    }
7099    #[inline(always)]
7100    fn zip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7101        let (a0, _) = self.split_i16x32(a);
7102        let (b0, _) = self.split_i16x32(b);
7103        self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0))
7104    }
7105    #[inline(always)]
7106    fn zip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7107        let (_, a1) = self.split_i16x32(a);
7108        let (_, b1) = self.split_i16x32(b);
7109        self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1))
7110    }
7111    #[inline(always)]
7112    fn unzip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7113        let (a0, a1) = self.split_i16x32(a);
7114        let (b0, b1) = self.split_i16x32(b);
7115        self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1))
7116    }
7117    #[inline(always)]
7118    fn unzip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7119        let (a0, a1) = self.split_i16x32(a);
7120        let (b0, b1) = self.split_i16x32(b);
7121        self.combine_i16x16(
7122            self.unzip_high_i16x16(a0, a1),
7123            self.unzip_high_i16x16(b0, b1),
7124        )
7125    }
7126    #[inline(always)]
7127    fn interleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
7128        let (a0, a1) = self.split_i16x32(a);
7129        let (b0, b1) = self.split_i16x32(b);
7130        let lo_lo = self.zip_low_i16x16(a0, b0);
7131        let lo_hi = self.zip_high_i16x16(a0, b0);
7132        let hi_lo = self.zip_low_i16x16(a1, b1);
7133        let hi_hi = self.zip_high_i16x16(a1, b1);
7134        (
7135            self.combine_i16x16(lo_lo, lo_hi),
7136            self.combine_i16x16(hi_lo, hi_hi),
7137        )
7138    }
7139    #[inline(always)]
7140    fn deinterleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
7141        let (a0, a1) = self.split_i16x32(a);
7142        let (b0, b1) = self.split_i16x32(b);
7143        let lo_even = self.unzip_low_i16x16(a0, a1);
7144        let lo_odd = self.unzip_high_i16x16(a0, a1);
7145        let hi_even = self.unzip_low_i16x16(b0, b1);
7146        let hi_odd = self.unzip_high_i16x16(b0, b1);
7147        (
7148            self.combine_i16x16(lo_even, hi_even),
7149            self.combine_i16x16(lo_odd, hi_odd),
7150        )
7151    }
7152    #[inline(always)]
7153    fn select_i16x32(self, a: mask16x32<Self>, b: i16x32<Self>, c: i16x32<Self>) -> i16x32<Self> {
7154        let (a0, a1) = self.split_mask16x32(a);
7155        let (b0, b1) = self.split_i16x32(b);
7156        let (c0, c1) = self.split_i16x32(c);
7157        self.combine_i16x16(
7158            self.select_i16x16(a0, b0, c0),
7159            self.select_i16x16(a1, b1, c1),
7160        )
7161    }
7162    #[inline(always)]
7163    fn min_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7164        let (a0, a1) = self.split_i16x32(a);
7165        let (b0, b1) = self.split_i16x32(b);
7166        self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1))
7167    }
7168    #[inline(always)]
7169    fn max_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7170        let (a0, a1) = self.split_i16x32(a);
7171        let (b0, b1) = self.split_i16x32(b);
7172        self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1))
7173    }
7174    #[inline(always)]
7175    fn split_i16x32(self, a: i16x32<Self>) -> (i16x16<Self>, i16x16<Self>) {
7176        (
7177            i16x16 {
7178                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
7179                simd: self,
7180            },
7181            i16x16 {
7182                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
7183                simd: self,
7184            },
7185        )
7186    }
7187    #[inline(always)]
7188    fn neg_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
7189        let (a0, a1) = self.split_i16x32(a);
7190        self.combine_i16x16(self.neg_i16x16(a0), self.neg_i16x16(a1))
7191    }
7192    #[inline(always)]
7193    fn reinterpret_u8_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
7194        let (a0, a1) = self.split_i16x32(a);
7195        self.combine_u8x32(
7196            self.reinterpret_u8_i16x16(a0),
7197            self.reinterpret_u8_i16x16(a1),
7198        )
7199    }
7200    #[inline(always)]
7201    fn reinterpret_u32_i16x32(self, a: i16x32<Self>) -> u32x16<Self> {
7202        let (a0, a1) = self.split_i16x32(a);
7203        self.combine_u32x8(
7204            self.reinterpret_u32_i16x16(a0),
7205            self.reinterpret_u32_i16x16(a1),
7206        )
7207    }
7208    #[inline(always)]
7209    fn splat_u16x32(self, val: u16) -> u16x32<Self> {
7210        let half = self.splat_u16x16(val);
7211        self.combine_u16x16(half, half)
7212    }
7213    #[inline(always)]
7214    fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32<Self> {
7215        u16x32 {
7216            val: unsafe { core::mem::transmute_copy(&val) },
7217            simd: self,
7218        }
7219    }
7220    #[inline(always)]
7221    fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32<Self> {
7222        u16x32 {
7223            val: unsafe { core::mem::transmute_copy(val) },
7224            simd: self,
7225        }
7226    }
7227    #[inline(always)]
7228    fn as_array_u16x32(self, a: u16x32<Self>) -> [u16; 32usize] {
7229        unsafe { core::mem::transmute::<[__m128i; 4usize], [u16; 32usize]>(a.val.0) }
7230    }
7231    #[inline(always)]
7232    fn as_array_ref_u16x32(self, a: &u16x32<Self>) -> &[u16; 32usize] {
7233        unsafe { core::mem::transmute::<&[__m128i; 4usize], &[u16; 32usize]>(&a.val.0) }
7234    }
7235    #[inline(always)]
7236    fn as_array_mut_u16x32(self, a: &mut u16x32<Self>) -> &mut [u16; 32usize] {
7237        unsafe { core::mem::transmute::<&mut [__m128i; 4usize], &mut [u16; 32usize]>(&mut a.val.0) }
7238    }
7239    #[inline(always)]
7240    fn store_array_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
7241        unsafe {
7242            core::ptr::copy_nonoverlapping(
7243                (&raw const a.val.0) as *const u16,
7244                dest.as_mut_ptr(),
7245                32usize,
7246            );
7247        }
7248    }
7249    #[inline(always)]
7250    fn cvt_from_bytes_u16x32(self, a: u8x64<Self>) -> u16x32<Self> {
7251        unsafe {
7252            u16x32 {
7253                val: core::mem::transmute(a.val),
7254                simd: self,
7255            }
7256        }
7257    }
7258    #[inline(always)]
7259    fn cvt_to_bytes_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
7260        unsafe {
7261            u8x64 {
7262                val: core::mem::transmute(a.val),
7263                simd: self,
7264            }
7265        }
7266    }
7267    #[inline(always)]
7268    fn slide_u16x32<const SHIFT: usize>(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7269        unsafe {
7270            if SHIFT >= 32usize {
7271                return b;
7272            }
7273            let result = cross_block_alignr_128x4(
7274                self.cvt_to_bytes_u16x32(b).val.0,
7275                self.cvt_to_bytes_u16x32(a).val.0,
7276                SHIFT * 2usize,
7277            );
7278            self.cvt_from_bytes_u16x32(u8x64 {
7279                val: crate::support::Aligned512(result),
7280                simd: self,
7281            })
7282        }
7283    }
7284    #[inline(always)]
7285    fn slide_within_blocks_u16x32<const SHIFT: usize>(
7286        self,
7287        a: u16x32<Self>,
7288        b: u16x32<Self>,
7289    ) -> u16x32<Self> {
7290        let (a0, a1) = self.split_u16x32(a);
7291        let (b0, b1) = self.split_u16x32(b);
7292        self.combine_u16x16(
7293            self.slide_within_blocks_u16x16::<SHIFT>(a0, b0),
7294            self.slide_within_blocks_u16x16::<SHIFT>(a1, b1),
7295        )
7296    }
7297    #[inline(always)]
7298    fn add_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7299        let (a0, a1) = self.split_u16x32(a);
7300        let (b0, b1) = self.split_u16x32(b);
7301        self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1))
7302    }
7303    #[inline(always)]
7304    fn sub_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7305        let (a0, a1) = self.split_u16x32(a);
7306        let (b0, b1) = self.split_u16x32(b);
7307        self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1))
7308    }
7309    #[inline(always)]
7310    fn mul_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7311        let (a0, a1) = self.split_u16x32(a);
7312        let (b0, b1) = self.split_u16x32(b);
7313        self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1))
7314    }
7315    #[inline(always)]
7316    fn and_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7317        let (a0, a1) = self.split_u16x32(a);
7318        let (b0, b1) = self.split_u16x32(b);
7319        self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1))
7320    }
7321    #[inline(always)]
7322    fn or_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7323        let (a0, a1) = self.split_u16x32(a);
7324        let (b0, b1) = self.split_u16x32(b);
7325        self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1))
7326    }
7327    #[inline(always)]
7328    fn xor_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7329        let (a0, a1) = self.split_u16x32(a);
7330        let (b0, b1) = self.split_u16x32(b);
7331        self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1))
7332    }
7333    #[inline(always)]
7334    fn not_u16x32(self, a: u16x32<Self>) -> u16x32<Self> {
7335        let (a0, a1) = self.split_u16x32(a);
7336        self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1))
7337    }
7338    #[inline(always)]
7339    fn shl_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
7340        let (a0, a1) = self.split_u16x32(a);
7341        self.combine_u16x16(self.shl_u16x16(a0, shift), self.shl_u16x16(a1, shift))
7342    }
7343    #[inline(always)]
7344    fn shlv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7345        let (a0, a1) = self.split_u16x32(a);
7346        let (b0, b1) = self.split_u16x32(b);
7347        self.combine_u16x16(self.shlv_u16x16(a0, b0), self.shlv_u16x16(a1, b1))
7348    }
7349    #[inline(always)]
7350    fn shr_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
7351        let (a0, a1) = self.split_u16x32(a);
7352        self.combine_u16x16(self.shr_u16x16(a0, shift), self.shr_u16x16(a1, shift))
7353    }
7354    #[inline(always)]
7355    fn shrv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7356        let (a0, a1) = self.split_u16x32(a);
7357        let (b0, b1) = self.split_u16x32(b);
7358        self.combine_u16x16(self.shrv_u16x16(a0, b0), self.shrv_u16x16(a1, b1))
7359    }
7360    #[inline(always)]
7361    fn simd_eq_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
7362        let (a0, a1) = self.split_u16x32(a);
7363        let (b0, b1) = self.split_u16x32(b);
7364        self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1))
7365    }
7366    #[inline(always)]
7367    fn simd_lt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
7368        let (a0, a1) = self.split_u16x32(a);
7369        let (b0, b1) = self.split_u16x32(b);
7370        self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1))
7371    }
7372    #[inline(always)]
7373    fn simd_le_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
7374        let (a0, a1) = self.split_u16x32(a);
7375        let (b0, b1) = self.split_u16x32(b);
7376        self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1))
7377    }
7378    #[inline(always)]
7379    fn simd_ge_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
7380        let (a0, a1) = self.split_u16x32(a);
7381        let (b0, b1) = self.split_u16x32(b);
7382        self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1))
7383    }
7384    #[inline(always)]
7385    fn simd_gt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
7386        let (a0, a1) = self.split_u16x32(a);
7387        let (b0, b1) = self.split_u16x32(b);
7388        self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1))
7389    }
7390    #[inline(always)]
7391    fn zip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7392        let (a0, _) = self.split_u16x32(a);
7393        let (b0, _) = self.split_u16x32(b);
7394        self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0))
7395    }
7396    #[inline(always)]
7397    fn zip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7398        let (_, a1) = self.split_u16x32(a);
7399        let (_, b1) = self.split_u16x32(b);
7400        self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1))
7401    }
7402    #[inline(always)]
7403    fn unzip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7404        let (a0, a1) = self.split_u16x32(a);
7405        let (b0, b1) = self.split_u16x32(b);
7406        self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1))
7407    }
7408    #[inline(always)]
7409    fn unzip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7410        let (a0, a1) = self.split_u16x32(a);
7411        let (b0, b1) = self.split_u16x32(b);
7412        self.combine_u16x16(
7413            self.unzip_high_u16x16(a0, a1),
7414            self.unzip_high_u16x16(b0, b1),
7415        )
7416    }
7417    #[inline(always)]
7418    fn interleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
7419        let (a0, a1) = self.split_u16x32(a);
7420        let (b0, b1) = self.split_u16x32(b);
7421        let lo_lo = self.zip_low_u16x16(a0, b0);
7422        let lo_hi = self.zip_high_u16x16(a0, b0);
7423        let hi_lo = self.zip_low_u16x16(a1, b1);
7424        let hi_hi = self.zip_high_u16x16(a1, b1);
7425        (
7426            self.combine_u16x16(lo_lo, lo_hi),
7427            self.combine_u16x16(hi_lo, hi_hi),
7428        )
7429    }
7430    #[inline(always)]
7431    fn deinterleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
7432        let (a0, a1) = self.split_u16x32(a);
7433        let (b0, b1) = self.split_u16x32(b);
7434        let lo_even = self.unzip_low_u16x16(a0, a1);
7435        let lo_odd = self.unzip_high_u16x16(a0, a1);
7436        let hi_even = self.unzip_low_u16x16(b0, b1);
7437        let hi_odd = self.unzip_high_u16x16(b0, b1);
7438        (
7439            self.combine_u16x16(lo_even, hi_even),
7440            self.combine_u16x16(lo_odd, hi_odd),
7441        )
7442    }
7443    #[inline(always)]
7444    fn select_u16x32(self, a: mask16x32<Self>, b: u16x32<Self>, c: u16x32<Self>) -> u16x32<Self> {
7445        let (a0, a1) = self.split_mask16x32(a);
7446        let (b0, b1) = self.split_u16x32(b);
7447        let (c0, c1) = self.split_u16x32(c);
7448        self.combine_u16x16(
7449            self.select_u16x16(a0, b0, c0),
7450            self.select_u16x16(a1, b1, c1),
7451        )
7452    }
7453    #[inline(always)]
7454    fn min_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7455        let (a0, a1) = self.split_u16x32(a);
7456        let (b0, b1) = self.split_u16x32(b);
7457        self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1))
7458    }
7459    #[inline(always)]
7460    fn max_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7461        let (a0, a1) = self.split_u16x32(a);
7462        let (b0, b1) = self.split_u16x32(b);
7463        self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1))
7464    }
7465    #[inline(always)]
7466    fn split_u16x32(self, a: u16x32<Self>) -> (u16x16<Self>, u16x16<Self>) {
7467        (
7468            u16x16 {
7469                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
7470                simd: self,
7471            },
7472            u16x16 {
7473                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
7474                simd: self,
7475            },
7476        )
7477    }
7478    #[inline(always)]
7479    fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self> {
7480        unsafe {
7481            let v0 = _mm_loadu_si128(src.as_ptr() as *const _);
7482            let v1 = _mm_loadu_si128(src.as_ptr().add(8usize) as *const _);
7483            let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 8usize) as *const _);
7484            let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 8usize) as *const _);
7485            let mask = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
7486            let v0 = _mm_shuffle_epi8(v0, mask);
7487            let v1 = _mm_shuffle_epi8(v1, mask);
7488            let v2 = _mm_shuffle_epi8(v2, mask);
7489            let v3 = _mm_shuffle_epi8(v3, mask);
7490            let tmp0 = _mm_unpacklo_epi32(v0, v1);
7491            let tmp1 = _mm_unpackhi_epi32(v0, v1);
7492            let tmp2 = _mm_unpacklo_epi32(v2, v3);
7493            let tmp3 = _mm_unpackhi_epi32(v2, v3);
7494            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
7495            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
7496            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
7497            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
7498            self.combine_u16x16(
7499                self.combine_u16x8(out0.simd_into(self), out1.simd_into(self)),
7500                self.combine_u16x8(out2.simd_into(self), out3.simd_into(self)),
7501            )
7502        }
7503    }
7504    #[inline(always)]
7505    fn store_interleaved_128_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
7506        let (v01, v23) = self.split_u16x32(a);
7507        let (v0, v1) = self.split_u16x16(v01);
7508        let (v2, v3) = self.split_u16x16(v23);
7509        let v0 = v0.into();
7510        let v1 = v1.into();
7511        let v2 = v2.into();
7512        let v3 = v3.into();
7513        unsafe {
7514            let tmp0 = _mm_unpacklo_epi32(v0, v1);
7515            let tmp1 = _mm_unpackhi_epi32(v0, v1);
7516            let tmp2 = _mm_unpacklo_epi32(v2, v3);
7517            let tmp3 = _mm_unpackhi_epi32(v2, v3);
7518            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
7519            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
7520            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
7521            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
7522            let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
7523            let out0 = _mm_shuffle_epi8(out0, mask);
7524            let out1 = _mm_shuffle_epi8(out1, mask);
7525            let out2 = _mm_shuffle_epi8(out2, mask);
7526            let out3 = _mm_shuffle_epi8(out3, mask);
7527            _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0);
7528            _mm_storeu_si128(dest.as_mut_ptr().add(8usize) as *mut _, out1);
7529            _mm_storeu_si128(dest.as_mut_ptr().add(2 * 8usize) as *mut _, out2);
7530            _mm_storeu_si128(dest.as_mut_ptr().add(3 * 8usize) as *mut _, out3);
7531        }
7532    }
7533    #[inline(always)]
7534    fn narrow_u16x32(self, a: u16x32<Self>) -> u8x32<Self> {
7535        let (a0, a1) = self.split_u16x32(a);
7536        self.combine_u8x16(self.narrow_u16x16(a0), self.narrow_u16x16(a1))
7537    }
7538    #[inline(always)]
7539    fn reinterpret_u8_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
7540        let (a0, a1) = self.split_u16x32(a);
7541        self.combine_u8x32(
7542            self.reinterpret_u8_u16x16(a0),
7543            self.reinterpret_u8_u16x16(a1),
7544        )
7545    }
7546    #[inline(always)]
7547    fn reinterpret_u32_u16x32(self, a: u16x32<Self>) -> u32x16<Self> {
7548        let (a0, a1) = self.split_u16x32(a);
7549        self.combine_u32x8(
7550            self.reinterpret_u32_u16x16(a0),
7551            self.reinterpret_u32_u16x16(a1),
7552        )
7553    }
7554    #[inline(always)]
7555    fn splat_mask16x32(self, val: i16) -> mask16x32<Self> {
7556        let half = self.splat_mask16x16(val);
7557        self.combine_mask16x16(half, half)
7558    }
7559    #[inline(always)]
7560    fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32<Self> {
7561        mask16x32 {
7562            val: unsafe { core::mem::transmute_copy(&val) },
7563            simd: self,
7564        }
7565    }
7566    #[inline(always)]
7567    fn load_array_ref_mask16x32(self, val: &[i16; 32usize]) -> mask16x32<Self> {
7568        mask16x32 {
7569            val: unsafe { core::mem::transmute_copy(val) },
7570            simd: self,
7571        }
7572    }
7573    #[inline(always)]
7574    fn as_array_mask16x32(self, a: mask16x32<Self>) -> [i16; 32usize] {
7575        unsafe { core::mem::transmute::<[__m128i; 4usize], [i16; 32usize]>(a.val.0) }
7576    }
7577    #[inline(always)]
7578    fn as_array_ref_mask16x32(self, a: &mask16x32<Self>) -> &[i16; 32usize] {
7579        unsafe { core::mem::transmute::<&[__m128i; 4usize], &[i16; 32usize]>(&a.val.0) }
7580    }
7581    #[inline(always)]
7582    fn as_array_mut_mask16x32(self, a: &mut mask16x32<Self>) -> &mut [i16; 32usize] {
7583        unsafe { core::mem::transmute::<&mut [__m128i; 4usize], &mut [i16; 32usize]>(&mut a.val.0) }
7584    }
7585    #[inline(always)]
7586    fn store_array_mask16x32(self, a: mask16x32<Self>, dest: &mut [i16; 32usize]) -> () {
7587        unsafe {
7588            core::ptr::copy_nonoverlapping(
7589                (&raw const a.val.0) as *const i16,
7590                dest.as_mut_ptr(),
7591                32usize,
7592            );
7593        }
7594    }
7595    #[inline(always)]
7596    fn cvt_from_bytes_mask16x32(self, a: u8x64<Self>) -> mask16x32<Self> {
7597        unsafe {
7598            mask16x32 {
7599                val: core::mem::transmute(a.val),
7600                simd: self,
7601            }
7602        }
7603    }
7604    #[inline(always)]
7605    fn cvt_to_bytes_mask16x32(self, a: mask16x32<Self>) -> u8x64<Self> {
7606        unsafe {
7607            u8x64 {
7608                val: core::mem::transmute(a.val),
7609                simd: self,
7610            }
7611        }
7612    }
7613    #[inline(always)]
7614    fn slide_mask16x32<const SHIFT: usize>(
7615        self,
7616        a: mask16x32<Self>,
7617        b: mask16x32<Self>,
7618    ) -> mask16x32<Self> {
7619        unsafe {
7620            if SHIFT >= 32usize {
7621                return b;
7622            }
7623            let result = cross_block_alignr_128x4(
7624                self.cvt_to_bytes_mask16x32(b).val.0,
7625                self.cvt_to_bytes_mask16x32(a).val.0,
7626                SHIFT * 2usize,
7627            );
7628            self.cvt_from_bytes_mask16x32(u8x64 {
7629                val: crate::support::Aligned512(result),
7630                simd: self,
7631            })
7632        }
7633    }
7634    #[inline(always)]
7635    fn slide_within_blocks_mask16x32<const SHIFT: usize>(
7636        self,
7637        a: mask16x32<Self>,
7638        b: mask16x32<Self>,
7639    ) -> mask16x32<Self> {
7640        let (a0, a1) = self.split_mask16x32(a);
7641        let (b0, b1) = self.split_mask16x32(b);
7642        self.combine_mask16x16(
7643            self.slide_within_blocks_mask16x16::<SHIFT>(a0, b0),
7644            self.slide_within_blocks_mask16x16::<SHIFT>(a1, b1),
7645        )
7646    }
7647    #[inline(always)]
7648    fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
7649        let (a0, a1) = self.split_mask16x32(a);
7650        let (b0, b1) = self.split_mask16x32(b);
7651        self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1))
7652    }
7653    #[inline(always)]
7654    fn or_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
7655        let (a0, a1) = self.split_mask16x32(a);
7656        let (b0, b1) = self.split_mask16x32(b);
7657        self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1))
7658    }
7659    #[inline(always)]
7660    fn xor_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
7661        let (a0, a1) = self.split_mask16x32(a);
7662        let (b0, b1) = self.split_mask16x32(b);
7663        self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1))
7664    }
7665    #[inline(always)]
7666    fn not_mask16x32(self, a: mask16x32<Self>) -> mask16x32<Self> {
7667        let (a0, a1) = self.split_mask16x32(a);
7668        self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1))
7669    }
7670    #[inline(always)]
7671    fn select_mask16x32(
7672        self,
7673        a: mask16x32<Self>,
7674        b: mask16x32<Self>,
7675        c: mask16x32<Self>,
7676    ) -> mask16x32<Self> {
7677        let (a0, a1) = self.split_mask16x32(a);
7678        let (b0, b1) = self.split_mask16x32(b);
7679        let (c0, c1) = self.split_mask16x32(c);
7680        self.combine_mask16x16(
7681            self.select_mask16x16(a0, b0, c0),
7682            self.select_mask16x16(a1, b1, c1),
7683        )
7684    }
7685    #[inline(always)]
7686    fn simd_eq_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
7687        let (a0, a1) = self.split_mask16x32(a);
7688        let (b0, b1) = self.split_mask16x32(b);
7689        self.combine_mask16x16(
7690            self.simd_eq_mask16x16(a0, b0),
7691            self.simd_eq_mask16x16(a1, b1),
7692        )
7693    }
7694    #[inline(always)]
7695    fn any_true_mask16x32(self, a: mask16x32<Self>) -> bool {
7696        let (a0, a1) = self.split_mask16x32(a);
7697        self.any_true_mask16x16(a0) || self.any_true_mask16x16(a1)
7698    }
7699    #[inline(always)]
7700    fn all_true_mask16x32(self, a: mask16x32<Self>) -> bool {
7701        let (a0, a1) = self.split_mask16x32(a);
7702        self.all_true_mask16x16(a0) && self.all_true_mask16x16(a1)
7703    }
7704    #[inline(always)]
7705    fn any_false_mask16x32(self, a: mask16x32<Self>) -> bool {
7706        let (a0, a1) = self.split_mask16x32(a);
7707        self.any_false_mask16x16(a0) || self.any_false_mask16x16(a1)
7708    }
7709    #[inline(always)]
7710    fn all_false_mask16x32(self, a: mask16x32<Self>) -> bool {
7711        let (a0, a1) = self.split_mask16x32(a);
7712        self.all_false_mask16x16(a0) && self.all_false_mask16x16(a1)
7713    }
7714    #[inline(always)]
7715    fn split_mask16x32(self, a: mask16x32<Self>) -> (mask16x16<Self>, mask16x16<Self>) {
7716        (
7717            mask16x16 {
7718                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
7719                simd: self,
7720            },
7721            mask16x16 {
7722                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
7723                simd: self,
7724            },
7725        )
7726    }
7727    #[inline(always)]
7728    fn splat_i32x16(self, val: i32) -> i32x16<Self> {
7729        let half = self.splat_i32x8(val);
7730        self.combine_i32x8(half, half)
7731    }
7732    #[inline(always)]
7733    fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16<Self> {
7734        i32x16 {
7735            val: unsafe { core::mem::transmute_copy(&val) },
7736            simd: self,
7737        }
7738    }
7739    #[inline(always)]
7740    fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16<Self> {
7741        i32x16 {
7742            val: unsafe { core::mem::transmute_copy(val) },
7743            simd: self,
7744        }
7745    }
7746    #[inline(always)]
7747    fn as_array_i32x16(self, a: i32x16<Self>) -> [i32; 16usize] {
7748        unsafe { core::mem::transmute::<[__m128i; 4usize], [i32; 16usize]>(a.val.0) }
7749    }
7750    #[inline(always)]
7751    fn as_array_ref_i32x16(self, a: &i32x16<Self>) -> &[i32; 16usize] {
7752        unsafe { core::mem::transmute::<&[__m128i; 4usize], &[i32; 16usize]>(&a.val.0) }
7753    }
7754    #[inline(always)]
7755    fn as_array_mut_i32x16(self, a: &mut i32x16<Self>) -> &mut [i32; 16usize] {
7756        unsafe { core::mem::transmute::<&mut [__m128i; 4usize], &mut [i32; 16usize]>(&mut a.val.0) }
7757    }
7758    #[inline(always)]
7759    fn store_array_i32x16(self, a: i32x16<Self>, dest: &mut [i32; 16usize]) -> () {
7760        unsafe {
7761            core::ptr::copy_nonoverlapping(
7762                (&raw const a.val.0) as *const i32,
7763                dest.as_mut_ptr(),
7764                16usize,
7765            );
7766        }
7767    }
7768    #[inline(always)]
7769    fn cvt_from_bytes_i32x16(self, a: u8x64<Self>) -> i32x16<Self> {
7770        unsafe {
7771            i32x16 {
7772                val: core::mem::transmute(a.val),
7773                simd: self,
7774            }
7775        }
7776    }
7777    #[inline(always)]
7778    fn cvt_to_bytes_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
7779        unsafe {
7780            u8x64 {
7781                val: core::mem::transmute(a.val),
7782                simd: self,
7783            }
7784        }
7785    }
7786    #[inline(always)]
7787    fn slide_i32x16<const SHIFT: usize>(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7788        unsafe {
7789            if SHIFT >= 16usize {
7790                return b;
7791            }
7792            let result = cross_block_alignr_128x4(
7793                self.cvt_to_bytes_i32x16(b).val.0,
7794                self.cvt_to_bytes_i32x16(a).val.0,
7795                SHIFT * 4usize,
7796            );
7797            self.cvt_from_bytes_i32x16(u8x64 {
7798                val: crate::support::Aligned512(result),
7799                simd: self,
7800            })
7801        }
7802    }
7803    #[inline(always)]
7804    fn slide_within_blocks_i32x16<const SHIFT: usize>(
7805        self,
7806        a: i32x16<Self>,
7807        b: i32x16<Self>,
7808    ) -> i32x16<Self> {
7809        let (a0, a1) = self.split_i32x16(a);
7810        let (b0, b1) = self.split_i32x16(b);
7811        self.combine_i32x8(
7812            self.slide_within_blocks_i32x8::<SHIFT>(a0, b0),
7813            self.slide_within_blocks_i32x8::<SHIFT>(a1, b1),
7814        )
7815    }
7816    #[inline(always)]
7817    fn add_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7818        let (a0, a1) = self.split_i32x16(a);
7819        let (b0, b1) = self.split_i32x16(b);
7820        self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1))
7821    }
7822    #[inline(always)]
7823    fn sub_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7824        let (a0, a1) = self.split_i32x16(a);
7825        let (b0, b1) = self.split_i32x16(b);
7826        self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1))
7827    }
7828    #[inline(always)]
7829    fn mul_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7830        let (a0, a1) = self.split_i32x16(a);
7831        let (b0, b1) = self.split_i32x16(b);
7832        self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1))
7833    }
7834    #[inline(always)]
7835    fn and_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7836        let (a0, a1) = self.split_i32x16(a);
7837        let (b0, b1) = self.split_i32x16(b);
7838        self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1))
7839    }
7840    #[inline(always)]
7841    fn or_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7842        let (a0, a1) = self.split_i32x16(a);
7843        let (b0, b1) = self.split_i32x16(b);
7844        self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1))
7845    }
7846    #[inline(always)]
7847    fn xor_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7848        let (a0, a1) = self.split_i32x16(a);
7849        let (b0, b1) = self.split_i32x16(b);
7850        self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1))
7851    }
7852    #[inline(always)]
7853    fn not_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
7854        let (a0, a1) = self.split_i32x16(a);
7855        self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1))
7856    }
7857    #[inline(always)]
7858    fn shl_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
7859        let (a0, a1) = self.split_i32x16(a);
7860        self.combine_i32x8(self.shl_i32x8(a0, shift), self.shl_i32x8(a1, shift))
7861    }
7862    #[inline(always)]
7863    fn shlv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7864        let (a0, a1) = self.split_i32x16(a);
7865        let (b0, b1) = self.split_i32x16(b);
7866        self.combine_i32x8(self.shlv_i32x8(a0, b0), self.shlv_i32x8(a1, b1))
7867    }
7868    #[inline(always)]
7869    fn shr_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
7870        let (a0, a1) = self.split_i32x16(a);
7871        self.combine_i32x8(self.shr_i32x8(a0, shift), self.shr_i32x8(a1, shift))
7872    }
7873    #[inline(always)]
7874    fn shrv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7875        let (a0, a1) = self.split_i32x16(a);
7876        let (b0, b1) = self.split_i32x16(b);
7877        self.combine_i32x8(self.shrv_i32x8(a0, b0), self.shrv_i32x8(a1, b1))
7878    }
7879    #[inline(always)]
7880    fn simd_eq_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
7881        let (a0, a1) = self.split_i32x16(a);
7882        let (b0, b1) = self.split_i32x16(b);
7883        self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1))
7884    }
7885    #[inline(always)]
7886    fn simd_lt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
7887        let (a0, a1) = self.split_i32x16(a);
7888        let (b0, b1) = self.split_i32x16(b);
7889        self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1))
7890    }
7891    #[inline(always)]
7892    fn simd_le_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
7893        let (a0, a1) = self.split_i32x16(a);
7894        let (b0, b1) = self.split_i32x16(b);
7895        self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1))
7896    }
7897    #[inline(always)]
7898    fn simd_ge_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
7899        let (a0, a1) = self.split_i32x16(a);
7900        let (b0, b1) = self.split_i32x16(b);
7901        self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1))
7902    }
7903    #[inline(always)]
7904    fn simd_gt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
7905        let (a0, a1) = self.split_i32x16(a);
7906        let (b0, b1) = self.split_i32x16(b);
7907        self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1))
7908    }
7909    #[inline(always)]
7910    fn zip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7911        let (a0, _) = self.split_i32x16(a);
7912        let (b0, _) = self.split_i32x16(b);
7913        self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0))
7914    }
7915    #[inline(always)]
7916    fn zip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7917        let (_, a1) = self.split_i32x16(a);
7918        let (_, b1) = self.split_i32x16(b);
7919        self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1))
7920    }
7921    #[inline(always)]
7922    fn unzip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7923        let (a0, a1) = self.split_i32x16(a);
7924        let (b0, b1) = self.split_i32x16(b);
7925        self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1))
7926    }
7927    #[inline(always)]
7928    fn unzip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7929        let (a0, a1) = self.split_i32x16(a);
7930        let (b0, b1) = self.split_i32x16(b);
7931        self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1))
7932    }
7933    #[inline(always)]
7934    fn interleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
7935        let (a0, a1) = self.split_i32x16(a);
7936        let (b0, b1) = self.split_i32x16(b);
7937        let lo_lo = self.zip_low_i32x8(a0, b0);
7938        let lo_hi = self.zip_high_i32x8(a0, b0);
7939        let hi_lo = self.zip_low_i32x8(a1, b1);
7940        let hi_hi = self.zip_high_i32x8(a1, b1);
7941        (
7942            self.combine_i32x8(lo_lo, lo_hi),
7943            self.combine_i32x8(hi_lo, hi_hi),
7944        )
7945    }
7946    #[inline(always)]
7947    fn deinterleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
7948        let (a0, a1) = self.split_i32x16(a);
7949        let (b0, b1) = self.split_i32x16(b);
7950        let lo_even = self.unzip_low_i32x8(a0, a1);
7951        let lo_odd = self.unzip_high_i32x8(a0, a1);
7952        let hi_even = self.unzip_low_i32x8(b0, b1);
7953        let hi_odd = self.unzip_high_i32x8(b0, b1);
7954        (
7955            self.combine_i32x8(lo_even, hi_even),
7956            self.combine_i32x8(lo_odd, hi_odd),
7957        )
7958    }
7959    #[inline(always)]
7960    fn select_i32x16(self, a: mask32x16<Self>, b: i32x16<Self>, c: i32x16<Self>) -> i32x16<Self> {
7961        let (a0, a1) = self.split_mask32x16(a);
7962        let (b0, b1) = self.split_i32x16(b);
7963        let (c0, c1) = self.split_i32x16(c);
7964        self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1))
7965    }
7966    #[inline(always)]
7967    fn min_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7968        let (a0, a1) = self.split_i32x16(a);
7969        let (b0, b1) = self.split_i32x16(b);
7970        self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1))
7971    }
7972    #[inline(always)]
7973    fn max_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7974        let (a0, a1) = self.split_i32x16(a);
7975        let (b0, b1) = self.split_i32x16(b);
7976        self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1))
7977    }
7978    #[inline(always)]
7979    fn split_i32x16(self, a: i32x16<Self>) -> (i32x8<Self>, i32x8<Self>) {
7980        (
7981            i32x8 {
7982                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
7983                simd: self,
7984            },
7985            i32x8 {
7986                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
7987                simd: self,
7988            },
7989        )
7990    }
7991    #[inline(always)]
7992    fn neg_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
7993        let (a0, a1) = self.split_i32x16(a);
7994        self.combine_i32x8(self.neg_i32x8(a0), self.neg_i32x8(a1))
7995    }
7996    #[inline(always)]
7997    fn reinterpret_u8_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
7998        let (a0, a1) = self.split_i32x16(a);
7999        self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1))
8000    }
8001    #[inline(always)]
8002    fn reinterpret_u32_i32x16(self, a: i32x16<Self>) -> u32x16<Self> {
8003        let (a0, a1) = self.split_i32x16(a);
8004        self.combine_u32x8(
8005            self.reinterpret_u32_i32x8(a0),
8006            self.reinterpret_u32_i32x8(a1),
8007        )
8008    }
8009    #[inline(always)]
8010    fn cvt_f32_i32x16(self, a: i32x16<Self>) -> f32x16<Self> {
8011        let (a0, a1) = self.split_i32x16(a);
8012        self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1))
8013    }
8014    #[inline(always)]
8015    fn splat_u32x16(self, val: u32) -> u32x16<Self> {
8016        let half = self.splat_u32x8(val);
8017        self.combine_u32x8(half, half)
8018    }
8019    #[inline(always)]
8020    fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16<Self> {
8021        u32x16 {
8022            val: unsafe { core::mem::transmute_copy(&val) },
8023            simd: self,
8024        }
8025    }
8026    #[inline(always)]
8027    fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16<Self> {
8028        u32x16 {
8029            val: unsafe { core::mem::transmute_copy(val) },
8030            simd: self,
8031        }
8032    }
8033    #[inline(always)]
8034    fn as_array_u32x16(self, a: u32x16<Self>) -> [u32; 16usize] {
8035        unsafe { core::mem::transmute::<[__m128i; 4usize], [u32; 16usize]>(a.val.0) }
8036    }
8037    #[inline(always)]
8038    fn as_array_ref_u32x16(self, a: &u32x16<Self>) -> &[u32; 16usize] {
8039        unsafe { core::mem::transmute::<&[__m128i; 4usize], &[u32; 16usize]>(&a.val.0) }
8040    }
8041    #[inline(always)]
8042    fn as_array_mut_u32x16(self, a: &mut u32x16<Self>) -> &mut [u32; 16usize] {
8043        unsafe { core::mem::transmute::<&mut [__m128i; 4usize], &mut [u32; 16usize]>(&mut a.val.0) }
8044    }
8045    #[inline(always)]
8046    fn store_array_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
8047        unsafe {
8048            core::ptr::copy_nonoverlapping(
8049                (&raw const a.val.0) as *const u32,
8050                dest.as_mut_ptr(),
8051                16usize,
8052            );
8053        }
8054    }
8055    #[inline(always)]
8056    fn cvt_from_bytes_u32x16(self, a: u8x64<Self>) -> u32x16<Self> {
8057        unsafe {
8058            u32x16 {
8059                val: core::mem::transmute(a.val),
8060                simd: self,
8061            }
8062        }
8063    }
8064    #[inline(always)]
8065    fn cvt_to_bytes_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
8066        unsafe {
8067            u8x64 {
8068                val: core::mem::transmute(a.val),
8069                simd: self,
8070            }
8071        }
8072    }
8073    #[inline(always)]
8074    fn slide_u32x16<const SHIFT: usize>(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8075        unsafe {
8076            if SHIFT >= 16usize {
8077                return b;
8078            }
8079            let result = cross_block_alignr_128x4(
8080                self.cvt_to_bytes_u32x16(b).val.0,
8081                self.cvt_to_bytes_u32x16(a).val.0,
8082                SHIFT * 4usize,
8083            );
8084            self.cvt_from_bytes_u32x16(u8x64 {
8085                val: crate::support::Aligned512(result),
8086                simd: self,
8087            })
8088        }
8089    }
8090    #[inline(always)]
8091    fn slide_within_blocks_u32x16<const SHIFT: usize>(
8092        self,
8093        a: u32x16<Self>,
8094        b: u32x16<Self>,
8095    ) -> u32x16<Self> {
8096        let (a0, a1) = self.split_u32x16(a);
8097        let (b0, b1) = self.split_u32x16(b);
8098        self.combine_u32x8(
8099            self.slide_within_blocks_u32x8::<SHIFT>(a0, b0),
8100            self.slide_within_blocks_u32x8::<SHIFT>(a1, b1),
8101        )
8102    }
8103    #[inline(always)]
8104    fn add_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8105        let (a0, a1) = self.split_u32x16(a);
8106        let (b0, b1) = self.split_u32x16(b);
8107        self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1))
8108    }
8109    #[inline(always)]
8110    fn sub_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8111        let (a0, a1) = self.split_u32x16(a);
8112        let (b0, b1) = self.split_u32x16(b);
8113        self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1))
8114    }
8115    #[inline(always)]
8116    fn mul_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8117        let (a0, a1) = self.split_u32x16(a);
8118        let (b0, b1) = self.split_u32x16(b);
8119        self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1))
8120    }
8121    #[inline(always)]
8122    fn and_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8123        let (a0, a1) = self.split_u32x16(a);
8124        let (b0, b1) = self.split_u32x16(b);
8125        self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1))
8126    }
8127    #[inline(always)]
8128    fn or_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8129        let (a0, a1) = self.split_u32x16(a);
8130        let (b0, b1) = self.split_u32x16(b);
8131        self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1))
8132    }
8133    #[inline(always)]
8134    fn xor_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8135        let (a0, a1) = self.split_u32x16(a);
8136        let (b0, b1) = self.split_u32x16(b);
8137        self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1))
8138    }
8139    #[inline(always)]
8140    fn not_u32x16(self, a: u32x16<Self>) -> u32x16<Self> {
8141        let (a0, a1) = self.split_u32x16(a);
8142        self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1))
8143    }
8144    #[inline(always)]
8145    fn shl_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
8146        let (a0, a1) = self.split_u32x16(a);
8147        self.combine_u32x8(self.shl_u32x8(a0, shift), self.shl_u32x8(a1, shift))
8148    }
8149    #[inline(always)]
8150    fn shlv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8151        let (a0, a1) = self.split_u32x16(a);
8152        let (b0, b1) = self.split_u32x16(b);
8153        self.combine_u32x8(self.shlv_u32x8(a0, b0), self.shlv_u32x8(a1, b1))
8154    }
8155    #[inline(always)]
8156    fn shr_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
8157        let (a0, a1) = self.split_u32x16(a);
8158        self.combine_u32x8(self.shr_u32x8(a0, shift), self.shr_u32x8(a1, shift))
8159    }
8160    #[inline(always)]
8161    fn shrv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8162        let (a0, a1) = self.split_u32x16(a);
8163        let (b0, b1) = self.split_u32x16(b);
8164        self.combine_u32x8(self.shrv_u32x8(a0, b0), self.shrv_u32x8(a1, b1))
8165    }
8166    #[inline(always)]
8167    fn simd_eq_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
8168        let (a0, a1) = self.split_u32x16(a);
8169        let (b0, b1) = self.split_u32x16(b);
8170        self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1))
8171    }
8172    #[inline(always)]
8173    fn simd_lt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
8174        let (a0, a1) = self.split_u32x16(a);
8175        let (b0, b1) = self.split_u32x16(b);
8176        self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1))
8177    }
8178    #[inline(always)]
8179    fn simd_le_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
8180        let (a0, a1) = self.split_u32x16(a);
8181        let (b0, b1) = self.split_u32x16(b);
8182        self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1))
8183    }
8184    #[inline(always)]
8185    fn simd_ge_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
8186        let (a0, a1) = self.split_u32x16(a);
8187        let (b0, b1) = self.split_u32x16(b);
8188        self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1))
8189    }
8190    #[inline(always)]
8191    fn simd_gt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
8192        let (a0, a1) = self.split_u32x16(a);
8193        let (b0, b1) = self.split_u32x16(b);
8194        self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1))
8195    }
8196    #[inline(always)]
8197    fn zip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8198        let (a0, _) = self.split_u32x16(a);
8199        let (b0, _) = self.split_u32x16(b);
8200        self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0))
8201    }
8202    #[inline(always)]
8203    fn zip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8204        let (_, a1) = self.split_u32x16(a);
8205        let (_, b1) = self.split_u32x16(b);
8206        self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1))
8207    }
8208    #[inline(always)]
8209    fn unzip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8210        let (a0, a1) = self.split_u32x16(a);
8211        let (b0, b1) = self.split_u32x16(b);
8212        self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1))
8213    }
8214    #[inline(always)]
8215    fn unzip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8216        let (a0, a1) = self.split_u32x16(a);
8217        let (b0, b1) = self.split_u32x16(b);
8218        self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1))
8219    }
8220    #[inline(always)]
8221    fn interleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
8222        let (a0, a1) = self.split_u32x16(a);
8223        let (b0, b1) = self.split_u32x16(b);
8224        let lo_lo = self.zip_low_u32x8(a0, b0);
8225        let lo_hi = self.zip_high_u32x8(a0, b0);
8226        let hi_lo = self.zip_low_u32x8(a1, b1);
8227        let hi_hi = self.zip_high_u32x8(a1, b1);
8228        (
8229            self.combine_u32x8(lo_lo, lo_hi),
8230            self.combine_u32x8(hi_lo, hi_hi),
8231        )
8232    }
8233    #[inline(always)]
8234    fn deinterleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
8235        let (a0, a1) = self.split_u32x16(a);
8236        let (b0, b1) = self.split_u32x16(b);
8237        let lo_even = self.unzip_low_u32x8(a0, a1);
8238        let lo_odd = self.unzip_high_u32x8(a0, a1);
8239        let hi_even = self.unzip_low_u32x8(b0, b1);
8240        let hi_odd = self.unzip_high_u32x8(b0, b1);
8241        (
8242            self.combine_u32x8(lo_even, hi_even),
8243            self.combine_u32x8(lo_odd, hi_odd),
8244        )
8245    }
8246    #[inline(always)]
8247    fn select_u32x16(self, a: mask32x16<Self>, b: u32x16<Self>, c: u32x16<Self>) -> u32x16<Self> {
8248        let (a0, a1) = self.split_mask32x16(a);
8249        let (b0, b1) = self.split_u32x16(b);
8250        let (c0, c1) = self.split_u32x16(c);
8251        self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1))
8252    }
8253    #[inline(always)]
8254    fn min_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8255        let (a0, a1) = self.split_u32x16(a);
8256        let (b0, b1) = self.split_u32x16(b);
8257        self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1))
8258    }
8259    #[inline(always)]
8260    fn max_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8261        let (a0, a1) = self.split_u32x16(a);
8262        let (b0, b1) = self.split_u32x16(b);
8263        self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1))
8264    }
8265    #[inline(always)]
8266    fn split_u32x16(self, a: u32x16<Self>) -> (u32x8<Self>, u32x8<Self>) {
8267        (
8268            u32x8 {
8269                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
8270                simd: self,
8271            },
8272            u32x8 {
8273                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
8274                simd: self,
8275            },
8276        )
8277    }
8278    #[inline(always)]
8279    fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self> {
8280        unsafe {
8281            let v0 = _mm_loadu_si128(src.as_ptr() as *const _);
8282            let v1 = _mm_loadu_si128(src.as_ptr().add(4usize) as *const _);
8283            let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 4usize) as *const _);
8284            let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 4usize) as *const _);
8285            let tmp0 = _mm_unpacklo_epi32(v0, v1);
8286            let tmp1 = _mm_unpackhi_epi32(v0, v1);
8287            let tmp2 = _mm_unpacklo_epi32(v2, v3);
8288            let tmp3 = _mm_unpackhi_epi32(v2, v3);
8289            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
8290            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
8291            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
8292            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
8293            self.combine_u32x8(
8294                self.combine_u32x4(out0.simd_into(self), out1.simd_into(self)),
8295                self.combine_u32x4(out2.simd_into(self), out3.simd_into(self)),
8296            )
8297        }
8298    }
8299    #[inline(always)]
8300    fn store_interleaved_128_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
8301        let (v01, v23) = self.split_u32x16(a);
8302        let (v0, v1) = self.split_u32x8(v01);
8303        let (v2, v3) = self.split_u32x8(v23);
8304        let v0 = v0.into();
8305        let v1 = v1.into();
8306        let v2 = v2.into();
8307        let v3 = v3.into();
8308        unsafe {
8309            let tmp0 = _mm_unpacklo_epi32(v0, v1);
8310            let tmp1 = _mm_unpackhi_epi32(v0, v1);
8311            let tmp2 = _mm_unpacklo_epi32(v2, v3);
8312            let tmp3 = _mm_unpackhi_epi32(v2, v3);
8313            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
8314            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
8315            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
8316            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
8317            _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0);
8318            _mm_storeu_si128(dest.as_mut_ptr().add(4usize) as *mut _, out1);
8319            _mm_storeu_si128(dest.as_mut_ptr().add(2 * 4usize) as *mut _, out2);
8320            _mm_storeu_si128(dest.as_mut_ptr().add(3 * 4usize) as *mut _, out3);
8321        }
8322    }
8323    #[inline(always)]
8324    fn reinterpret_u8_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
8325        let (a0, a1) = self.split_u32x16(a);
8326        self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1))
8327    }
8328    #[inline(always)]
8329    fn cvt_f32_u32x16(self, a: u32x16<Self>) -> f32x16<Self> {
8330        let (a0, a1) = self.split_u32x16(a);
8331        self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1))
8332    }
8333    #[inline(always)]
8334    fn splat_mask32x16(self, val: i32) -> mask32x16<Self> {
8335        let half = self.splat_mask32x8(val);
8336        self.combine_mask32x8(half, half)
8337    }
8338    #[inline(always)]
8339    fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16<Self> {
8340        mask32x16 {
8341            val: unsafe { core::mem::transmute_copy(&val) },
8342            simd: self,
8343        }
8344    }
8345    #[inline(always)]
8346    fn load_array_ref_mask32x16(self, val: &[i32; 16usize]) -> mask32x16<Self> {
8347        mask32x16 {
8348            val: unsafe { core::mem::transmute_copy(val) },
8349            simd: self,
8350        }
8351    }
8352    #[inline(always)]
8353    fn as_array_mask32x16(self, a: mask32x16<Self>) -> [i32; 16usize] {
8354        unsafe { core::mem::transmute::<[__m128i; 4usize], [i32; 16usize]>(a.val.0) }
8355    }
8356    #[inline(always)]
8357    fn as_array_ref_mask32x16(self, a: &mask32x16<Self>) -> &[i32; 16usize] {
8358        unsafe { core::mem::transmute::<&[__m128i; 4usize], &[i32; 16usize]>(&a.val.0) }
8359    }
8360    #[inline(always)]
8361    fn as_array_mut_mask32x16(self, a: &mut mask32x16<Self>) -> &mut [i32; 16usize] {
8362        unsafe { core::mem::transmute::<&mut [__m128i; 4usize], &mut [i32; 16usize]>(&mut a.val.0) }
8363    }
8364    #[inline(always)]
8365    fn store_array_mask32x16(self, a: mask32x16<Self>, dest: &mut [i32; 16usize]) -> () {
8366        unsafe {
8367            core::ptr::copy_nonoverlapping(
8368                (&raw const a.val.0) as *const i32,
8369                dest.as_mut_ptr(),
8370                16usize,
8371            );
8372        }
8373    }
8374    #[inline(always)]
8375    fn cvt_from_bytes_mask32x16(self, a: u8x64<Self>) -> mask32x16<Self> {
8376        unsafe {
8377            mask32x16 {
8378                val: core::mem::transmute(a.val),
8379                simd: self,
8380            }
8381        }
8382    }
8383    #[inline(always)]
8384    fn cvt_to_bytes_mask32x16(self, a: mask32x16<Self>) -> u8x64<Self> {
8385        unsafe {
8386            u8x64 {
8387                val: core::mem::transmute(a.val),
8388                simd: self,
8389            }
8390        }
8391    }
8392    #[inline(always)]
8393    fn slide_mask32x16<const SHIFT: usize>(
8394        self,
8395        a: mask32x16<Self>,
8396        b: mask32x16<Self>,
8397    ) -> mask32x16<Self> {
8398        unsafe {
8399            if SHIFT >= 16usize {
8400                return b;
8401            }
8402            let result = cross_block_alignr_128x4(
8403                self.cvt_to_bytes_mask32x16(b).val.0,
8404                self.cvt_to_bytes_mask32x16(a).val.0,
8405                SHIFT * 4usize,
8406            );
8407            self.cvt_from_bytes_mask32x16(u8x64 {
8408                val: crate::support::Aligned512(result),
8409                simd: self,
8410            })
8411        }
8412    }
8413    #[inline(always)]
8414    fn slide_within_blocks_mask32x16<const SHIFT: usize>(
8415        self,
8416        a: mask32x16<Self>,
8417        b: mask32x16<Self>,
8418    ) -> mask32x16<Self> {
8419        let (a0, a1) = self.split_mask32x16(a);
8420        let (b0, b1) = self.split_mask32x16(b);
8421        self.combine_mask32x8(
8422            self.slide_within_blocks_mask32x8::<SHIFT>(a0, b0),
8423            self.slide_within_blocks_mask32x8::<SHIFT>(a1, b1),
8424        )
8425    }
8426    #[inline(always)]
8427    fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
8428        let (a0, a1) = self.split_mask32x16(a);
8429        let (b0, b1) = self.split_mask32x16(b);
8430        self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1))
8431    }
8432    #[inline(always)]
8433    fn or_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
8434        let (a0, a1) = self.split_mask32x16(a);
8435        let (b0, b1) = self.split_mask32x16(b);
8436        self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1))
8437    }
8438    #[inline(always)]
8439    fn xor_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
8440        let (a0, a1) = self.split_mask32x16(a);
8441        let (b0, b1) = self.split_mask32x16(b);
8442        self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1))
8443    }
8444    #[inline(always)]
8445    fn not_mask32x16(self, a: mask32x16<Self>) -> mask32x16<Self> {
8446        let (a0, a1) = self.split_mask32x16(a);
8447        self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1))
8448    }
8449    #[inline(always)]
8450    fn select_mask32x16(
8451        self,
8452        a: mask32x16<Self>,
8453        b: mask32x16<Self>,
8454        c: mask32x16<Self>,
8455    ) -> mask32x16<Self> {
8456        let (a0, a1) = self.split_mask32x16(a);
8457        let (b0, b1) = self.split_mask32x16(b);
8458        let (c0, c1) = self.split_mask32x16(c);
8459        self.combine_mask32x8(
8460            self.select_mask32x8(a0, b0, c0),
8461            self.select_mask32x8(a1, b1, c1),
8462        )
8463    }
8464    #[inline(always)]
8465    fn simd_eq_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
8466        let (a0, a1) = self.split_mask32x16(a);
8467        let (b0, b1) = self.split_mask32x16(b);
8468        self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1))
8469    }
8470    #[inline(always)]
8471    fn any_true_mask32x16(self, a: mask32x16<Self>) -> bool {
8472        let (a0, a1) = self.split_mask32x16(a);
8473        self.any_true_mask32x8(a0) || self.any_true_mask32x8(a1)
8474    }
8475    #[inline(always)]
8476    fn all_true_mask32x16(self, a: mask32x16<Self>) -> bool {
8477        let (a0, a1) = self.split_mask32x16(a);
8478        self.all_true_mask32x8(a0) && self.all_true_mask32x8(a1)
8479    }
8480    #[inline(always)]
8481    fn any_false_mask32x16(self, a: mask32x16<Self>) -> bool {
8482        let (a0, a1) = self.split_mask32x16(a);
8483        self.any_false_mask32x8(a0) || self.any_false_mask32x8(a1)
8484    }
8485    #[inline(always)]
8486    fn all_false_mask32x16(self, a: mask32x16<Self>) -> bool {
8487        let (a0, a1) = self.split_mask32x16(a);
8488        self.all_false_mask32x8(a0) && self.all_false_mask32x8(a1)
8489    }
8490    #[inline(always)]
8491    fn split_mask32x16(self, a: mask32x16<Self>) -> (mask32x8<Self>, mask32x8<Self>) {
8492        (
8493            mask32x8 {
8494                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
8495                simd: self,
8496            },
8497            mask32x8 {
8498                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
8499                simd: self,
8500            },
8501        )
8502    }
8503    #[inline(always)]
8504    fn splat_f64x8(self, val: f64) -> f64x8<Self> {
8505        let half = self.splat_f64x4(val);
8506        self.combine_f64x4(half, half)
8507    }
8508    #[inline(always)]
8509    fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8<Self> {
8510        f64x8 {
8511            val: unsafe { core::mem::transmute_copy(&val) },
8512            simd: self,
8513        }
8514    }
8515    #[inline(always)]
8516    fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8<Self> {
8517        f64x8 {
8518            val: unsafe { core::mem::transmute_copy(val) },
8519            simd: self,
8520        }
8521    }
8522    #[inline(always)]
8523    fn as_array_f64x8(self, a: f64x8<Self>) -> [f64; 8usize] {
8524        unsafe { core::mem::transmute::<[__m128d; 4usize], [f64; 8usize]>(a.val.0) }
8525    }
8526    #[inline(always)]
8527    fn as_array_ref_f64x8(self, a: &f64x8<Self>) -> &[f64; 8usize] {
8528        unsafe { core::mem::transmute::<&[__m128d; 4usize], &[f64; 8usize]>(&a.val.0) }
8529    }
8530    #[inline(always)]
8531    fn as_array_mut_f64x8(self, a: &mut f64x8<Self>) -> &mut [f64; 8usize] {
8532        unsafe { core::mem::transmute::<&mut [__m128d; 4usize], &mut [f64; 8usize]>(&mut a.val.0) }
8533    }
8534    #[inline(always)]
8535    fn store_array_f64x8(self, a: f64x8<Self>, dest: &mut [f64; 8usize]) -> () {
8536        unsafe {
8537            core::ptr::copy_nonoverlapping(
8538                (&raw const a.val.0) as *const f64,
8539                dest.as_mut_ptr(),
8540                8usize,
8541            );
8542        }
8543    }
8544    #[inline(always)]
8545    fn cvt_from_bytes_f64x8(self, a: u8x64<Self>) -> f64x8<Self> {
8546        unsafe {
8547            f64x8 {
8548                val: core::mem::transmute(a.val),
8549                simd: self,
8550            }
8551        }
8552    }
8553    #[inline(always)]
8554    fn cvt_to_bytes_f64x8(self, a: f64x8<Self>) -> u8x64<Self> {
8555        unsafe {
8556            u8x64 {
8557                val: core::mem::transmute(a.val),
8558                simd: self,
8559            }
8560        }
8561    }
8562    #[inline(always)]
8563    fn slide_f64x8<const SHIFT: usize>(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8564        unsafe {
8565            if SHIFT >= 8usize {
8566                return b;
8567            }
8568            let result = cross_block_alignr_128x4(
8569                self.cvt_to_bytes_f64x8(b).val.0,
8570                self.cvt_to_bytes_f64x8(a).val.0,
8571                SHIFT * 8usize,
8572            );
8573            self.cvt_from_bytes_f64x8(u8x64 {
8574                val: crate::support::Aligned512(result),
8575                simd: self,
8576            })
8577        }
8578    }
8579    #[inline(always)]
8580    fn slide_within_blocks_f64x8<const SHIFT: usize>(
8581        self,
8582        a: f64x8<Self>,
8583        b: f64x8<Self>,
8584    ) -> f64x8<Self> {
8585        let (a0, a1) = self.split_f64x8(a);
8586        let (b0, b1) = self.split_f64x8(b);
8587        self.combine_f64x4(
8588            self.slide_within_blocks_f64x4::<SHIFT>(a0, b0),
8589            self.slide_within_blocks_f64x4::<SHIFT>(a1, b1),
8590        )
8591    }
8592    #[inline(always)]
8593    fn abs_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8594        let (a0, a1) = self.split_f64x8(a);
8595        self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1))
8596    }
8597    #[inline(always)]
8598    fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8599        let (a0, a1) = self.split_f64x8(a);
8600        self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1))
8601    }
8602    #[inline(always)]
8603    fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8604        let (a0, a1) = self.split_f64x8(a);
8605        self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
8606    }
8607    #[inline(always)]
8608    fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8609        let (a0, a1) = self.split_f64x8(a);
8610        let (b0, b1) = self.split_f64x8(b);
8611        self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1))
8612    }
8613    #[inline(always)]
8614    fn sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8615        let (a0, a1) = self.split_f64x8(a);
8616        let (b0, b1) = self.split_f64x8(b);
8617        self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1))
8618    }
8619    #[inline(always)]
8620    fn mul_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8621        let (a0, a1) = self.split_f64x8(a);
8622        let (b0, b1) = self.split_f64x8(b);
8623        self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1))
8624    }
8625    #[inline(always)]
8626    fn div_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8627        let (a0, a1) = self.split_f64x8(a);
8628        let (b0, b1) = self.split_f64x8(b);
8629        self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1))
8630    }
8631    #[inline(always)]
8632    fn copysign_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8633        let (a0, a1) = self.split_f64x8(a);
8634        let (b0, b1) = self.split_f64x8(b);
8635        self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1))
8636    }
8637    #[inline(always)]
8638    fn simd_eq_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
8639        let (a0, a1) = self.split_f64x8(a);
8640        let (b0, b1) = self.split_f64x8(b);
8641        self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1))
8642    }
8643    #[inline(always)]
8644    fn simd_lt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
8645        let (a0, a1) = self.split_f64x8(a);
8646        let (b0, b1) = self.split_f64x8(b);
8647        self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1))
8648    }
8649    #[inline(always)]
8650    fn simd_le_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
8651        let (a0, a1) = self.split_f64x8(a);
8652        let (b0, b1) = self.split_f64x8(b);
8653        self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1))
8654    }
8655    #[inline(always)]
8656    fn simd_ge_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
8657        let (a0, a1) = self.split_f64x8(a);
8658        let (b0, b1) = self.split_f64x8(b);
8659        self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1))
8660    }
8661    #[inline(always)]
8662    fn simd_gt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
8663        let (a0, a1) = self.split_f64x8(a);
8664        let (b0, b1) = self.split_f64x8(b);
8665        self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1))
8666    }
8667    #[inline(always)]
8668    fn zip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8669        let (a0, _) = self.split_f64x8(a);
8670        let (b0, _) = self.split_f64x8(b);
8671        self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0))
8672    }
8673    #[inline(always)]
8674    fn zip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8675        let (_, a1) = self.split_f64x8(a);
8676        let (_, b1) = self.split_f64x8(b);
8677        self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1))
8678    }
8679    #[inline(always)]
8680    fn unzip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8681        let (a0, a1) = self.split_f64x8(a);
8682        let (b0, b1) = self.split_f64x8(b);
8683        self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1))
8684    }
8685    #[inline(always)]
8686    fn unzip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8687        let (a0, a1) = self.split_f64x8(a);
8688        let (b0, b1) = self.split_f64x8(b);
8689        self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1))
8690    }
8691    #[inline(always)]
8692    fn interleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
8693        let (a0, a1) = self.split_f64x8(a);
8694        let (b0, b1) = self.split_f64x8(b);
8695        let lo_lo = self.zip_low_f64x4(a0, b0);
8696        let lo_hi = self.zip_high_f64x4(a0, b0);
8697        let hi_lo = self.zip_low_f64x4(a1, b1);
8698        let hi_hi = self.zip_high_f64x4(a1, b1);
8699        (
8700            self.combine_f64x4(lo_lo, lo_hi),
8701            self.combine_f64x4(hi_lo, hi_hi),
8702        )
8703    }
8704    #[inline(always)]
8705    fn deinterleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
8706        let (a0, a1) = self.split_f64x8(a);
8707        let (b0, b1) = self.split_f64x8(b);
8708        let lo_even = self.unzip_low_f64x4(a0, a1);
8709        let lo_odd = self.unzip_high_f64x4(a0, a1);
8710        let hi_even = self.unzip_low_f64x4(b0, b1);
8711        let hi_odd = self.unzip_high_f64x4(b0, b1);
8712        (
8713            self.combine_f64x4(lo_even, hi_even),
8714            self.combine_f64x4(lo_odd, hi_odd),
8715        )
8716    }
8717    #[inline(always)]
8718    fn max_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8719        let (a0, a1) = self.split_f64x8(a);
8720        let (b0, b1) = self.split_f64x8(b);
8721        self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1))
8722    }
8723    #[inline(always)]
8724    fn min_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8725        let (a0, a1) = self.split_f64x8(a);
8726        let (b0, b1) = self.split_f64x8(b);
8727        self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1))
8728    }
8729    #[inline(always)]
8730    fn max_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8731        let (a0, a1) = self.split_f64x8(a);
8732        let (b0, b1) = self.split_f64x8(b);
8733        self.combine_f64x4(
8734            self.max_precise_f64x4(a0, b0),
8735            self.max_precise_f64x4(a1, b1),
8736        )
8737    }
8738    #[inline(always)]
8739    fn min_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8740        let (a0, a1) = self.split_f64x8(a);
8741        let (b0, b1) = self.split_f64x8(b);
8742        self.combine_f64x4(
8743            self.min_precise_f64x4(a0, b0),
8744            self.min_precise_f64x4(a1, b1),
8745        )
8746    }
8747    #[inline(always)]
8748    fn mul_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
8749        let (a0, a1) = self.split_f64x8(a);
8750        let (b0, b1) = self.split_f64x8(b);
8751        let (c0, c1) = self.split_f64x8(c);
8752        self.combine_f64x4(
8753            self.mul_add_f64x4(a0, b0, c0),
8754            self.mul_add_f64x4(a1, b1, c1),
8755        )
8756    }
8757    #[inline(always)]
8758    fn mul_sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
8759        let (a0, a1) = self.split_f64x8(a);
8760        let (b0, b1) = self.split_f64x8(b);
8761        let (c0, c1) = self.split_f64x8(c);
8762        self.combine_f64x4(
8763            self.mul_sub_f64x4(a0, b0, c0),
8764            self.mul_sub_f64x4(a1, b1, c1),
8765        )
8766    }
8767    #[inline(always)]
8768    fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8769        let (a0, a1) = self.split_f64x8(a);
8770        self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1))
8771    }
8772    #[inline(always)]
8773    fn ceil_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8774        let (a0, a1) = self.split_f64x8(a);
8775        self.combine_f64x4(self.ceil_f64x4(a0), self.ceil_f64x4(a1))
8776    }
8777    #[inline(always)]
8778    fn round_ties_even_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8779        let (a0, a1) = self.split_f64x8(a);
8780        self.combine_f64x4(
8781            self.round_ties_even_f64x4(a0),
8782            self.round_ties_even_f64x4(a1),
8783        )
8784    }
8785    #[inline(always)]
8786    fn fract_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8787        let (a0, a1) = self.split_f64x8(a);
8788        self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1))
8789    }
8790    #[inline(always)]
8791    fn trunc_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8792        let (a0, a1) = self.split_f64x8(a);
8793        self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1))
8794    }
8795    #[inline(always)]
8796    fn select_f64x8(self, a: mask64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
8797        let (a0, a1) = self.split_mask64x8(a);
8798        let (b0, b1) = self.split_f64x8(b);
8799        let (c0, c1) = self.split_f64x8(c);
8800        self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1))
8801    }
8802    #[inline(always)]
8803    fn split_f64x8(self, a: f64x8<Self>) -> (f64x4<Self>, f64x4<Self>) {
8804        (
8805            f64x4 {
8806                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
8807                simd: self,
8808            },
8809            f64x4 {
8810                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
8811                simd: self,
8812            },
8813        )
8814    }
8815    #[inline(always)]
8816    fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self> {
8817        let (a0, a1) = self.split_f64x8(a);
8818        self.combine_f32x8(
8819            self.reinterpret_f32_f64x4(a0),
8820            self.reinterpret_f32_f64x4(a1),
8821        )
8822    }
8823    #[inline(always)]
8824    fn splat_mask64x8(self, val: i64) -> mask64x8<Self> {
8825        let half = self.splat_mask64x4(val);
8826        self.combine_mask64x4(half, half)
8827    }
8828    #[inline(always)]
8829    fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8<Self> {
8830        mask64x8 {
8831            val: unsafe { core::mem::transmute_copy(&val) },
8832            simd: self,
8833        }
8834    }
8835    #[inline(always)]
8836    fn load_array_ref_mask64x8(self, val: &[i64; 8usize]) -> mask64x8<Self> {
8837        mask64x8 {
8838            val: unsafe { core::mem::transmute_copy(val) },
8839            simd: self,
8840        }
8841    }
8842    #[inline(always)]
8843    fn as_array_mask64x8(self, a: mask64x8<Self>) -> [i64; 8usize] {
8844        unsafe { core::mem::transmute::<[__m128i; 4usize], [i64; 8usize]>(a.val.0) }
8845    }
8846    #[inline(always)]
8847    fn as_array_ref_mask64x8(self, a: &mask64x8<Self>) -> &[i64; 8usize] {
8848        unsafe { core::mem::transmute::<&[__m128i; 4usize], &[i64; 8usize]>(&a.val.0) }
8849    }
8850    #[inline(always)]
8851    fn as_array_mut_mask64x8(self, a: &mut mask64x8<Self>) -> &mut [i64; 8usize] {
8852        unsafe { core::mem::transmute::<&mut [__m128i; 4usize], &mut [i64; 8usize]>(&mut a.val.0) }
8853    }
8854    #[inline(always)]
8855    fn store_array_mask64x8(self, a: mask64x8<Self>, dest: &mut [i64; 8usize]) -> () {
8856        unsafe {
8857            core::ptr::copy_nonoverlapping(
8858                (&raw const a.val.0) as *const i64,
8859                dest.as_mut_ptr(),
8860                8usize,
8861            );
8862        }
8863    }
8864    #[inline(always)]
8865    fn cvt_from_bytes_mask64x8(self, a: u8x64<Self>) -> mask64x8<Self> {
8866        unsafe {
8867            mask64x8 {
8868                val: core::mem::transmute(a.val),
8869                simd: self,
8870            }
8871        }
8872    }
8873    #[inline(always)]
8874    fn cvt_to_bytes_mask64x8(self, a: mask64x8<Self>) -> u8x64<Self> {
8875        unsafe {
8876            u8x64 {
8877                val: core::mem::transmute(a.val),
8878                simd: self,
8879            }
8880        }
8881    }
8882    #[inline(always)]
8883    fn slide_mask64x8<const SHIFT: usize>(
8884        self,
8885        a: mask64x8<Self>,
8886        b: mask64x8<Self>,
8887    ) -> mask64x8<Self> {
8888        unsafe {
8889            if SHIFT >= 8usize {
8890                return b;
8891            }
8892            let result = cross_block_alignr_128x4(
8893                self.cvt_to_bytes_mask64x8(b).val.0,
8894                self.cvt_to_bytes_mask64x8(a).val.0,
8895                SHIFT * 8usize,
8896            );
8897            self.cvt_from_bytes_mask64x8(u8x64 {
8898                val: crate::support::Aligned512(result),
8899                simd: self,
8900            })
8901        }
8902    }
8903    #[inline(always)]
8904    fn slide_within_blocks_mask64x8<const SHIFT: usize>(
8905        self,
8906        a: mask64x8<Self>,
8907        b: mask64x8<Self>,
8908    ) -> mask64x8<Self> {
8909        let (a0, a1) = self.split_mask64x8(a);
8910        let (b0, b1) = self.split_mask64x8(b);
8911        self.combine_mask64x4(
8912            self.slide_within_blocks_mask64x4::<SHIFT>(a0, b0),
8913            self.slide_within_blocks_mask64x4::<SHIFT>(a1, b1),
8914        )
8915    }
8916    #[inline(always)]
8917    fn and_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
8918        let (a0, a1) = self.split_mask64x8(a);
8919        let (b0, b1) = self.split_mask64x8(b);
8920        self.combine_mask64x4(self.and_mask64x4(a0, b0), self.and_mask64x4(a1, b1))
8921    }
8922    #[inline(always)]
8923    fn or_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
8924        let (a0, a1) = self.split_mask64x8(a);
8925        let (b0, b1) = self.split_mask64x8(b);
8926        self.combine_mask64x4(self.or_mask64x4(a0, b0), self.or_mask64x4(a1, b1))
8927    }
8928    #[inline(always)]
8929    fn xor_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
8930        let (a0, a1) = self.split_mask64x8(a);
8931        let (b0, b1) = self.split_mask64x8(b);
8932        self.combine_mask64x4(self.xor_mask64x4(a0, b0), self.xor_mask64x4(a1, b1))
8933    }
8934    #[inline(always)]
8935    fn not_mask64x8(self, a: mask64x8<Self>) -> mask64x8<Self> {
8936        let (a0, a1) = self.split_mask64x8(a);
8937        self.combine_mask64x4(self.not_mask64x4(a0), self.not_mask64x4(a1))
8938    }
8939    #[inline(always)]
8940    fn select_mask64x8(
8941        self,
8942        a: mask64x8<Self>,
8943        b: mask64x8<Self>,
8944        c: mask64x8<Self>,
8945    ) -> mask64x8<Self> {
8946        let (a0, a1) = self.split_mask64x8(a);
8947        let (b0, b1) = self.split_mask64x8(b);
8948        let (c0, c1) = self.split_mask64x8(c);
8949        self.combine_mask64x4(
8950            self.select_mask64x4(a0, b0, c0),
8951            self.select_mask64x4(a1, b1, c1),
8952        )
8953    }
8954    #[inline(always)]
8955    fn simd_eq_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
8956        let (a0, a1) = self.split_mask64x8(a);
8957        let (b0, b1) = self.split_mask64x8(b);
8958        self.combine_mask64x4(self.simd_eq_mask64x4(a0, b0), self.simd_eq_mask64x4(a1, b1))
8959    }
8960    #[inline(always)]
8961    fn any_true_mask64x8(self, a: mask64x8<Self>) -> bool {
8962        let (a0, a1) = self.split_mask64x8(a);
8963        self.any_true_mask64x4(a0) || self.any_true_mask64x4(a1)
8964    }
8965    #[inline(always)]
8966    fn all_true_mask64x8(self, a: mask64x8<Self>) -> bool {
8967        let (a0, a1) = self.split_mask64x8(a);
8968        self.all_true_mask64x4(a0) && self.all_true_mask64x4(a1)
8969    }
8970    #[inline(always)]
8971    fn any_false_mask64x8(self, a: mask64x8<Self>) -> bool {
8972        let (a0, a1) = self.split_mask64x8(a);
8973        self.any_false_mask64x4(a0) || self.any_false_mask64x4(a1)
8974    }
8975    #[inline(always)]
8976    fn all_false_mask64x8(self, a: mask64x8<Self>) -> bool {
8977        let (a0, a1) = self.split_mask64x8(a);
8978        self.all_false_mask64x4(a0) && self.all_false_mask64x4(a1)
8979    }
8980    #[inline(always)]
8981    fn split_mask64x8(self, a: mask64x8<Self>) -> (mask64x4<Self>, mask64x4<Self>) {
8982        (
8983            mask64x4 {
8984                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
8985                simd: self,
8986            },
8987            mask64x4 {
8988                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
8989                simd: self,
8990            },
8991        )
8992    }
8993}
8994impl<S: Simd> SimdFrom<__m128, S> for f32x4<S> {
8995    #[inline(always)]
8996    fn simd_from(simd: S, arch: __m128) -> Self {
8997        Self {
8998            val: unsafe { core::mem::transmute_copy(&arch) },
8999            simd,
9000        }
9001    }
9002}
9003impl<S: Simd> From<f32x4<S>> for __m128 {
9004    #[inline(always)]
9005    fn from(value: f32x4<S>) -> Self {
9006        unsafe { core::mem::transmute_copy(&value.val) }
9007    }
9008}
9009impl<S: Simd> SimdFrom<__m128i, S> for i8x16<S> {
9010    #[inline(always)]
9011    fn simd_from(simd: S, arch: __m128i) -> Self {
9012        Self {
9013            val: unsafe { core::mem::transmute_copy(&arch) },
9014            simd,
9015        }
9016    }
9017}
9018impl<S: Simd> From<i8x16<S>> for __m128i {
9019    #[inline(always)]
9020    fn from(value: i8x16<S>) -> Self {
9021        unsafe { core::mem::transmute_copy(&value.val) }
9022    }
9023}
9024impl<S: Simd> SimdFrom<__m128i, S> for u8x16<S> {
9025    #[inline(always)]
9026    fn simd_from(simd: S, arch: __m128i) -> Self {
9027        Self {
9028            val: unsafe { core::mem::transmute_copy(&arch) },
9029            simd,
9030        }
9031    }
9032}
9033impl<S: Simd> From<u8x16<S>> for __m128i {
9034    #[inline(always)]
9035    fn from(value: u8x16<S>) -> Self {
9036        unsafe { core::mem::transmute_copy(&value.val) }
9037    }
9038}
9039impl<S: Simd> SimdFrom<__m128i, S> for mask8x16<S> {
9040    #[inline(always)]
9041    fn simd_from(simd: S, arch: __m128i) -> Self {
9042        Self {
9043            val: unsafe { core::mem::transmute_copy(&arch) },
9044            simd,
9045        }
9046    }
9047}
9048impl<S: Simd> From<mask8x16<S>> for __m128i {
9049    #[inline(always)]
9050    fn from(value: mask8x16<S>) -> Self {
9051        unsafe { core::mem::transmute_copy(&value.val) }
9052    }
9053}
9054impl<S: Simd> SimdFrom<__m128i, S> for i16x8<S> {
9055    #[inline(always)]
9056    fn simd_from(simd: S, arch: __m128i) -> Self {
9057        Self {
9058            val: unsafe { core::mem::transmute_copy(&arch) },
9059            simd,
9060        }
9061    }
9062}
9063impl<S: Simd> From<i16x8<S>> for __m128i {
9064    #[inline(always)]
9065    fn from(value: i16x8<S>) -> Self {
9066        unsafe { core::mem::transmute_copy(&value.val) }
9067    }
9068}
9069impl<S: Simd> SimdFrom<__m128i, S> for u16x8<S> {
9070    #[inline(always)]
9071    fn simd_from(simd: S, arch: __m128i) -> Self {
9072        Self {
9073            val: unsafe { core::mem::transmute_copy(&arch) },
9074            simd,
9075        }
9076    }
9077}
9078impl<S: Simd> From<u16x8<S>> for __m128i {
9079    #[inline(always)]
9080    fn from(value: u16x8<S>) -> Self {
9081        unsafe { core::mem::transmute_copy(&value.val) }
9082    }
9083}
9084impl<S: Simd> SimdFrom<__m128i, S> for mask16x8<S> {
9085    #[inline(always)]
9086    fn simd_from(simd: S, arch: __m128i) -> Self {
9087        Self {
9088            val: unsafe { core::mem::transmute_copy(&arch) },
9089            simd,
9090        }
9091    }
9092}
9093impl<S: Simd> From<mask16x8<S>> for __m128i {
9094    #[inline(always)]
9095    fn from(value: mask16x8<S>) -> Self {
9096        unsafe { core::mem::transmute_copy(&value.val) }
9097    }
9098}
9099impl<S: Simd> SimdFrom<__m128i, S> for i32x4<S> {
9100    #[inline(always)]
9101    fn simd_from(simd: S, arch: __m128i) -> Self {
9102        Self {
9103            val: unsafe { core::mem::transmute_copy(&arch) },
9104            simd,
9105        }
9106    }
9107}
9108impl<S: Simd> From<i32x4<S>> for __m128i {
9109    #[inline(always)]
9110    fn from(value: i32x4<S>) -> Self {
9111        unsafe { core::mem::transmute_copy(&value.val) }
9112    }
9113}
9114impl<S: Simd> SimdFrom<__m128i, S> for u32x4<S> {
9115    #[inline(always)]
9116    fn simd_from(simd: S, arch: __m128i) -> Self {
9117        Self {
9118            val: unsafe { core::mem::transmute_copy(&arch) },
9119            simd,
9120        }
9121    }
9122}
9123impl<S: Simd> From<u32x4<S>> for __m128i {
9124    #[inline(always)]
9125    fn from(value: u32x4<S>) -> Self {
9126        unsafe { core::mem::transmute_copy(&value.val) }
9127    }
9128}
9129impl<S: Simd> SimdFrom<__m128i, S> for mask32x4<S> {
9130    #[inline(always)]
9131    fn simd_from(simd: S, arch: __m128i) -> Self {
9132        Self {
9133            val: unsafe { core::mem::transmute_copy(&arch) },
9134            simd,
9135        }
9136    }
9137}
9138impl<S: Simd> From<mask32x4<S>> for __m128i {
9139    #[inline(always)]
9140    fn from(value: mask32x4<S>) -> Self {
9141        unsafe { core::mem::transmute_copy(&value.val) }
9142    }
9143}
9144impl<S: Simd> SimdFrom<__m128d, S> for f64x2<S> {
9145    #[inline(always)]
9146    fn simd_from(simd: S, arch: __m128d) -> Self {
9147        Self {
9148            val: unsafe { core::mem::transmute_copy(&arch) },
9149            simd,
9150        }
9151    }
9152}
9153impl<S: Simd> From<f64x2<S>> for __m128d {
9154    #[inline(always)]
9155    fn from(value: f64x2<S>) -> Self {
9156        unsafe { core::mem::transmute_copy(&value.val) }
9157    }
9158}
9159impl<S: Simd> SimdFrom<__m128i, S> for mask64x2<S> {
9160    #[inline(always)]
9161    fn simd_from(simd: S, arch: __m128i) -> Self {
9162        Self {
9163            val: unsafe { core::mem::transmute_copy(&arch) },
9164            simd,
9165        }
9166    }
9167}
9168impl<S: Simd> From<mask64x2<S>> for __m128i {
9169    #[inline(always)]
9170    fn from(value: mask64x2<S>) -> Self {
9171        unsafe { core::mem::transmute_copy(&value.val) }
9172    }
9173}
9174#[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"]
9175#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"]
9176#[doc = r" Rust doesn't currently let you do math on const generics."]
9177#[inline(always)]
9178unsafe fn dyn_alignr_128(a: __m128i, b: __m128i, shift: usize) -> __m128i {
9179    unsafe {
9180        match shift {
9181            0usize => _mm_alignr_epi8::<0i32>(a, b),
9182            1usize => _mm_alignr_epi8::<1i32>(a, b),
9183            2usize => _mm_alignr_epi8::<2i32>(a, b),
9184            3usize => _mm_alignr_epi8::<3i32>(a, b),
9185            4usize => _mm_alignr_epi8::<4i32>(a, b),
9186            5usize => _mm_alignr_epi8::<5i32>(a, b),
9187            6usize => _mm_alignr_epi8::<6i32>(a, b),
9188            7usize => _mm_alignr_epi8::<7i32>(a, b),
9189            8usize => _mm_alignr_epi8::<8i32>(a, b),
9190            9usize => _mm_alignr_epi8::<9i32>(a, b),
9191            10usize => _mm_alignr_epi8::<10i32>(a, b),
9192            11usize => _mm_alignr_epi8::<11i32>(a, b),
9193            12usize => _mm_alignr_epi8::<12i32>(a, b),
9194            13usize => _mm_alignr_epi8::<13i32>(a, b),
9195            14usize => _mm_alignr_epi8::<14i32>(a, b),
9196            15usize => _mm_alignr_epi8::<15i32>(a, b),
9197            _ => unreachable!(),
9198        }
9199    }
9200}
9201#[doc = r" Concatenates `b` and `a` (each N blocks) and extracts N blocks starting at byte offset `shift_bytes`."]
9202#[doc = r" Extracts from [b : a] (b in low bytes, a in high bytes), matching `alignr` semantics."]
9203#[inline(always)]
9204unsafe fn cross_block_alignr_128x2(
9205    a: [__m128i; 2usize],
9206    b: [__m128i; 2usize],
9207    shift_bytes: usize,
9208) -> [__m128i; 2usize] {
9209    [
9210        {
9211            let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 0usize, shift_bytes);
9212            unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) }
9213        },
9214        {
9215            let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 1usize, shift_bytes);
9216            unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) }
9217        },
9218    ]
9219}
9220#[doc = r" Concatenates `b` and `a` (each N blocks) and extracts N blocks starting at byte offset `shift_bytes`."]
9221#[doc = r" Extracts from [b : a] (b in low bytes, a in high bytes), matching `alignr` semantics."]
9222#[inline(always)]
9223unsafe fn cross_block_alignr_128x4(
9224    a: [__m128i; 4usize],
9225    b: [__m128i; 4usize],
9226    shift_bytes: usize,
9227) -> [__m128i; 4usize] {
9228    [
9229        {
9230            let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 0usize, shift_bytes);
9231            unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) }
9232        },
9233        {
9234            let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 1usize, shift_bytes);
9235            unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) }
9236        },
9237        {
9238            let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 2usize, shift_bytes);
9239            unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) }
9240        },
9241        {
9242            let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 3usize, shift_bytes);
9243            unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) }
9244        },
9245    ]
9246}