fearless_simd/generated/
sse4_2.rs

1// Copyright 2025 the Fearless_SIMD Authors
2// SPDX-License-Identifier: Apache-2.0 OR MIT
3
4// This file is autogenerated by fearless_simd_gen
5
6#![expect(
7    unused_variables,
8    clippy::todo,
9    reason = "TODO: https://github.com/linebender/fearless_simd/issues/40"
10)]
11use crate::{Level, Simd, SimdFrom, SimdInto, seal::Seal};
12use crate::{
13    f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4,
14    i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4,
15    mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32,
16    u32x4, u32x8, u32x16,
17};
18#[cfg(target_arch = "x86")]
19use core::arch::x86::*;
20#[cfg(target_arch = "x86_64")]
21use core::arch::x86_64::*;
22use core::ops::*;
23#[doc = r#" The SIMD token for the "SSE 4.2" level."#]
24#[derive(Clone, Copy, Debug)]
25pub struct Sse4_2 {
26    pub sse4_2: crate::core_arch::x86::Sse4_2,
27}
28impl Sse4_2 {
29    #[doc = r" Create a SIMD token."]
30    #[doc = r""]
31    #[doc = r" # Safety"]
32    #[doc = r""]
33    #[doc = r" The SSE4.2 CPU feature must be available."]
34    #[inline]
35    pub unsafe fn new_unchecked() -> Self {
36        Sse4_2 {
37            sse4_2: unsafe { crate::core_arch::x86::Sse4_2::new_unchecked() },
38        }
39    }
40}
41impl Seal for Sse4_2 {}
42impl Simd for Sse4_2 {
43    type f32s = f32x4<Self>;
44    type u8s = u8x16<Self>;
45    type i8s = i8x16<Self>;
46    type u16s = u16x8<Self>;
47    type i16s = i16x8<Self>;
48    type u32s = u32x4<Self>;
49    type i32s = i32x4<Self>;
50    type mask8s = mask8x16<Self>;
51    type mask16s = mask16x8<Self>;
52    type mask32s = mask32x4<Self>;
53    #[inline(always)]
54    fn level(self) -> Level {
55        Level::Sse4_2(self)
56    }
57    #[inline]
58    fn vectorize<F: FnOnce() -> R, R>(self, f: F) -> R {
59        #[target_feature(enable = "sse4.2")]
60        #[inline]
61        unsafe fn vectorize_sse4_2<F: FnOnce() -> R, R>(f: F) -> R {
62            f()
63        }
64        unsafe { vectorize_sse4_2(f) }
65    }
66    #[inline(always)]
67    fn splat_f32x4(self, val: f32) -> f32x4<Self> {
68        unsafe { _mm_set1_ps(val).simd_into(self) }
69    }
70    #[inline(always)]
71    fn abs_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
72        unsafe { _mm_andnot_ps(_mm_set1_ps(-0.0), a.into()).simd_into(self) }
73    }
74    #[inline(always)]
75    fn neg_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
76        unsafe { _mm_xor_ps(a.into(), _mm_set1_ps(-0.0)).simd_into(self) }
77    }
78    #[inline(always)]
79    fn sqrt_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
80        unsafe { _mm_sqrt_ps(a.into()).simd_into(self) }
81    }
82    #[inline(always)]
83    fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
84        unsafe { _mm_add_ps(a.into(), b.into()).simd_into(self) }
85    }
86    #[inline(always)]
87    fn sub_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
88        unsafe { _mm_sub_ps(a.into(), b.into()).simd_into(self) }
89    }
90    #[inline(always)]
91    fn mul_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
92        unsafe { _mm_mul_ps(a.into(), b.into()).simd_into(self) }
93    }
94    #[inline(always)]
95    fn div_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
96        unsafe { _mm_div_ps(a.into(), b.into()).simd_into(self) }
97    }
98    #[inline(always)]
99    fn copysign_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
100        unsafe {
101            let mask = _mm_set1_ps(-0.0);
102            _mm_or_ps(_mm_and_ps(mask, b.into()), _mm_andnot_ps(mask, a.into())).simd_into(self)
103        }
104    }
105    #[inline(always)]
106    fn simd_eq_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
107        unsafe { _mm_castps_si128(_mm_cmpeq_ps(a.into(), b.into())).simd_into(self) }
108    }
109    #[inline(always)]
110    fn simd_lt_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
111        unsafe { _mm_castps_si128(_mm_cmplt_ps(a.into(), b.into())).simd_into(self) }
112    }
113    #[inline(always)]
114    fn simd_le_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
115        unsafe { _mm_castps_si128(_mm_cmple_ps(a.into(), b.into())).simd_into(self) }
116    }
117    #[inline(always)]
118    fn simd_ge_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
119        unsafe { _mm_castps_si128(_mm_cmpge_ps(a.into(), b.into())).simd_into(self) }
120    }
121    #[inline(always)]
122    fn simd_gt_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
123        unsafe { _mm_castps_si128(_mm_cmpgt_ps(a.into(), b.into())).simd_into(self) }
124    }
125    #[inline(always)]
126    fn zip_low_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
127        unsafe { _mm_unpacklo_ps(a.into(), b.into()).simd_into(self) }
128    }
129    #[inline(always)]
130    fn zip_high_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
131        unsafe { _mm_unpackhi_ps(a.into(), b.into()).simd_into(self) }
132    }
133    #[inline(always)]
134    fn unzip_low_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
135        unsafe { _mm_shuffle_ps::<0b10_00_10_00>(a.into(), b.into()).simd_into(self) }
136    }
137    #[inline(always)]
138    fn unzip_high_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
139        unsafe { _mm_shuffle_ps::<0b11_01_11_01>(a.into(), b.into()).simd_into(self) }
140    }
141    #[inline(always)]
142    fn max_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
143        unsafe { _mm_max_ps(a.into(), b.into()).simd_into(self) }
144    }
145    #[inline(always)]
146    fn max_precise_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
147        unsafe { _mm_max_ps(a.into(), b.into()).simd_into(self) }
148    }
149    #[inline(always)]
150    fn min_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
151        unsafe { _mm_min_ps(a.into(), b.into()).simd_into(self) }
152    }
153    #[inline(always)]
154    fn min_precise_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
155        unsafe { _mm_min_ps(a.into(), b.into()).simd_into(self) }
156    }
157    #[inline(always)]
158    fn madd_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
159        a + b * c
160    }
161    #[inline(always)]
162    fn msub_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
163        a - b * c
164    }
165    #[inline(always)]
166    fn floor_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
167        unsafe { _mm_floor_ps(a.into()).simd_into(self) }
168    }
169    #[inline(always)]
170    fn fract_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
171        a - a.trunc()
172    }
173    #[inline(always)]
174    fn trunc_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
175        unsafe { _mm_round_ps(a.into(), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC).simd_into(self) }
176    }
177    #[inline(always)]
178    fn select_f32x4(self, a: mask32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
179        unsafe {
180            let mask = _mm_castsi128_ps(a.into());
181            _mm_or_ps(_mm_and_ps(mask, b.into()), _mm_andnot_ps(mask, c.into())).simd_into(self)
182        }
183    }
184    #[inline(always)]
185    fn combine_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x8<Self> {
186        let mut result = [0.0; 8usize];
187        result[0..4usize].copy_from_slice(&a.val);
188        result[4usize..8usize].copy_from_slice(&b.val);
189        result.simd_into(self)
190    }
191    #[inline(always)]
192    fn reinterpret_f64_f32x4(self, a: f32x4<Self>) -> f64x2<Self> {
193        f64x2 {
194            val: bytemuck::cast(a.val),
195            simd: a.simd,
196        }
197    }
198    #[inline(always)]
199    fn reinterpret_i32_f32x4(self, a: f32x4<Self>) -> i32x4<Self> {
200        i32x4 {
201            val: bytemuck::cast(a.val),
202            simd: a.simd,
203        }
204    }
205    #[inline(always)]
206    fn reinterpret_u8_f32x4(self, a: f32x4<Self>) -> u8x16<Self> {
207        u8x16 {
208            val: bytemuck::cast(a.val),
209            simd: a.simd,
210        }
211    }
212    #[inline(always)]
213    fn reinterpret_u32_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
214        u32x4 {
215            val: bytemuck::cast(a.val),
216            simd: a.simd,
217        }
218    }
219    #[inline(always)]
220    fn cvt_u32_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
221        unsafe {
222            _mm_cvtps_epi32(_mm_max_ps(_mm_floor_ps(a.into()), _mm_set1_ps(0.0))).simd_into(self)
223        }
224    }
225    #[inline(always)]
226    fn cvt_i32_f32x4(self, a: f32x4<Self>) -> i32x4<Self> {
227        unsafe { _mm_cvtps_epi32(a.trunc().into()).simd_into(self) }
228    }
229    #[inline(always)]
230    fn splat_i8x16(self, val: i8) -> i8x16<Self> {
231        unsafe { _mm_set1_epi8(val).simd_into(self) }
232    }
233    #[inline(always)]
234    fn not_i8x16(self, a: i8x16<Self>) -> i8x16<Self> {
235        a ^ !0
236    }
237    #[inline(always)]
238    fn add_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
239        unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) }
240    }
241    #[inline(always)]
242    fn sub_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
243        unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) }
244    }
245    #[inline(always)]
246    fn mul_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
247        todo!()
248    }
249    #[inline(always)]
250    fn and_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
251        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
252    }
253    #[inline(always)]
254    fn or_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
255        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
256    }
257    #[inline(always)]
258    fn xor_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
259        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
260    }
261    #[inline(always)]
262    fn shr_i8x16(self, a: i8x16<Self>, shift: u32) -> i8x16<Self> {
263        unsafe {
264            let val = a.into();
265            let shift_count = _mm_cvtsi32_si128(shift as i32);
266            let lo_16 = _mm_unpacklo_epi8(val, _mm_cmplt_epi8(val, _mm_setzero_si128()));
267            let hi_16 = _mm_unpackhi_epi8(val, _mm_cmplt_epi8(val, _mm_setzero_si128()));
268            let lo_shifted = _mm_sra_epi16(lo_16, shift_count);
269            let hi_shifted = _mm_sra_epi16(hi_16, shift_count);
270            _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
271        }
272    }
273    #[inline(always)]
274    fn simd_eq_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
275        unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) }
276    }
277    #[inline(always)]
278    fn simd_lt_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
279        unsafe { _mm_cmplt_epi8(a.into(), b.into()).simd_into(self) }
280    }
281    #[inline(always)]
282    fn simd_le_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
283        unsafe { _mm_cmpeq_epi8(_mm_min_epi8(a.into(), b.into()), a.into()).simd_into(self) }
284    }
285    #[inline(always)]
286    fn simd_ge_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
287        unsafe { _mm_cmpeq_epi8(_mm_max_epi8(a.into(), b.into()), a.into()).simd_into(self) }
288    }
289    #[inline(always)]
290    fn simd_gt_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
291        unsafe { _mm_cmpgt_epi8(a.into(), b.into()).simd_into(self) }
292    }
293    #[inline(always)]
294    fn zip_low_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
295        unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) }
296    }
297    #[inline(always)]
298    fn zip_high_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
299        unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) }
300    }
301    #[inline(always)]
302    fn unzip_low_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
303        unsafe {
304            let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14);
305            let t1 = _mm_shuffle_epi8(a.into(), mask);
306            let t2 = _mm_shuffle_epi8(b.into(), mask);
307            _mm_unpacklo_epi64(t1, t2).simd_into(self)
308        }
309    }
310    #[inline(always)]
311    fn unzip_high_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
312        unsafe {
313            let mask = _mm_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, 1, 3, 5, 7, 9, 11, 13, 15);
314            let t1 = _mm_shuffle_epi8(a.into(), mask);
315            let t2 = _mm_shuffle_epi8(b.into(), mask);
316            _mm_unpacklo_epi64(t1, t2).simd_into(self)
317        }
318    }
319    #[inline(always)]
320    fn select_i8x16(self, a: mask8x16<Self>, b: i8x16<Self>, c: i8x16<Self>) -> i8x16<Self> {
321        unsafe {
322            _mm_or_si128(
323                _mm_and_si128(a.into(), b.into()),
324                _mm_andnot_si128(a.into(), c.into()),
325            )
326            .simd_into(self)
327        }
328    }
329    #[inline(always)]
330    fn min_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
331        unsafe { _mm_min_epi8(a.into(), b.into()).simd_into(self) }
332    }
333    #[inline(always)]
334    fn max_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
335        unsafe { _mm_max_epi8(a.into(), b.into()).simd_into(self) }
336    }
337    #[inline(always)]
338    fn combine_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x32<Self> {
339        let mut result = [0; 32usize];
340        result[0..16usize].copy_from_slice(&a.val);
341        result[16usize..32usize].copy_from_slice(&b.val);
342        result.simd_into(self)
343    }
344    #[inline(always)]
345    fn reinterpret_u8_i8x16(self, a: i8x16<Self>) -> u8x16<Self> {
346        u8x16 {
347            val: bytemuck::cast(a.val),
348            simd: a.simd,
349        }
350    }
351    #[inline(always)]
352    fn reinterpret_u32_i8x16(self, a: i8x16<Self>) -> u32x4<Self> {
353        u32x4 {
354            val: bytemuck::cast(a.val),
355            simd: a.simd,
356        }
357    }
358    #[inline(always)]
359    fn splat_u8x16(self, val: u8) -> u8x16<Self> {
360        unsafe { _mm_set1_epi8(val as _).simd_into(self) }
361    }
362    #[inline(always)]
363    fn not_u8x16(self, a: u8x16<Self>) -> u8x16<Self> {
364        a ^ !0
365    }
366    #[inline(always)]
367    fn add_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
368        unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) }
369    }
370    #[inline(always)]
371    fn sub_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
372        unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) }
373    }
374    #[inline(always)]
375    fn mul_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
376        todo!()
377    }
378    #[inline(always)]
379    fn and_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
380        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
381    }
382    #[inline(always)]
383    fn or_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
384        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
385    }
386    #[inline(always)]
387    fn xor_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
388        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
389    }
390    #[inline(always)]
391    fn shr_u8x16(self, a: u8x16<Self>, shift: u32) -> u8x16<Self> {
392        unsafe {
393            let val = a.into();
394            let shift_count = _mm_cvtsi32_si128(shift as i32);
395            let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128());
396            let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128());
397            let lo_shifted = _mm_srl_epi16(lo_16, shift_count);
398            let hi_shifted = _mm_srl_epi16(hi_16, shift_count);
399            _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
400        }
401    }
402    #[inline(always)]
403    fn simd_eq_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
404        unsafe {
405            let sign_bit = _mm_set1_epi8(0x80u8 as _);
406            let a_signed = _mm_xor_si128(a.into(), sign_bit);
407            let b_signed = _mm_xor_si128(b.into(), sign_bit);
408            _mm_cmpgt_epi8(a_signed, b_signed).simd_into(self)
409        }
410    }
411    #[inline(always)]
412    fn simd_lt_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
413        unsafe {
414            let sign_bit = _mm_set1_epi8(0x80u8 as _);
415            let a_signed = _mm_xor_si128(a.into(), sign_bit);
416            let b_signed = _mm_xor_si128(b.into(), sign_bit);
417            _mm_cmpgt_epi8(b_signed, a_signed).simd_into(self)
418        }
419    }
420    #[inline(always)]
421    fn simd_le_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
422        unsafe { _mm_cmpeq_epi8(_mm_min_epu8(a.into(), b.into()), a.into()).simd_into(self) }
423    }
424    #[inline(always)]
425    fn simd_ge_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
426        unsafe { _mm_cmpeq_epi8(_mm_max_epu8(a.into(), b.into()), a.into()).simd_into(self) }
427    }
428    #[inline(always)]
429    fn simd_gt_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
430        unsafe {
431            let sign_bit = _mm_set1_epi8(0x80u8 as _);
432            let a_signed = _mm_xor_si128(a.into(), sign_bit);
433            let b_signed = _mm_xor_si128(b.into(), sign_bit);
434            _mm_cmpgt_epi8(a_signed, b_signed).simd_into(self)
435        }
436    }
437    #[inline(always)]
438    fn zip_low_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
439        unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) }
440    }
441    #[inline(always)]
442    fn zip_high_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
443        unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) }
444    }
445    #[inline(always)]
446    fn unzip_low_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
447        unsafe {
448            let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14);
449            let t1 = _mm_shuffle_epi8(a.into(), mask);
450            let t2 = _mm_shuffle_epi8(b.into(), mask);
451            _mm_unpacklo_epi64(t1, t2).simd_into(self)
452        }
453    }
454    #[inline(always)]
455    fn unzip_high_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
456        unsafe {
457            let mask = _mm_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, 1, 3, 5, 7, 9, 11, 13, 15);
458            let t1 = _mm_shuffle_epi8(a.into(), mask);
459            let t2 = _mm_shuffle_epi8(b.into(), mask);
460            _mm_unpacklo_epi64(t1, t2).simd_into(self)
461        }
462    }
463    #[inline(always)]
464    fn select_u8x16(self, a: mask8x16<Self>, b: u8x16<Self>, c: u8x16<Self>) -> u8x16<Self> {
465        unsafe {
466            _mm_or_si128(
467                _mm_and_si128(a.into(), b.into()),
468                _mm_andnot_si128(a.into(), c.into()),
469            )
470            .simd_into(self)
471        }
472    }
473    #[inline(always)]
474    fn min_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
475        unsafe { _mm_min_epu8(a.into(), b.into()).simd_into(self) }
476    }
477    #[inline(always)]
478    fn max_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
479        unsafe { _mm_max_epu8(a.into(), b.into()).simd_into(self) }
480    }
481    #[inline(always)]
482    fn combine_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x32<Self> {
483        let mut result = [0; 32usize];
484        result[0..16usize].copy_from_slice(&a.val);
485        result[16usize..32usize].copy_from_slice(&b.val);
486        result.simd_into(self)
487    }
488    #[inline(always)]
489    fn widen_u8x16(self, a: u8x16<Self>) -> u16x16<Self> {
490        unsafe {
491            let raw = a.into();
492            let high = _mm_cvtepu8_epi16(raw).simd_into(self);
493            let low = _mm_cvtepu8_epi16(_mm_srli_si128::<8>(raw)).simd_into(self);
494            self.combine_u16x8(high, low)
495        }
496    }
497    #[inline(always)]
498    fn reinterpret_u32_u8x16(self, a: u8x16<Self>) -> u32x4<Self> {
499        u32x4 {
500            val: bytemuck::cast(a.val),
501            simd: a.simd,
502        }
503    }
504    #[inline(always)]
505    fn splat_mask8x16(self, val: i8) -> mask8x16<Self> {
506        unsafe { _mm_set1_epi8(val).simd_into(self) }
507    }
508    #[inline(always)]
509    fn not_mask8x16(self, a: mask8x16<Self>) -> mask8x16<Self> {
510        a ^ !0
511    }
512    #[inline(always)]
513    fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
514        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
515    }
516    #[inline(always)]
517    fn or_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
518        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
519    }
520    #[inline(always)]
521    fn xor_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
522        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
523    }
524    #[inline(always)]
525    fn select_mask8x16(
526        self,
527        a: mask8x16<Self>,
528        b: mask8x16<Self>,
529        c: mask8x16<Self>,
530    ) -> mask8x16<Self> {
531        unsafe {
532            _mm_or_si128(
533                _mm_and_si128(a.into(), b.into()),
534                _mm_andnot_si128(a.into(), c.into()),
535            )
536            .simd_into(self)
537        }
538    }
539    #[inline(always)]
540    fn simd_eq_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
541        unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) }
542    }
543    #[inline(always)]
544    fn combine_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x32<Self> {
545        let mut result = [0; 32usize];
546        result[0..16usize].copy_from_slice(&a.val);
547        result[16usize..32usize].copy_from_slice(&b.val);
548        result.simd_into(self)
549    }
550    #[inline(always)]
551    fn splat_i16x8(self, val: i16) -> i16x8<Self> {
552        unsafe { _mm_set1_epi16(val).simd_into(self) }
553    }
554    #[inline(always)]
555    fn not_i16x8(self, a: i16x8<Self>) -> i16x8<Self> {
556        a ^ !0
557    }
558    #[inline(always)]
559    fn add_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
560        unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) }
561    }
562    #[inline(always)]
563    fn sub_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
564        unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) }
565    }
566    #[inline(always)]
567    fn mul_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
568        unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) }
569    }
570    #[inline(always)]
571    fn and_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
572        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
573    }
574    #[inline(always)]
575    fn or_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
576        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
577    }
578    #[inline(always)]
579    fn xor_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
580        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
581    }
582    #[inline(always)]
583    fn shr_i16x8(self, a: i16x8<Self>, shift: u32) -> i16x8<Self> {
584        unsafe { _mm_sra_epi16(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) }
585    }
586    #[inline(always)]
587    fn simd_eq_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
588        unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) }
589    }
590    #[inline(always)]
591    fn simd_lt_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
592        unsafe { _mm_cmplt_epi16(a.into(), b.into()).simd_into(self) }
593    }
594    #[inline(always)]
595    fn simd_le_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
596        unsafe { _mm_cmpeq_epi16(_mm_min_epi16(a.into(), b.into()), a.into()).simd_into(self) }
597    }
598    #[inline(always)]
599    fn simd_ge_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
600        unsafe { _mm_cmpeq_epi16(_mm_max_epi16(a.into(), b.into()), a.into()).simd_into(self) }
601    }
602    #[inline(always)]
603    fn simd_gt_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
604        unsafe { _mm_cmpgt_epi16(a.into(), b.into()).simd_into(self) }
605    }
606    #[inline(always)]
607    fn zip_low_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
608        unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) }
609    }
610    #[inline(always)]
611    fn zip_high_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
612        unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) }
613    }
614    #[inline(always)]
615    fn unzip_low_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
616        unsafe {
617            let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13);
618            let t1 = _mm_shuffle_epi8(a.into(), mask);
619            let t2 = _mm_shuffle_epi8(b.into(), mask);
620            _mm_unpacklo_epi64(t1, t2).simd_into(self)
621        }
622    }
623    #[inline(always)]
624    fn unzip_high_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
625        unsafe {
626            let mask = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
627            let t1 = _mm_shuffle_epi8(a.into(), mask);
628            let t2 = _mm_shuffle_epi8(b.into(), mask);
629            _mm_unpacklo_epi64(t1, t2).simd_into(self)
630        }
631    }
632    #[inline(always)]
633    fn select_i16x8(self, a: mask16x8<Self>, b: i16x8<Self>, c: i16x8<Self>) -> i16x8<Self> {
634        unsafe {
635            _mm_or_si128(
636                _mm_and_si128(a.into(), b.into()),
637                _mm_andnot_si128(a.into(), c.into()),
638            )
639            .simd_into(self)
640        }
641    }
642    #[inline(always)]
643    fn min_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
644        unsafe { _mm_min_epi16(a.into(), b.into()).simd_into(self) }
645    }
646    #[inline(always)]
647    fn max_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
648        unsafe { _mm_max_epi16(a.into(), b.into()).simd_into(self) }
649    }
650    #[inline(always)]
651    fn combine_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x16<Self> {
652        let mut result = [0; 16usize];
653        result[0..8usize].copy_from_slice(&a.val);
654        result[8usize..16usize].copy_from_slice(&b.val);
655        result.simd_into(self)
656    }
657    #[inline(always)]
658    fn reinterpret_u8_i16x8(self, a: i16x8<Self>) -> u8x16<Self> {
659        u8x16 {
660            val: bytemuck::cast(a.val),
661            simd: a.simd,
662        }
663    }
664    #[inline(always)]
665    fn reinterpret_u32_i16x8(self, a: i16x8<Self>) -> u32x4<Self> {
666        u32x4 {
667            val: bytemuck::cast(a.val),
668            simd: a.simd,
669        }
670    }
671    #[inline(always)]
672    fn splat_u16x8(self, val: u16) -> u16x8<Self> {
673        unsafe { _mm_set1_epi16(val as _).simd_into(self) }
674    }
675    #[inline(always)]
676    fn not_u16x8(self, a: u16x8<Self>) -> u16x8<Self> {
677        a ^ !0
678    }
679    #[inline(always)]
680    fn add_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
681        unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) }
682    }
683    #[inline(always)]
684    fn sub_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
685        unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) }
686    }
687    #[inline(always)]
688    fn mul_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
689        unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) }
690    }
691    #[inline(always)]
692    fn and_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
693        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
694    }
695    #[inline(always)]
696    fn or_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
697        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
698    }
699    #[inline(always)]
700    fn xor_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
701        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
702    }
703    #[inline(always)]
704    fn shr_u16x8(self, a: u16x8<Self>, shift: u32) -> u16x8<Self> {
705        unsafe { _mm_srl_epi16(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) }
706    }
707    #[inline(always)]
708    fn simd_eq_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
709        unsafe {
710            let sign_bit = _mm_set1_epi16(0x8000u16 as _);
711            let a_signed = _mm_xor_si128(a.into(), sign_bit);
712            let b_signed = _mm_xor_si128(b.into(), sign_bit);
713            _mm_cmpgt_epi16(a_signed, b_signed).simd_into(self)
714        }
715    }
716    #[inline(always)]
717    fn simd_lt_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
718        unsafe {
719            let sign_bit = _mm_set1_epi16(0x8000u16 as _);
720            let a_signed = _mm_xor_si128(a.into(), sign_bit);
721            let b_signed = _mm_xor_si128(b.into(), sign_bit);
722            _mm_cmpgt_epi16(b_signed, a_signed).simd_into(self)
723        }
724    }
725    #[inline(always)]
726    fn simd_le_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
727        unsafe { _mm_cmpeq_epi16(_mm_min_epu16(a.into(), b.into()), a.into()).simd_into(self) }
728    }
729    #[inline(always)]
730    fn simd_ge_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
731        unsafe { _mm_cmpeq_epi16(_mm_max_epu16(a.into(), b.into()), a.into()).simd_into(self) }
732    }
733    #[inline(always)]
734    fn simd_gt_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
735        unsafe {
736            let sign_bit = _mm_set1_epi16(0x8000u16 as _);
737            let a_signed = _mm_xor_si128(a.into(), sign_bit);
738            let b_signed = _mm_xor_si128(b.into(), sign_bit);
739            _mm_cmpgt_epi16(a_signed, b_signed).simd_into(self)
740        }
741    }
742    #[inline(always)]
743    fn zip_low_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
744        unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) }
745    }
746    #[inline(always)]
747    fn zip_high_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
748        unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) }
749    }
750    #[inline(always)]
751    fn unzip_low_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
752        unsafe {
753            let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13);
754            let t1 = _mm_shuffle_epi8(a.into(), mask);
755            let t2 = _mm_shuffle_epi8(b.into(), mask);
756            _mm_unpacklo_epi64(t1, t2).simd_into(self)
757        }
758    }
759    #[inline(always)]
760    fn unzip_high_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
761        unsafe {
762            let mask = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
763            let t1 = _mm_shuffle_epi8(a.into(), mask);
764            let t2 = _mm_shuffle_epi8(b.into(), mask);
765            _mm_unpacklo_epi64(t1, t2).simd_into(self)
766        }
767    }
768    #[inline(always)]
769    fn select_u16x8(self, a: mask16x8<Self>, b: u16x8<Self>, c: u16x8<Self>) -> u16x8<Self> {
770        unsafe {
771            _mm_or_si128(
772                _mm_and_si128(a.into(), b.into()),
773                _mm_andnot_si128(a.into(), c.into()),
774            )
775            .simd_into(self)
776        }
777    }
778    #[inline(always)]
779    fn min_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
780        unsafe { _mm_min_epu16(a.into(), b.into()).simd_into(self) }
781    }
782    #[inline(always)]
783    fn max_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
784        unsafe { _mm_max_epu16(a.into(), b.into()).simd_into(self) }
785    }
786    #[inline(always)]
787    fn combine_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x16<Self> {
788        let mut result = [0; 16usize];
789        result[0..8usize].copy_from_slice(&a.val);
790        result[8usize..16usize].copy_from_slice(&b.val);
791        result.simd_into(self)
792    }
793    #[inline(always)]
794    fn reinterpret_u8_u16x8(self, a: u16x8<Self>) -> u8x16<Self> {
795        u8x16 {
796            val: bytemuck::cast(a.val),
797            simd: a.simd,
798        }
799    }
800    #[inline(always)]
801    fn reinterpret_u32_u16x8(self, a: u16x8<Self>) -> u32x4<Self> {
802        u32x4 {
803            val: bytemuck::cast(a.val),
804            simd: a.simd,
805        }
806    }
807    #[inline(always)]
808    fn splat_mask16x8(self, val: i16) -> mask16x8<Self> {
809        unsafe { _mm_set1_epi16(val).simd_into(self) }
810    }
811    #[inline(always)]
812    fn not_mask16x8(self, a: mask16x8<Self>) -> mask16x8<Self> {
813        a ^ !0
814    }
815    #[inline(always)]
816    fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
817        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
818    }
819    #[inline(always)]
820    fn or_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
821        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
822    }
823    #[inline(always)]
824    fn xor_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
825        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
826    }
827    #[inline(always)]
828    fn select_mask16x8(
829        self,
830        a: mask16x8<Self>,
831        b: mask16x8<Self>,
832        c: mask16x8<Self>,
833    ) -> mask16x8<Self> {
834        unsafe {
835            _mm_or_si128(
836                _mm_and_si128(a.into(), b.into()),
837                _mm_andnot_si128(a.into(), c.into()),
838            )
839            .simd_into(self)
840        }
841    }
842    #[inline(always)]
843    fn simd_eq_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
844        unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) }
845    }
846    #[inline(always)]
847    fn combine_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x16<Self> {
848        let mut result = [0; 16usize];
849        result[0..8usize].copy_from_slice(&a.val);
850        result[8usize..16usize].copy_from_slice(&b.val);
851        result.simd_into(self)
852    }
853    #[inline(always)]
854    fn splat_i32x4(self, val: i32) -> i32x4<Self> {
855        unsafe { _mm_set1_epi32(val).simd_into(self) }
856    }
857    #[inline(always)]
858    fn not_i32x4(self, a: i32x4<Self>) -> i32x4<Self> {
859        a ^ !0
860    }
861    #[inline(always)]
862    fn add_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
863        unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) }
864    }
865    #[inline(always)]
866    fn sub_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
867        unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) }
868    }
869    #[inline(always)]
870    fn mul_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
871        unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) }
872    }
873    #[inline(always)]
874    fn and_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
875        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
876    }
877    #[inline(always)]
878    fn or_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
879        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
880    }
881    #[inline(always)]
882    fn xor_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
883        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
884    }
885    #[inline(always)]
886    fn shr_i32x4(self, a: i32x4<Self>, shift: u32) -> i32x4<Self> {
887        unsafe { _mm_sra_epi32(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) }
888    }
889    #[inline(always)]
890    fn simd_eq_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
891        unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) }
892    }
893    #[inline(always)]
894    fn simd_lt_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
895        unsafe { _mm_cmplt_epi32(a.into(), b.into()).simd_into(self) }
896    }
897    #[inline(always)]
898    fn simd_le_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
899        unsafe { _mm_cmpeq_epi32(_mm_min_epi32(a.into(), b.into()), a.into()).simd_into(self) }
900    }
901    #[inline(always)]
902    fn simd_ge_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
903        unsafe { _mm_cmpeq_epi32(_mm_max_epi32(a.into(), b.into()), a.into()).simd_into(self) }
904    }
905    #[inline(always)]
906    fn simd_gt_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
907        unsafe { _mm_cmpgt_epi32(a.into(), b.into()).simd_into(self) }
908    }
909    #[inline(always)]
910    fn zip_low_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
911        unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) }
912    }
913    #[inline(always)]
914    fn zip_high_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
915        unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) }
916    }
917    #[inline(always)]
918    fn unzip_low_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
919        unsafe {
920            let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
921            let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
922            _mm_unpacklo_epi64(t1, t2).simd_into(self)
923        }
924    }
925    #[inline(always)]
926    fn unzip_high_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
927        unsafe {
928            let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
929            let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
930            _mm_unpackhi_epi64(t1, t2).simd_into(self)
931        }
932    }
933    #[inline(always)]
934    fn select_i32x4(self, a: mask32x4<Self>, b: i32x4<Self>, c: i32x4<Self>) -> i32x4<Self> {
935        unsafe {
936            _mm_or_si128(
937                _mm_and_si128(a.into(), b.into()),
938                _mm_andnot_si128(a.into(), c.into()),
939            )
940            .simd_into(self)
941        }
942    }
943    #[inline(always)]
944    fn min_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
945        unsafe { _mm_min_epi32(a.into(), b.into()).simd_into(self) }
946    }
947    #[inline(always)]
948    fn max_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
949        unsafe { _mm_max_epi32(a.into(), b.into()).simd_into(self) }
950    }
951    #[inline(always)]
952    fn combine_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x8<Self> {
953        let mut result = [0; 8usize];
954        result[0..4usize].copy_from_slice(&a.val);
955        result[4usize..8usize].copy_from_slice(&b.val);
956        result.simd_into(self)
957    }
958    #[inline(always)]
959    fn reinterpret_u8_i32x4(self, a: i32x4<Self>) -> u8x16<Self> {
960        u8x16 {
961            val: bytemuck::cast(a.val),
962            simd: a.simd,
963        }
964    }
965    #[inline(always)]
966    fn reinterpret_u32_i32x4(self, a: i32x4<Self>) -> u32x4<Self> {
967        u32x4 {
968            val: bytemuck::cast(a.val),
969            simd: a.simd,
970        }
971    }
972    #[inline(always)]
973    fn cvt_f32_i32x4(self, a: i32x4<Self>) -> f32x4<Self> {
974        unsafe { _mm_cvtepi32_ps(a.into()).simd_into(self) }
975    }
976    #[inline(always)]
977    fn splat_u32x4(self, val: u32) -> u32x4<Self> {
978        unsafe { _mm_set1_epi32(val as _).simd_into(self) }
979    }
980    #[inline(always)]
981    fn not_u32x4(self, a: u32x4<Self>) -> u32x4<Self> {
982        a ^ !0
983    }
984    #[inline(always)]
985    fn add_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
986        unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) }
987    }
988    #[inline(always)]
989    fn sub_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
990        unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) }
991    }
992    #[inline(always)]
993    fn mul_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
994        unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) }
995    }
996    #[inline(always)]
997    fn and_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
998        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
999    }
1000    #[inline(always)]
1001    fn or_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1002        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1003    }
1004    #[inline(always)]
1005    fn xor_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1006        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1007    }
1008    #[inline(always)]
1009    fn shr_u32x4(self, a: u32x4<Self>, shift: u32) -> u32x4<Self> {
1010        unsafe { _mm_srl_epi32(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) }
1011    }
1012    #[inline(always)]
1013    fn simd_eq_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1014        unsafe {
1015            let sign_bit = _mm_set1_epi32(0x80000000u32 as _);
1016            let a_signed = _mm_xor_si128(a.into(), sign_bit);
1017            let b_signed = _mm_xor_si128(b.into(), sign_bit);
1018            _mm_cmpgt_epi32(a_signed, b_signed).simd_into(self)
1019        }
1020    }
1021    #[inline(always)]
1022    fn simd_lt_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1023        unsafe {
1024            let sign_bit = _mm_set1_epi32(0x80000000u32 as _);
1025            let a_signed = _mm_xor_si128(a.into(), sign_bit);
1026            let b_signed = _mm_xor_si128(b.into(), sign_bit);
1027            _mm_cmpgt_epi32(b_signed, a_signed).simd_into(self)
1028        }
1029    }
1030    #[inline(always)]
1031    fn simd_le_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1032        unsafe { _mm_cmpeq_epi32(_mm_min_epu32(a.into(), b.into()), a.into()).simd_into(self) }
1033    }
1034    #[inline(always)]
1035    fn simd_ge_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1036        unsafe { _mm_cmpeq_epi32(_mm_max_epu32(a.into(), b.into()), a.into()).simd_into(self) }
1037    }
1038    #[inline(always)]
1039    fn simd_gt_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1040        unsafe {
1041            let sign_bit = _mm_set1_epi32(0x80000000u32 as _);
1042            let a_signed = _mm_xor_si128(a.into(), sign_bit);
1043            let b_signed = _mm_xor_si128(b.into(), sign_bit);
1044            _mm_cmpgt_epi32(a_signed, b_signed).simd_into(self)
1045        }
1046    }
1047    #[inline(always)]
1048    fn zip_low_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1049        unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) }
1050    }
1051    #[inline(always)]
1052    fn zip_high_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1053        unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) }
1054    }
1055    #[inline(always)]
1056    fn unzip_low_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1057        unsafe {
1058            let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
1059            let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
1060            _mm_unpacklo_epi64(t1, t2).simd_into(self)
1061        }
1062    }
1063    #[inline(always)]
1064    fn unzip_high_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1065        unsafe {
1066            let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
1067            let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
1068            _mm_unpackhi_epi64(t1, t2).simd_into(self)
1069        }
1070    }
1071    #[inline(always)]
1072    fn select_u32x4(self, a: mask32x4<Self>, b: u32x4<Self>, c: u32x4<Self>) -> u32x4<Self> {
1073        unsafe {
1074            _mm_or_si128(
1075                _mm_and_si128(a.into(), b.into()),
1076                _mm_andnot_si128(a.into(), c.into()),
1077            )
1078            .simd_into(self)
1079        }
1080    }
1081    #[inline(always)]
1082    fn min_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1083        unsafe { _mm_min_epu32(a.into(), b.into()).simd_into(self) }
1084    }
1085    #[inline(always)]
1086    fn max_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1087        unsafe { _mm_max_epu32(a.into(), b.into()).simd_into(self) }
1088    }
1089    #[inline(always)]
1090    fn combine_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x8<Self> {
1091        let mut result = [0; 8usize];
1092        result[0..4usize].copy_from_slice(&a.val);
1093        result[4usize..8usize].copy_from_slice(&b.val);
1094        result.simd_into(self)
1095    }
1096    #[inline(always)]
1097    fn reinterpret_u8_u32x4(self, a: u32x4<Self>) -> u8x16<Self> {
1098        u8x16 {
1099            val: bytemuck::cast(a.val),
1100            simd: a.simd,
1101        }
1102    }
1103    #[inline(always)]
1104    fn cvt_f32_u32x4(self, a: u32x4<Self>) -> f32x4<Self> {
1105        unsafe { _mm_cvtepi32_ps(a.into()).simd_into(self) }
1106    }
1107    #[inline(always)]
1108    fn splat_mask32x4(self, val: i32) -> mask32x4<Self> {
1109        unsafe { _mm_set1_epi32(val).simd_into(self) }
1110    }
1111    #[inline(always)]
1112    fn not_mask32x4(self, a: mask32x4<Self>) -> mask32x4<Self> {
1113        a ^ !0
1114    }
1115    #[inline(always)]
1116    fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
1117        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1118    }
1119    #[inline(always)]
1120    fn or_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
1121        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1122    }
1123    #[inline(always)]
1124    fn xor_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
1125        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1126    }
1127    #[inline(always)]
1128    fn select_mask32x4(
1129        self,
1130        a: mask32x4<Self>,
1131        b: mask32x4<Self>,
1132        c: mask32x4<Self>,
1133    ) -> mask32x4<Self> {
1134        unsafe {
1135            _mm_or_si128(
1136                _mm_and_si128(a.into(), b.into()),
1137                _mm_andnot_si128(a.into(), c.into()),
1138            )
1139            .simd_into(self)
1140        }
1141    }
1142    #[inline(always)]
1143    fn simd_eq_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
1144        unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) }
1145    }
1146    #[inline(always)]
1147    fn combine_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x8<Self> {
1148        let mut result = [0; 8usize];
1149        result[0..4usize].copy_from_slice(&a.val);
1150        result[4usize..8usize].copy_from_slice(&b.val);
1151        result.simd_into(self)
1152    }
1153    #[inline(always)]
1154    fn splat_f64x2(self, val: f64) -> f64x2<Self> {
1155        unsafe { _mm_set1_pd(val).simd_into(self) }
1156    }
1157    #[inline(always)]
1158    fn abs_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
1159        unsafe { _mm_andnot_pd(_mm_set1_pd(-0.0), a.into()).simd_into(self) }
1160    }
1161    #[inline(always)]
1162    fn neg_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
1163        unsafe { _mm_xor_pd(a.into(), _mm_set1_pd(-0.0)).simd_into(self) }
1164    }
1165    #[inline(always)]
1166    fn sqrt_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
1167        unsafe { _mm_sqrt_pd(a.into()).simd_into(self) }
1168    }
1169    #[inline(always)]
1170    fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1171        unsafe { _mm_add_pd(a.into(), b.into()).simd_into(self) }
1172    }
1173    #[inline(always)]
1174    fn sub_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1175        unsafe { _mm_sub_pd(a.into(), b.into()).simd_into(self) }
1176    }
1177    #[inline(always)]
1178    fn mul_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1179        unsafe { _mm_mul_pd(a.into(), b.into()).simd_into(self) }
1180    }
1181    #[inline(always)]
1182    fn div_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1183        unsafe { _mm_div_pd(a.into(), b.into()).simd_into(self) }
1184    }
1185    #[inline(always)]
1186    fn copysign_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1187        unsafe {
1188            let mask = _mm_set1_pd(-0.0);
1189            _mm_or_pd(_mm_and_pd(mask, b.into()), _mm_andnot_pd(mask, a.into())).simd_into(self)
1190        }
1191    }
1192    #[inline(always)]
1193    fn simd_eq_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
1194        unsafe { _mm_castpd_si128(_mm_cmpeq_pd(a.into(), b.into())).simd_into(self) }
1195    }
1196    #[inline(always)]
1197    fn simd_lt_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
1198        unsafe { _mm_castpd_si128(_mm_cmplt_pd(a.into(), b.into())).simd_into(self) }
1199    }
1200    #[inline(always)]
1201    fn simd_le_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
1202        unsafe { _mm_castpd_si128(_mm_cmple_pd(a.into(), b.into())).simd_into(self) }
1203    }
1204    #[inline(always)]
1205    fn simd_ge_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
1206        unsafe { _mm_castpd_si128(_mm_cmpge_pd(a.into(), b.into())).simd_into(self) }
1207    }
1208    #[inline(always)]
1209    fn simd_gt_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
1210        unsafe { _mm_castpd_si128(_mm_cmpgt_pd(a.into(), b.into())).simd_into(self) }
1211    }
1212    #[inline(always)]
1213    fn zip_low_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1214        unsafe { _mm_unpacklo_pd(a.into(), b.into()).simd_into(self) }
1215    }
1216    #[inline(always)]
1217    fn zip_high_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1218        unsafe { _mm_unpackhi_pd(a.into(), b.into()).simd_into(self) }
1219    }
1220    #[inline(always)]
1221    fn unzip_low_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1222        unsafe { _mm_shuffle_pd::<0b00>(a.into(), b.into()).simd_into(self) }
1223    }
1224    #[inline(always)]
1225    fn unzip_high_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1226        unsafe { _mm_shuffle_pd::<0b11>(a.into(), b.into()).simd_into(self) }
1227    }
1228    #[inline(always)]
1229    fn max_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1230        unsafe { _mm_max_pd(a.into(), b.into()).simd_into(self) }
1231    }
1232    #[inline(always)]
1233    fn max_precise_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1234        unsafe { _mm_max_pd(a.into(), b.into()).simd_into(self) }
1235    }
1236    #[inline(always)]
1237    fn min_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1238        unsafe { _mm_min_pd(a.into(), b.into()).simd_into(self) }
1239    }
1240    #[inline(always)]
1241    fn min_precise_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1242        unsafe { _mm_min_pd(a.into(), b.into()).simd_into(self) }
1243    }
1244    #[inline(always)]
1245    fn madd_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
1246        a + b * c
1247    }
1248    #[inline(always)]
1249    fn msub_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
1250        a - b * c
1251    }
1252    #[inline(always)]
1253    fn floor_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
1254        unsafe { _mm_floor_pd(a.into()).simd_into(self) }
1255    }
1256    #[inline(always)]
1257    fn fract_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
1258        a - a.trunc()
1259    }
1260    #[inline(always)]
1261    fn trunc_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
1262        unsafe { _mm_round_pd(a.into(), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC).simd_into(self) }
1263    }
1264    #[inline(always)]
1265    fn select_f64x2(self, a: mask64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
1266        unsafe {
1267            let mask = _mm_castsi128_pd(a.into());
1268            _mm_or_pd(_mm_and_pd(mask, b.into()), _mm_andnot_pd(mask, c.into())).simd_into(self)
1269        }
1270    }
1271    #[inline(always)]
1272    fn combine_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x4<Self> {
1273        let mut result = [0.0; 4usize];
1274        result[0..2usize].copy_from_slice(&a.val);
1275        result[2usize..4usize].copy_from_slice(&b.val);
1276        result.simd_into(self)
1277    }
1278    #[inline(always)]
1279    fn reinterpret_f32_f64x2(self, a: f64x2<Self>) -> f32x4<Self> {
1280        f32x4 {
1281            val: bytemuck::cast(a.val),
1282            simd: a.simd,
1283        }
1284    }
1285    #[inline(always)]
1286    fn splat_mask64x2(self, val: i64) -> mask64x2<Self> {
1287        unsafe { _mm_set1_epi64x(val).simd_into(self) }
1288    }
1289    #[inline(always)]
1290    fn not_mask64x2(self, a: mask64x2<Self>) -> mask64x2<Self> {
1291        a ^ !0
1292    }
1293    #[inline(always)]
1294    fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
1295        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1296    }
1297    #[inline(always)]
1298    fn or_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
1299        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1300    }
1301    #[inline(always)]
1302    fn xor_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
1303        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1304    }
1305    #[inline(always)]
1306    fn select_mask64x2(
1307        self,
1308        a: mask64x2<Self>,
1309        b: mask64x2<Self>,
1310        c: mask64x2<Self>,
1311    ) -> mask64x2<Self> {
1312        unsafe {
1313            _mm_or_si128(
1314                _mm_and_si128(a.into(), b.into()),
1315                _mm_andnot_si128(a.into(), c.into()),
1316            )
1317            .simd_into(self)
1318        }
1319    }
1320    #[inline(always)]
1321    fn simd_eq_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
1322        unsafe { _mm_cmpeq_epi64(a.into(), b.into()).simd_into(self) }
1323    }
1324    #[inline(always)]
1325    fn combine_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x4<Self> {
1326        let mut result = [0; 4usize];
1327        result[0..2usize].copy_from_slice(&a.val);
1328        result[2usize..4usize].copy_from_slice(&b.val);
1329        result.simd_into(self)
1330    }
1331    #[inline(always)]
1332    fn splat_f32x8(self, a: f32) -> f32x8<Self> {
1333        let half = self.splat_f32x4(a);
1334        self.combine_f32x4(half, half)
1335    }
1336    #[inline(always)]
1337    fn abs_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
1338        let (a0, a1) = self.split_f32x8(a);
1339        self.combine_f32x4(self.abs_f32x4(a0), self.abs_f32x4(a1))
1340    }
1341    #[inline(always)]
1342    fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
1343        let (a0, a1) = self.split_f32x8(a);
1344        self.combine_f32x4(self.neg_f32x4(a0), self.neg_f32x4(a1))
1345    }
1346    #[inline(always)]
1347    fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
1348        let (a0, a1) = self.split_f32x8(a);
1349        self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1))
1350    }
1351    #[inline(always)]
1352    fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1353        let (a0, a1) = self.split_f32x8(a);
1354        let (b0, b1) = self.split_f32x8(b);
1355        self.combine_f32x4(self.add_f32x4(a0, b0), self.add_f32x4(a1, b1))
1356    }
1357    #[inline(always)]
1358    fn sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1359        let (a0, a1) = self.split_f32x8(a);
1360        let (b0, b1) = self.split_f32x8(b);
1361        self.combine_f32x4(self.sub_f32x4(a0, b0), self.sub_f32x4(a1, b1))
1362    }
1363    #[inline(always)]
1364    fn mul_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1365        let (a0, a1) = self.split_f32x8(a);
1366        let (b0, b1) = self.split_f32x8(b);
1367        self.combine_f32x4(self.mul_f32x4(a0, b0), self.mul_f32x4(a1, b1))
1368    }
1369    #[inline(always)]
1370    fn div_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1371        let (a0, a1) = self.split_f32x8(a);
1372        let (b0, b1) = self.split_f32x8(b);
1373        self.combine_f32x4(self.div_f32x4(a0, b0), self.div_f32x4(a1, b1))
1374    }
1375    #[inline(always)]
1376    fn copysign_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1377        let (a0, a1) = self.split_f32x8(a);
1378        let (b0, b1) = self.split_f32x8(b);
1379        self.combine_f32x4(self.copysign_f32x4(a0, b0), self.copysign_f32x4(a1, b1))
1380    }
1381    #[inline(always)]
1382    fn simd_eq_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
1383        let (a0, a1) = self.split_f32x8(a);
1384        let (b0, b1) = self.split_f32x8(b);
1385        self.combine_mask32x4(self.simd_eq_f32x4(a0, b0), self.simd_eq_f32x4(a1, b1))
1386    }
1387    #[inline(always)]
1388    fn simd_lt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
1389        let (a0, a1) = self.split_f32x8(a);
1390        let (b0, b1) = self.split_f32x8(b);
1391        self.combine_mask32x4(self.simd_lt_f32x4(a0, b0), self.simd_lt_f32x4(a1, b1))
1392    }
1393    #[inline(always)]
1394    fn simd_le_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
1395        let (a0, a1) = self.split_f32x8(a);
1396        let (b0, b1) = self.split_f32x8(b);
1397        self.combine_mask32x4(self.simd_le_f32x4(a0, b0), self.simd_le_f32x4(a1, b1))
1398    }
1399    #[inline(always)]
1400    fn simd_ge_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
1401        let (a0, a1) = self.split_f32x8(a);
1402        let (b0, b1) = self.split_f32x8(b);
1403        self.combine_mask32x4(self.simd_ge_f32x4(a0, b0), self.simd_ge_f32x4(a1, b1))
1404    }
1405    #[inline(always)]
1406    fn simd_gt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
1407        let (a0, a1) = self.split_f32x8(a);
1408        let (b0, b1) = self.split_f32x8(b);
1409        self.combine_mask32x4(self.simd_gt_f32x4(a0, b0), self.simd_gt_f32x4(a1, b1))
1410    }
1411    #[inline(always)]
1412    fn zip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1413        let (a0, _) = self.split_f32x8(a);
1414        let (b0, _) = self.split_f32x8(b);
1415        self.combine_f32x4(self.zip_low_f32x4(a0, b0), self.zip_high_f32x4(a0, b0))
1416    }
1417    #[inline(always)]
1418    fn zip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1419        let (_, a1) = self.split_f32x8(a);
1420        let (_, b1) = self.split_f32x8(b);
1421        self.combine_f32x4(self.zip_low_f32x4(a1, b1), self.zip_high_f32x4(a1, b1))
1422    }
1423    #[inline(always)]
1424    fn unzip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1425        let (a0, a1) = self.split_f32x8(a);
1426        let (b0, b1) = self.split_f32x8(b);
1427        self.combine_f32x4(self.unzip_low_f32x4(a0, a1), self.unzip_low_f32x4(b0, b1))
1428    }
1429    #[inline(always)]
1430    fn unzip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1431        let (a0, a1) = self.split_f32x8(a);
1432        let (b0, b1) = self.split_f32x8(b);
1433        self.combine_f32x4(self.unzip_high_f32x4(a0, a1), self.unzip_high_f32x4(b0, b1))
1434    }
1435    #[inline(always)]
1436    fn max_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1437        let (a0, a1) = self.split_f32x8(a);
1438        let (b0, b1) = self.split_f32x8(b);
1439        self.combine_f32x4(self.max_f32x4(a0, b0), self.max_f32x4(a1, b1))
1440    }
1441    #[inline(always)]
1442    fn max_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1443        let (a0, a1) = self.split_f32x8(a);
1444        let (b0, b1) = self.split_f32x8(b);
1445        self.combine_f32x4(
1446            self.max_precise_f32x4(a0, b0),
1447            self.max_precise_f32x4(a1, b1),
1448        )
1449    }
1450    #[inline(always)]
1451    fn min_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1452        let (a0, a1) = self.split_f32x8(a);
1453        let (b0, b1) = self.split_f32x8(b);
1454        self.combine_f32x4(self.min_f32x4(a0, b0), self.min_f32x4(a1, b1))
1455    }
1456    #[inline(always)]
1457    fn min_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1458        let (a0, a1) = self.split_f32x8(a);
1459        let (b0, b1) = self.split_f32x8(b);
1460        self.combine_f32x4(
1461            self.min_precise_f32x4(a0, b0),
1462            self.min_precise_f32x4(a1, b1),
1463        )
1464    }
1465    #[inline(always)]
1466    fn madd_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
1467        let (a0, a1) = self.split_f32x8(a);
1468        let (b0, b1) = self.split_f32x8(b);
1469        let (c0, c1) = self.split_f32x8(c);
1470        self.combine_f32x4(self.madd_f32x4(a0, b0, c0), self.madd_f32x4(a1, b1, c1))
1471    }
1472    #[inline(always)]
1473    fn msub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
1474        let (a0, a1) = self.split_f32x8(a);
1475        let (b0, b1) = self.split_f32x8(b);
1476        let (c0, c1) = self.split_f32x8(c);
1477        self.combine_f32x4(self.msub_f32x4(a0, b0, c0), self.msub_f32x4(a1, b1, c1))
1478    }
1479    #[inline(always)]
1480    fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
1481        let (a0, a1) = self.split_f32x8(a);
1482        self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1))
1483    }
1484    #[inline(always)]
1485    fn fract_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
1486        let (a0, a1) = self.split_f32x8(a);
1487        self.combine_f32x4(self.fract_f32x4(a0), self.fract_f32x4(a1))
1488    }
1489    #[inline(always)]
1490    fn trunc_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
1491        let (a0, a1) = self.split_f32x8(a);
1492        self.combine_f32x4(self.trunc_f32x4(a0), self.trunc_f32x4(a1))
1493    }
1494    #[inline(always)]
1495    fn select_f32x8(self, a: mask32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
1496        let (a0, a1) = self.split_mask32x8(a);
1497        let (b0, b1) = self.split_f32x8(b);
1498        let (c0, c1) = self.split_f32x8(c);
1499        self.combine_f32x4(self.select_f32x4(a0, b0, c0), self.select_f32x4(a1, b1, c1))
1500    }
1501    #[inline(always)]
1502    fn combine_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x16<Self> {
1503        let mut result = [0.0; 16usize];
1504        result[0..8usize].copy_from_slice(&a.val);
1505        result[8usize..16usize].copy_from_slice(&b.val);
1506        result.simd_into(self)
1507    }
1508    #[inline(always)]
1509    fn split_f32x8(self, a: f32x8<Self>) -> (f32x4<Self>, f32x4<Self>) {
1510        let mut b0 = [0.0; 4usize];
1511        let mut b1 = [0.0; 4usize];
1512        b0.copy_from_slice(&a.val[0..4usize]);
1513        b1.copy_from_slice(&a.val[4usize..8usize]);
1514        (b0.simd_into(self), b1.simd_into(self))
1515    }
1516    #[inline(always)]
1517    fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self> {
1518        let (a0, a1) = self.split_f32x8(a);
1519        self.combine_f64x2(
1520            self.reinterpret_f64_f32x4(a0),
1521            self.reinterpret_f64_f32x4(a1),
1522        )
1523    }
1524    #[inline(always)]
1525    fn reinterpret_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
1526        let (a0, a1) = self.split_f32x8(a);
1527        self.combine_i32x4(
1528            self.reinterpret_i32_f32x4(a0),
1529            self.reinterpret_i32_f32x4(a1),
1530        )
1531    }
1532    #[inline(always)]
1533    fn reinterpret_u8_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
1534        let (a0, a1) = self.split_f32x8(a);
1535        self.combine_u8x16(self.reinterpret_u8_f32x4(a0), self.reinterpret_u8_f32x4(a1))
1536    }
1537    #[inline(always)]
1538    fn reinterpret_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
1539        let (a0, a1) = self.split_f32x8(a);
1540        self.combine_u32x4(
1541            self.reinterpret_u32_f32x4(a0),
1542            self.reinterpret_u32_f32x4(a1),
1543        )
1544    }
1545    #[inline(always)]
1546    fn cvt_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
1547        let (a0, a1) = self.split_f32x8(a);
1548        self.combine_u32x4(self.cvt_u32_f32x4(a0), self.cvt_u32_f32x4(a1))
1549    }
1550    #[inline(always)]
1551    fn cvt_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
1552        let (a0, a1) = self.split_f32x8(a);
1553        self.combine_i32x4(self.cvt_i32_f32x4(a0), self.cvt_i32_f32x4(a1))
1554    }
1555    #[inline(always)]
1556    fn splat_i8x32(self, a: i8) -> i8x32<Self> {
1557        let half = self.splat_i8x16(a);
1558        self.combine_i8x16(half, half)
1559    }
1560    #[inline(always)]
1561    fn not_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
1562        let (a0, a1) = self.split_i8x32(a);
1563        self.combine_i8x16(self.not_i8x16(a0), self.not_i8x16(a1))
1564    }
1565    #[inline(always)]
1566    fn add_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1567        let (a0, a1) = self.split_i8x32(a);
1568        let (b0, b1) = self.split_i8x32(b);
1569        self.combine_i8x16(self.add_i8x16(a0, b0), self.add_i8x16(a1, b1))
1570    }
1571    #[inline(always)]
1572    fn sub_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1573        let (a0, a1) = self.split_i8x32(a);
1574        let (b0, b1) = self.split_i8x32(b);
1575        self.combine_i8x16(self.sub_i8x16(a0, b0), self.sub_i8x16(a1, b1))
1576    }
1577    #[inline(always)]
1578    fn mul_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1579        let (a0, a1) = self.split_i8x32(a);
1580        let (b0, b1) = self.split_i8x32(b);
1581        self.combine_i8x16(self.mul_i8x16(a0, b0), self.mul_i8x16(a1, b1))
1582    }
1583    #[inline(always)]
1584    fn and_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1585        let (a0, a1) = self.split_i8x32(a);
1586        let (b0, b1) = self.split_i8x32(b);
1587        self.combine_i8x16(self.and_i8x16(a0, b0), self.and_i8x16(a1, b1))
1588    }
1589    #[inline(always)]
1590    fn or_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1591        let (a0, a1) = self.split_i8x32(a);
1592        let (b0, b1) = self.split_i8x32(b);
1593        self.combine_i8x16(self.or_i8x16(a0, b0), self.or_i8x16(a1, b1))
1594    }
1595    #[inline(always)]
1596    fn xor_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1597        let (a0, a1) = self.split_i8x32(a);
1598        let (b0, b1) = self.split_i8x32(b);
1599        self.combine_i8x16(self.xor_i8x16(a0, b0), self.xor_i8x16(a1, b1))
1600    }
1601    #[inline(always)]
1602    fn shr_i8x32(self, a: i8x32<Self>, b: u32) -> i8x32<Self> {
1603        let (a0, a1) = self.split_i8x32(a);
1604        self.combine_i8x16(self.shr_i8x16(a0, b), self.shr_i8x16(a1, b))
1605    }
1606    #[inline(always)]
1607    fn simd_eq_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
1608        let (a0, a1) = self.split_i8x32(a);
1609        let (b0, b1) = self.split_i8x32(b);
1610        self.combine_mask8x16(self.simd_eq_i8x16(a0, b0), self.simd_eq_i8x16(a1, b1))
1611    }
1612    #[inline(always)]
1613    fn simd_lt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
1614        let (a0, a1) = self.split_i8x32(a);
1615        let (b0, b1) = self.split_i8x32(b);
1616        self.combine_mask8x16(self.simd_lt_i8x16(a0, b0), self.simd_lt_i8x16(a1, b1))
1617    }
1618    #[inline(always)]
1619    fn simd_le_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
1620        let (a0, a1) = self.split_i8x32(a);
1621        let (b0, b1) = self.split_i8x32(b);
1622        self.combine_mask8x16(self.simd_le_i8x16(a0, b0), self.simd_le_i8x16(a1, b1))
1623    }
1624    #[inline(always)]
1625    fn simd_ge_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
1626        let (a0, a1) = self.split_i8x32(a);
1627        let (b0, b1) = self.split_i8x32(b);
1628        self.combine_mask8x16(self.simd_ge_i8x16(a0, b0), self.simd_ge_i8x16(a1, b1))
1629    }
1630    #[inline(always)]
1631    fn simd_gt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
1632        let (a0, a1) = self.split_i8x32(a);
1633        let (b0, b1) = self.split_i8x32(b);
1634        self.combine_mask8x16(self.simd_gt_i8x16(a0, b0), self.simd_gt_i8x16(a1, b1))
1635    }
1636    #[inline(always)]
1637    fn zip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1638        let (a0, _) = self.split_i8x32(a);
1639        let (b0, _) = self.split_i8x32(b);
1640        self.combine_i8x16(self.zip_low_i8x16(a0, b0), self.zip_high_i8x16(a0, b0))
1641    }
1642    #[inline(always)]
1643    fn zip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1644        let (_, a1) = self.split_i8x32(a);
1645        let (_, b1) = self.split_i8x32(b);
1646        self.combine_i8x16(self.zip_low_i8x16(a1, b1), self.zip_high_i8x16(a1, b1))
1647    }
1648    #[inline(always)]
1649    fn unzip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1650        let (a0, a1) = self.split_i8x32(a);
1651        let (b0, b1) = self.split_i8x32(b);
1652        self.combine_i8x16(self.unzip_low_i8x16(a0, a1), self.unzip_low_i8x16(b0, b1))
1653    }
1654    #[inline(always)]
1655    fn unzip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1656        let (a0, a1) = self.split_i8x32(a);
1657        let (b0, b1) = self.split_i8x32(b);
1658        self.combine_i8x16(self.unzip_high_i8x16(a0, a1), self.unzip_high_i8x16(b0, b1))
1659    }
1660    #[inline(always)]
1661    fn select_i8x32(self, a: mask8x32<Self>, b: i8x32<Self>, c: i8x32<Self>) -> i8x32<Self> {
1662        let (a0, a1) = self.split_mask8x32(a);
1663        let (b0, b1) = self.split_i8x32(b);
1664        let (c0, c1) = self.split_i8x32(c);
1665        self.combine_i8x16(self.select_i8x16(a0, b0, c0), self.select_i8x16(a1, b1, c1))
1666    }
1667    #[inline(always)]
1668    fn min_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1669        let (a0, a1) = self.split_i8x32(a);
1670        let (b0, b1) = self.split_i8x32(b);
1671        self.combine_i8x16(self.min_i8x16(a0, b0), self.min_i8x16(a1, b1))
1672    }
1673    #[inline(always)]
1674    fn max_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1675        let (a0, a1) = self.split_i8x32(a);
1676        let (b0, b1) = self.split_i8x32(b);
1677        self.combine_i8x16(self.max_i8x16(a0, b0), self.max_i8x16(a1, b1))
1678    }
1679    #[inline(always)]
1680    fn combine_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x64<Self> {
1681        let mut result = [0; 64usize];
1682        result[0..32usize].copy_from_slice(&a.val);
1683        result[32usize..64usize].copy_from_slice(&b.val);
1684        result.simd_into(self)
1685    }
1686    #[inline(always)]
1687    fn split_i8x32(self, a: i8x32<Self>) -> (i8x16<Self>, i8x16<Self>) {
1688        let mut b0 = [0; 16usize];
1689        let mut b1 = [0; 16usize];
1690        b0.copy_from_slice(&a.val[0..16usize]);
1691        b1.copy_from_slice(&a.val[16usize..32usize]);
1692        (b0.simd_into(self), b1.simd_into(self))
1693    }
1694    #[inline(always)]
1695    fn reinterpret_u8_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
1696        let (a0, a1) = self.split_i8x32(a);
1697        self.combine_u8x16(self.reinterpret_u8_i8x16(a0), self.reinterpret_u8_i8x16(a1))
1698    }
1699    #[inline(always)]
1700    fn reinterpret_u32_i8x32(self, a: i8x32<Self>) -> u32x8<Self> {
1701        let (a0, a1) = self.split_i8x32(a);
1702        self.combine_u32x4(
1703            self.reinterpret_u32_i8x16(a0),
1704            self.reinterpret_u32_i8x16(a1),
1705        )
1706    }
1707    #[inline(always)]
1708    fn splat_u8x32(self, a: u8) -> u8x32<Self> {
1709        let half = self.splat_u8x16(a);
1710        self.combine_u8x16(half, half)
1711    }
1712    #[inline(always)]
1713    fn not_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
1714        let (a0, a1) = self.split_u8x32(a);
1715        self.combine_u8x16(self.not_u8x16(a0), self.not_u8x16(a1))
1716    }
1717    #[inline(always)]
1718    fn add_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1719        let (a0, a1) = self.split_u8x32(a);
1720        let (b0, b1) = self.split_u8x32(b);
1721        self.combine_u8x16(self.add_u8x16(a0, b0), self.add_u8x16(a1, b1))
1722    }
1723    #[inline(always)]
1724    fn sub_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1725        let (a0, a1) = self.split_u8x32(a);
1726        let (b0, b1) = self.split_u8x32(b);
1727        self.combine_u8x16(self.sub_u8x16(a0, b0), self.sub_u8x16(a1, b1))
1728    }
1729    #[inline(always)]
1730    fn mul_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1731        let (a0, a1) = self.split_u8x32(a);
1732        let (b0, b1) = self.split_u8x32(b);
1733        self.combine_u8x16(self.mul_u8x16(a0, b0), self.mul_u8x16(a1, b1))
1734    }
1735    #[inline(always)]
1736    fn and_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1737        let (a0, a1) = self.split_u8x32(a);
1738        let (b0, b1) = self.split_u8x32(b);
1739        self.combine_u8x16(self.and_u8x16(a0, b0), self.and_u8x16(a1, b1))
1740    }
1741    #[inline(always)]
1742    fn or_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1743        let (a0, a1) = self.split_u8x32(a);
1744        let (b0, b1) = self.split_u8x32(b);
1745        self.combine_u8x16(self.or_u8x16(a0, b0), self.or_u8x16(a1, b1))
1746    }
1747    #[inline(always)]
1748    fn xor_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1749        let (a0, a1) = self.split_u8x32(a);
1750        let (b0, b1) = self.split_u8x32(b);
1751        self.combine_u8x16(self.xor_u8x16(a0, b0), self.xor_u8x16(a1, b1))
1752    }
1753    #[inline(always)]
1754    fn shr_u8x32(self, a: u8x32<Self>, b: u32) -> u8x32<Self> {
1755        let (a0, a1) = self.split_u8x32(a);
1756        self.combine_u8x16(self.shr_u8x16(a0, b), self.shr_u8x16(a1, b))
1757    }
1758    #[inline(always)]
1759    fn simd_eq_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
1760        let (a0, a1) = self.split_u8x32(a);
1761        let (b0, b1) = self.split_u8x32(b);
1762        self.combine_mask8x16(self.simd_eq_u8x16(a0, b0), self.simd_eq_u8x16(a1, b1))
1763    }
1764    #[inline(always)]
1765    fn simd_lt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
1766        let (a0, a1) = self.split_u8x32(a);
1767        let (b0, b1) = self.split_u8x32(b);
1768        self.combine_mask8x16(self.simd_lt_u8x16(a0, b0), self.simd_lt_u8x16(a1, b1))
1769    }
1770    #[inline(always)]
1771    fn simd_le_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
1772        let (a0, a1) = self.split_u8x32(a);
1773        let (b0, b1) = self.split_u8x32(b);
1774        self.combine_mask8x16(self.simd_le_u8x16(a0, b0), self.simd_le_u8x16(a1, b1))
1775    }
1776    #[inline(always)]
1777    fn simd_ge_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
1778        let (a0, a1) = self.split_u8x32(a);
1779        let (b0, b1) = self.split_u8x32(b);
1780        self.combine_mask8x16(self.simd_ge_u8x16(a0, b0), self.simd_ge_u8x16(a1, b1))
1781    }
1782    #[inline(always)]
1783    fn simd_gt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
1784        let (a0, a1) = self.split_u8x32(a);
1785        let (b0, b1) = self.split_u8x32(b);
1786        self.combine_mask8x16(self.simd_gt_u8x16(a0, b0), self.simd_gt_u8x16(a1, b1))
1787    }
1788    #[inline(always)]
1789    fn zip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1790        let (a0, _) = self.split_u8x32(a);
1791        let (b0, _) = self.split_u8x32(b);
1792        self.combine_u8x16(self.zip_low_u8x16(a0, b0), self.zip_high_u8x16(a0, b0))
1793    }
1794    #[inline(always)]
1795    fn zip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1796        let (_, a1) = self.split_u8x32(a);
1797        let (_, b1) = self.split_u8x32(b);
1798        self.combine_u8x16(self.zip_low_u8x16(a1, b1), self.zip_high_u8x16(a1, b1))
1799    }
1800    #[inline(always)]
1801    fn unzip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1802        let (a0, a1) = self.split_u8x32(a);
1803        let (b0, b1) = self.split_u8x32(b);
1804        self.combine_u8x16(self.unzip_low_u8x16(a0, a1), self.unzip_low_u8x16(b0, b1))
1805    }
1806    #[inline(always)]
1807    fn unzip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1808        let (a0, a1) = self.split_u8x32(a);
1809        let (b0, b1) = self.split_u8x32(b);
1810        self.combine_u8x16(self.unzip_high_u8x16(a0, a1), self.unzip_high_u8x16(b0, b1))
1811    }
1812    #[inline(always)]
1813    fn select_u8x32(self, a: mask8x32<Self>, b: u8x32<Self>, c: u8x32<Self>) -> u8x32<Self> {
1814        let (a0, a1) = self.split_mask8x32(a);
1815        let (b0, b1) = self.split_u8x32(b);
1816        let (c0, c1) = self.split_u8x32(c);
1817        self.combine_u8x16(self.select_u8x16(a0, b0, c0), self.select_u8x16(a1, b1, c1))
1818    }
1819    #[inline(always)]
1820    fn min_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1821        let (a0, a1) = self.split_u8x32(a);
1822        let (b0, b1) = self.split_u8x32(b);
1823        self.combine_u8x16(self.min_u8x16(a0, b0), self.min_u8x16(a1, b1))
1824    }
1825    #[inline(always)]
1826    fn max_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1827        let (a0, a1) = self.split_u8x32(a);
1828        let (b0, b1) = self.split_u8x32(b);
1829        self.combine_u8x16(self.max_u8x16(a0, b0), self.max_u8x16(a1, b1))
1830    }
1831    #[inline(always)]
1832    fn combine_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x64<Self> {
1833        let mut result = [0; 64usize];
1834        result[0..32usize].copy_from_slice(&a.val);
1835        result[32usize..64usize].copy_from_slice(&b.val);
1836        result.simd_into(self)
1837    }
1838    #[inline(always)]
1839    fn split_u8x32(self, a: u8x32<Self>) -> (u8x16<Self>, u8x16<Self>) {
1840        let mut b0 = [0; 16usize];
1841        let mut b1 = [0; 16usize];
1842        b0.copy_from_slice(&a.val[0..16usize]);
1843        b1.copy_from_slice(&a.val[16usize..32usize]);
1844        (b0.simd_into(self), b1.simd_into(self))
1845    }
1846    #[inline(always)]
1847    fn widen_u8x32(self, a: u8x32<Self>) -> u16x32<Self> {
1848        let (a0, a1) = self.split_u8x32(a);
1849        self.combine_u16x16(self.widen_u8x16(a0), self.widen_u8x16(a1))
1850    }
1851    #[inline(always)]
1852    fn reinterpret_u32_u8x32(self, a: u8x32<Self>) -> u32x8<Self> {
1853        let (a0, a1) = self.split_u8x32(a);
1854        self.combine_u32x4(
1855            self.reinterpret_u32_u8x16(a0),
1856            self.reinterpret_u32_u8x16(a1),
1857        )
1858    }
1859    #[inline(always)]
1860    fn splat_mask8x32(self, a: i8) -> mask8x32<Self> {
1861        let half = self.splat_mask8x16(a);
1862        self.combine_mask8x16(half, half)
1863    }
1864    #[inline(always)]
1865    fn not_mask8x32(self, a: mask8x32<Self>) -> mask8x32<Self> {
1866        let (a0, a1) = self.split_mask8x32(a);
1867        self.combine_mask8x16(self.not_mask8x16(a0), self.not_mask8x16(a1))
1868    }
1869    #[inline(always)]
1870    fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
1871        let (a0, a1) = self.split_mask8x32(a);
1872        let (b0, b1) = self.split_mask8x32(b);
1873        self.combine_mask8x16(self.and_mask8x16(a0, b0), self.and_mask8x16(a1, b1))
1874    }
1875    #[inline(always)]
1876    fn or_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
1877        let (a0, a1) = self.split_mask8x32(a);
1878        let (b0, b1) = self.split_mask8x32(b);
1879        self.combine_mask8x16(self.or_mask8x16(a0, b0), self.or_mask8x16(a1, b1))
1880    }
1881    #[inline(always)]
1882    fn xor_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
1883        let (a0, a1) = self.split_mask8x32(a);
1884        let (b0, b1) = self.split_mask8x32(b);
1885        self.combine_mask8x16(self.xor_mask8x16(a0, b0), self.xor_mask8x16(a1, b1))
1886    }
1887    #[inline(always)]
1888    fn select_mask8x32(
1889        self,
1890        a: mask8x32<Self>,
1891        b: mask8x32<Self>,
1892        c: mask8x32<Self>,
1893    ) -> mask8x32<Self> {
1894        let (a0, a1) = self.split_mask8x32(a);
1895        let (b0, b1) = self.split_mask8x32(b);
1896        let (c0, c1) = self.split_mask8x32(c);
1897        self.combine_mask8x16(
1898            self.select_mask8x16(a0, b0, c0),
1899            self.select_mask8x16(a1, b1, c1),
1900        )
1901    }
1902    #[inline(always)]
1903    fn simd_eq_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
1904        let (a0, a1) = self.split_mask8x32(a);
1905        let (b0, b1) = self.split_mask8x32(b);
1906        self.combine_mask8x16(self.simd_eq_mask8x16(a0, b0), self.simd_eq_mask8x16(a1, b1))
1907    }
1908    #[inline(always)]
1909    fn combine_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x64<Self> {
1910        let mut result = [0; 64usize];
1911        result[0..32usize].copy_from_slice(&a.val);
1912        result[32usize..64usize].copy_from_slice(&b.val);
1913        result.simd_into(self)
1914    }
1915    #[inline(always)]
1916    fn split_mask8x32(self, a: mask8x32<Self>) -> (mask8x16<Self>, mask8x16<Self>) {
1917        let mut b0 = [0; 16usize];
1918        let mut b1 = [0; 16usize];
1919        b0.copy_from_slice(&a.val[0..16usize]);
1920        b1.copy_from_slice(&a.val[16usize..32usize]);
1921        (b0.simd_into(self), b1.simd_into(self))
1922    }
1923    #[inline(always)]
1924    fn splat_i16x16(self, a: i16) -> i16x16<Self> {
1925        let half = self.splat_i16x8(a);
1926        self.combine_i16x8(half, half)
1927    }
1928    #[inline(always)]
1929    fn not_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
1930        let (a0, a1) = self.split_i16x16(a);
1931        self.combine_i16x8(self.not_i16x8(a0), self.not_i16x8(a1))
1932    }
1933    #[inline(always)]
1934    fn add_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
1935        let (a0, a1) = self.split_i16x16(a);
1936        let (b0, b1) = self.split_i16x16(b);
1937        self.combine_i16x8(self.add_i16x8(a0, b0), self.add_i16x8(a1, b1))
1938    }
1939    #[inline(always)]
1940    fn sub_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
1941        let (a0, a1) = self.split_i16x16(a);
1942        let (b0, b1) = self.split_i16x16(b);
1943        self.combine_i16x8(self.sub_i16x8(a0, b0), self.sub_i16x8(a1, b1))
1944    }
1945    #[inline(always)]
1946    fn mul_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
1947        let (a0, a1) = self.split_i16x16(a);
1948        let (b0, b1) = self.split_i16x16(b);
1949        self.combine_i16x8(self.mul_i16x8(a0, b0), self.mul_i16x8(a1, b1))
1950    }
1951    #[inline(always)]
1952    fn and_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
1953        let (a0, a1) = self.split_i16x16(a);
1954        let (b0, b1) = self.split_i16x16(b);
1955        self.combine_i16x8(self.and_i16x8(a0, b0), self.and_i16x8(a1, b1))
1956    }
1957    #[inline(always)]
1958    fn or_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
1959        let (a0, a1) = self.split_i16x16(a);
1960        let (b0, b1) = self.split_i16x16(b);
1961        self.combine_i16x8(self.or_i16x8(a0, b0), self.or_i16x8(a1, b1))
1962    }
1963    #[inline(always)]
1964    fn xor_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
1965        let (a0, a1) = self.split_i16x16(a);
1966        let (b0, b1) = self.split_i16x16(b);
1967        self.combine_i16x8(self.xor_i16x8(a0, b0), self.xor_i16x8(a1, b1))
1968    }
1969    #[inline(always)]
1970    fn shr_i16x16(self, a: i16x16<Self>, b: u32) -> i16x16<Self> {
1971        let (a0, a1) = self.split_i16x16(a);
1972        self.combine_i16x8(self.shr_i16x8(a0, b), self.shr_i16x8(a1, b))
1973    }
1974    #[inline(always)]
1975    fn simd_eq_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
1976        let (a0, a1) = self.split_i16x16(a);
1977        let (b0, b1) = self.split_i16x16(b);
1978        self.combine_mask16x8(self.simd_eq_i16x8(a0, b0), self.simd_eq_i16x8(a1, b1))
1979    }
1980    #[inline(always)]
1981    fn simd_lt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
1982        let (a0, a1) = self.split_i16x16(a);
1983        let (b0, b1) = self.split_i16x16(b);
1984        self.combine_mask16x8(self.simd_lt_i16x8(a0, b0), self.simd_lt_i16x8(a1, b1))
1985    }
1986    #[inline(always)]
1987    fn simd_le_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
1988        let (a0, a1) = self.split_i16x16(a);
1989        let (b0, b1) = self.split_i16x16(b);
1990        self.combine_mask16x8(self.simd_le_i16x8(a0, b0), self.simd_le_i16x8(a1, b1))
1991    }
1992    #[inline(always)]
1993    fn simd_ge_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
1994        let (a0, a1) = self.split_i16x16(a);
1995        let (b0, b1) = self.split_i16x16(b);
1996        self.combine_mask16x8(self.simd_ge_i16x8(a0, b0), self.simd_ge_i16x8(a1, b1))
1997    }
1998    #[inline(always)]
1999    fn simd_gt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
2000        let (a0, a1) = self.split_i16x16(a);
2001        let (b0, b1) = self.split_i16x16(b);
2002        self.combine_mask16x8(self.simd_gt_i16x8(a0, b0), self.simd_gt_i16x8(a1, b1))
2003    }
2004    #[inline(always)]
2005    fn zip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2006        let (a0, _) = self.split_i16x16(a);
2007        let (b0, _) = self.split_i16x16(b);
2008        self.combine_i16x8(self.zip_low_i16x8(a0, b0), self.zip_high_i16x8(a0, b0))
2009    }
2010    #[inline(always)]
2011    fn zip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2012        let (_, a1) = self.split_i16x16(a);
2013        let (_, b1) = self.split_i16x16(b);
2014        self.combine_i16x8(self.zip_low_i16x8(a1, b1), self.zip_high_i16x8(a1, b1))
2015    }
2016    #[inline(always)]
2017    fn unzip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2018        let (a0, a1) = self.split_i16x16(a);
2019        let (b0, b1) = self.split_i16x16(b);
2020        self.combine_i16x8(self.unzip_low_i16x8(a0, a1), self.unzip_low_i16x8(b0, b1))
2021    }
2022    #[inline(always)]
2023    fn unzip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2024        let (a0, a1) = self.split_i16x16(a);
2025        let (b0, b1) = self.split_i16x16(b);
2026        self.combine_i16x8(self.unzip_high_i16x8(a0, a1), self.unzip_high_i16x8(b0, b1))
2027    }
2028    #[inline(always)]
2029    fn select_i16x16(self, a: mask16x16<Self>, b: i16x16<Self>, c: i16x16<Self>) -> i16x16<Self> {
2030        let (a0, a1) = self.split_mask16x16(a);
2031        let (b0, b1) = self.split_i16x16(b);
2032        let (c0, c1) = self.split_i16x16(c);
2033        self.combine_i16x8(self.select_i16x8(a0, b0, c0), self.select_i16x8(a1, b1, c1))
2034    }
2035    #[inline(always)]
2036    fn min_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2037        let (a0, a1) = self.split_i16x16(a);
2038        let (b0, b1) = self.split_i16x16(b);
2039        self.combine_i16x8(self.min_i16x8(a0, b0), self.min_i16x8(a1, b1))
2040    }
2041    #[inline(always)]
2042    fn max_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2043        let (a0, a1) = self.split_i16x16(a);
2044        let (b0, b1) = self.split_i16x16(b);
2045        self.combine_i16x8(self.max_i16x8(a0, b0), self.max_i16x8(a1, b1))
2046    }
2047    #[inline(always)]
2048    fn combine_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x32<Self> {
2049        let mut result = [0; 32usize];
2050        result[0..16usize].copy_from_slice(&a.val);
2051        result[16usize..32usize].copy_from_slice(&b.val);
2052        result.simd_into(self)
2053    }
2054    #[inline(always)]
2055    fn split_i16x16(self, a: i16x16<Self>) -> (i16x8<Self>, i16x8<Self>) {
2056        let mut b0 = [0; 8usize];
2057        let mut b1 = [0; 8usize];
2058        b0.copy_from_slice(&a.val[0..8usize]);
2059        b1.copy_from_slice(&a.val[8usize..16usize]);
2060        (b0.simd_into(self), b1.simd_into(self))
2061    }
2062    #[inline(always)]
2063    fn reinterpret_u8_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
2064        let (a0, a1) = self.split_i16x16(a);
2065        self.combine_u8x16(self.reinterpret_u8_i16x8(a0), self.reinterpret_u8_i16x8(a1))
2066    }
2067    #[inline(always)]
2068    fn reinterpret_u32_i16x16(self, a: i16x16<Self>) -> u32x8<Self> {
2069        let (a0, a1) = self.split_i16x16(a);
2070        self.combine_u32x4(
2071            self.reinterpret_u32_i16x8(a0),
2072            self.reinterpret_u32_i16x8(a1),
2073        )
2074    }
2075    #[inline(always)]
2076    fn splat_u16x16(self, a: u16) -> u16x16<Self> {
2077        let half = self.splat_u16x8(a);
2078        self.combine_u16x8(half, half)
2079    }
2080    #[inline(always)]
2081    fn not_u16x16(self, a: u16x16<Self>) -> u16x16<Self> {
2082        let (a0, a1) = self.split_u16x16(a);
2083        self.combine_u16x8(self.not_u16x8(a0), self.not_u16x8(a1))
2084    }
2085    #[inline(always)]
2086    fn add_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2087        let (a0, a1) = self.split_u16x16(a);
2088        let (b0, b1) = self.split_u16x16(b);
2089        self.combine_u16x8(self.add_u16x8(a0, b0), self.add_u16x8(a1, b1))
2090    }
2091    #[inline(always)]
2092    fn sub_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2093        let (a0, a1) = self.split_u16x16(a);
2094        let (b0, b1) = self.split_u16x16(b);
2095        self.combine_u16x8(self.sub_u16x8(a0, b0), self.sub_u16x8(a1, b1))
2096    }
2097    #[inline(always)]
2098    fn mul_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2099        let (a0, a1) = self.split_u16x16(a);
2100        let (b0, b1) = self.split_u16x16(b);
2101        self.combine_u16x8(self.mul_u16x8(a0, b0), self.mul_u16x8(a1, b1))
2102    }
2103    #[inline(always)]
2104    fn and_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2105        let (a0, a1) = self.split_u16x16(a);
2106        let (b0, b1) = self.split_u16x16(b);
2107        self.combine_u16x8(self.and_u16x8(a0, b0), self.and_u16x8(a1, b1))
2108    }
2109    #[inline(always)]
2110    fn or_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2111        let (a0, a1) = self.split_u16x16(a);
2112        let (b0, b1) = self.split_u16x16(b);
2113        self.combine_u16x8(self.or_u16x8(a0, b0), self.or_u16x8(a1, b1))
2114    }
2115    #[inline(always)]
2116    fn xor_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2117        let (a0, a1) = self.split_u16x16(a);
2118        let (b0, b1) = self.split_u16x16(b);
2119        self.combine_u16x8(self.xor_u16x8(a0, b0), self.xor_u16x8(a1, b1))
2120    }
2121    #[inline(always)]
2122    fn shr_u16x16(self, a: u16x16<Self>, b: u32) -> u16x16<Self> {
2123        let (a0, a1) = self.split_u16x16(a);
2124        self.combine_u16x8(self.shr_u16x8(a0, b), self.shr_u16x8(a1, b))
2125    }
2126    #[inline(always)]
2127    fn simd_eq_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
2128        let (a0, a1) = self.split_u16x16(a);
2129        let (b0, b1) = self.split_u16x16(b);
2130        self.combine_mask16x8(self.simd_eq_u16x8(a0, b0), self.simd_eq_u16x8(a1, b1))
2131    }
2132    #[inline(always)]
2133    fn simd_lt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
2134        let (a0, a1) = self.split_u16x16(a);
2135        let (b0, b1) = self.split_u16x16(b);
2136        self.combine_mask16x8(self.simd_lt_u16x8(a0, b0), self.simd_lt_u16x8(a1, b1))
2137    }
2138    #[inline(always)]
2139    fn simd_le_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
2140        let (a0, a1) = self.split_u16x16(a);
2141        let (b0, b1) = self.split_u16x16(b);
2142        self.combine_mask16x8(self.simd_le_u16x8(a0, b0), self.simd_le_u16x8(a1, b1))
2143    }
2144    #[inline(always)]
2145    fn simd_ge_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
2146        let (a0, a1) = self.split_u16x16(a);
2147        let (b0, b1) = self.split_u16x16(b);
2148        self.combine_mask16x8(self.simd_ge_u16x8(a0, b0), self.simd_ge_u16x8(a1, b1))
2149    }
2150    #[inline(always)]
2151    fn simd_gt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
2152        let (a0, a1) = self.split_u16x16(a);
2153        let (b0, b1) = self.split_u16x16(b);
2154        self.combine_mask16x8(self.simd_gt_u16x8(a0, b0), self.simd_gt_u16x8(a1, b1))
2155    }
2156    #[inline(always)]
2157    fn zip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2158        let (a0, _) = self.split_u16x16(a);
2159        let (b0, _) = self.split_u16x16(b);
2160        self.combine_u16x8(self.zip_low_u16x8(a0, b0), self.zip_high_u16x8(a0, b0))
2161    }
2162    #[inline(always)]
2163    fn zip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2164        let (_, a1) = self.split_u16x16(a);
2165        let (_, b1) = self.split_u16x16(b);
2166        self.combine_u16x8(self.zip_low_u16x8(a1, b1), self.zip_high_u16x8(a1, b1))
2167    }
2168    #[inline(always)]
2169    fn unzip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2170        let (a0, a1) = self.split_u16x16(a);
2171        let (b0, b1) = self.split_u16x16(b);
2172        self.combine_u16x8(self.unzip_low_u16x8(a0, a1), self.unzip_low_u16x8(b0, b1))
2173    }
2174    #[inline(always)]
2175    fn unzip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2176        let (a0, a1) = self.split_u16x16(a);
2177        let (b0, b1) = self.split_u16x16(b);
2178        self.combine_u16x8(self.unzip_high_u16x8(a0, a1), self.unzip_high_u16x8(b0, b1))
2179    }
2180    #[inline(always)]
2181    fn select_u16x16(self, a: mask16x16<Self>, b: u16x16<Self>, c: u16x16<Self>) -> u16x16<Self> {
2182        let (a0, a1) = self.split_mask16x16(a);
2183        let (b0, b1) = self.split_u16x16(b);
2184        let (c0, c1) = self.split_u16x16(c);
2185        self.combine_u16x8(self.select_u16x8(a0, b0, c0), self.select_u16x8(a1, b1, c1))
2186    }
2187    #[inline(always)]
2188    fn min_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2189        let (a0, a1) = self.split_u16x16(a);
2190        let (b0, b1) = self.split_u16x16(b);
2191        self.combine_u16x8(self.min_u16x8(a0, b0), self.min_u16x8(a1, b1))
2192    }
2193    #[inline(always)]
2194    fn max_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2195        let (a0, a1) = self.split_u16x16(a);
2196        let (b0, b1) = self.split_u16x16(b);
2197        self.combine_u16x8(self.max_u16x8(a0, b0), self.max_u16x8(a1, b1))
2198    }
2199    #[inline(always)]
2200    fn combine_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x32<Self> {
2201        let mut result = [0; 32usize];
2202        result[0..16usize].copy_from_slice(&a.val);
2203        result[16usize..32usize].copy_from_slice(&b.val);
2204        result.simd_into(self)
2205    }
2206    #[inline(always)]
2207    fn split_u16x16(self, a: u16x16<Self>) -> (u16x8<Self>, u16x8<Self>) {
2208        let mut b0 = [0; 8usize];
2209        let mut b1 = [0; 8usize];
2210        b0.copy_from_slice(&a.val[0..8usize]);
2211        b1.copy_from_slice(&a.val[8usize..16usize]);
2212        (b0.simd_into(self), b1.simd_into(self))
2213    }
2214    #[inline(always)]
2215    fn narrow_u16x16(self, a: u16x16<Self>) -> u8x16<Self> {
2216        let (a, b) = self.split_u16x16(a);
2217        unsafe {
2218            let mask = _mm_set1_epi16(0xFF);
2219            let lo_masked = _mm_and_si128(a.into(), mask);
2220            let hi_masked = _mm_and_si128(b.into(), mask);
2221            let result = _mm_packus_epi16(lo_masked, hi_masked);
2222            result.simd_into(self)
2223        }
2224    }
2225    #[inline(always)]
2226    fn reinterpret_u8_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
2227        let (a0, a1) = self.split_u16x16(a);
2228        self.combine_u8x16(self.reinterpret_u8_u16x8(a0), self.reinterpret_u8_u16x8(a1))
2229    }
2230    #[inline(always)]
2231    fn reinterpret_u32_u16x16(self, a: u16x16<Self>) -> u32x8<Self> {
2232        let (a0, a1) = self.split_u16x16(a);
2233        self.combine_u32x4(
2234            self.reinterpret_u32_u16x8(a0),
2235            self.reinterpret_u32_u16x8(a1),
2236        )
2237    }
2238    #[inline(always)]
2239    fn splat_mask16x16(self, a: i16) -> mask16x16<Self> {
2240        let half = self.splat_mask16x8(a);
2241        self.combine_mask16x8(half, half)
2242    }
2243    #[inline(always)]
2244    fn not_mask16x16(self, a: mask16x16<Self>) -> mask16x16<Self> {
2245        let (a0, a1) = self.split_mask16x16(a);
2246        self.combine_mask16x8(self.not_mask16x8(a0), self.not_mask16x8(a1))
2247    }
2248    #[inline(always)]
2249    fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
2250        let (a0, a1) = self.split_mask16x16(a);
2251        let (b0, b1) = self.split_mask16x16(b);
2252        self.combine_mask16x8(self.and_mask16x8(a0, b0), self.and_mask16x8(a1, b1))
2253    }
2254    #[inline(always)]
2255    fn or_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
2256        let (a0, a1) = self.split_mask16x16(a);
2257        let (b0, b1) = self.split_mask16x16(b);
2258        self.combine_mask16x8(self.or_mask16x8(a0, b0), self.or_mask16x8(a1, b1))
2259    }
2260    #[inline(always)]
2261    fn xor_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
2262        let (a0, a1) = self.split_mask16x16(a);
2263        let (b0, b1) = self.split_mask16x16(b);
2264        self.combine_mask16x8(self.xor_mask16x8(a0, b0), self.xor_mask16x8(a1, b1))
2265    }
2266    #[inline(always)]
2267    fn select_mask16x16(
2268        self,
2269        a: mask16x16<Self>,
2270        b: mask16x16<Self>,
2271        c: mask16x16<Self>,
2272    ) -> mask16x16<Self> {
2273        let (a0, a1) = self.split_mask16x16(a);
2274        let (b0, b1) = self.split_mask16x16(b);
2275        let (c0, c1) = self.split_mask16x16(c);
2276        self.combine_mask16x8(
2277            self.select_mask16x8(a0, b0, c0),
2278            self.select_mask16x8(a1, b1, c1),
2279        )
2280    }
2281    #[inline(always)]
2282    fn simd_eq_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
2283        let (a0, a1) = self.split_mask16x16(a);
2284        let (b0, b1) = self.split_mask16x16(b);
2285        self.combine_mask16x8(self.simd_eq_mask16x8(a0, b0), self.simd_eq_mask16x8(a1, b1))
2286    }
2287    #[inline(always)]
2288    fn combine_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x32<Self> {
2289        let mut result = [0; 32usize];
2290        result[0..16usize].copy_from_slice(&a.val);
2291        result[16usize..32usize].copy_from_slice(&b.val);
2292        result.simd_into(self)
2293    }
2294    #[inline(always)]
2295    fn split_mask16x16(self, a: mask16x16<Self>) -> (mask16x8<Self>, mask16x8<Self>) {
2296        let mut b0 = [0; 8usize];
2297        let mut b1 = [0; 8usize];
2298        b0.copy_from_slice(&a.val[0..8usize]);
2299        b1.copy_from_slice(&a.val[8usize..16usize]);
2300        (b0.simd_into(self), b1.simd_into(self))
2301    }
2302    #[inline(always)]
2303    fn splat_i32x8(self, a: i32) -> i32x8<Self> {
2304        let half = self.splat_i32x4(a);
2305        self.combine_i32x4(half, half)
2306    }
2307    #[inline(always)]
2308    fn not_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
2309        let (a0, a1) = self.split_i32x8(a);
2310        self.combine_i32x4(self.not_i32x4(a0), self.not_i32x4(a1))
2311    }
2312    #[inline(always)]
2313    fn add_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2314        let (a0, a1) = self.split_i32x8(a);
2315        let (b0, b1) = self.split_i32x8(b);
2316        self.combine_i32x4(self.add_i32x4(a0, b0), self.add_i32x4(a1, b1))
2317    }
2318    #[inline(always)]
2319    fn sub_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2320        let (a0, a1) = self.split_i32x8(a);
2321        let (b0, b1) = self.split_i32x8(b);
2322        self.combine_i32x4(self.sub_i32x4(a0, b0), self.sub_i32x4(a1, b1))
2323    }
2324    #[inline(always)]
2325    fn mul_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2326        let (a0, a1) = self.split_i32x8(a);
2327        let (b0, b1) = self.split_i32x8(b);
2328        self.combine_i32x4(self.mul_i32x4(a0, b0), self.mul_i32x4(a1, b1))
2329    }
2330    #[inline(always)]
2331    fn and_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2332        let (a0, a1) = self.split_i32x8(a);
2333        let (b0, b1) = self.split_i32x8(b);
2334        self.combine_i32x4(self.and_i32x4(a0, b0), self.and_i32x4(a1, b1))
2335    }
2336    #[inline(always)]
2337    fn or_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2338        let (a0, a1) = self.split_i32x8(a);
2339        let (b0, b1) = self.split_i32x8(b);
2340        self.combine_i32x4(self.or_i32x4(a0, b0), self.or_i32x4(a1, b1))
2341    }
2342    #[inline(always)]
2343    fn xor_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2344        let (a0, a1) = self.split_i32x8(a);
2345        let (b0, b1) = self.split_i32x8(b);
2346        self.combine_i32x4(self.xor_i32x4(a0, b0), self.xor_i32x4(a1, b1))
2347    }
2348    #[inline(always)]
2349    fn shr_i32x8(self, a: i32x8<Self>, b: u32) -> i32x8<Self> {
2350        let (a0, a1) = self.split_i32x8(a);
2351        self.combine_i32x4(self.shr_i32x4(a0, b), self.shr_i32x4(a1, b))
2352    }
2353    #[inline(always)]
2354    fn simd_eq_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
2355        let (a0, a1) = self.split_i32x8(a);
2356        let (b0, b1) = self.split_i32x8(b);
2357        self.combine_mask32x4(self.simd_eq_i32x4(a0, b0), self.simd_eq_i32x4(a1, b1))
2358    }
2359    #[inline(always)]
2360    fn simd_lt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
2361        let (a0, a1) = self.split_i32x8(a);
2362        let (b0, b1) = self.split_i32x8(b);
2363        self.combine_mask32x4(self.simd_lt_i32x4(a0, b0), self.simd_lt_i32x4(a1, b1))
2364    }
2365    #[inline(always)]
2366    fn simd_le_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
2367        let (a0, a1) = self.split_i32x8(a);
2368        let (b0, b1) = self.split_i32x8(b);
2369        self.combine_mask32x4(self.simd_le_i32x4(a0, b0), self.simd_le_i32x4(a1, b1))
2370    }
2371    #[inline(always)]
2372    fn simd_ge_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
2373        let (a0, a1) = self.split_i32x8(a);
2374        let (b0, b1) = self.split_i32x8(b);
2375        self.combine_mask32x4(self.simd_ge_i32x4(a0, b0), self.simd_ge_i32x4(a1, b1))
2376    }
2377    #[inline(always)]
2378    fn simd_gt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
2379        let (a0, a1) = self.split_i32x8(a);
2380        let (b0, b1) = self.split_i32x8(b);
2381        self.combine_mask32x4(self.simd_gt_i32x4(a0, b0), self.simd_gt_i32x4(a1, b1))
2382    }
2383    #[inline(always)]
2384    fn zip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2385        let (a0, _) = self.split_i32x8(a);
2386        let (b0, _) = self.split_i32x8(b);
2387        self.combine_i32x4(self.zip_low_i32x4(a0, b0), self.zip_high_i32x4(a0, b0))
2388    }
2389    #[inline(always)]
2390    fn zip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2391        let (_, a1) = self.split_i32x8(a);
2392        let (_, b1) = self.split_i32x8(b);
2393        self.combine_i32x4(self.zip_low_i32x4(a1, b1), self.zip_high_i32x4(a1, b1))
2394    }
2395    #[inline(always)]
2396    fn unzip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2397        let (a0, a1) = self.split_i32x8(a);
2398        let (b0, b1) = self.split_i32x8(b);
2399        self.combine_i32x4(self.unzip_low_i32x4(a0, a1), self.unzip_low_i32x4(b0, b1))
2400    }
2401    #[inline(always)]
2402    fn unzip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2403        let (a0, a1) = self.split_i32x8(a);
2404        let (b0, b1) = self.split_i32x8(b);
2405        self.combine_i32x4(self.unzip_high_i32x4(a0, a1), self.unzip_high_i32x4(b0, b1))
2406    }
2407    #[inline(always)]
2408    fn select_i32x8(self, a: mask32x8<Self>, b: i32x8<Self>, c: i32x8<Self>) -> i32x8<Self> {
2409        let (a0, a1) = self.split_mask32x8(a);
2410        let (b0, b1) = self.split_i32x8(b);
2411        let (c0, c1) = self.split_i32x8(c);
2412        self.combine_i32x4(self.select_i32x4(a0, b0, c0), self.select_i32x4(a1, b1, c1))
2413    }
2414    #[inline(always)]
2415    fn min_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2416        let (a0, a1) = self.split_i32x8(a);
2417        let (b0, b1) = self.split_i32x8(b);
2418        self.combine_i32x4(self.min_i32x4(a0, b0), self.min_i32x4(a1, b1))
2419    }
2420    #[inline(always)]
2421    fn max_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2422        let (a0, a1) = self.split_i32x8(a);
2423        let (b0, b1) = self.split_i32x8(b);
2424        self.combine_i32x4(self.max_i32x4(a0, b0), self.max_i32x4(a1, b1))
2425    }
2426    #[inline(always)]
2427    fn combine_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x16<Self> {
2428        let mut result = [0; 16usize];
2429        result[0..8usize].copy_from_slice(&a.val);
2430        result[8usize..16usize].copy_from_slice(&b.val);
2431        result.simd_into(self)
2432    }
2433    #[inline(always)]
2434    fn split_i32x8(self, a: i32x8<Self>) -> (i32x4<Self>, i32x4<Self>) {
2435        let mut b0 = [0; 4usize];
2436        let mut b1 = [0; 4usize];
2437        b0.copy_from_slice(&a.val[0..4usize]);
2438        b1.copy_from_slice(&a.val[4usize..8usize]);
2439        (b0.simd_into(self), b1.simd_into(self))
2440    }
2441    #[inline(always)]
2442    fn reinterpret_u8_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
2443        let (a0, a1) = self.split_i32x8(a);
2444        self.combine_u8x16(self.reinterpret_u8_i32x4(a0), self.reinterpret_u8_i32x4(a1))
2445    }
2446    #[inline(always)]
2447    fn reinterpret_u32_i32x8(self, a: i32x8<Self>) -> u32x8<Self> {
2448        let (a0, a1) = self.split_i32x8(a);
2449        self.combine_u32x4(
2450            self.reinterpret_u32_i32x4(a0),
2451            self.reinterpret_u32_i32x4(a1),
2452        )
2453    }
2454    #[inline(always)]
2455    fn cvt_f32_i32x8(self, a: i32x8<Self>) -> f32x8<Self> {
2456        let (a0, a1) = self.split_i32x8(a);
2457        self.combine_f32x4(self.cvt_f32_i32x4(a0), self.cvt_f32_i32x4(a1))
2458    }
2459    #[inline(always)]
2460    fn splat_u32x8(self, a: u32) -> u32x8<Self> {
2461        let half = self.splat_u32x4(a);
2462        self.combine_u32x4(half, half)
2463    }
2464    #[inline(always)]
2465    fn not_u32x8(self, a: u32x8<Self>) -> u32x8<Self> {
2466        let (a0, a1) = self.split_u32x8(a);
2467        self.combine_u32x4(self.not_u32x4(a0), self.not_u32x4(a1))
2468    }
2469    #[inline(always)]
2470    fn add_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2471        let (a0, a1) = self.split_u32x8(a);
2472        let (b0, b1) = self.split_u32x8(b);
2473        self.combine_u32x4(self.add_u32x4(a0, b0), self.add_u32x4(a1, b1))
2474    }
2475    #[inline(always)]
2476    fn sub_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2477        let (a0, a1) = self.split_u32x8(a);
2478        let (b0, b1) = self.split_u32x8(b);
2479        self.combine_u32x4(self.sub_u32x4(a0, b0), self.sub_u32x4(a1, b1))
2480    }
2481    #[inline(always)]
2482    fn mul_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2483        let (a0, a1) = self.split_u32x8(a);
2484        let (b0, b1) = self.split_u32x8(b);
2485        self.combine_u32x4(self.mul_u32x4(a0, b0), self.mul_u32x4(a1, b1))
2486    }
2487    #[inline(always)]
2488    fn and_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2489        let (a0, a1) = self.split_u32x8(a);
2490        let (b0, b1) = self.split_u32x8(b);
2491        self.combine_u32x4(self.and_u32x4(a0, b0), self.and_u32x4(a1, b1))
2492    }
2493    #[inline(always)]
2494    fn or_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2495        let (a0, a1) = self.split_u32x8(a);
2496        let (b0, b1) = self.split_u32x8(b);
2497        self.combine_u32x4(self.or_u32x4(a0, b0), self.or_u32x4(a1, b1))
2498    }
2499    #[inline(always)]
2500    fn xor_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2501        let (a0, a1) = self.split_u32x8(a);
2502        let (b0, b1) = self.split_u32x8(b);
2503        self.combine_u32x4(self.xor_u32x4(a0, b0), self.xor_u32x4(a1, b1))
2504    }
2505    #[inline(always)]
2506    fn shr_u32x8(self, a: u32x8<Self>, b: u32) -> u32x8<Self> {
2507        let (a0, a1) = self.split_u32x8(a);
2508        self.combine_u32x4(self.shr_u32x4(a0, b), self.shr_u32x4(a1, b))
2509    }
2510    #[inline(always)]
2511    fn simd_eq_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
2512        let (a0, a1) = self.split_u32x8(a);
2513        let (b0, b1) = self.split_u32x8(b);
2514        self.combine_mask32x4(self.simd_eq_u32x4(a0, b0), self.simd_eq_u32x4(a1, b1))
2515    }
2516    #[inline(always)]
2517    fn simd_lt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
2518        let (a0, a1) = self.split_u32x8(a);
2519        let (b0, b1) = self.split_u32x8(b);
2520        self.combine_mask32x4(self.simd_lt_u32x4(a0, b0), self.simd_lt_u32x4(a1, b1))
2521    }
2522    #[inline(always)]
2523    fn simd_le_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
2524        let (a0, a1) = self.split_u32x8(a);
2525        let (b0, b1) = self.split_u32x8(b);
2526        self.combine_mask32x4(self.simd_le_u32x4(a0, b0), self.simd_le_u32x4(a1, b1))
2527    }
2528    #[inline(always)]
2529    fn simd_ge_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
2530        let (a0, a1) = self.split_u32x8(a);
2531        let (b0, b1) = self.split_u32x8(b);
2532        self.combine_mask32x4(self.simd_ge_u32x4(a0, b0), self.simd_ge_u32x4(a1, b1))
2533    }
2534    #[inline(always)]
2535    fn simd_gt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
2536        let (a0, a1) = self.split_u32x8(a);
2537        let (b0, b1) = self.split_u32x8(b);
2538        self.combine_mask32x4(self.simd_gt_u32x4(a0, b0), self.simd_gt_u32x4(a1, b1))
2539    }
2540    #[inline(always)]
2541    fn zip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2542        let (a0, _) = self.split_u32x8(a);
2543        let (b0, _) = self.split_u32x8(b);
2544        self.combine_u32x4(self.zip_low_u32x4(a0, b0), self.zip_high_u32x4(a0, b0))
2545    }
2546    #[inline(always)]
2547    fn zip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2548        let (_, a1) = self.split_u32x8(a);
2549        let (_, b1) = self.split_u32x8(b);
2550        self.combine_u32x4(self.zip_low_u32x4(a1, b1), self.zip_high_u32x4(a1, b1))
2551    }
2552    #[inline(always)]
2553    fn unzip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2554        let (a0, a1) = self.split_u32x8(a);
2555        let (b0, b1) = self.split_u32x8(b);
2556        self.combine_u32x4(self.unzip_low_u32x4(a0, a1), self.unzip_low_u32x4(b0, b1))
2557    }
2558    #[inline(always)]
2559    fn unzip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2560        let (a0, a1) = self.split_u32x8(a);
2561        let (b0, b1) = self.split_u32x8(b);
2562        self.combine_u32x4(self.unzip_high_u32x4(a0, a1), self.unzip_high_u32x4(b0, b1))
2563    }
2564    #[inline(always)]
2565    fn select_u32x8(self, a: mask32x8<Self>, b: u32x8<Self>, c: u32x8<Self>) -> u32x8<Self> {
2566        let (a0, a1) = self.split_mask32x8(a);
2567        let (b0, b1) = self.split_u32x8(b);
2568        let (c0, c1) = self.split_u32x8(c);
2569        self.combine_u32x4(self.select_u32x4(a0, b0, c0), self.select_u32x4(a1, b1, c1))
2570    }
2571    #[inline(always)]
2572    fn min_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2573        let (a0, a1) = self.split_u32x8(a);
2574        let (b0, b1) = self.split_u32x8(b);
2575        self.combine_u32x4(self.min_u32x4(a0, b0), self.min_u32x4(a1, b1))
2576    }
2577    #[inline(always)]
2578    fn max_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2579        let (a0, a1) = self.split_u32x8(a);
2580        let (b0, b1) = self.split_u32x8(b);
2581        self.combine_u32x4(self.max_u32x4(a0, b0), self.max_u32x4(a1, b1))
2582    }
2583    #[inline(always)]
2584    fn combine_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x16<Self> {
2585        let mut result = [0; 16usize];
2586        result[0..8usize].copy_from_slice(&a.val);
2587        result[8usize..16usize].copy_from_slice(&b.val);
2588        result.simd_into(self)
2589    }
2590    #[inline(always)]
2591    fn split_u32x8(self, a: u32x8<Self>) -> (u32x4<Self>, u32x4<Self>) {
2592        let mut b0 = [0; 4usize];
2593        let mut b1 = [0; 4usize];
2594        b0.copy_from_slice(&a.val[0..4usize]);
2595        b1.copy_from_slice(&a.val[4usize..8usize]);
2596        (b0.simd_into(self), b1.simd_into(self))
2597    }
2598    #[inline(always)]
2599    fn reinterpret_u8_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
2600        let (a0, a1) = self.split_u32x8(a);
2601        self.combine_u8x16(self.reinterpret_u8_u32x4(a0), self.reinterpret_u8_u32x4(a1))
2602    }
2603    #[inline(always)]
2604    fn cvt_f32_u32x8(self, a: u32x8<Self>) -> f32x8<Self> {
2605        let (a0, a1) = self.split_u32x8(a);
2606        self.combine_f32x4(self.cvt_f32_u32x4(a0), self.cvt_f32_u32x4(a1))
2607    }
2608    #[inline(always)]
2609    fn splat_mask32x8(self, a: i32) -> mask32x8<Self> {
2610        let half = self.splat_mask32x4(a);
2611        self.combine_mask32x4(half, half)
2612    }
2613    #[inline(always)]
2614    fn not_mask32x8(self, a: mask32x8<Self>) -> mask32x8<Self> {
2615        let (a0, a1) = self.split_mask32x8(a);
2616        self.combine_mask32x4(self.not_mask32x4(a0), self.not_mask32x4(a1))
2617    }
2618    #[inline(always)]
2619    fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
2620        let (a0, a1) = self.split_mask32x8(a);
2621        let (b0, b1) = self.split_mask32x8(b);
2622        self.combine_mask32x4(self.and_mask32x4(a0, b0), self.and_mask32x4(a1, b1))
2623    }
2624    #[inline(always)]
2625    fn or_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
2626        let (a0, a1) = self.split_mask32x8(a);
2627        let (b0, b1) = self.split_mask32x8(b);
2628        self.combine_mask32x4(self.or_mask32x4(a0, b0), self.or_mask32x4(a1, b1))
2629    }
2630    #[inline(always)]
2631    fn xor_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
2632        let (a0, a1) = self.split_mask32x8(a);
2633        let (b0, b1) = self.split_mask32x8(b);
2634        self.combine_mask32x4(self.xor_mask32x4(a0, b0), self.xor_mask32x4(a1, b1))
2635    }
2636    #[inline(always)]
2637    fn select_mask32x8(
2638        self,
2639        a: mask32x8<Self>,
2640        b: mask32x8<Self>,
2641        c: mask32x8<Self>,
2642    ) -> mask32x8<Self> {
2643        let (a0, a1) = self.split_mask32x8(a);
2644        let (b0, b1) = self.split_mask32x8(b);
2645        let (c0, c1) = self.split_mask32x8(c);
2646        self.combine_mask32x4(
2647            self.select_mask32x4(a0, b0, c0),
2648            self.select_mask32x4(a1, b1, c1),
2649        )
2650    }
2651    #[inline(always)]
2652    fn simd_eq_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
2653        let (a0, a1) = self.split_mask32x8(a);
2654        let (b0, b1) = self.split_mask32x8(b);
2655        self.combine_mask32x4(self.simd_eq_mask32x4(a0, b0), self.simd_eq_mask32x4(a1, b1))
2656    }
2657    #[inline(always)]
2658    fn combine_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x16<Self> {
2659        let mut result = [0; 16usize];
2660        result[0..8usize].copy_from_slice(&a.val);
2661        result[8usize..16usize].copy_from_slice(&b.val);
2662        result.simd_into(self)
2663    }
2664    #[inline(always)]
2665    fn split_mask32x8(self, a: mask32x8<Self>) -> (mask32x4<Self>, mask32x4<Self>) {
2666        let mut b0 = [0; 4usize];
2667        let mut b1 = [0; 4usize];
2668        b0.copy_from_slice(&a.val[0..4usize]);
2669        b1.copy_from_slice(&a.val[4usize..8usize]);
2670        (b0.simd_into(self), b1.simd_into(self))
2671    }
2672    #[inline(always)]
2673    fn splat_f64x4(self, a: f64) -> f64x4<Self> {
2674        let half = self.splat_f64x2(a);
2675        self.combine_f64x2(half, half)
2676    }
2677    #[inline(always)]
2678    fn abs_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
2679        let (a0, a1) = self.split_f64x4(a);
2680        self.combine_f64x2(self.abs_f64x2(a0), self.abs_f64x2(a1))
2681    }
2682    #[inline(always)]
2683    fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
2684        let (a0, a1) = self.split_f64x4(a);
2685        self.combine_f64x2(self.neg_f64x2(a0), self.neg_f64x2(a1))
2686    }
2687    #[inline(always)]
2688    fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
2689        let (a0, a1) = self.split_f64x4(a);
2690        self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1))
2691    }
2692    #[inline(always)]
2693    fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2694        let (a0, a1) = self.split_f64x4(a);
2695        let (b0, b1) = self.split_f64x4(b);
2696        self.combine_f64x2(self.add_f64x2(a0, b0), self.add_f64x2(a1, b1))
2697    }
2698    #[inline(always)]
2699    fn sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2700        let (a0, a1) = self.split_f64x4(a);
2701        let (b0, b1) = self.split_f64x4(b);
2702        self.combine_f64x2(self.sub_f64x2(a0, b0), self.sub_f64x2(a1, b1))
2703    }
2704    #[inline(always)]
2705    fn mul_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2706        let (a0, a1) = self.split_f64x4(a);
2707        let (b0, b1) = self.split_f64x4(b);
2708        self.combine_f64x2(self.mul_f64x2(a0, b0), self.mul_f64x2(a1, b1))
2709    }
2710    #[inline(always)]
2711    fn div_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2712        let (a0, a1) = self.split_f64x4(a);
2713        let (b0, b1) = self.split_f64x4(b);
2714        self.combine_f64x2(self.div_f64x2(a0, b0), self.div_f64x2(a1, b1))
2715    }
2716    #[inline(always)]
2717    fn copysign_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2718        let (a0, a1) = self.split_f64x4(a);
2719        let (b0, b1) = self.split_f64x4(b);
2720        self.combine_f64x2(self.copysign_f64x2(a0, b0), self.copysign_f64x2(a1, b1))
2721    }
2722    #[inline(always)]
2723    fn simd_eq_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
2724        let (a0, a1) = self.split_f64x4(a);
2725        let (b0, b1) = self.split_f64x4(b);
2726        self.combine_mask64x2(self.simd_eq_f64x2(a0, b0), self.simd_eq_f64x2(a1, b1))
2727    }
2728    #[inline(always)]
2729    fn simd_lt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
2730        let (a0, a1) = self.split_f64x4(a);
2731        let (b0, b1) = self.split_f64x4(b);
2732        self.combine_mask64x2(self.simd_lt_f64x2(a0, b0), self.simd_lt_f64x2(a1, b1))
2733    }
2734    #[inline(always)]
2735    fn simd_le_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
2736        let (a0, a1) = self.split_f64x4(a);
2737        let (b0, b1) = self.split_f64x4(b);
2738        self.combine_mask64x2(self.simd_le_f64x2(a0, b0), self.simd_le_f64x2(a1, b1))
2739    }
2740    #[inline(always)]
2741    fn simd_ge_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
2742        let (a0, a1) = self.split_f64x4(a);
2743        let (b0, b1) = self.split_f64x4(b);
2744        self.combine_mask64x2(self.simd_ge_f64x2(a0, b0), self.simd_ge_f64x2(a1, b1))
2745    }
2746    #[inline(always)]
2747    fn simd_gt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
2748        let (a0, a1) = self.split_f64x4(a);
2749        let (b0, b1) = self.split_f64x4(b);
2750        self.combine_mask64x2(self.simd_gt_f64x2(a0, b0), self.simd_gt_f64x2(a1, b1))
2751    }
2752    #[inline(always)]
2753    fn zip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2754        let (a0, _) = self.split_f64x4(a);
2755        let (b0, _) = self.split_f64x4(b);
2756        self.combine_f64x2(self.zip_low_f64x2(a0, b0), self.zip_high_f64x2(a0, b0))
2757    }
2758    #[inline(always)]
2759    fn zip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2760        let (_, a1) = self.split_f64x4(a);
2761        let (_, b1) = self.split_f64x4(b);
2762        self.combine_f64x2(self.zip_low_f64x2(a1, b1), self.zip_high_f64x2(a1, b1))
2763    }
2764    #[inline(always)]
2765    fn unzip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2766        let (a0, a1) = self.split_f64x4(a);
2767        let (b0, b1) = self.split_f64x4(b);
2768        self.combine_f64x2(self.unzip_low_f64x2(a0, a1), self.unzip_low_f64x2(b0, b1))
2769    }
2770    #[inline(always)]
2771    fn unzip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2772        let (a0, a1) = self.split_f64x4(a);
2773        let (b0, b1) = self.split_f64x4(b);
2774        self.combine_f64x2(self.unzip_high_f64x2(a0, a1), self.unzip_high_f64x2(b0, b1))
2775    }
2776    #[inline(always)]
2777    fn max_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2778        let (a0, a1) = self.split_f64x4(a);
2779        let (b0, b1) = self.split_f64x4(b);
2780        self.combine_f64x2(self.max_f64x2(a0, b0), self.max_f64x2(a1, b1))
2781    }
2782    #[inline(always)]
2783    fn max_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2784        let (a0, a1) = self.split_f64x4(a);
2785        let (b0, b1) = self.split_f64x4(b);
2786        self.combine_f64x2(
2787            self.max_precise_f64x2(a0, b0),
2788            self.max_precise_f64x2(a1, b1),
2789        )
2790    }
2791    #[inline(always)]
2792    fn min_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2793        let (a0, a1) = self.split_f64x4(a);
2794        let (b0, b1) = self.split_f64x4(b);
2795        self.combine_f64x2(self.min_f64x2(a0, b0), self.min_f64x2(a1, b1))
2796    }
2797    #[inline(always)]
2798    fn min_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2799        let (a0, a1) = self.split_f64x4(a);
2800        let (b0, b1) = self.split_f64x4(b);
2801        self.combine_f64x2(
2802            self.min_precise_f64x2(a0, b0),
2803            self.min_precise_f64x2(a1, b1),
2804        )
2805    }
2806    #[inline(always)]
2807    fn madd_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
2808        let (a0, a1) = self.split_f64x4(a);
2809        let (b0, b1) = self.split_f64x4(b);
2810        let (c0, c1) = self.split_f64x4(c);
2811        self.combine_f64x2(self.madd_f64x2(a0, b0, c0), self.madd_f64x2(a1, b1, c1))
2812    }
2813    #[inline(always)]
2814    fn msub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
2815        let (a0, a1) = self.split_f64x4(a);
2816        let (b0, b1) = self.split_f64x4(b);
2817        let (c0, c1) = self.split_f64x4(c);
2818        self.combine_f64x2(self.msub_f64x2(a0, b0, c0), self.msub_f64x2(a1, b1, c1))
2819    }
2820    #[inline(always)]
2821    fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
2822        let (a0, a1) = self.split_f64x4(a);
2823        self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1))
2824    }
2825    #[inline(always)]
2826    fn fract_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
2827        let (a0, a1) = self.split_f64x4(a);
2828        self.combine_f64x2(self.fract_f64x2(a0), self.fract_f64x2(a1))
2829    }
2830    #[inline(always)]
2831    fn trunc_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
2832        let (a0, a1) = self.split_f64x4(a);
2833        self.combine_f64x2(self.trunc_f64x2(a0), self.trunc_f64x2(a1))
2834    }
2835    #[inline(always)]
2836    fn select_f64x4(self, a: mask64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
2837        let (a0, a1) = self.split_mask64x4(a);
2838        let (b0, b1) = self.split_f64x4(b);
2839        let (c0, c1) = self.split_f64x4(c);
2840        self.combine_f64x2(self.select_f64x2(a0, b0, c0), self.select_f64x2(a1, b1, c1))
2841    }
2842    #[inline(always)]
2843    fn combine_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x8<Self> {
2844        let mut result = [0.0; 8usize];
2845        result[0..4usize].copy_from_slice(&a.val);
2846        result[4usize..8usize].copy_from_slice(&b.val);
2847        result.simd_into(self)
2848    }
2849    #[inline(always)]
2850    fn split_f64x4(self, a: f64x4<Self>) -> (f64x2<Self>, f64x2<Self>) {
2851        let mut b0 = [0.0; 2usize];
2852        let mut b1 = [0.0; 2usize];
2853        b0.copy_from_slice(&a.val[0..2usize]);
2854        b1.copy_from_slice(&a.val[2usize..4usize]);
2855        (b0.simd_into(self), b1.simd_into(self))
2856    }
2857    #[inline(always)]
2858    fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
2859        let (a0, a1) = self.split_f64x4(a);
2860        self.combine_f32x4(
2861            self.reinterpret_f32_f64x2(a0),
2862            self.reinterpret_f32_f64x2(a1),
2863        )
2864    }
2865    #[inline(always)]
2866    fn splat_mask64x4(self, a: i64) -> mask64x4<Self> {
2867        let half = self.splat_mask64x2(a);
2868        self.combine_mask64x2(half, half)
2869    }
2870    #[inline(always)]
2871    fn not_mask64x4(self, a: mask64x4<Self>) -> mask64x4<Self> {
2872        let (a0, a1) = self.split_mask64x4(a);
2873        self.combine_mask64x2(self.not_mask64x2(a0), self.not_mask64x2(a1))
2874    }
2875    #[inline(always)]
2876    fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
2877        let (a0, a1) = self.split_mask64x4(a);
2878        let (b0, b1) = self.split_mask64x4(b);
2879        self.combine_mask64x2(self.and_mask64x2(a0, b0), self.and_mask64x2(a1, b1))
2880    }
2881    #[inline(always)]
2882    fn or_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
2883        let (a0, a1) = self.split_mask64x4(a);
2884        let (b0, b1) = self.split_mask64x4(b);
2885        self.combine_mask64x2(self.or_mask64x2(a0, b0), self.or_mask64x2(a1, b1))
2886    }
2887    #[inline(always)]
2888    fn xor_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
2889        let (a0, a1) = self.split_mask64x4(a);
2890        let (b0, b1) = self.split_mask64x4(b);
2891        self.combine_mask64x2(self.xor_mask64x2(a0, b0), self.xor_mask64x2(a1, b1))
2892    }
2893    #[inline(always)]
2894    fn select_mask64x4(
2895        self,
2896        a: mask64x4<Self>,
2897        b: mask64x4<Self>,
2898        c: mask64x4<Self>,
2899    ) -> mask64x4<Self> {
2900        let (a0, a1) = self.split_mask64x4(a);
2901        let (b0, b1) = self.split_mask64x4(b);
2902        let (c0, c1) = self.split_mask64x4(c);
2903        self.combine_mask64x2(
2904            self.select_mask64x2(a0, b0, c0),
2905            self.select_mask64x2(a1, b1, c1),
2906        )
2907    }
2908    #[inline(always)]
2909    fn simd_eq_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
2910        let (a0, a1) = self.split_mask64x4(a);
2911        let (b0, b1) = self.split_mask64x4(b);
2912        self.combine_mask64x2(self.simd_eq_mask64x2(a0, b0), self.simd_eq_mask64x2(a1, b1))
2913    }
2914    #[inline(always)]
2915    fn combine_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x8<Self> {
2916        let mut result = [0; 8usize];
2917        result[0..4usize].copy_from_slice(&a.val);
2918        result[4usize..8usize].copy_from_slice(&b.val);
2919        result.simd_into(self)
2920    }
2921    #[inline(always)]
2922    fn split_mask64x4(self, a: mask64x4<Self>) -> (mask64x2<Self>, mask64x2<Self>) {
2923        let mut b0 = [0; 2usize];
2924        let mut b1 = [0; 2usize];
2925        b0.copy_from_slice(&a.val[0..2usize]);
2926        b1.copy_from_slice(&a.val[2usize..4usize]);
2927        (b0.simd_into(self), b1.simd_into(self))
2928    }
2929    #[inline(always)]
2930    fn splat_f32x16(self, a: f32) -> f32x16<Self> {
2931        let half = self.splat_f32x8(a);
2932        self.combine_f32x8(half, half)
2933    }
2934    #[inline(always)]
2935    fn abs_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
2936        let (a0, a1) = self.split_f32x16(a);
2937        self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1))
2938    }
2939    #[inline(always)]
2940    fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
2941        let (a0, a1) = self.split_f32x16(a);
2942        self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1))
2943    }
2944    #[inline(always)]
2945    fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
2946        let (a0, a1) = self.split_f32x16(a);
2947        self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
2948    }
2949    #[inline(always)]
2950    fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
2951        let (a0, a1) = self.split_f32x16(a);
2952        let (b0, b1) = self.split_f32x16(b);
2953        self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1))
2954    }
2955    #[inline(always)]
2956    fn sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
2957        let (a0, a1) = self.split_f32x16(a);
2958        let (b0, b1) = self.split_f32x16(b);
2959        self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1))
2960    }
2961    #[inline(always)]
2962    fn mul_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
2963        let (a0, a1) = self.split_f32x16(a);
2964        let (b0, b1) = self.split_f32x16(b);
2965        self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1))
2966    }
2967    #[inline(always)]
2968    fn div_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
2969        let (a0, a1) = self.split_f32x16(a);
2970        let (b0, b1) = self.split_f32x16(b);
2971        self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1))
2972    }
2973    #[inline(always)]
2974    fn copysign_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
2975        let (a0, a1) = self.split_f32x16(a);
2976        let (b0, b1) = self.split_f32x16(b);
2977        self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1))
2978    }
2979    #[inline(always)]
2980    fn simd_eq_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
2981        let (a0, a1) = self.split_f32x16(a);
2982        let (b0, b1) = self.split_f32x16(b);
2983        self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1))
2984    }
2985    #[inline(always)]
2986    fn simd_lt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
2987        let (a0, a1) = self.split_f32x16(a);
2988        let (b0, b1) = self.split_f32x16(b);
2989        self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1))
2990    }
2991    #[inline(always)]
2992    fn simd_le_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
2993        let (a0, a1) = self.split_f32x16(a);
2994        let (b0, b1) = self.split_f32x16(b);
2995        self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1))
2996    }
2997    #[inline(always)]
2998    fn simd_ge_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
2999        let (a0, a1) = self.split_f32x16(a);
3000        let (b0, b1) = self.split_f32x16(b);
3001        self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1))
3002    }
3003    #[inline(always)]
3004    fn simd_gt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
3005        let (a0, a1) = self.split_f32x16(a);
3006        let (b0, b1) = self.split_f32x16(b);
3007        self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1))
3008    }
3009    #[inline(always)]
3010    fn zip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3011        let (a0, _) = self.split_f32x16(a);
3012        let (b0, _) = self.split_f32x16(b);
3013        self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0))
3014    }
3015    #[inline(always)]
3016    fn zip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3017        let (_, a1) = self.split_f32x16(a);
3018        let (_, b1) = self.split_f32x16(b);
3019        self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1))
3020    }
3021    #[inline(always)]
3022    fn unzip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3023        let (a0, a1) = self.split_f32x16(a);
3024        let (b0, b1) = self.split_f32x16(b);
3025        self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1))
3026    }
3027    #[inline(always)]
3028    fn unzip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3029        let (a0, a1) = self.split_f32x16(a);
3030        let (b0, b1) = self.split_f32x16(b);
3031        self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1))
3032    }
3033    #[inline(always)]
3034    fn max_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3035        let (a0, a1) = self.split_f32x16(a);
3036        let (b0, b1) = self.split_f32x16(b);
3037        self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1))
3038    }
3039    #[inline(always)]
3040    fn max_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3041        let (a0, a1) = self.split_f32x16(a);
3042        let (b0, b1) = self.split_f32x16(b);
3043        self.combine_f32x8(
3044            self.max_precise_f32x8(a0, b0),
3045            self.max_precise_f32x8(a1, b1),
3046        )
3047    }
3048    #[inline(always)]
3049    fn min_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3050        let (a0, a1) = self.split_f32x16(a);
3051        let (b0, b1) = self.split_f32x16(b);
3052        self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1))
3053    }
3054    #[inline(always)]
3055    fn min_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3056        let (a0, a1) = self.split_f32x16(a);
3057        let (b0, b1) = self.split_f32x16(b);
3058        self.combine_f32x8(
3059            self.min_precise_f32x8(a0, b0),
3060            self.min_precise_f32x8(a1, b1),
3061        )
3062    }
3063    #[inline(always)]
3064    fn madd_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
3065        let (a0, a1) = self.split_f32x16(a);
3066        let (b0, b1) = self.split_f32x16(b);
3067        let (c0, c1) = self.split_f32x16(c);
3068        self.combine_f32x8(self.madd_f32x8(a0, b0, c0), self.madd_f32x8(a1, b1, c1))
3069    }
3070    #[inline(always)]
3071    fn msub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
3072        let (a0, a1) = self.split_f32x16(a);
3073        let (b0, b1) = self.split_f32x16(b);
3074        let (c0, c1) = self.split_f32x16(c);
3075        self.combine_f32x8(self.msub_f32x8(a0, b0, c0), self.msub_f32x8(a1, b1, c1))
3076    }
3077    #[inline(always)]
3078    fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
3079        let (a0, a1) = self.split_f32x16(a);
3080        self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1))
3081    }
3082    #[inline(always)]
3083    fn fract_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
3084        let (a0, a1) = self.split_f32x16(a);
3085        self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1))
3086    }
3087    #[inline(always)]
3088    fn trunc_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
3089        let (a0, a1) = self.split_f32x16(a);
3090        self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1))
3091    }
3092    #[inline(always)]
3093    fn select_f32x16(self, a: mask32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
3094        let (a0, a1) = self.split_mask32x16(a);
3095        let (b0, b1) = self.split_f32x16(b);
3096        let (c0, c1) = self.split_f32x16(c);
3097        self.combine_f32x8(self.select_f32x8(a0, b0, c0), self.select_f32x8(a1, b1, c1))
3098    }
3099    #[inline(always)]
3100    fn split_f32x16(self, a: f32x16<Self>) -> (f32x8<Self>, f32x8<Self>) {
3101        let mut b0 = [0.0; 8usize];
3102        let mut b1 = [0.0; 8usize];
3103        b0.copy_from_slice(&a.val[0..8usize]);
3104        b1.copy_from_slice(&a.val[8usize..16usize]);
3105        (b0.simd_into(self), b1.simd_into(self))
3106    }
3107    #[inline(always)]
3108    fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
3109        let (a0, a1) = self.split_f32x16(a);
3110        self.combine_f64x4(
3111            self.reinterpret_f64_f32x8(a0),
3112            self.reinterpret_f64_f32x8(a1),
3113        )
3114    }
3115    #[inline(always)]
3116    fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
3117        let (a0, a1) = self.split_f32x16(a);
3118        self.combine_i32x8(
3119            self.reinterpret_i32_f32x8(a0),
3120            self.reinterpret_i32_f32x8(a1),
3121        )
3122    }
3123    #[inline(always)]
3124    fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
3125        crate::Fallback::new()
3126            .load_interleaved_128_f32x16(src)
3127            .val
3128            .simd_into(self)
3129    }
3130    #[inline(always)]
3131    fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
3132        let fb = crate::Fallback::new();
3133        fb.store_interleaved_128_f32x16(a.val.simd_into(fb), dest);
3134    }
3135    #[inline(always)]
3136    fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
3137        let (a0, a1) = self.split_f32x16(a);
3138        self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1))
3139    }
3140    #[inline(always)]
3141    fn reinterpret_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
3142        let (a0, a1) = self.split_f32x16(a);
3143        self.combine_u32x8(
3144            self.reinterpret_u32_f32x8(a0),
3145            self.reinterpret_u32_f32x8(a1),
3146        )
3147    }
3148    #[inline(always)]
3149    fn cvt_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
3150        let (a0, a1) = self.split_f32x16(a);
3151        self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1))
3152    }
3153    #[inline(always)]
3154    fn cvt_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
3155        let (a0, a1) = self.split_f32x16(a);
3156        self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1))
3157    }
3158    #[inline(always)]
3159    fn splat_i8x64(self, a: i8) -> i8x64<Self> {
3160        let half = self.splat_i8x32(a);
3161        self.combine_i8x32(half, half)
3162    }
3163    #[inline(always)]
3164    fn not_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
3165        let (a0, a1) = self.split_i8x64(a);
3166        self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1))
3167    }
3168    #[inline(always)]
3169    fn add_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3170        let (a0, a1) = self.split_i8x64(a);
3171        let (b0, b1) = self.split_i8x64(b);
3172        self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1))
3173    }
3174    #[inline(always)]
3175    fn sub_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3176        let (a0, a1) = self.split_i8x64(a);
3177        let (b0, b1) = self.split_i8x64(b);
3178        self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1))
3179    }
3180    #[inline(always)]
3181    fn mul_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3182        let (a0, a1) = self.split_i8x64(a);
3183        let (b0, b1) = self.split_i8x64(b);
3184        self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1))
3185    }
3186    #[inline(always)]
3187    fn and_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3188        let (a0, a1) = self.split_i8x64(a);
3189        let (b0, b1) = self.split_i8x64(b);
3190        self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1))
3191    }
3192    #[inline(always)]
3193    fn or_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3194        let (a0, a1) = self.split_i8x64(a);
3195        let (b0, b1) = self.split_i8x64(b);
3196        self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1))
3197    }
3198    #[inline(always)]
3199    fn xor_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3200        let (a0, a1) = self.split_i8x64(a);
3201        let (b0, b1) = self.split_i8x64(b);
3202        self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1))
3203    }
3204    #[inline(always)]
3205    fn shr_i8x64(self, a: i8x64<Self>, b: u32) -> i8x64<Self> {
3206        let (a0, a1) = self.split_i8x64(a);
3207        self.combine_i8x32(self.shr_i8x32(a0, b), self.shr_i8x32(a1, b))
3208    }
3209    #[inline(always)]
3210    fn simd_eq_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
3211        let (a0, a1) = self.split_i8x64(a);
3212        let (b0, b1) = self.split_i8x64(b);
3213        self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1))
3214    }
3215    #[inline(always)]
3216    fn simd_lt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
3217        let (a0, a1) = self.split_i8x64(a);
3218        let (b0, b1) = self.split_i8x64(b);
3219        self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1))
3220    }
3221    #[inline(always)]
3222    fn simd_le_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
3223        let (a0, a1) = self.split_i8x64(a);
3224        let (b0, b1) = self.split_i8x64(b);
3225        self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1))
3226    }
3227    #[inline(always)]
3228    fn simd_ge_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
3229        let (a0, a1) = self.split_i8x64(a);
3230        let (b0, b1) = self.split_i8x64(b);
3231        self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1))
3232    }
3233    #[inline(always)]
3234    fn simd_gt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
3235        let (a0, a1) = self.split_i8x64(a);
3236        let (b0, b1) = self.split_i8x64(b);
3237        self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1))
3238    }
3239    #[inline(always)]
3240    fn zip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3241        let (a0, _) = self.split_i8x64(a);
3242        let (b0, _) = self.split_i8x64(b);
3243        self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0))
3244    }
3245    #[inline(always)]
3246    fn zip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3247        let (_, a1) = self.split_i8x64(a);
3248        let (_, b1) = self.split_i8x64(b);
3249        self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1))
3250    }
3251    #[inline(always)]
3252    fn unzip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3253        let (a0, a1) = self.split_i8x64(a);
3254        let (b0, b1) = self.split_i8x64(b);
3255        self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1))
3256    }
3257    #[inline(always)]
3258    fn unzip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3259        let (a0, a1) = self.split_i8x64(a);
3260        let (b0, b1) = self.split_i8x64(b);
3261        self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1))
3262    }
3263    #[inline(always)]
3264    fn select_i8x64(self, a: mask8x64<Self>, b: i8x64<Self>, c: i8x64<Self>) -> i8x64<Self> {
3265        let (a0, a1) = self.split_mask8x64(a);
3266        let (b0, b1) = self.split_i8x64(b);
3267        let (c0, c1) = self.split_i8x64(c);
3268        self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1))
3269    }
3270    #[inline(always)]
3271    fn min_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3272        let (a0, a1) = self.split_i8x64(a);
3273        let (b0, b1) = self.split_i8x64(b);
3274        self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1))
3275    }
3276    #[inline(always)]
3277    fn max_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3278        let (a0, a1) = self.split_i8x64(a);
3279        let (b0, b1) = self.split_i8x64(b);
3280        self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1))
3281    }
3282    #[inline(always)]
3283    fn split_i8x64(self, a: i8x64<Self>) -> (i8x32<Self>, i8x32<Self>) {
3284        let mut b0 = [0; 32usize];
3285        let mut b1 = [0; 32usize];
3286        b0.copy_from_slice(&a.val[0..32usize]);
3287        b1.copy_from_slice(&a.val[32usize..64usize]);
3288        (b0.simd_into(self), b1.simd_into(self))
3289    }
3290    #[inline(always)]
3291    fn reinterpret_u8_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
3292        let (a0, a1) = self.split_i8x64(a);
3293        self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1))
3294    }
3295    #[inline(always)]
3296    fn reinterpret_u32_i8x64(self, a: i8x64<Self>) -> u32x16<Self> {
3297        let (a0, a1) = self.split_i8x64(a);
3298        self.combine_u32x8(
3299            self.reinterpret_u32_i8x32(a0),
3300            self.reinterpret_u32_i8x32(a1),
3301        )
3302    }
3303    #[inline(always)]
3304    fn splat_u8x64(self, a: u8) -> u8x64<Self> {
3305        let half = self.splat_u8x32(a);
3306        self.combine_u8x32(half, half)
3307    }
3308    #[inline(always)]
3309    fn not_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
3310        let (a0, a1) = self.split_u8x64(a);
3311        self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1))
3312    }
3313    #[inline(always)]
3314    fn add_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3315        let (a0, a1) = self.split_u8x64(a);
3316        let (b0, b1) = self.split_u8x64(b);
3317        self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1))
3318    }
3319    #[inline(always)]
3320    fn sub_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3321        let (a0, a1) = self.split_u8x64(a);
3322        let (b0, b1) = self.split_u8x64(b);
3323        self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1))
3324    }
3325    #[inline(always)]
3326    fn mul_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3327        let (a0, a1) = self.split_u8x64(a);
3328        let (b0, b1) = self.split_u8x64(b);
3329        self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1))
3330    }
3331    #[inline(always)]
3332    fn and_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3333        let (a0, a1) = self.split_u8x64(a);
3334        let (b0, b1) = self.split_u8x64(b);
3335        self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1))
3336    }
3337    #[inline(always)]
3338    fn or_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3339        let (a0, a1) = self.split_u8x64(a);
3340        let (b0, b1) = self.split_u8x64(b);
3341        self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1))
3342    }
3343    #[inline(always)]
3344    fn xor_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3345        let (a0, a1) = self.split_u8x64(a);
3346        let (b0, b1) = self.split_u8x64(b);
3347        self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1))
3348    }
3349    #[inline(always)]
3350    fn shr_u8x64(self, a: u8x64<Self>, b: u32) -> u8x64<Self> {
3351        let (a0, a1) = self.split_u8x64(a);
3352        self.combine_u8x32(self.shr_u8x32(a0, b), self.shr_u8x32(a1, b))
3353    }
3354    #[inline(always)]
3355    fn simd_eq_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
3356        let (a0, a1) = self.split_u8x64(a);
3357        let (b0, b1) = self.split_u8x64(b);
3358        self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1))
3359    }
3360    #[inline(always)]
3361    fn simd_lt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
3362        let (a0, a1) = self.split_u8x64(a);
3363        let (b0, b1) = self.split_u8x64(b);
3364        self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1))
3365    }
3366    #[inline(always)]
3367    fn simd_le_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
3368        let (a0, a1) = self.split_u8x64(a);
3369        let (b0, b1) = self.split_u8x64(b);
3370        self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1))
3371    }
3372    #[inline(always)]
3373    fn simd_ge_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
3374        let (a0, a1) = self.split_u8x64(a);
3375        let (b0, b1) = self.split_u8x64(b);
3376        self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1))
3377    }
3378    #[inline(always)]
3379    fn simd_gt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
3380        let (a0, a1) = self.split_u8x64(a);
3381        let (b0, b1) = self.split_u8x64(b);
3382        self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1))
3383    }
3384    #[inline(always)]
3385    fn zip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3386        let (a0, _) = self.split_u8x64(a);
3387        let (b0, _) = self.split_u8x64(b);
3388        self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0))
3389    }
3390    #[inline(always)]
3391    fn zip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3392        let (_, a1) = self.split_u8x64(a);
3393        let (_, b1) = self.split_u8x64(b);
3394        self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1))
3395    }
3396    #[inline(always)]
3397    fn unzip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3398        let (a0, a1) = self.split_u8x64(a);
3399        let (b0, b1) = self.split_u8x64(b);
3400        self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1))
3401    }
3402    #[inline(always)]
3403    fn unzip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3404        let (a0, a1) = self.split_u8x64(a);
3405        let (b0, b1) = self.split_u8x64(b);
3406        self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1))
3407    }
3408    #[inline(always)]
3409    fn select_u8x64(self, a: mask8x64<Self>, b: u8x64<Self>, c: u8x64<Self>) -> u8x64<Self> {
3410        let (a0, a1) = self.split_mask8x64(a);
3411        let (b0, b1) = self.split_u8x64(b);
3412        let (c0, c1) = self.split_u8x64(c);
3413        self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1))
3414    }
3415    #[inline(always)]
3416    fn min_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3417        let (a0, a1) = self.split_u8x64(a);
3418        let (b0, b1) = self.split_u8x64(b);
3419        self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1))
3420    }
3421    #[inline(always)]
3422    fn max_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3423        let (a0, a1) = self.split_u8x64(a);
3424        let (b0, b1) = self.split_u8x64(b);
3425        self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1))
3426    }
3427    #[inline(always)]
3428    fn split_u8x64(self, a: u8x64<Self>) -> (u8x32<Self>, u8x32<Self>) {
3429        let mut b0 = [0; 32usize];
3430        let mut b1 = [0; 32usize];
3431        b0.copy_from_slice(&a.val[0..32usize]);
3432        b1.copy_from_slice(&a.val[32usize..64usize]);
3433        (b0.simd_into(self), b1.simd_into(self))
3434    }
3435    #[inline(always)]
3436    fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self> {
3437        crate::Fallback::new()
3438            .load_interleaved_128_u8x64(src)
3439            .val
3440            .simd_into(self)
3441    }
3442    #[inline(always)]
3443    fn store_interleaved_128_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
3444        let fb = crate::Fallback::new();
3445        fb.store_interleaved_128_u8x64(a.val.simd_into(fb), dest);
3446    }
3447    #[inline(always)]
3448    fn reinterpret_u32_u8x64(self, a: u8x64<Self>) -> u32x16<Self> {
3449        let (a0, a1) = self.split_u8x64(a);
3450        self.combine_u32x8(
3451            self.reinterpret_u32_u8x32(a0),
3452            self.reinterpret_u32_u8x32(a1),
3453        )
3454    }
3455    #[inline(always)]
3456    fn splat_mask8x64(self, a: i8) -> mask8x64<Self> {
3457        let half = self.splat_mask8x32(a);
3458        self.combine_mask8x32(half, half)
3459    }
3460    #[inline(always)]
3461    fn not_mask8x64(self, a: mask8x64<Self>) -> mask8x64<Self> {
3462        let (a0, a1) = self.split_mask8x64(a);
3463        self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1))
3464    }
3465    #[inline(always)]
3466    fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
3467        let (a0, a1) = self.split_mask8x64(a);
3468        let (b0, b1) = self.split_mask8x64(b);
3469        self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1))
3470    }
3471    #[inline(always)]
3472    fn or_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
3473        let (a0, a1) = self.split_mask8x64(a);
3474        let (b0, b1) = self.split_mask8x64(b);
3475        self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1))
3476    }
3477    #[inline(always)]
3478    fn xor_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
3479        let (a0, a1) = self.split_mask8x64(a);
3480        let (b0, b1) = self.split_mask8x64(b);
3481        self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1))
3482    }
3483    #[inline(always)]
3484    fn select_mask8x64(
3485        self,
3486        a: mask8x64<Self>,
3487        b: mask8x64<Self>,
3488        c: mask8x64<Self>,
3489    ) -> mask8x64<Self> {
3490        let (a0, a1) = self.split_mask8x64(a);
3491        let (b0, b1) = self.split_mask8x64(b);
3492        let (c0, c1) = self.split_mask8x64(c);
3493        self.combine_mask8x32(
3494            self.select_mask8x32(a0, b0, c0),
3495            self.select_mask8x32(a1, b1, c1),
3496        )
3497    }
3498    #[inline(always)]
3499    fn simd_eq_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
3500        let (a0, a1) = self.split_mask8x64(a);
3501        let (b0, b1) = self.split_mask8x64(b);
3502        self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1))
3503    }
3504    #[inline(always)]
3505    fn split_mask8x64(self, a: mask8x64<Self>) -> (mask8x32<Self>, mask8x32<Self>) {
3506        let mut b0 = [0; 32usize];
3507        let mut b1 = [0; 32usize];
3508        b0.copy_from_slice(&a.val[0..32usize]);
3509        b1.copy_from_slice(&a.val[32usize..64usize]);
3510        (b0.simd_into(self), b1.simd_into(self))
3511    }
3512    #[inline(always)]
3513    fn splat_i16x32(self, a: i16) -> i16x32<Self> {
3514        let half = self.splat_i16x16(a);
3515        self.combine_i16x16(half, half)
3516    }
3517    #[inline(always)]
3518    fn not_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
3519        let (a0, a1) = self.split_i16x32(a);
3520        self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1))
3521    }
3522    #[inline(always)]
3523    fn add_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3524        let (a0, a1) = self.split_i16x32(a);
3525        let (b0, b1) = self.split_i16x32(b);
3526        self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1))
3527    }
3528    #[inline(always)]
3529    fn sub_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3530        let (a0, a1) = self.split_i16x32(a);
3531        let (b0, b1) = self.split_i16x32(b);
3532        self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1))
3533    }
3534    #[inline(always)]
3535    fn mul_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3536        let (a0, a1) = self.split_i16x32(a);
3537        let (b0, b1) = self.split_i16x32(b);
3538        self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1))
3539    }
3540    #[inline(always)]
3541    fn and_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3542        let (a0, a1) = self.split_i16x32(a);
3543        let (b0, b1) = self.split_i16x32(b);
3544        self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1))
3545    }
3546    #[inline(always)]
3547    fn or_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3548        let (a0, a1) = self.split_i16x32(a);
3549        let (b0, b1) = self.split_i16x32(b);
3550        self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1))
3551    }
3552    #[inline(always)]
3553    fn xor_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3554        let (a0, a1) = self.split_i16x32(a);
3555        let (b0, b1) = self.split_i16x32(b);
3556        self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1))
3557    }
3558    #[inline(always)]
3559    fn shr_i16x32(self, a: i16x32<Self>, b: u32) -> i16x32<Self> {
3560        let (a0, a1) = self.split_i16x32(a);
3561        self.combine_i16x16(self.shr_i16x16(a0, b), self.shr_i16x16(a1, b))
3562    }
3563    #[inline(always)]
3564    fn simd_eq_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
3565        let (a0, a1) = self.split_i16x32(a);
3566        let (b0, b1) = self.split_i16x32(b);
3567        self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1))
3568    }
3569    #[inline(always)]
3570    fn simd_lt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
3571        let (a0, a1) = self.split_i16x32(a);
3572        let (b0, b1) = self.split_i16x32(b);
3573        self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1))
3574    }
3575    #[inline(always)]
3576    fn simd_le_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
3577        let (a0, a1) = self.split_i16x32(a);
3578        let (b0, b1) = self.split_i16x32(b);
3579        self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1))
3580    }
3581    #[inline(always)]
3582    fn simd_ge_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
3583        let (a0, a1) = self.split_i16x32(a);
3584        let (b0, b1) = self.split_i16x32(b);
3585        self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1))
3586    }
3587    #[inline(always)]
3588    fn simd_gt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
3589        let (a0, a1) = self.split_i16x32(a);
3590        let (b0, b1) = self.split_i16x32(b);
3591        self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1))
3592    }
3593    #[inline(always)]
3594    fn zip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3595        let (a0, _) = self.split_i16x32(a);
3596        let (b0, _) = self.split_i16x32(b);
3597        self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0))
3598    }
3599    #[inline(always)]
3600    fn zip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3601        let (_, a1) = self.split_i16x32(a);
3602        let (_, b1) = self.split_i16x32(b);
3603        self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1))
3604    }
3605    #[inline(always)]
3606    fn unzip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3607        let (a0, a1) = self.split_i16x32(a);
3608        let (b0, b1) = self.split_i16x32(b);
3609        self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1))
3610    }
3611    #[inline(always)]
3612    fn unzip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3613        let (a0, a1) = self.split_i16x32(a);
3614        let (b0, b1) = self.split_i16x32(b);
3615        self.combine_i16x16(
3616            self.unzip_high_i16x16(a0, a1),
3617            self.unzip_high_i16x16(b0, b1),
3618        )
3619    }
3620    #[inline(always)]
3621    fn select_i16x32(self, a: mask16x32<Self>, b: i16x32<Self>, c: i16x32<Self>) -> i16x32<Self> {
3622        let (a0, a1) = self.split_mask16x32(a);
3623        let (b0, b1) = self.split_i16x32(b);
3624        let (c0, c1) = self.split_i16x32(c);
3625        self.combine_i16x16(
3626            self.select_i16x16(a0, b0, c0),
3627            self.select_i16x16(a1, b1, c1),
3628        )
3629    }
3630    #[inline(always)]
3631    fn min_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3632        let (a0, a1) = self.split_i16x32(a);
3633        let (b0, b1) = self.split_i16x32(b);
3634        self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1))
3635    }
3636    #[inline(always)]
3637    fn max_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3638        let (a0, a1) = self.split_i16x32(a);
3639        let (b0, b1) = self.split_i16x32(b);
3640        self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1))
3641    }
3642    #[inline(always)]
3643    fn split_i16x32(self, a: i16x32<Self>) -> (i16x16<Self>, i16x16<Self>) {
3644        let mut b0 = [0; 16usize];
3645        let mut b1 = [0; 16usize];
3646        b0.copy_from_slice(&a.val[0..16usize]);
3647        b1.copy_from_slice(&a.val[16usize..32usize]);
3648        (b0.simd_into(self), b1.simd_into(self))
3649    }
3650    #[inline(always)]
3651    fn reinterpret_u8_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
3652        let (a0, a1) = self.split_i16x32(a);
3653        self.combine_u8x32(
3654            self.reinterpret_u8_i16x16(a0),
3655            self.reinterpret_u8_i16x16(a1),
3656        )
3657    }
3658    #[inline(always)]
3659    fn reinterpret_u32_i16x32(self, a: i16x32<Self>) -> u32x16<Self> {
3660        let (a0, a1) = self.split_i16x32(a);
3661        self.combine_u32x8(
3662            self.reinterpret_u32_i16x16(a0),
3663            self.reinterpret_u32_i16x16(a1),
3664        )
3665    }
3666    #[inline(always)]
3667    fn splat_u16x32(self, a: u16) -> u16x32<Self> {
3668        let half = self.splat_u16x16(a);
3669        self.combine_u16x16(half, half)
3670    }
3671    #[inline(always)]
3672    fn not_u16x32(self, a: u16x32<Self>) -> u16x32<Self> {
3673        let (a0, a1) = self.split_u16x32(a);
3674        self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1))
3675    }
3676    #[inline(always)]
3677    fn add_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3678        let (a0, a1) = self.split_u16x32(a);
3679        let (b0, b1) = self.split_u16x32(b);
3680        self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1))
3681    }
3682    #[inline(always)]
3683    fn sub_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3684        let (a0, a1) = self.split_u16x32(a);
3685        let (b0, b1) = self.split_u16x32(b);
3686        self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1))
3687    }
3688    #[inline(always)]
3689    fn mul_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3690        let (a0, a1) = self.split_u16x32(a);
3691        let (b0, b1) = self.split_u16x32(b);
3692        self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1))
3693    }
3694    #[inline(always)]
3695    fn and_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3696        let (a0, a1) = self.split_u16x32(a);
3697        let (b0, b1) = self.split_u16x32(b);
3698        self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1))
3699    }
3700    #[inline(always)]
3701    fn or_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3702        let (a0, a1) = self.split_u16x32(a);
3703        let (b0, b1) = self.split_u16x32(b);
3704        self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1))
3705    }
3706    #[inline(always)]
3707    fn xor_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3708        let (a0, a1) = self.split_u16x32(a);
3709        let (b0, b1) = self.split_u16x32(b);
3710        self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1))
3711    }
3712    #[inline(always)]
3713    fn shr_u16x32(self, a: u16x32<Self>, b: u32) -> u16x32<Self> {
3714        let (a0, a1) = self.split_u16x32(a);
3715        self.combine_u16x16(self.shr_u16x16(a0, b), self.shr_u16x16(a1, b))
3716    }
3717    #[inline(always)]
3718    fn simd_eq_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
3719        let (a0, a1) = self.split_u16x32(a);
3720        let (b0, b1) = self.split_u16x32(b);
3721        self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1))
3722    }
3723    #[inline(always)]
3724    fn simd_lt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
3725        let (a0, a1) = self.split_u16x32(a);
3726        let (b0, b1) = self.split_u16x32(b);
3727        self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1))
3728    }
3729    #[inline(always)]
3730    fn simd_le_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
3731        let (a0, a1) = self.split_u16x32(a);
3732        let (b0, b1) = self.split_u16x32(b);
3733        self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1))
3734    }
3735    #[inline(always)]
3736    fn simd_ge_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
3737        let (a0, a1) = self.split_u16x32(a);
3738        let (b0, b1) = self.split_u16x32(b);
3739        self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1))
3740    }
3741    #[inline(always)]
3742    fn simd_gt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
3743        let (a0, a1) = self.split_u16x32(a);
3744        let (b0, b1) = self.split_u16x32(b);
3745        self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1))
3746    }
3747    #[inline(always)]
3748    fn zip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3749        let (a0, _) = self.split_u16x32(a);
3750        let (b0, _) = self.split_u16x32(b);
3751        self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0))
3752    }
3753    #[inline(always)]
3754    fn zip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3755        let (_, a1) = self.split_u16x32(a);
3756        let (_, b1) = self.split_u16x32(b);
3757        self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1))
3758    }
3759    #[inline(always)]
3760    fn unzip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3761        let (a0, a1) = self.split_u16x32(a);
3762        let (b0, b1) = self.split_u16x32(b);
3763        self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1))
3764    }
3765    #[inline(always)]
3766    fn unzip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3767        let (a0, a1) = self.split_u16x32(a);
3768        let (b0, b1) = self.split_u16x32(b);
3769        self.combine_u16x16(
3770            self.unzip_high_u16x16(a0, a1),
3771            self.unzip_high_u16x16(b0, b1),
3772        )
3773    }
3774    #[inline(always)]
3775    fn select_u16x32(self, a: mask16x32<Self>, b: u16x32<Self>, c: u16x32<Self>) -> u16x32<Self> {
3776        let (a0, a1) = self.split_mask16x32(a);
3777        let (b0, b1) = self.split_u16x32(b);
3778        let (c0, c1) = self.split_u16x32(c);
3779        self.combine_u16x16(
3780            self.select_u16x16(a0, b0, c0),
3781            self.select_u16x16(a1, b1, c1),
3782        )
3783    }
3784    #[inline(always)]
3785    fn min_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3786        let (a0, a1) = self.split_u16x32(a);
3787        let (b0, b1) = self.split_u16x32(b);
3788        self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1))
3789    }
3790    #[inline(always)]
3791    fn max_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3792        let (a0, a1) = self.split_u16x32(a);
3793        let (b0, b1) = self.split_u16x32(b);
3794        self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1))
3795    }
3796    #[inline(always)]
3797    fn split_u16x32(self, a: u16x32<Self>) -> (u16x16<Self>, u16x16<Self>) {
3798        let mut b0 = [0; 16usize];
3799        let mut b1 = [0; 16usize];
3800        b0.copy_from_slice(&a.val[0..16usize]);
3801        b1.copy_from_slice(&a.val[16usize..32usize]);
3802        (b0.simd_into(self), b1.simd_into(self))
3803    }
3804    #[inline(always)]
3805    fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self> {
3806        crate::Fallback::new()
3807            .load_interleaved_128_u16x32(src)
3808            .val
3809            .simd_into(self)
3810    }
3811    #[inline(always)]
3812    fn store_interleaved_128_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
3813        let fb = crate::Fallback::new();
3814        fb.store_interleaved_128_u16x32(a.val.simd_into(fb), dest);
3815    }
3816    #[inline(always)]
3817    fn narrow_u16x32(self, a: u16x32<Self>) -> u8x32<Self> {
3818        let (a0, a1) = self.split_u16x32(a);
3819        self.combine_u8x16(self.narrow_u16x16(a0), self.narrow_u16x16(a1))
3820    }
3821    #[inline(always)]
3822    fn reinterpret_u8_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
3823        let (a0, a1) = self.split_u16x32(a);
3824        self.combine_u8x32(
3825            self.reinterpret_u8_u16x16(a0),
3826            self.reinterpret_u8_u16x16(a1),
3827        )
3828    }
3829    #[inline(always)]
3830    fn reinterpret_u32_u16x32(self, a: u16x32<Self>) -> u32x16<Self> {
3831        let (a0, a1) = self.split_u16x32(a);
3832        self.combine_u32x8(
3833            self.reinterpret_u32_u16x16(a0),
3834            self.reinterpret_u32_u16x16(a1),
3835        )
3836    }
3837    #[inline(always)]
3838    fn splat_mask16x32(self, a: i16) -> mask16x32<Self> {
3839        let half = self.splat_mask16x16(a);
3840        self.combine_mask16x16(half, half)
3841    }
3842    #[inline(always)]
3843    fn not_mask16x32(self, a: mask16x32<Self>) -> mask16x32<Self> {
3844        let (a0, a1) = self.split_mask16x32(a);
3845        self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1))
3846    }
3847    #[inline(always)]
3848    fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
3849        let (a0, a1) = self.split_mask16x32(a);
3850        let (b0, b1) = self.split_mask16x32(b);
3851        self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1))
3852    }
3853    #[inline(always)]
3854    fn or_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
3855        let (a0, a1) = self.split_mask16x32(a);
3856        let (b0, b1) = self.split_mask16x32(b);
3857        self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1))
3858    }
3859    #[inline(always)]
3860    fn xor_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
3861        let (a0, a1) = self.split_mask16x32(a);
3862        let (b0, b1) = self.split_mask16x32(b);
3863        self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1))
3864    }
3865    #[inline(always)]
3866    fn select_mask16x32(
3867        self,
3868        a: mask16x32<Self>,
3869        b: mask16x32<Self>,
3870        c: mask16x32<Self>,
3871    ) -> mask16x32<Self> {
3872        let (a0, a1) = self.split_mask16x32(a);
3873        let (b0, b1) = self.split_mask16x32(b);
3874        let (c0, c1) = self.split_mask16x32(c);
3875        self.combine_mask16x16(
3876            self.select_mask16x16(a0, b0, c0),
3877            self.select_mask16x16(a1, b1, c1),
3878        )
3879    }
3880    #[inline(always)]
3881    fn simd_eq_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
3882        let (a0, a1) = self.split_mask16x32(a);
3883        let (b0, b1) = self.split_mask16x32(b);
3884        self.combine_mask16x16(
3885            self.simd_eq_mask16x16(a0, b0),
3886            self.simd_eq_mask16x16(a1, b1),
3887        )
3888    }
3889    #[inline(always)]
3890    fn split_mask16x32(self, a: mask16x32<Self>) -> (mask16x16<Self>, mask16x16<Self>) {
3891        let mut b0 = [0; 16usize];
3892        let mut b1 = [0; 16usize];
3893        b0.copy_from_slice(&a.val[0..16usize]);
3894        b1.copy_from_slice(&a.val[16usize..32usize]);
3895        (b0.simd_into(self), b1.simd_into(self))
3896    }
3897    #[inline(always)]
3898    fn splat_i32x16(self, a: i32) -> i32x16<Self> {
3899        let half = self.splat_i32x8(a);
3900        self.combine_i32x8(half, half)
3901    }
3902    #[inline(always)]
3903    fn not_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
3904        let (a0, a1) = self.split_i32x16(a);
3905        self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1))
3906    }
3907    #[inline(always)]
3908    fn add_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
3909        let (a0, a1) = self.split_i32x16(a);
3910        let (b0, b1) = self.split_i32x16(b);
3911        self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1))
3912    }
3913    #[inline(always)]
3914    fn sub_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
3915        let (a0, a1) = self.split_i32x16(a);
3916        let (b0, b1) = self.split_i32x16(b);
3917        self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1))
3918    }
3919    #[inline(always)]
3920    fn mul_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
3921        let (a0, a1) = self.split_i32x16(a);
3922        let (b0, b1) = self.split_i32x16(b);
3923        self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1))
3924    }
3925    #[inline(always)]
3926    fn and_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
3927        let (a0, a1) = self.split_i32x16(a);
3928        let (b0, b1) = self.split_i32x16(b);
3929        self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1))
3930    }
3931    #[inline(always)]
3932    fn or_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
3933        let (a0, a1) = self.split_i32x16(a);
3934        let (b0, b1) = self.split_i32x16(b);
3935        self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1))
3936    }
3937    #[inline(always)]
3938    fn xor_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
3939        let (a0, a1) = self.split_i32x16(a);
3940        let (b0, b1) = self.split_i32x16(b);
3941        self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1))
3942    }
3943    #[inline(always)]
3944    fn shr_i32x16(self, a: i32x16<Self>, b: u32) -> i32x16<Self> {
3945        let (a0, a1) = self.split_i32x16(a);
3946        self.combine_i32x8(self.shr_i32x8(a0, b), self.shr_i32x8(a1, b))
3947    }
3948    #[inline(always)]
3949    fn simd_eq_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
3950        let (a0, a1) = self.split_i32x16(a);
3951        let (b0, b1) = self.split_i32x16(b);
3952        self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1))
3953    }
3954    #[inline(always)]
3955    fn simd_lt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
3956        let (a0, a1) = self.split_i32x16(a);
3957        let (b0, b1) = self.split_i32x16(b);
3958        self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1))
3959    }
3960    #[inline(always)]
3961    fn simd_le_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
3962        let (a0, a1) = self.split_i32x16(a);
3963        let (b0, b1) = self.split_i32x16(b);
3964        self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1))
3965    }
3966    #[inline(always)]
3967    fn simd_ge_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
3968        let (a0, a1) = self.split_i32x16(a);
3969        let (b0, b1) = self.split_i32x16(b);
3970        self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1))
3971    }
3972    #[inline(always)]
3973    fn simd_gt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
3974        let (a0, a1) = self.split_i32x16(a);
3975        let (b0, b1) = self.split_i32x16(b);
3976        self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1))
3977    }
3978    #[inline(always)]
3979    fn zip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
3980        let (a0, _) = self.split_i32x16(a);
3981        let (b0, _) = self.split_i32x16(b);
3982        self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0))
3983    }
3984    #[inline(always)]
3985    fn zip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
3986        let (_, a1) = self.split_i32x16(a);
3987        let (_, b1) = self.split_i32x16(b);
3988        self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1))
3989    }
3990    #[inline(always)]
3991    fn unzip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
3992        let (a0, a1) = self.split_i32x16(a);
3993        let (b0, b1) = self.split_i32x16(b);
3994        self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1))
3995    }
3996    #[inline(always)]
3997    fn unzip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
3998        let (a0, a1) = self.split_i32x16(a);
3999        let (b0, b1) = self.split_i32x16(b);
4000        self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1))
4001    }
4002    #[inline(always)]
4003    fn select_i32x16(self, a: mask32x16<Self>, b: i32x16<Self>, c: i32x16<Self>) -> i32x16<Self> {
4004        let (a0, a1) = self.split_mask32x16(a);
4005        let (b0, b1) = self.split_i32x16(b);
4006        let (c0, c1) = self.split_i32x16(c);
4007        self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1))
4008    }
4009    #[inline(always)]
4010    fn min_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
4011        let (a0, a1) = self.split_i32x16(a);
4012        let (b0, b1) = self.split_i32x16(b);
4013        self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1))
4014    }
4015    #[inline(always)]
4016    fn max_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
4017        let (a0, a1) = self.split_i32x16(a);
4018        let (b0, b1) = self.split_i32x16(b);
4019        self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1))
4020    }
4021    #[inline(always)]
4022    fn split_i32x16(self, a: i32x16<Self>) -> (i32x8<Self>, i32x8<Self>) {
4023        let mut b0 = [0; 8usize];
4024        let mut b1 = [0; 8usize];
4025        b0.copy_from_slice(&a.val[0..8usize]);
4026        b1.copy_from_slice(&a.val[8usize..16usize]);
4027        (b0.simd_into(self), b1.simd_into(self))
4028    }
4029    #[inline(always)]
4030    fn reinterpret_u8_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
4031        let (a0, a1) = self.split_i32x16(a);
4032        self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1))
4033    }
4034    #[inline(always)]
4035    fn reinterpret_u32_i32x16(self, a: i32x16<Self>) -> u32x16<Self> {
4036        let (a0, a1) = self.split_i32x16(a);
4037        self.combine_u32x8(
4038            self.reinterpret_u32_i32x8(a0),
4039            self.reinterpret_u32_i32x8(a1),
4040        )
4041    }
4042    #[inline(always)]
4043    fn cvt_f32_i32x16(self, a: i32x16<Self>) -> f32x16<Self> {
4044        let (a0, a1) = self.split_i32x16(a);
4045        self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1))
4046    }
4047    #[inline(always)]
4048    fn splat_u32x16(self, a: u32) -> u32x16<Self> {
4049        let half = self.splat_u32x8(a);
4050        self.combine_u32x8(half, half)
4051    }
4052    #[inline(always)]
4053    fn not_u32x16(self, a: u32x16<Self>) -> u32x16<Self> {
4054        let (a0, a1) = self.split_u32x16(a);
4055        self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1))
4056    }
4057    #[inline(always)]
4058    fn add_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4059        let (a0, a1) = self.split_u32x16(a);
4060        let (b0, b1) = self.split_u32x16(b);
4061        self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1))
4062    }
4063    #[inline(always)]
4064    fn sub_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4065        let (a0, a1) = self.split_u32x16(a);
4066        let (b0, b1) = self.split_u32x16(b);
4067        self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1))
4068    }
4069    #[inline(always)]
4070    fn mul_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4071        let (a0, a1) = self.split_u32x16(a);
4072        let (b0, b1) = self.split_u32x16(b);
4073        self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1))
4074    }
4075    #[inline(always)]
4076    fn and_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4077        let (a0, a1) = self.split_u32x16(a);
4078        let (b0, b1) = self.split_u32x16(b);
4079        self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1))
4080    }
4081    #[inline(always)]
4082    fn or_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4083        let (a0, a1) = self.split_u32x16(a);
4084        let (b0, b1) = self.split_u32x16(b);
4085        self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1))
4086    }
4087    #[inline(always)]
4088    fn xor_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4089        let (a0, a1) = self.split_u32x16(a);
4090        let (b0, b1) = self.split_u32x16(b);
4091        self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1))
4092    }
4093    #[inline(always)]
4094    fn shr_u32x16(self, a: u32x16<Self>, b: u32) -> u32x16<Self> {
4095        let (a0, a1) = self.split_u32x16(a);
4096        self.combine_u32x8(self.shr_u32x8(a0, b), self.shr_u32x8(a1, b))
4097    }
4098    #[inline(always)]
4099    fn simd_eq_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
4100        let (a0, a1) = self.split_u32x16(a);
4101        let (b0, b1) = self.split_u32x16(b);
4102        self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1))
4103    }
4104    #[inline(always)]
4105    fn simd_lt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
4106        let (a0, a1) = self.split_u32x16(a);
4107        let (b0, b1) = self.split_u32x16(b);
4108        self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1))
4109    }
4110    #[inline(always)]
4111    fn simd_le_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
4112        let (a0, a1) = self.split_u32x16(a);
4113        let (b0, b1) = self.split_u32x16(b);
4114        self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1))
4115    }
4116    #[inline(always)]
4117    fn simd_ge_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
4118        let (a0, a1) = self.split_u32x16(a);
4119        let (b0, b1) = self.split_u32x16(b);
4120        self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1))
4121    }
4122    #[inline(always)]
4123    fn simd_gt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
4124        let (a0, a1) = self.split_u32x16(a);
4125        let (b0, b1) = self.split_u32x16(b);
4126        self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1))
4127    }
4128    #[inline(always)]
4129    fn zip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4130        let (a0, _) = self.split_u32x16(a);
4131        let (b0, _) = self.split_u32x16(b);
4132        self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0))
4133    }
4134    #[inline(always)]
4135    fn zip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4136        let (_, a1) = self.split_u32x16(a);
4137        let (_, b1) = self.split_u32x16(b);
4138        self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1))
4139    }
4140    #[inline(always)]
4141    fn unzip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4142        let (a0, a1) = self.split_u32x16(a);
4143        let (b0, b1) = self.split_u32x16(b);
4144        self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1))
4145    }
4146    #[inline(always)]
4147    fn unzip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4148        let (a0, a1) = self.split_u32x16(a);
4149        let (b0, b1) = self.split_u32x16(b);
4150        self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1))
4151    }
4152    #[inline(always)]
4153    fn select_u32x16(self, a: mask32x16<Self>, b: u32x16<Self>, c: u32x16<Self>) -> u32x16<Self> {
4154        let (a0, a1) = self.split_mask32x16(a);
4155        let (b0, b1) = self.split_u32x16(b);
4156        let (c0, c1) = self.split_u32x16(c);
4157        self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1))
4158    }
4159    #[inline(always)]
4160    fn min_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4161        let (a0, a1) = self.split_u32x16(a);
4162        let (b0, b1) = self.split_u32x16(b);
4163        self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1))
4164    }
4165    #[inline(always)]
4166    fn max_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4167        let (a0, a1) = self.split_u32x16(a);
4168        let (b0, b1) = self.split_u32x16(b);
4169        self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1))
4170    }
4171    #[inline(always)]
4172    fn split_u32x16(self, a: u32x16<Self>) -> (u32x8<Self>, u32x8<Self>) {
4173        let mut b0 = [0; 8usize];
4174        let mut b1 = [0; 8usize];
4175        b0.copy_from_slice(&a.val[0..8usize]);
4176        b1.copy_from_slice(&a.val[8usize..16usize]);
4177        (b0.simd_into(self), b1.simd_into(self))
4178    }
4179    #[inline(always)]
4180    fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self> {
4181        unsafe {
4182            let v0 = _mm_loadu_si128(src.as_ptr().add(0) as *const __m128i);
4183            let v1 = _mm_loadu_si128(src.as_ptr().add(4) as *const __m128i);
4184            let v2 = _mm_loadu_si128(src.as_ptr().add(8) as *const __m128i);
4185            let v3 = _mm_loadu_si128(src.as_ptr().add(12) as *const __m128i);
4186            let tmp0 = _mm_unpacklo_epi32(v0, v1);
4187            let tmp1 = _mm_unpackhi_epi32(v0, v1);
4188            let tmp2 = _mm_unpacklo_epi32(v2, v3);
4189            let tmp3 = _mm_unpackhi_epi32(v2, v3);
4190            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
4191            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
4192            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
4193            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
4194            self.combine_u32x8(
4195                self.combine_u32x4(out0.simd_into(self), out1.simd_into(self)),
4196                self.combine_u32x4(out2.simd_into(self), out3.simd_into(self)),
4197            )
4198        }
4199    }
4200    #[inline(always)]
4201    fn store_interleaved_128_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
4202        let fb = crate::Fallback::new();
4203        fb.store_interleaved_128_u32x16(a.val.simd_into(fb), dest);
4204    }
4205    #[inline(always)]
4206    fn reinterpret_u8_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
4207        let (a0, a1) = self.split_u32x16(a);
4208        self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1))
4209    }
4210    #[inline(always)]
4211    fn cvt_f32_u32x16(self, a: u32x16<Self>) -> f32x16<Self> {
4212        let (a0, a1) = self.split_u32x16(a);
4213        self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1))
4214    }
4215    #[inline(always)]
4216    fn splat_mask32x16(self, a: i32) -> mask32x16<Self> {
4217        let half = self.splat_mask32x8(a);
4218        self.combine_mask32x8(half, half)
4219    }
4220    #[inline(always)]
4221    fn not_mask32x16(self, a: mask32x16<Self>) -> mask32x16<Self> {
4222        let (a0, a1) = self.split_mask32x16(a);
4223        self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1))
4224    }
4225    #[inline(always)]
4226    fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
4227        let (a0, a1) = self.split_mask32x16(a);
4228        let (b0, b1) = self.split_mask32x16(b);
4229        self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1))
4230    }
4231    #[inline(always)]
4232    fn or_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
4233        let (a0, a1) = self.split_mask32x16(a);
4234        let (b0, b1) = self.split_mask32x16(b);
4235        self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1))
4236    }
4237    #[inline(always)]
4238    fn xor_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
4239        let (a0, a1) = self.split_mask32x16(a);
4240        let (b0, b1) = self.split_mask32x16(b);
4241        self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1))
4242    }
4243    #[inline(always)]
4244    fn select_mask32x16(
4245        self,
4246        a: mask32x16<Self>,
4247        b: mask32x16<Self>,
4248        c: mask32x16<Self>,
4249    ) -> mask32x16<Self> {
4250        let (a0, a1) = self.split_mask32x16(a);
4251        let (b0, b1) = self.split_mask32x16(b);
4252        let (c0, c1) = self.split_mask32x16(c);
4253        self.combine_mask32x8(
4254            self.select_mask32x8(a0, b0, c0),
4255            self.select_mask32x8(a1, b1, c1),
4256        )
4257    }
4258    #[inline(always)]
4259    fn simd_eq_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
4260        let (a0, a1) = self.split_mask32x16(a);
4261        let (b0, b1) = self.split_mask32x16(b);
4262        self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1))
4263    }
4264    #[inline(always)]
4265    fn split_mask32x16(self, a: mask32x16<Self>) -> (mask32x8<Self>, mask32x8<Self>) {
4266        let mut b0 = [0; 8usize];
4267        let mut b1 = [0; 8usize];
4268        b0.copy_from_slice(&a.val[0..8usize]);
4269        b1.copy_from_slice(&a.val[8usize..16usize]);
4270        (b0.simd_into(self), b1.simd_into(self))
4271    }
4272    #[inline(always)]
4273    fn splat_f64x8(self, a: f64) -> f64x8<Self> {
4274        let half = self.splat_f64x4(a);
4275        self.combine_f64x4(half, half)
4276    }
4277    #[inline(always)]
4278    fn abs_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
4279        let (a0, a1) = self.split_f64x8(a);
4280        self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1))
4281    }
4282    #[inline(always)]
4283    fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
4284        let (a0, a1) = self.split_f64x8(a);
4285        self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1))
4286    }
4287    #[inline(always)]
4288    fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
4289        let (a0, a1) = self.split_f64x8(a);
4290        self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
4291    }
4292    #[inline(always)]
4293    fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4294        let (a0, a1) = self.split_f64x8(a);
4295        let (b0, b1) = self.split_f64x8(b);
4296        self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1))
4297    }
4298    #[inline(always)]
4299    fn sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4300        let (a0, a1) = self.split_f64x8(a);
4301        let (b0, b1) = self.split_f64x8(b);
4302        self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1))
4303    }
4304    #[inline(always)]
4305    fn mul_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4306        let (a0, a1) = self.split_f64x8(a);
4307        let (b0, b1) = self.split_f64x8(b);
4308        self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1))
4309    }
4310    #[inline(always)]
4311    fn div_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4312        let (a0, a1) = self.split_f64x8(a);
4313        let (b0, b1) = self.split_f64x8(b);
4314        self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1))
4315    }
4316    #[inline(always)]
4317    fn copysign_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4318        let (a0, a1) = self.split_f64x8(a);
4319        let (b0, b1) = self.split_f64x8(b);
4320        self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1))
4321    }
4322    #[inline(always)]
4323    fn simd_eq_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
4324        let (a0, a1) = self.split_f64x8(a);
4325        let (b0, b1) = self.split_f64x8(b);
4326        self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1))
4327    }
4328    #[inline(always)]
4329    fn simd_lt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
4330        let (a0, a1) = self.split_f64x8(a);
4331        let (b0, b1) = self.split_f64x8(b);
4332        self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1))
4333    }
4334    #[inline(always)]
4335    fn simd_le_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
4336        let (a0, a1) = self.split_f64x8(a);
4337        let (b0, b1) = self.split_f64x8(b);
4338        self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1))
4339    }
4340    #[inline(always)]
4341    fn simd_ge_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
4342        let (a0, a1) = self.split_f64x8(a);
4343        let (b0, b1) = self.split_f64x8(b);
4344        self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1))
4345    }
4346    #[inline(always)]
4347    fn simd_gt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
4348        let (a0, a1) = self.split_f64x8(a);
4349        let (b0, b1) = self.split_f64x8(b);
4350        self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1))
4351    }
4352    #[inline(always)]
4353    fn zip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4354        let (a0, _) = self.split_f64x8(a);
4355        let (b0, _) = self.split_f64x8(b);
4356        self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0))
4357    }
4358    #[inline(always)]
4359    fn zip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4360        let (_, a1) = self.split_f64x8(a);
4361        let (_, b1) = self.split_f64x8(b);
4362        self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1))
4363    }
4364    #[inline(always)]
4365    fn unzip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4366        let (a0, a1) = self.split_f64x8(a);
4367        let (b0, b1) = self.split_f64x8(b);
4368        self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1))
4369    }
4370    #[inline(always)]
4371    fn unzip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4372        let (a0, a1) = self.split_f64x8(a);
4373        let (b0, b1) = self.split_f64x8(b);
4374        self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1))
4375    }
4376    #[inline(always)]
4377    fn max_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4378        let (a0, a1) = self.split_f64x8(a);
4379        let (b0, b1) = self.split_f64x8(b);
4380        self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1))
4381    }
4382    #[inline(always)]
4383    fn max_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4384        let (a0, a1) = self.split_f64x8(a);
4385        let (b0, b1) = self.split_f64x8(b);
4386        self.combine_f64x4(
4387            self.max_precise_f64x4(a0, b0),
4388            self.max_precise_f64x4(a1, b1),
4389        )
4390    }
4391    #[inline(always)]
4392    fn min_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4393        let (a0, a1) = self.split_f64x8(a);
4394        let (b0, b1) = self.split_f64x8(b);
4395        self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1))
4396    }
4397    #[inline(always)]
4398    fn min_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4399        let (a0, a1) = self.split_f64x8(a);
4400        let (b0, b1) = self.split_f64x8(b);
4401        self.combine_f64x4(
4402            self.min_precise_f64x4(a0, b0),
4403            self.min_precise_f64x4(a1, b1),
4404        )
4405    }
4406    #[inline(always)]
4407    fn madd_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
4408        let (a0, a1) = self.split_f64x8(a);
4409        let (b0, b1) = self.split_f64x8(b);
4410        let (c0, c1) = self.split_f64x8(c);
4411        self.combine_f64x4(self.madd_f64x4(a0, b0, c0), self.madd_f64x4(a1, b1, c1))
4412    }
4413    #[inline(always)]
4414    fn msub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
4415        let (a0, a1) = self.split_f64x8(a);
4416        let (b0, b1) = self.split_f64x8(b);
4417        let (c0, c1) = self.split_f64x8(c);
4418        self.combine_f64x4(self.msub_f64x4(a0, b0, c0), self.msub_f64x4(a1, b1, c1))
4419    }
4420    #[inline(always)]
4421    fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
4422        let (a0, a1) = self.split_f64x8(a);
4423        self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1))
4424    }
4425    #[inline(always)]
4426    fn fract_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
4427        let (a0, a1) = self.split_f64x8(a);
4428        self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1))
4429    }
4430    #[inline(always)]
4431    fn trunc_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
4432        let (a0, a1) = self.split_f64x8(a);
4433        self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1))
4434    }
4435    #[inline(always)]
4436    fn select_f64x8(self, a: mask64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
4437        let (a0, a1) = self.split_mask64x8(a);
4438        let (b0, b1) = self.split_f64x8(b);
4439        let (c0, c1) = self.split_f64x8(c);
4440        self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1))
4441    }
4442    #[inline(always)]
4443    fn split_f64x8(self, a: f64x8<Self>) -> (f64x4<Self>, f64x4<Self>) {
4444        let mut b0 = [0.0; 4usize];
4445        let mut b1 = [0.0; 4usize];
4446        b0.copy_from_slice(&a.val[0..4usize]);
4447        b1.copy_from_slice(&a.val[4usize..8usize]);
4448        (b0.simd_into(self), b1.simd_into(self))
4449    }
4450    #[inline(always)]
4451    fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self> {
4452        let (a0, a1) = self.split_f64x8(a);
4453        self.combine_f32x8(
4454            self.reinterpret_f32_f64x4(a0),
4455            self.reinterpret_f32_f64x4(a1),
4456        )
4457    }
4458    #[inline(always)]
4459    fn splat_mask64x8(self, a: i64) -> mask64x8<Self> {
4460        let half = self.splat_mask64x4(a);
4461        self.combine_mask64x4(half, half)
4462    }
4463    #[inline(always)]
4464    fn not_mask64x8(self, a: mask64x8<Self>) -> mask64x8<Self> {
4465        let (a0, a1) = self.split_mask64x8(a);
4466        self.combine_mask64x4(self.not_mask64x4(a0), self.not_mask64x4(a1))
4467    }
4468    #[inline(always)]
4469    fn and_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
4470        let (a0, a1) = self.split_mask64x8(a);
4471        let (b0, b1) = self.split_mask64x8(b);
4472        self.combine_mask64x4(self.and_mask64x4(a0, b0), self.and_mask64x4(a1, b1))
4473    }
4474    #[inline(always)]
4475    fn or_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
4476        let (a0, a1) = self.split_mask64x8(a);
4477        let (b0, b1) = self.split_mask64x8(b);
4478        self.combine_mask64x4(self.or_mask64x4(a0, b0), self.or_mask64x4(a1, b1))
4479    }
4480    #[inline(always)]
4481    fn xor_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
4482        let (a0, a1) = self.split_mask64x8(a);
4483        let (b0, b1) = self.split_mask64x8(b);
4484        self.combine_mask64x4(self.xor_mask64x4(a0, b0), self.xor_mask64x4(a1, b1))
4485    }
4486    #[inline(always)]
4487    fn select_mask64x8(
4488        self,
4489        a: mask64x8<Self>,
4490        b: mask64x8<Self>,
4491        c: mask64x8<Self>,
4492    ) -> mask64x8<Self> {
4493        let (a0, a1) = self.split_mask64x8(a);
4494        let (b0, b1) = self.split_mask64x8(b);
4495        let (c0, c1) = self.split_mask64x8(c);
4496        self.combine_mask64x4(
4497            self.select_mask64x4(a0, b0, c0),
4498            self.select_mask64x4(a1, b1, c1),
4499        )
4500    }
4501    #[inline(always)]
4502    fn simd_eq_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
4503        let (a0, a1) = self.split_mask64x8(a);
4504        let (b0, b1) = self.split_mask64x8(b);
4505        self.combine_mask64x4(self.simd_eq_mask64x4(a0, b0), self.simd_eq_mask64x4(a1, b1))
4506    }
4507    #[inline(always)]
4508    fn split_mask64x8(self, a: mask64x8<Self>) -> (mask64x4<Self>, mask64x4<Self>) {
4509        let mut b0 = [0; 4usize];
4510        let mut b1 = [0; 4usize];
4511        b0.copy_from_slice(&a.val[0..4usize]);
4512        b1.copy_from_slice(&a.val[4usize..8usize]);
4513        (b0.simd_into(self), b1.simd_into(self))
4514    }
4515}
4516impl<S: Simd> SimdFrom<__m128, S> for f32x4<S> {
4517    #[inline(always)]
4518    fn simd_from(arch: __m128, simd: S) -> Self {
4519        Self {
4520            val: unsafe { core::mem::transmute(arch) },
4521            simd,
4522        }
4523    }
4524}
4525impl<S: Simd> From<f32x4<S>> for __m128 {
4526    #[inline(always)]
4527    fn from(value: f32x4<S>) -> Self {
4528        unsafe { core::mem::transmute(value.val) }
4529    }
4530}
4531impl<S: Simd> SimdFrom<__m128i, S> for i8x16<S> {
4532    #[inline(always)]
4533    fn simd_from(arch: __m128i, simd: S) -> Self {
4534        Self {
4535            val: unsafe { core::mem::transmute(arch) },
4536            simd,
4537        }
4538    }
4539}
4540impl<S: Simd> From<i8x16<S>> for __m128i {
4541    #[inline(always)]
4542    fn from(value: i8x16<S>) -> Self {
4543        unsafe { core::mem::transmute(value.val) }
4544    }
4545}
4546impl<S: Simd> SimdFrom<__m128i, S> for u8x16<S> {
4547    #[inline(always)]
4548    fn simd_from(arch: __m128i, simd: S) -> Self {
4549        Self {
4550            val: unsafe { core::mem::transmute(arch) },
4551            simd,
4552        }
4553    }
4554}
4555impl<S: Simd> From<u8x16<S>> for __m128i {
4556    #[inline(always)]
4557    fn from(value: u8x16<S>) -> Self {
4558        unsafe { core::mem::transmute(value.val) }
4559    }
4560}
4561impl<S: Simd> SimdFrom<__m128i, S> for mask8x16<S> {
4562    #[inline(always)]
4563    fn simd_from(arch: __m128i, simd: S) -> Self {
4564        Self {
4565            val: unsafe { core::mem::transmute(arch) },
4566            simd,
4567        }
4568    }
4569}
4570impl<S: Simd> From<mask8x16<S>> for __m128i {
4571    #[inline(always)]
4572    fn from(value: mask8x16<S>) -> Self {
4573        unsafe { core::mem::transmute(value.val) }
4574    }
4575}
4576impl<S: Simd> SimdFrom<__m128i, S> for i16x8<S> {
4577    #[inline(always)]
4578    fn simd_from(arch: __m128i, simd: S) -> Self {
4579        Self {
4580            val: unsafe { core::mem::transmute(arch) },
4581            simd,
4582        }
4583    }
4584}
4585impl<S: Simd> From<i16x8<S>> for __m128i {
4586    #[inline(always)]
4587    fn from(value: i16x8<S>) -> Self {
4588        unsafe { core::mem::transmute(value.val) }
4589    }
4590}
4591impl<S: Simd> SimdFrom<__m128i, S> for u16x8<S> {
4592    #[inline(always)]
4593    fn simd_from(arch: __m128i, simd: S) -> Self {
4594        Self {
4595            val: unsafe { core::mem::transmute(arch) },
4596            simd,
4597        }
4598    }
4599}
4600impl<S: Simd> From<u16x8<S>> for __m128i {
4601    #[inline(always)]
4602    fn from(value: u16x8<S>) -> Self {
4603        unsafe { core::mem::transmute(value.val) }
4604    }
4605}
4606impl<S: Simd> SimdFrom<__m128i, S> for mask16x8<S> {
4607    #[inline(always)]
4608    fn simd_from(arch: __m128i, simd: S) -> Self {
4609        Self {
4610            val: unsafe { core::mem::transmute(arch) },
4611            simd,
4612        }
4613    }
4614}
4615impl<S: Simd> From<mask16x8<S>> for __m128i {
4616    #[inline(always)]
4617    fn from(value: mask16x8<S>) -> Self {
4618        unsafe { core::mem::transmute(value.val) }
4619    }
4620}
4621impl<S: Simd> SimdFrom<__m128i, S> for i32x4<S> {
4622    #[inline(always)]
4623    fn simd_from(arch: __m128i, simd: S) -> Self {
4624        Self {
4625            val: unsafe { core::mem::transmute(arch) },
4626            simd,
4627        }
4628    }
4629}
4630impl<S: Simd> From<i32x4<S>> for __m128i {
4631    #[inline(always)]
4632    fn from(value: i32x4<S>) -> Self {
4633        unsafe { core::mem::transmute(value.val) }
4634    }
4635}
4636impl<S: Simd> SimdFrom<__m128i, S> for u32x4<S> {
4637    #[inline(always)]
4638    fn simd_from(arch: __m128i, simd: S) -> Self {
4639        Self {
4640            val: unsafe { core::mem::transmute(arch) },
4641            simd,
4642        }
4643    }
4644}
4645impl<S: Simd> From<u32x4<S>> for __m128i {
4646    #[inline(always)]
4647    fn from(value: u32x4<S>) -> Self {
4648        unsafe { core::mem::transmute(value.val) }
4649    }
4650}
4651impl<S: Simd> SimdFrom<__m128i, S> for mask32x4<S> {
4652    #[inline(always)]
4653    fn simd_from(arch: __m128i, simd: S) -> Self {
4654        Self {
4655            val: unsafe { core::mem::transmute(arch) },
4656            simd,
4657        }
4658    }
4659}
4660impl<S: Simd> From<mask32x4<S>> for __m128i {
4661    #[inline(always)]
4662    fn from(value: mask32x4<S>) -> Self {
4663        unsafe { core::mem::transmute(value.val) }
4664    }
4665}
4666impl<S: Simd> SimdFrom<__m128d, S> for f64x2<S> {
4667    #[inline(always)]
4668    fn simd_from(arch: __m128d, simd: S) -> Self {
4669        Self {
4670            val: unsafe { core::mem::transmute(arch) },
4671            simd,
4672        }
4673    }
4674}
4675impl<S: Simd> From<f64x2<S>> for __m128d {
4676    #[inline(always)]
4677    fn from(value: f64x2<S>) -> Self {
4678        unsafe { core::mem::transmute(value.val) }
4679    }
4680}
4681impl<S: Simd> SimdFrom<__m128i, S> for mask64x2<S> {
4682    #[inline(always)]
4683    fn simd_from(arch: __m128i, simd: S) -> Self {
4684        Self {
4685            val: unsafe { core::mem::transmute(arch) },
4686            simd,
4687        }
4688    }
4689}
4690impl<S: Simd> From<mask64x2<S>> for __m128i {
4691    #[inline(always)]
4692    fn from(value: mask64x2<S>) -> Self {
4693        unsafe { core::mem::transmute(value.val) }
4694    }
4695}