fearless_simd/generated/
sse4_2.rs

1// Copyright 2025 the Fearless_SIMD Authors
2// SPDX-License-Identifier: Apache-2.0 OR MIT
3
4// This file is autogenerated by fearless_simd_gen
5
6#![expect(
7    unused_variables,
8    clippy::todo,
9    reason = "TODO: https://github.com/linebender/fearless_simd/issues/40"
10)]
11use crate::{Level, Simd, SimdFrom, SimdInto, seal::Seal};
12use crate::{
13    f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4,
14    i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4,
15    mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32,
16    u32x4, u32x8, u32x16,
17};
18#[cfg(target_arch = "x86")]
19use core::arch::x86::*;
20#[cfg(target_arch = "x86_64")]
21use core::arch::x86_64::*;
22use core::ops::*;
23#[doc = r#" The SIMD token for the "SSE 4.2" level."#]
24#[derive(Clone, Copy, Debug)]
25pub struct Sse4_2 {
26    pub sse4_2: crate::core_arch::x86::Sse4_2,
27}
28impl Sse4_2 {
29    #[doc = r" Create a SIMD token."]
30    #[doc = r""]
31    #[doc = r" # Safety"]
32    #[doc = r""]
33    #[doc = r" The SSE4.2 CPU feature must be available."]
34    #[inline]
35    pub const unsafe fn new_unchecked() -> Self {
36        Sse4_2 {
37            sse4_2: unsafe { crate::core_arch::x86::Sse4_2::new_unchecked() },
38        }
39    }
40}
41impl Seal for Sse4_2 {}
42impl Simd for Sse4_2 {
43    type f32s = f32x4<Self>;
44    type u8s = u8x16<Self>;
45    type i8s = i8x16<Self>;
46    type u16s = u16x8<Self>;
47    type i16s = i16x8<Self>;
48    type u32s = u32x4<Self>;
49    type i32s = i32x4<Self>;
50    type mask8s = mask8x16<Self>;
51    type mask16s = mask16x8<Self>;
52    type mask32s = mask32x4<Self>;
53    #[inline(always)]
54    fn level(self) -> Level {
55        Level::Sse4_2(self)
56    }
57    #[inline]
58    fn vectorize<F: FnOnce() -> R, R>(self, f: F) -> R {
59        #[target_feature(enable = "sse4.2")]
60        #[inline]
61        unsafe fn vectorize_sse4_2<F: FnOnce() -> R, R>(f: F) -> R {
62            f()
63        }
64        unsafe { vectorize_sse4_2(f) }
65    }
66    #[inline(always)]
67    fn splat_f32x4(self, val: f32) -> f32x4<Self> {
68        unsafe { _mm_set1_ps(val).simd_into(self) }
69    }
70    #[inline(always)]
71    fn abs_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
72        unsafe { _mm_andnot_ps(_mm_set1_ps(-0.0), a.into()).simd_into(self) }
73    }
74    #[inline(always)]
75    fn neg_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
76        unsafe { _mm_xor_ps(a.into(), _mm_set1_ps(-0.0)).simd_into(self) }
77    }
78    #[inline(always)]
79    fn sqrt_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
80        unsafe { _mm_sqrt_ps(a.into()).simd_into(self) }
81    }
82    #[inline(always)]
83    fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
84        unsafe { _mm_add_ps(a.into(), b.into()).simd_into(self) }
85    }
86    #[inline(always)]
87    fn sub_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
88        unsafe { _mm_sub_ps(a.into(), b.into()).simd_into(self) }
89    }
90    #[inline(always)]
91    fn mul_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
92        unsafe { _mm_mul_ps(a.into(), b.into()).simd_into(self) }
93    }
94    #[inline(always)]
95    fn div_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
96        unsafe { _mm_div_ps(a.into(), b.into()).simd_into(self) }
97    }
98    #[inline(always)]
99    fn copysign_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
100        unsafe {
101            let mask = _mm_set1_ps(-0.0);
102            _mm_or_ps(_mm_and_ps(mask, b.into()), _mm_andnot_ps(mask, a.into())).simd_into(self)
103        }
104    }
105    #[inline(always)]
106    fn simd_eq_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
107        unsafe { _mm_castps_si128(_mm_cmpeq_ps(a.into(), b.into())).simd_into(self) }
108    }
109    #[inline(always)]
110    fn simd_lt_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
111        unsafe { _mm_castps_si128(_mm_cmplt_ps(a.into(), b.into())).simd_into(self) }
112    }
113    #[inline(always)]
114    fn simd_le_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
115        unsafe { _mm_castps_si128(_mm_cmple_ps(a.into(), b.into())).simd_into(self) }
116    }
117    #[inline(always)]
118    fn simd_ge_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
119        unsafe { _mm_castps_si128(_mm_cmpge_ps(a.into(), b.into())).simd_into(self) }
120    }
121    #[inline(always)]
122    fn simd_gt_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
123        unsafe { _mm_castps_si128(_mm_cmpgt_ps(a.into(), b.into())).simd_into(self) }
124    }
125    #[inline(always)]
126    fn zip_low_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
127        unsafe { _mm_unpacklo_ps(a.into(), b.into()).simd_into(self) }
128    }
129    #[inline(always)]
130    fn zip_high_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
131        unsafe { _mm_unpackhi_ps(a.into(), b.into()).simd_into(self) }
132    }
133    #[inline(always)]
134    fn unzip_low_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
135        unsafe { _mm_shuffle_ps::<0b10_00_10_00>(a.into(), b.into()).simd_into(self) }
136    }
137    #[inline(always)]
138    fn unzip_high_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
139        unsafe { _mm_shuffle_ps::<0b11_01_11_01>(a.into(), b.into()).simd_into(self) }
140    }
141    #[inline(always)]
142    fn max_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
143        unsafe { _mm_max_ps(a.into(), b.into()).simd_into(self) }
144    }
145    #[inline(always)]
146    fn max_precise_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
147        unsafe { _mm_max_ps(a.into(), b.into()).simd_into(self) }
148    }
149    #[inline(always)]
150    fn min_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
151        unsafe { _mm_min_ps(a.into(), b.into()).simd_into(self) }
152    }
153    #[inline(always)]
154    fn min_precise_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
155        unsafe { _mm_min_ps(a.into(), b.into()).simd_into(self) }
156    }
157    #[inline(always)]
158    fn madd_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
159        a * b + c
160    }
161    #[inline(always)]
162    fn msub_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
163        a * b - c
164    }
165    #[inline(always)]
166    fn floor_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
167        unsafe { _mm_floor_ps(a.into()).simd_into(self) }
168    }
169    #[inline(always)]
170    fn fract_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
171        a - a.trunc()
172    }
173    #[inline(always)]
174    fn trunc_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
175        unsafe { _mm_round_ps(a.into(), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC).simd_into(self) }
176    }
177    #[inline(always)]
178    fn select_f32x4(self, a: mask32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
179        unsafe {
180            let mask = _mm_castsi128_ps(a.into());
181            _mm_or_ps(_mm_and_ps(mask, b.into()), _mm_andnot_ps(mask, c.into())).simd_into(self)
182        }
183    }
184    #[inline(always)]
185    fn combine_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x8<Self> {
186        let mut result = [0.0; 8usize];
187        result[0..4usize].copy_from_slice(&a.val);
188        result[4usize..8usize].copy_from_slice(&b.val);
189        result.simd_into(self)
190    }
191    #[inline(always)]
192    fn reinterpret_f64_f32x4(self, a: f32x4<Self>) -> f64x2<Self> {
193        f64x2 {
194            val: bytemuck::cast(a.val),
195            simd: a.simd,
196        }
197    }
198    #[inline(always)]
199    fn reinterpret_i32_f32x4(self, a: f32x4<Self>) -> i32x4<Self> {
200        i32x4 {
201            val: bytemuck::cast(a.val),
202            simd: a.simd,
203        }
204    }
205    #[inline(always)]
206    fn reinterpret_u8_f32x4(self, a: f32x4<Self>) -> u8x16<Self> {
207        u8x16 {
208            val: bytemuck::cast(a.val),
209            simd: a.simd,
210        }
211    }
212    #[inline(always)]
213    fn reinterpret_u32_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
214        u32x4 {
215            val: bytemuck::cast(a.val),
216            simd: a.simd,
217        }
218    }
219    #[inline(always)]
220    fn cvt_u32_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
221        unsafe {
222            _mm_cvtps_epi32(_mm_max_ps(_mm_floor_ps(a.into()), _mm_set1_ps(0.0))).simd_into(self)
223        }
224    }
225    #[inline(always)]
226    fn cvt_i32_f32x4(self, a: f32x4<Self>) -> i32x4<Self> {
227        unsafe { _mm_cvtps_epi32(a.trunc().into()).simd_into(self) }
228    }
229    #[inline(always)]
230    fn splat_i8x16(self, val: i8) -> i8x16<Self> {
231        unsafe { _mm_set1_epi8(val).simd_into(self) }
232    }
233    #[inline(always)]
234    fn not_i8x16(self, a: i8x16<Self>) -> i8x16<Self> {
235        a ^ !0
236    }
237    #[inline(always)]
238    fn add_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
239        unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) }
240    }
241    #[inline(always)]
242    fn sub_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
243        unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) }
244    }
245    #[inline(always)]
246    fn mul_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
247        todo!()
248    }
249    #[inline(always)]
250    fn and_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
251        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
252    }
253    #[inline(always)]
254    fn or_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
255        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
256    }
257    #[inline(always)]
258    fn xor_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
259        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
260    }
261    #[inline(always)]
262    fn shr_i8x16(self, a: i8x16<Self>, shift: u32) -> i8x16<Self> {
263        unsafe {
264            let val = a.into();
265            let shift_count = _mm_cvtsi32_si128(shift as i32);
266            let lo_16 = _mm_unpacklo_epi8(val, _mm_cmplt_epi8(val, _mm_setzero_si128()));
267            let hi_16 = _mm_unpackhi_epi8(val, _mm_cmplt_epi8(val, _mm_setzero_si128()));
268            let lo_shifted = _mm_sra_epi16(lo_16, shift_count);
269            let hi_shifted = _mm_sra_epi16(hi_16, shift_count);
270            _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
271        }
272    }
273    #[inline(always)]
274    fn shrv_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
275        core::array::from_fn(|i| core::ops::Shr::shr(a.val[i], b.val[i])).simd_into(self)
276    }
277    #[inline(always)]
278    fn shl_i8x16(self, a: i8x16<Self>, shift: u32) -> i8x16<Self> {
279        unsafe {
280            let val = a.into();
281            let shift_count = _mm_cvtsi32_si128(shift as i32);
282            let lo_16 = _mm_unpacklo_epi8(val, _mm_cmplt_epi8(val, _mm_setzero_si128()));
283            let hi_16 = _mm_unpackhi_epi8(val, _mm_cmplt_epi8(val, _mm_setzero_si128()));
284            let lo_shifted = _mm_sll_epi16(lo_16, shift_count);
285            let hi_shifted = _mm_sll_epi16(hi_16, shift_count);
286            _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
287        }
288    }
289    #[inline(always)]
290    fn simd_eq_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
291        unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) }
292    }
293    #[inline(always)]
294    fn simd_lt_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
295        unsafe { _mm_cmplt_epi8(a.into(), b.into()).simd_into(self) }
296    }
297    #[inline(always)]
298    fn simd_le_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
299        unsafe { _mm_cmpeq_epi8(_mm_min_epi8(a.into(), b.into()), a.into()).simd_into(self) }
300    }
301    #[inline(always)]
302    fn simd_ge_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
303        unsafe { _mm_cmpeq_epi8(_mm_max_epi8(a.into(), b.into()), a.into()).simd_into(self) }
304    }
305    #[inline(always)]
306    fn simd_gt_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
307        unsafe { _mm_cmpgt_epi8(a.into(), b.into()).simd_into(self) }
308    }
309    #[inline(always)]
310    fn zip_low_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
311        unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) }
312    }
313    #[inline(always)]
314    fn zip_high_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
315        unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) }
316    }
317    #[inline(always)]
318    fn unzip_low_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
319        unsafe {
320            let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14);
321            let t1 = _mm_shuffle_epi8(a.into(), mask);
322            let t2 = _mm_shuffle_epi8(b.into(), mask);
323            _mm_unpacklo_epi64(t1, t2).simd_into(self)
324        }
325    }
326    #[inline(always)]
327    fn unzip_high_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
328        unsafe {
329            let mask = _mm_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, 1, 3, 5, 7, 9, 11, 13, 15);
330            let t1 = _mm_shuffle_epi8(a.into(), mask);
331            let t2 = _mm_shuffle_epi8(b.into(), mask);
332            _mm_unpacklo_epi64(t1, t2).simd_into(self)
333        }
334    }
335    #[inline(always)]
336    fn select_i8x16(self, a: mask8x16<Self>, b: i8x16<Self>, c: i8x16<Self>) -> i8x16<Self> {
337        unsafe {
338            _mm_or_si128(
339                _mm_and_si128(a.into(), b.into()),
340                _mm_andnot_si128(a.into(), c.into()),
341            )
342            .simd_into(self)
343        }
344    }
345    #[inline(always)]
346    fn min_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
347        unsafe { _mm_min_epi8(a.into(), b.into()).simd_into(self) }
348    }
349    #[inline(always)]
350    fn max_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
351        unsafe { _mm_max_epi8(a.into(), b.into()).simd_into(self) }
352    }
353    #[inline(always)]
354    fn combine_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x32<Self> {
355        let mut result = [0; 32usize];
356        result[0..16usize].copy_from_slice(&a.val);
357        result[16usize..32usize].copy_from_slice(&b.val);
358        result.simd_into(self)
359    }
360    #[inline(always)]
361    fn neg_i8x16(self, a: i8x16<Self>) -> i8x16<Self> {
362        unsafe { _mm_sub_epi8(_mm_setzero_si128(), a.into()).simd_into(self) }
363    }
364    #[inline(always)]
365    fn reinterpret_u8_i8x16(self, a: i8x16<Self>) -> u8x16<Self> {
366        u8x16 {
367            val: bytemuck::cast(a.val),
368            simd: a.simd,
369        }
370    }
371    #[inline(always)]
372    fn reinterpret_u32_i8x16(self, a: i8x16<Self>) -> u32x4<Self> {
373        u32x4 {
374            val: bytemuck::cast(a.val),
375            simd: a.simd,
376        }
377    }
378    #[inline(always)]
379    fn splat_u8x16(self, val: u8) -> u8x16<Self> {
380        unsafe { _mm_set1_epi8(val as _).simd_into(self) }
381    }
382    #[inline(always)]
383    fn not_u8x16(self, a: u8x16<Self>) -> u8x16<Self> {
384        a ^ !0
385    }
386    #[inline(always)]
387    fn add_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
388        unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) }
389    }
390    #[inline(always)]
391    fn sub_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
392        unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) }
393    }
394    #[inline(always)]
395    fn mul_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
396        todo!()
397    }
398    #[inline(always)]
399    fn and_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
400        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
401    }
402    #[inline(always)]
403    fn or_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
404        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
405    }
406    #[inline(always)]
407    fn xor_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
408        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
409    }
410    #[inline(always)]
411    fn shr_u8x16(self, a: u8x16<Self>, shift: u32) -> u8x16<Self> {
412        unsafe {
413            let val = a.into();
414            let shift_count = _mm_cvtsi32_si128(shift as i32);
415            let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128());
416            let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128());
417            let lo_shifted = _mm_srl_epi16(lo_16, shift_count);
418            let hi_shifted = _mm_srl_epi16(hi_16, shift_count);
419            _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
420        }
421    }
422    #[inline(always)]
423    fn shrv_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
424        core::array::from_fn(|i| core::ops::Shr::shr(a.val[i], b.val[i])).simd_into(self)
425    }
426    #[inline(always)]
427    fn shl_u8x16(self, a: u8x16<Self>, shift: u32) -> u8x16<Self> {
428        unsafe {
429            let val = a.into();
430            let shift_count = _mm_cvtsi32_si128(shift as i32);
431            let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128());
432            let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128());
433            let lo_shifted = _mm_sll_epi16(lo_16, shift_count);
434            let hi_shifted = _mm_sll_epi16(hi_16, shift_count);
435            _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
436        }
437    }
438    #[inline(always)]
439    fn simd_eq_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
440        unsafe {
441            let sign_bit = _mm_set1_epi8(0x80u8 as _);
442            let a_signed = _mm_xor_si128(a.into(), sign_bit);
443            let b_signed = _mm_xor_si128(b.into(), sign_bit);
444            _mm_cmpgt_epi8(a_signed, b_signed).simd_into(self)
445        }
446    }
447    #[inline(always)]
448    fn simd_lt_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
449        unsafe {
450            let sign_bit = _mm_set1_epi8(0x80u8 as _);
451            let a_signed = _mm_xor_si128(a.into(), sign_bit);
452            let b_signed = _mm_xor_si128(b.into(), sign_bit);
453            _mm_cmpgt_epi8(b_signed, a_signed).simd_into(self)
454        }
455    }
456    #[inline(always)]
457    fn simd_le_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
458        unsafe { _mm_cmpeq_epi8(_mm_min_epu8(a.into(), b.into()), a.into()).simd_into(self) }
459    }
460    #[inline(always)]
461    fn simd_ge_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
462        unsafe { _mm_cmpeq_epi8(_mm_max_epu8(a.into(), b.into()), a.into()).simd_into(self) }
463    }
464    #[inline(always)]
465    fn simd_gt_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
466        unsafe {
467            let sign_bit = _mm_set1_epi8(0x80u8 as _);
468            let a_signed = _mm_xor_si128(a.into(), sign_bit);
469            let b_signed = _mm_xor_si128(b.into(), sign_bit);
470            _mm_cmpgt_epi8(a_signed, b_signed).simd_into(self)
471        }
472    }
473    #[inline(always)]
474    fn zip_low_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
475        unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) }
476    }
477    #[inline(always)]
478    fn zip_high_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
479        unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) }
480    }
481    #[inline(always)]
482    fn unzip_low_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
483        unsafe {
484            let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14);
485            let t1 = _mm_shuffle_epi8(a.into(), mask);
486            let t2 = _mm_shuffle_epi8(b.into(), mask);
487            _mm_unpacklo_epi64(t1, t2).simd_into(self)
488        }
489    }
490    #[inline(always)]
491    fn unzip_high_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
492        unsafe {
493            let mask = _mm_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, 1, 3, 5, 7, 9, 11, 13, 15);
494            let t1 = _mm_shuffle_epi8(a.into(), mask);
495            let t2 = _mm_shuffle_epi8(b.into(), mask);
496            _mm_unpacklo_epi64(t1, t2).simd_into(self)
497        }
498    }
499    #[inline(always)]
500    fn select_u8x16(self, a: mask8x16<Self>, b: u8x16<Self>, c: u8x16<Self>) -> u8x16<Self> {
501        unsafe {
502            _mm_or_si128(
503                _mm_and_si128(a.into(), b.into()),
504                _mm_andnot_si128(a.into(), c.into()),
505            )
506            .simd_into(self)
507        }
508    }
509    #[inline(always)]
510    fn min_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
511        unsafe { _mm_min_epu8(a.into(), b.into()).simd_into(self) }
512    }
513    #[inline(always)]
514    fn max_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
515        unsafe { _mm_max_epu8(a.into(), b.into()).simd_into(self) }
516    }
517    #[inline(always)]
518    fn combine_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x32<Self> {
519        let mut result = [0; 32usize];
520        result[0..16usize].copy_from_slice(&a.val);
521        result[16usize..32usize].copy_from_slice(&b.val);
522        result.simd_into(self)
523    }
524    #[inline(always)]
525    fn widen_u8x16(self, a: u8x16<Self>) -> u16x16<Self> {
526        unsafe {
527            let raw = a.into();
528            let high = _mm_cvtepu8_epi16(raw).simd_into(self);
529            let low = _mm_cvtepu8_epi16(_mm_srli_si128::<8>(raw)).simd_into(self);
530            self.combine_u16x8(high, low)
531        }
532    }
533    #[inline(always)]
534    fn reinterpret_u32_u8x16(self, a: u8x16<Self>) -> u32x4<Self> {
535        u32x4 {
536            val: bytemuck::cast(a.val),
537            simd: a.simd,
538        }
539    }
540    #[inline(always)]
541    fn splat_mask8x16(self, val: i8) -> mask8x16<Self> {
542        unsafe { _mm_set1_epi8(val).simd_into(self) }
543    }
544    #[inline(always)]
545    fn not_mask8x16(self, a: mask8x16<Self>) -> mask8x16<Self> {
546        a ^ !0
547    }
548    #[inline(always)]
549    fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
550        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
551    }
552    #[inline(always)]
553    fn or_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
554        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
555    }
556    #[inline(always)]
557    fn xor_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
558        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
559    }
560    #[inline(always)]
561    fn select_mask8x16(
562        self,
563        a: mask8x16<Self>,
564        b: mask8x16<Self>,
565        c: mask8x16<Self>,
566    ) -> mask8x16<Self> {
567        unsafe {
568            _mm_or_si128(
569                _mm_and_si128(a.into(), b.into()),
570                _mm_andnot_si128(a.into(), c.into()),
571            )
572            .simd_into(self)
573        }
574    }
575    #[inline(always)]
576    fn simd_eq_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
577        unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) }
578    }
579    #[inline(always)]
580    fn combine_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x32<Self> {
581        let mut result = [0; 32usize];
582        result[0..16usize].copy_from_slice(&a.val);
583        result[16usize..32usize].copy_from_slice(&b.val);
584        result.simd_into(self)
585    }
586    #[inline(always)]
587    fn splat_i16x8(self, val: i16) -> i16x8<Self> {
588        unsafe { _mm_set1_epi16(val).simd_into(self) }
589    }
590    #[inline(always)]
591    fn not_i16x8(self, a: i16x8<Self>) -> i16x8<Self> {
592        a ^ !0
593    }
594    #[inline(always)]
595    fn add_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
596        unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) }
597    }
598    #[inline(always)]
599    fn sub_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
600        unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) }
601    }
602    #[inline(always)]
603    fn mul_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
604        unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) }
605    }
606    #[inline(always)]
607    fn and_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
608        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
609    }
610    #[inline(always)]
611    fn or_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
612        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
613    }
614    #[inline(always)]
615    fn xor_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
616        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
617    }
618    #[inline(always)]
619    fn shr_i16x8(self, a: i16x8<Self>, shift: u32) -> i16x8<Self> {
620        unsafe { _mm_sra_epi16(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) }
621    }
622    #[inline(always)]
623    fn shrv_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
624        core::array::from_fn(|i| core::ops::Shr::shr(a.val[i], b.val[i])).simd_into(self)
625    }
626    #[inline(always)]
627    fn shl_i16x8(self, a: i16x8<Self>, shift: u32) -> i16x8<Self> {
628        unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) }
629    }
630    #[inline(always)]
631    fn simd_eq_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
632        unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) }
633    }
634    #[inline(always)]
635    fn simd_lt_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
636        unsafe { _mm_cmplt_epi16(a.into(), b.into()).simd_into(self) }
637    }
638    #[inline(always)]
639    fn simd_le_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
640        unsafe { _mm_cmpeq_epi16(_mm_min_epi16(a.into(), b.into()), a.into()).simd_into(self) }
641    }
642    #[inline(always)]
643    fn simd_ge_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
644        unsafe { _mm_cmpeq_epi16(_mm_max_epi16(a.into(), b.into()), a.into()).simd_into(self) }
645    }
646    #[inline(always)]
647    fn simd_gt_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
648        unsafe { _mm_cmpgt_epi16(a.into(), b.into()).simd_into(self) }
649    }
650    #[inline(always)]
651    fn zip_low_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
652        unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) }
653    }
654    #[inline(always)]
655    fn zip_high_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
656        unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) }
657    }
658    #[inline(always)]
659    fn unzip_low_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
660        unsafe {
661            let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13);
662            let t1 = _mm_shuffle_epi8(a.into(), mask);
663            let t2 = _mm_shuffle_epi8(b.into(), mask);
664            _mm_unpacklo_epi64(t1, t2).simd_into(self)
665        }
666    }
667    #[inline(always)]
668    fn unzip_high_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
669        unsafe {
670            let mask = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
671            let t1 = _mm_shuffle_epi8(a.into(), mask);
672            let t2 = _mm_shuffle_epi8(b.into(), mask);
673            _mm_unpacklo_epi64(t1, t2).simd_into(self)
674        }
675    }
676    #[inline(always)]
677    fn select_i16x8(self, a: mask16x8<Self>, b: i16x8<Self>, c: i16x8<Self>) -> i16x8<Self> {
678        unsafe {
679            _mm_or_si128(
680                _mm_and_si128(a.into(), b.into()),
681                _mm_andnot_si128(a.into(), c.into()),
682            )
683            .simd_into(self)
684        }
685    }
686    #[inline(always)]
687    fn min_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
688        unsafe { _mm_min_epi16(a.into(), b.into()).simd_into(self) }
689    }
690    #[inline(always)]
691    fn max_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
692        unsafe { _mm_max_epi16(a.into(), b.into()).simd_into(self) }
693    }
694    #[inline(always)]
695    fn combine_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x16<Self> {
696        let mut result = [0; 16usize];
697        result[0..8usize].copy_from_slice(&a.val);
698        result[8usize..16usize].copy_from_slice(&b.val);
699        result.simd_into(self)
700    }
701    #[inline(always)]
702    fn neg_i16x8(self, a: i16x8<Self>) -> i16x8<Self> {
703        unsafe { _mm_sub_epi16(_mm_setzero_si128(), a.into()).simd_into(self) }
704    }
705    #[inline(always)]
706    fn reinterpret_u8_i16x8(self, a: i16x8<Self>) -> u8x16<Self> {
707        u8x16 {
708            val: bytemuck::cast(a.val),
709            simd: a.simd,
710        }
711    }
712    #[inline(always)]
713    fn reinterpret_u32_i16x8(self, a: i16x8<Self>) -> u32x4<Self> {
714        u32x4 {
715            val: bytemuck::cast(a.val),
716            simd: a.simd,
717        }
718    }
719    #[inline(always)]
720    fn splat_u16x8(self, val: u16) -> u16x8<Self> {
721        unsafe { _mm_set1_epi16(val as _).simd_into(self) }
722    }
723    #[inline(always)]
724    fn not_u16x8(self, a: u16x8<Self>) -> u16x8<Self> {
725        a ^ !0
726    }
727    #[inline(always)]
728    fn add_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
729        unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) }
730    }
731    #[inline(always)]
732    fn sub_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
733        unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) }
734    }
735    #[inline(always)]
736    fn mul_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
737        unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) }
738    }
739    #[inline(always)]
740    fn and_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
741        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
742    }
743    #[inline(always)]
744    fn or_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
745        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
746    }
747    #[inline(always)]
748    fn xor_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
749        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
750    }
751    #[inline(always)]
752    fn shr_u16x8(self, a: u16x8<Self>, shift: u32) -> u16x8<Self> {
753        unsafe { _mm_srl_epi16(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) }
754    }
755    #[inline(always)]
756    fn shrv_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
757        core::array::from_fn(|i| core::ops::Shr::shr(a.val[i], b.val[i])).simd_into(self)
758    }
759    #[inline(always)]
760    fn shl_u16x8(self, a: u16x8<Self>, shift: u32) -> u16x8<Self> {
761        unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) }
762    }
763    #[inline(always)]
764    fn simd_eq_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
765        unsafe {
766            let sign_bit = _mm_set1_epi16(0x8000u16 as _);
767            let a_signed = _mm_xor_si128(a.into(), sign_bit);
768            let b_signed = _mm_xor_si128(b.into(), sign_bit);
769            _mm_cmpgt_epi16(a_signed, b_signed).simd_into(self)
770        }
771    }
772    #[inline(always)]
773    fn simd_lt_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
774        unsafe {
775            let sign_bit = _mm_set1_epi16(0x8000u16 as _);
776            let a_signed = _mm_xor_si128(a.into(), sign_bit);
777            let b_signed = _mm_xor_si128(b.into(), sign_bit);
778            _mm_cmpgt_epi16(b_signed, a_signed).simd_into(self)
779        }
780    }
781    #[inline(always)]
782    fn simd_le_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
783        unsafe { _mm_cmpeq_epi16(_mm_min_epu16(a.into(), b.into()), a.into()).simd_into(self) }
784    }
785    #[inline(always)]
786    fn simd_ge_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
787        unsafe { _mm_cmpeq_epi16(_mm_max_epu16(a.into(), b.into()), a.into()).simd_into(self) }
788    }
789    #[inline(always)]
790    fn simd_gt_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
791        unsafe {
792            let sign_bit = _mm_set1_epi16(0x8000u16 as _);
793            let a_signed = _mm_xor_si128(a.into(), sign_bit);
794            let b_signed = _mm_xor_si128(b.into(), sign_bit);
795            _mm_cmpgt_epi16(a_signed, b_signed).simd_into(self)
796        }
797    }
798    #[inline(always)]
799    fn zip_low_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
800        unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) }
801    }
802    #[inline(always)]
803    fn zip_high_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
804        unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) }
805    }
806    #[inline(always)]
807    fn unzip_low_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
808        unsafe {
809            let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13);
810            let t1 = _mm_shuffle_epi8(a.into(), mask);
811            let t2 = _mm_shuffle_epi8(b.into(), mask);
812            _mm_unpacklo_epi64(t1, t2).simd_into(self)
813        }
814    }
815    #[inline(always)]
816    fn unzip_high_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
817        unsafe {
818            let mask = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
819            let t1 = _mm_shuffle_epi8(a.into(), mask);
820            let t2 = _mm_shuffle_epi8(b.into(), mask);
821            _mm_unpacklo_epi64(t1, t2).simd_into(self)
822        }
823    }
824    #[inline(always)]
825    fn select_u16x8(self, a: mask16x8<Self>, b: u16x8<Self>, c: u16x8<Self>) -> u16x8<Self> {
826        unsafe {
827            _mm_or_si128(
828                _mm_and_si128(a.into(), b.into()),
829                _mm_andnot_si128(a.into(), c.into()),
830            )
831            .simd_into(self)
832        }
833    }
834    #[inline(always)]
835    fn min_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
836        unsafe { _mm_min_epu16(a.into(), b.into()).simd_into(self) }
837    }
838    #[inline(always)]
839    fn max_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
840        unsafe { _mm_max_epu16(a.into(), b.into()).simd_into(self) }
841    }
842    #[inline(always)]
843    fn combine_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x16<Self> {
844        let mut result = [0; 16usize];
845        result[0..8usize].copy_from_slice(&a.val);
846        result[8usize..16usize].copy_from_slice(&b.val);
847        result.simd_into(self)
848    }
849    #[inline(always)]
850    fn reinterpret_u8_u16x8(self, a: u16x8<Self>) -> u8x16<Self> {
851        u8x16 {
852            val: bytemuck::cast(a.val),
853            simd: a.simd,
854        }
855    }
856    #[inline(always)]
857    fn reinterpret_u32_u16x8(self, a: u16x8<Self>) -> u32x4<Self> {
858        u32x4 {
859            val: bytemuck::cast(a.val),
860            simd: a.simd,
861        }
862    }
863    #[inline(always)]
864    fn splat_mask16x8(self, val: i16) -> mask16x8<Self> {
865        unsafe { _mm_set1_epi16(val).simd_into(self) }
866    }
867    #[inline(always)]
868    fn not_mask16x8(self, a: mask16x8<Self>) -> mask16x8<Self> {
869        a ^ !0
870    }
871    #[inline(always)]
872    fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
873        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
874    }
875    #[inline(always)]
876    fn or_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
877        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
878    }
879    #[inline(always)]
880    fn xor_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
881        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
882    }
883    #[inline(always)]
884    fn select_mask16x8(
885        self,
886        a: mask16x8<Self>,
887        b: mask16x8<Self>,
888        c: mask16x8<Self>,
889    ) -> mask16x8<Self> {
890        unsafe {
891            _mm_or_si128(
892                _mm_and_si128(a.into(), b.into()),
893                _mm_andnot_si128(a.into(), c.into()),
894            )
895            .simd_into(self)
896        }
897    }
898    #[inline(always)]
899    fn simd_eq_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
900        unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) }
901    }
902    #[inline(always)]
903    fn combine_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x16<Self> {
904        let mut result = [0; 16usize];
905        result[0..8usize].copy_from_slice(&a.val);
906        result[8usize..16usize].copy_from_slice(&b.val);
907        result.simd_into(self)
908    }
909    #[inline(always)]
910    fn splat_i32x4(self, val: i32) -> i32x4<Self> {
911        unsafe { _mm_set1_epi32(val).simd_into(self) }
912    }
913    #[inline(always)]
914    fn not_i32x4(self, a: i32x4<Self>) -> i32x4<Self> {
915        a ^ !0
916    }
917    #[inline(always)]
918    fn add_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
919        unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) }
920    }
921    #[inline(always)]
922    fn sub_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
923        unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) }
924    }
925    #[inline(always)]
926    fn mul_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
927        unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) }
928    }
929    #[inline(always)]
930    fn and_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
931        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
932    }
933    #[inline(always)]
934    fn or_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
935        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
936    }
937    #[inline(always)]
938    fn xor_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
939        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
940    }
941    #[inline(always)]
942    fn shr_i32x4(self, a: i32x4<Self>, shift: u32) -> i32x4<Self> {
943        unsafe { _mm_sra_epi32(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) }
944    }
945    #[inline(always)]
946    fn shrv_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
947        core::array::from_fn(|i| core::ops::Shr::shr(a.val[i], b.val[i])).simd_into(self)
948    }
949    #[inline(always)]
950    fn shl_i32x4(self, a: i32x4<Self>, shift: u32) -> i32x4<Self> {
951        unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) }
952    }
953    #[inline(always)]
954    fn simd_eq_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
955        unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) }
956    }
957    #[inline(always)]
958    fn simd_lt_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
959        unsafe { _mm_cmplt_epi32(a.into(), b.into()).simd_into(self) }
960    }
961    #[inline(always)]
962    fn simd_le_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
963        unsafe { _mm_cmpeq_epi32(_mm_min_epi32(a.into(), b.into()), a.into()).simd_into(self) }
964    }
965    #[inline(always)]
966    fn simd_ge_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
967        unsafe { _mm_cmpeq_epi32(_mm_max_epi32(a.into(), b.into()), a.into()).simd_into(self) }
968    }
969    #[inline(always)]
970    fn simd_gt_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
971        unsafe { _mm_cmpgt_epi32(a.into(), b.into()).simd_into(self) }
972    }
973    #[inline(always)]
974    fn zip_low_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
975        unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) }
976    }
977    #[inline(always)]
978    fn zip_high_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
979        unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) }
980    }
981    #[inline(always)]
982    fn unzip_low_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
983        unsafe {
984            let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
985            let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
986            _mm_unpacklo_epi64(t1, t2).simd_into(self)
987        }
988    }
989    #[inline(always)]
990    fn unzip_high_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
991        unsafe {
992            let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
993            let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
994            _mm_unpackhi_epi64(t1, t2).simd_into(self)
995        }
996    }
997    #[inline(always)]
998    fn select_i32x4(self, a: mask32x4<Self>, b: i32x4<Self>, c: i32x4<Self>) -> i32x4<Self> {
999        unsafe {
1000            _mm_or_si128(
1001                _mm_and_si128(a.into(), b.into()),
1002                _mm_andnot_si128(a.into(), c.into()),
1003            )
1004            .simd_into(self)
1005        }
1006    }
1007    #[inline(always)]
1008    fn min_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1009        unsafe { _mm_min_epi32(a.into(), b.into()).simd_into(self) }
1010    }
1011    #[inline(always)]
1012    fn max_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1013        unsafe { _mm_max_epi32(a.into(), b.into()).simd_into(self) }
1014    }
1015    #[inline(always)]
1016    fn combine_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x8<Self> {
1017        let mut result = [0; 8usize];
1018        result[0..4usize].copy_from_slice(&a.val);
1019        result[4usize..8usize].copy_from_slice(&b.val);
1020        result.simd_into(self)
1021    }
1022    #[inline(always)]
1023    fn neg_i32x4(self, a: i32x4<Self>) -> i32x4<Self> {
1024        unsafe { _mm_sub_epi32(_mm_setzero_si128(), a.into()).simd_into(self) }
1025    }
1026    #[inline(always)]
1027    fn reinterpret_u8_i32x4(self, a: i32x4<Self>) -> u8x16<Self> {
1028        u8x16 {
1029            val: bytemuck::cast(a.val),
1030            simd: a.simd,
1031        }
1032    }
1033    #[inline(always)]
1034    fn reinterpret_u32_i32x4(self, a: i32x4<Self>) -> u32x4<Self> {
1035        u32x4 {
1036            val: bytemuck::cast(a.val),
1037            simd: a.simd,
1038        }
1039    }
1040    #[inline(always)]
1041    fn cvt_f32_i32x4(self, a: i32x4<Self>) -> f32x4<Self> {
1042        unsafe { _mm_cvtepi32_ps(a.into()).simd_into(self) }
1043    }
1044    #[inline(always)]
1045    fn splat_u32x4(self, val: u32) -> u32x4<Self> {
1046        unsafe { _mm_set1_epi32(val as _).simd_into(self) }
1047    }
1048    #[inline(always)]
1049    fn not_u32x4(self, a: u32x4<Self>) -> u32x4<Self> {
1050        a ^ !0
1051    }
1052    #[inline(always)]
1053    fn add_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1054        unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) }
1055    }
1056    #[inline(always)]
1057    fn sub_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1058        unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) }
1059    }
1060    #[inline(always)]
1061    fn mul_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1062        unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) }
1063    }
1064    #[inline(always)]
1065    fn and_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1066        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1067    }
1068    #[inline(always)]
1069    fn or_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1070        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1071    }
1072    #[inline(always)]
1073    fn xor_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1074        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1075    }
1076    #[inline(always)]
1077    fn shr_u32x4(self, a: u32x4<Self>, shift: u32) -> u32x4<Self> {
1078        unsafe { _mm_srl_epi32(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) }
1079    }
1080    #[inline(always)]
1081    fn shrv_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1082        core::array::from_fn(|i| core::ops::Shr::shr(a.val[i], b.val[i])).simd_into(self)
1083    }
1084    #[inline(always)]
1085    fn shl_u32x4(self, a: u32x4<Self>, shift: u32) -> u32x4<Self> {
1086        unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) }
1087    }
1088    #[inline(always)]
1089    fn simd_eq_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1090        unsafe {
1091            let sign_bit = _mm_set1_epi32(0x80000000u32 as _);
1092            let a_signed = _mm_xor_si128(a.into(), sign_bit);
1093            let b_signed = _mm_xor_si128(b.into(), sign_bit);
1094            _mm_cmpgt_epi32(a_signed, b_signed).simd_into(self)
1095        }
1096    }
1097    #[inline(always)]
1098    fn simd_lt_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1099        unsafe {
1100            let sign_bit = _mm_set1_epi32(0x80000000u32 as _);
1101            let a_signed = _mm_xor_si128(a.into(), sign_bit);
1102            let b_signed = _mm_xor_si128(b.into(), sign_bit);
1103            _mm_cmpgt_epi32(b_signed, a_signed).simd_into(self)
1104        }
1105    }
1106    #[inline(always)]
1107    fn simd_le_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1108        unsafe { _mm_cmpeq_epi32(_mm_min_epu32(a.into(), b.into()), a.into()).simd_into(self) }
1109    }
1110    #[inline(always)]
1111    fn simd_ge_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1112        unsafe { _mm_cmpeq_epi32(_mm_max_epu32(a.into(), b.into()), a.into()).simd_into(self) }
1113    }
1114    #[inline(always)]
1115    fn simd_gt_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1116        unsafe {
1117            let sign_bit = _mm_set1_epi32(0x80000000u32 as _);
1118            let a_signed = _mm_xor_si128(a.into(), sign_bit);
1119            let b_signed = _mm_xor_si128(b.into(), sign_bit);
1120            _mm_cmpgt_epi32(a_signed, b_signed).simd_into(self)
1121        }
1122    }
1123    #[inline(always)]
1124    fn zip_low_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1125        unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) }
1126    }
1127    #[inline(always)]
1128    fn zip_high_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1129        unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) }
1130    }
1131    #[inline(always)]
1132    fn unzip_low_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1133        unsafe {
1134            let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
1135            let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
1136            _mm_unpacklo_epi64(t1, t2).simd_into(self)
1137        }
1138    }
1139    #[inline(always)]
1140    fn unzip_high_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1141        unsafe {
1142            let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
1143            let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
1144            _mm_unpackhi_epi64(t1, t2).simd_into(self)
1145        }
1146    }
1147    #[inline(always)]
1148    fn select_u32x4(self, a: mask32x4<Self>, b: u32x4<Self>, c: u32x4<Self>) -> u32x4<Self> {
1149        unsafe {
1150            _mm_or_si128(
1151                _mm_and_si128(a.into(), b.into()),
1152                _mm_andnot_si128(a.into(), c.into()),
1153            )
1154            .simd_into(self)
1155        }
1156    }
1157    #[inline(always)]
1158    fn min_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1159        unsafe { _mm_min_epu32(a.into(), b.into()).simd_into(self) }
1160    }
1161    #[inline(always)]
1162    fn max_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1163        unsafe { _mm_max_epu32(a.into(), b.into()).simd_into(self) }
1164    }
1165    #[inline(always)]
1166    fn combine_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x8<Self> {
1167        let mut result = [0; 8usize];
1168        result[0..4usize].copy_from_slice(&a.val);
1169        result[4usize..8usize].copy_from_slice(&b.val);
1170        result.simd_into(self)
1171    }
1172    #[inline(always)]
1173    fn reinterpret_u8_u32x4(self, a: u32x4<Self>) -> u8x16<Self> {
1174        u8x16 {
1175            val: bytemuck::cast(a.val),
1176            simd: a.simd,
1177        }
1178    }
1179    #[inline(always)]
1180    fn cvt_f32_u32x4(self, a: u32x4<Self>) -> f32x4<Self> {
1181        unsafe { _mm_cvtepi32_ps(a.into()).simd_into(self) }
1182    }
1183    #[inline(always)]
1184    fn splat_mask32x4(self, val: i32) -> mask32x4<Self> {
1185        unsafe { _mm_set1_epi32(val).simd_into(self) }
1186    }
1187    #[inline(always)]
1188    fn not_mask32x4(self, a: mask32x4<Self>) -> mask32x4<Self> {
1189        a ^ !0
1190    }
1191    #[inline(always)]
1192    fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
1193        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1194    }
1195    #[inline(always)]
1196    fn or_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
1197        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1198    }
1199    #[inline(always)]
1200    fn xor_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
1201        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1202    }
1203    #[inline(always)]
1204    fn select_mask32x4(
1205        self,
1206        a: mask32x4<Self>,
1207        b: mask32x4<Self>,
1208        c: mask32x4<Self>,
1209    ) -> mask32x4<Self> {
1210        unsafe {
1211            _mm_or_si128(
1212                _mm_and_si128(a.into(), b.into()),
1213                _mm_andnot_si128(a.into(), c.into()),
1214            )
1215            .simd_into(self)
1216        }
1217    }
1218    #[inline(always)]
1219    fn simd_eq_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
1220        unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) }
1221    }
1222    #[inline(always)]
1223    fn combine_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x8<Self> {
1224        let mut result = [0; 8usize];
1225        result[0..4usize].copy_from_slice(&a.val);
1226        result[4usize..8usize].copy_from_slice(&b.val);
1227        result.simd_into(self)
1228    }
1229    #[inline(always)]
1230    fn splat_f64x2(self, val: f64) -> f64x2<Self> {
1231        unsafe { _mm_set1_pd(val).simd_into(self) }
1232    }
1233    #[inline(always)]
1234    fn abs_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
1235        unsafe { _mm_andnot_pd(_mm_set1_pd(-0.0), a.into()).simd_into(self) }
1236    }
1237    #[inline(always)]
1238    fn neg_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
1239        unsafe { _mm_xor_pd(a.into(), _mm_set1_pd(-0.0)).simd_into(self) }
1240    }
1241    #[inline(always)]
1242    fn sqrt_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
1243        unsafe { _mm_sqrt_pd(a.into()).simd_into(self) }
1244    }
1245    #[inline(always)]
1246    fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1247        unsafe { _mm_add_pd(a.into(), b.into()).simd_into(self) }
1248    }
1249    #[inline(always)]
1250    fn sub_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1251        unsafe { _mm_sub_pd(a.into(), b.into()).simd_into(self) }
1252    }
1253    #[inline(always)]
1254    fn mul_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1255        unsafe { _mm_mul_pd(a.into(), b.into()).simd_into(self) }
1256    }
1257    #[inline(always)]
1258    fn div_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1259        unsafe { _mm_div_pd(a.into(), b.into()).simd_into(self) }
1260    }
1261    #[inline(always)]
1262    fn copysign_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1263        unsafe {
1264            let mask = _mm_set1_pd(-0.0);
1265            _mm_or_pd(_mm_and_pd(mask, b.into()), _mm_andnot_pd(mask, a.into())).simd_into(self)
1266        }
1267    }
1268    #[inline(always)]
1269    fn simd_eq_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
1270        unsafe { _mm_castpd_si128(_mm_cmpeq_pd(a.into(), b.into())).simd_into(self) }
1271    }
1272    #[inline(always)]
1273    fn simd_lt_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
1274        unsafe { _mm_castpd_si128(_mm_cmplt_pd(a.into(), b.into())).simd_into(self) }
1275    }
1276    #[inline(always)]
1277    fn simd_le_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
1278        unsafe { _mm_castpd_si128(_mm_cmple_pd(a.into(), b.into())).simd_into(self) }
1279    }
1280    #[inline(always)]
1281    fn simd_ge_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
1282        unsafe { _mm_castpd_si128(_mm_cmpge_pd(a.into(), b.into())).simd_into(self) }
1283    }
1284    #[inline(always)]
1285    fn simd_gt_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
1286        unsafe { _mm_castpd_si128(_mm_cmpgt_pd(a.into(), b.into())).simd_into(self) }
1287    }
1288    #[inline(always)]
1289    fn zip_low_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1290        unsafe { _mm_unpacklo_pd(a.into(), b.into()).simd_into(self) }
1291    }
1292    #[inline(always)]
1293    fn zip_high_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1294        unsafe { _mm_unpackhi_pd(a.into(), b.into()).simd_into(self) }
1295    }
1296    #[inline(always)]
1297    fn unzip_low_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1298        unsafe { _mm_shuffle_pd::<0b00>(a.into(), b.into()).simd_into(self) }
1299    }
1300    #[inline(always)]
1301    fn unzip_high_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1302        unsafe { _mm_shuffle_pd::<0b11>(a.into(), b.into()).simd_into(self) }
1303    }
1304    #[inline(always)]
1305    fn max_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1306        unsafe { _mm_max_pd(a.into(), b.into()).simd_into(self) }
1307    }
1308    #[inline(always)]
1309    fn max_precise_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1310        unsafe { _mm_max_pd(a.into(), b.into()).simd_into(self) }
1311    }
1312    #[inline(always)]
1313    fn min_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1314        unsafe { _mm_min_pd(a.into(), b.into()).simd_into(self) }
1315    }
1316    #[inline(always)]
1317    fn min_precise_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1318        unsafe { _mm_min_pd(a.into(), b.into()).simd_into(self) }
1319    }
1320    #[inline(always)]
1321    fn madd_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
1322        a * b + c
1323    }
1324    #[inline(always)]
1325    fn msub_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
1326        a * b - c
1327    }
1328    #[inline(always)]
1329    fn floor_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
1330        unsafe { _mm_floor_pd(a.into()).simd_into(self) }
1331    }
1332    #[inline(always)]
1333    fn fract_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
1334        a - a.trunc()
1335    }
1336    #[inline(always)]
1337    fn trunc_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
1338        unsafe { _mm_round_pd(a.into(), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC).simd_into(self) }
1339    }
1340    #[inline(always)]
1341    fn select_f64x2(self, a: mask64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
1342        unsafe {
1343            let mask = _mm_castsi128_pd(a.into());
1344            _mm_or_pd(_mm_and_pd(mask, b.into()), _mm_andnot_pd(mask, c.into())).simd_into(self)
1345        }
1346    }
1347    #[inline(always)]
1348    fn combine_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x4<Self> {
1349        let mut result = [0.0; 4usize];
1350        result[0..2usize].copy_from_slice(&a.val);
1351        result[2usize..4usize].copy_from_slice(&b.val);
1352        result.simd_into(self)
1353    }
1354    #[inline(always)]
1355    fn reinterpret_f32_f64x2(self, a: f64x2<Self>) -> f32x4<Self> {
1356        f32x4 {
1357            val: bytemuck::cast(a.val),
1358            simd: a.simd,
1359        }
1360    }
1361    #[inline(always)]
1362    fn splat_mask64x2(self, val: i64) -> mask64x2<Self> {
1363        unsafe { _mm_set1_epi64x(val).simd_into(self) }
1364    }
1365    #[inline(always)]
1366    fn not_mask64x2(self, a: mask64x2<Self>) -> mask64x2<Self> {
1367        a ^ !0
1368    }
1369    #[inline(always)]
1370    fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
1371        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1372    }
1373    #[inline(always)]
1374    fn or_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
1375        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1376    }
1377    #[inline(always)]
1378    fn xor_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
1379        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1380    }
1381    #[inline(always)]
1382    fn select_mask64x2(
1383        self,
1384        a: mask64x2<Self>,
1385        b: mask64x2<Self>,
1386        c: mask64x2<Self>,
1387    ) -> mask64x2<Self> {
1388        unsafe {
1389            _mm_or_si128(
1390                _mm_and_si128(a.into(), b.into()),
1391                _mm_andnot_si128(a.into(), c.into()),
1392            )
1393            .simd_into(self)
1394        }
1395    }
1396    #[inline(always)]
1397    fn simd_eq_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
1398        unsafe { _mm_cmpeq_epi64(a.into(), b.into()).simd_into(self) }
1399    }
1400    #[inline(always)]
1401    fn combine_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x4<Self> {
1402        let mut result = [0; 4usize];
1403        result[0..2usize].copy_from_slice(&a.val);
1404        result[2usize..4usize].copy_from_slice(&b.val);
1405        result.simd_into(self)
1406    }
1407    #[inline(always)]
1408    fn splat_f32x8(self, a: f32) -> f32x8<Self> {
1409        let half = self.splat_f32x4(a);
1410        self.combine_f32x4(half, half)
1411    }
1412    #[inline(always)]
1413    fn abs_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
1414        let (a0, a1) = self.split_f32x8(a);
1415        self.combine_f32x4(self.abs_f32x4(a0), self.abs_f32x4(a1))
1416    }
1417    #[inline(always)]
1418    fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
1419        let (a0, a1) = self.split_f32x8(a);
1420        self.combine_f32x4(self.neg_f32x4(a0), self.neg_f32x4(a1))
1421    }
1422    #[inline(always)]
1423    fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
1424        let (a0, a1) = self.split_f32x8(a);
1425        self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1))
1426    }
1427    #[inline(always)]
1428    fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1429        let (a0, a1) = self.split_f32x8(a);
1430        let (b0, b1) = self.split_f32x8(b);
1431        self.combine_f32x4(self.add_f32x4(a0, b0), self.add_f32x4(a1, b1))
1432    }
1433    #[inline(always)]
1434    fn sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1435        let (a0, a1) = self.split_f32x8(a);
1436        let (b0, b1) = self.split_f32x8(b);
1437        self.combine_f32x4(self.sub_f32x4(a0, b0), self.sub_f32x4(a1, b1))
1438    }
1439    #[inline(always)]
1440    fn mul_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1441        let (a0, a1) = self.split_f32x8(a);
1442        let (b0, b1) = self.split_f32x8(b);
1443        self.combine_f32x4(self.mul_f32x4(a0, b0), self.mul_f32x4(a1, b1))
1444    }
1445    #[inline(always)]
1446    fn div_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1447        let (a0, a1) = self.split_f32x8(a);
1448        let (b0, b1) = self.split_f32x8(b);
1449        self.combine_f32x4(self.div_f32x4(a0, b0), self.div_f32x4(a1, b1))
1450    }
1451    #[inline(always)]
1452    fn copysign_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1453        let (a0, a1) = self.split_f32x8(a);
1454        let (b0, b1) = self.split_f32x8(b);
1455        self.combine_f32x4(self.copysign_f32x4(a0, b0), self.copysign_f32x4(a1, b1))
1456    }
1457    #[inline(always)]
1458    fn simd_eq_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
1459        let (a0, a1) = self.split_f32x8(a);
1460        let (b0, b1) = self.split_f32x8(b);
1461        self.combine_mask32x4(self.simd_eq_f32x4(a0, b0), self.simd_eq_f32x4(a1, b1))
1462    }
1463    #[inline(always)]
1464    fn simd_lt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
1465        let (a0, a1) = self.split_f32x8(a);
1466        let (b0, b1) = self.split_f32x8(b);
1467        self.combine_mask32x4(self.simd_lt_f32x4(a0, b0), self.simd_lt_f32x4(a1, b1))
1468    }
1469    #[inline(always)]
1470    fn simd_le_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
1471        let (a0, a1) = self.split_f32x8(a);
1472        let (b0, b1) = self.split_f32x8(b);
1473        self.combine_mask32x4(self.simd_le_f32x4(a0, b0), self.simd_le_f32x4(a1, b1))
1474    }
1475    #[inline(always)]
1476    fn simd_ge_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
1477        let (a0, a1) = self.split_f32x8(a);
1478        let (b0, b1) = self.split_f32x8(b);
1479        self.combine_mask32x4(self.simd_ge_f32x4(a0, b0), self.simd_ge_f32x4(a1, b1))
1480    }
1481    #[inline(always)]
1482    fn simd_gt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
1483        let (a0, a1) = self.split_f32x8(a);
1484        let (b0, b1) = self.split_f32x8(b);
1485        self.combine_mask32x4(self.simd_gt_f32x4(a0, b0), self.simd_gt_f32x4(a1, b1))
1486    }
1487    #[inline(always)]
1488    fn zip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1489        let (a0, _) = self.split_f32x8(a);
1490        let (b0, _) = self.split_f32x8(b);
1491        self.combine_f32x4(self.zip_low_f32x4(a0, b0), self.zip_high_f32x4(a0, b0))
1492    }
1493    #[inline(always)]
1494    fn zip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1495        let (_, a1) = self.split_f32x8(a);
1496        let (_, b1) = self.split_f32x8(b);
1497        self.combine_f32x4(self.zip_low_f32x4(a1, b1), self.zip_high_f32x4(a1, b1))
1498    }
1499    #[inline(always)]
1500    fn unzip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1501        let (a0, a1) = self.split_f32x8(a);
1502        let (b0, b1) = self.split_f32x8(b);
1503        self.combine_f32x4(self.unzip_low_f32x4(a0, a1), self.unzip_low_f32x4(b0, b1))
1504    }
1505    #[inline(always)]
1506    fn unzip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1507        let (a0, a1) = self.split_f32x8(a);
1508        let (b0, b1) = self.split_f32x8(b);
1509        self.combine_f32x4(self.unzip_high_f32x4(a0, a1), self.unzip_high_f32x4(b0, b1))
1510    }
1511    #[inline(always)]
1512    fn max_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1513        let (a0, a1) = self.split_f32x8(a);
1514        let (b0, b1) = self.split_f32x8(b);
1515        self.combine_f32x4(self.max_f32x4(a0, b0), self.max_f32x4(a1, b1))
1516    }
1517    #[inline(always)]
1518    fn max_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1519        let (a0, a1) = self.split_f32x8(a);
1520        let (b0, b1) = self.split_f32x8(b);
1521        self.combine_f32x4(
1522            self.max_precise_f32x4(a0, b0),
1523            self.max_precise_f32x4(a1, b1),
1524        )
1525    }
1526    #[inline(always)]
1527    fn min_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1528        let (a0, a1) = self.split_f32x8(a);
1529        let (b0, b1) = self.split_f32x8(b);
1530        self.combine_f32x4(self.min_f32x4(a0, b0), self.min_f32x4(a1, b1))
1531    }
1532    #[inline(always)]
1533    fn min_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1534        let (a0, a1) = self.split_f32x8(a);
1535        let (b0, b1) = self.split_f32x8(b);
1536        self.combine_f32x4(
1537            self.min_precise_f32x4(a0, b0),
1538            self.min_precise_f32x4(a1, b1),
1539        )
1540    }
1541    #[inline(always)]
1542    fn madd_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
1543        let (a0, a1) = self.split_f32x8(a);
1544        let (b0, b1) = self.split_f32x8(b);
1545        let (c0, c1) = self.split_f32x8(c);
1546        self.combine_f32x4(self.madd_f32x4(a0, b0, c0), self.madd_f32x4(a1, b1, c1))
1547    }
1548    #[inline(always)]
1549    fn msub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
1550        let (a0, a1) = self.split_f32x8(a);
1551        let (b0, b1) = self.split_f32x8(b);
1552        let (c0, c1) = self.split_f32x8(c);
1553        self.combine_f32x4(self.msub_f32x4(a0, b0, c0), self.msub_f32x4(a1, b1, c1))
1554    }
1555    #[inline(always)]
1556    fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
1557        let (a0, a1) = self.split_f32x8(a);
1558        self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1))
1559    }
1560    #[inline(always)]
1561    fn fract_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
1562        let (a0, a1) = self.split_f32x8(a);
1563        self.combine_f32x4(self.fract_f32x4(a0), self.fract_f32x4(a1))
1564    }
1565    #[inline(always)]
1566    fn trunc_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
1567        let (a0, a1) = self.split_f32x8(a);
1568        self.combine_f32x4(self.trunc_f32x4(a0), self.trunc_f32x4(a1))
1569    }
1570    #[inline(always)]
1571    fn select_f32x8(self, a: mask32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
1572        let (a0, a1) = self.split_mask32x8(a);
1573        let (b0, b1) = self.split_f32x8(b);
1574        let (c0, c1) = self.split_f32x8(c);
1575        self.combine_f32x4(self.select_f32x4(a0, b0, c0), self.select_f32x4(a1, b1, c1))
1576    }
1577    #[inline(always)]
1578    fn combine_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x16<Self> {
1579        let mut result = [0.0; 16usize];
1580        result[0..8usize].copy_from_slice(&a.val);
1581        result[8usize..16usize].copy_from_slice(&b.val);
1582        result.simd_into(self)
1583    }
1584    #[inline(always)]
1585    fn split_f32x8(self, a: f32x8<Self>) -> (f32x4<Self>, f32x4<Self>) {
1586        let mut b0 = [0.0; 4usize];
1587        let mut b1 = [0.0; 4usize];
1588        b0.copy_from_slice(&a.val[0..4usize]);
1589        b1.copy_from_slice(&a.val[4usize..8usize]);
1590        (b0.simd_into(self), b1.simd_into(self))
1591    }
1592    #[inline(always)]
1593    fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self> {
1594        let (a0, a1) = self.split_f32x8(a);
1595        self.combine_f64x2(
1596            self.reinterpret_f64_f32x4(a0),
1597            self.reinterpret_f64_f32x4(a1),
1598        )
1599    }
1600    #[inline(always)]
1601    fn reinterpret_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
1602        let (a0, a1) = self.split_f32x8(a);
1603        self.combine_i32x4(
1604            self.reinterpret_i32_f32x4(a0),
1605            self.reinterpret_i32_f32x4(a1),
1606        )
1607    }
1608    #[inline(always)]
1609    fn reinterpret_u8_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
1610        let (a0, a1) = self.split_f32x8(a);
1611        self.combine_u8x16(self.reinterpret_u8_f32x4(a0), self.reinterpret_u8_f32x4(a1))
1612    }
1613    #[inline(always)]
1614    fn reinterpret_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
1615        let (a0, a1) = self.split_f32x8(a);
1616        self.combine_u32x4(
1617            self.reinterpret_u32_f32x4(a0),
1618            self.reinterpret_u32_f32x4(a1),
1619        )
1620    }
1621    #[inline(always)]
1622    fn cvt_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
1623        let (a0, a1) = self.split_f32x8(a);
1624        self.combine_u32x4(self.cvt_u32_f32x4(a0), self.cvt_u32_f32x4(a1))
1625    }
1626    #[inline(always)]
1627    fn cvt_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
1628        let (a0, a1) = self.split_f32x8(a);
1629        self.combine_i32x4(self.cvt_i32_f32x4(a0), self.cvt_i32_f32x4(a1))
1630    }
1631    #[inline(always)]
1632    fn splat_i8x32(self, a: i8) -> i8x32<Self> {
1633        let half = self.splat_i8x16(a);
1634        self.combine_i8x16(half, half)
1635    }
1636    #[inline(always)]
1637    fn not_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
1638        let (a0, a1) = self.split_i8x32(a);
1639        self.combine_i8x16(self.not_i8x16(a0), self.not_i8x16(a1))
1640    }
1641    #[inline(always)]
1642    fn add_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1643        let (a0, a1) = self.split_i8x32(a);
1644        let (b0, b1) = self.split_i8x32(b);
1645        self.combine_i8x16(self.add_i8x16(a0, b0), self.add_i8x16(a1, b1))
1646    }
1647    #[inline(always)]
1648    fn sub_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1649        let (a0, a1) = self.split_i8x32(a);
1650        let (b0, b1) = self.split_i8x32(b);
1651        self.combine_i8x16(self.sub_i8x16(a0, b0), self.sub_i8x16(a1, b1))
1652    }
1653    #[inline(always)]
1654    fn mul_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1655        let (a0, a1) = self.split_i8x32(a);
1656        let (b0, b1) = self.split_i8x32(b);
1657        self.combine_i8x16(self.mul_i8x16(a0, b0), self.mul_i8x16(a1, b1))
1658    }
1659    #[inline(always)]
1660    fn and_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1661        let (a0, a1) = self.split_i8x32(a);
1662        let (b0, b1) = self.split_i8x32(b);
1663        self.combine_i8x16(self.and_i8x16(a0, b0), self.and_i8x16(a1, b1))
1664    }
1665    #[inline(always)]
1666    fn or_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1667        let (a0, a1) = self.split_i8x32(a);
1668        let (b0, b1) = self.split_i8x32(b);
1669        self.combine_i8x16(self.or_i8x16(a0, b0), self.or_i8x16(a1, b1))
1670    }
1671    #[inline(always)]
1672    fn xor_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1673        let (a0, a1) = self.split_i8x32(a);
1674        let (b0, b1) = self.split_i8x32(b);
1675        self.combine_i8x16(self.xor_i8x16(a0, b0), self.xor_i8x16(a1, b1))
1676    }
1677    #[inline(always)]
1678    fn shr_i8x32(self, a: i8x32<Self>, b: u32) -> i8x32<Self> {
1679        let (a0, a1) = self.split_i8x32(a);
1680        self.combine_i8x16(self.shr_i8x16(a0, b), self.shr_i8x16(a1, b))
1681    }
1682    #[inline(always)]
1683    fn shrv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1684        let (a0, a1) = self.split_i8x32(a);
1685        let (b0, b1) = self.split_i8x32(b);
1686        self.combine_i8x16(self.shrv_i8x16(a0, b0), self.shrv_i8x16(a1, b1))
1687    }
1688    #[inline(always)]
1689    fn shl_i8x32(self, a: i8x32<Self>, b: u32) -> i8x32<Self> {
1690        let (a0, a1) = self.split_i8x32(a);
1691        self.combine_i8x16(self.shl_i8x16(a0, b), self.shl_i8x16(a1, b))
1692    }
1693    #[inline(always)]
1694    fn simd_eq_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
1695        let (a0, a1) = self.split_i8x32(a);
1696        let (b0, b1) = self.split_i8x32(b);
1697        self.combine_mask8x16(self.simd_eq_i8x16(a0, b0), self.simd_eq_i8x16(a1, b1))
1698    }
1699    #[inline(always)]
1700    fn simd_lt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
1701        let (a0, a1) = self.split_i8x32(a);
1702        let (b0, b1) = self.split_i8x32(b);
1703        self.combine_mask8x16(self.simd_lt_i8x16(a0, b0), self.simd_lt_i8x16(a1, b1))
1704    }
1705    #[inline(always)]
1706    fn simd_le_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
1707        let (a0, a1) = self.split_i8x32(a);
1708        let (b0, b1) = self.split_i8x32(b);
1709        self.combine_mask8x16(self.simd_le_i8x16(a0, b0), self.simd_le_i8x16(a1, b1))
1710    }
1711    #[inline(always)]
1712    fn simd_ge_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
1713        let (a0, a1) = self.split_i8x32(a);
1714        let (b0, b1) = self.split_i8x32(b);
1715        self.combine_mask8x16(self.simd_ge_i8x16(a0, b0), self.simd_ge_i8x16(a1, b1))
1716    }
1717    #[inline(always)]
1718    fn simd_gt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
1719        let (a0, a1) = self.split_i8x32(a);
1720        let (b0, b1) = self.split_i8x32(b);
1721        self.combine_mask8x16(self.simd_gt_i8x16(a0, b0), self.simd_gt_i8x16(a1, b1))
1722    }
1723    #[inline(always)]
1724    fn zip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1725        let (a0, _) = self.split_i8x32(a);
1726        let (b0, _) = self.split_i8x32(b);
1727        self.combine_i8x16(self.zip_low_i8x16(a0, b0), self.zip_high_i8x16(a0, b0))
1728    }
1729    #[inline(always)]
1730    fn zip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1731        let (_, a1) = self.split_i8x32(a);
1732        let (_, b1) = self.split_i8x32(b);
1733        self.combine_i8x16(self.zip_low_i8x16(a1, b1), self.zip_high_i8x16(a1, b1))
1734    }
1735    #[inline(always)]
1736    fn unzip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1737        let (a0, a1) = self.split_i8x32(a);
1738        let (b0, b1) = self.split_i8x32(b);
1739        self.combine_i8x16(self.unzip_low_i8x16(a0, a1), self.unzip_low_i8x16(b0, b1))
1740    }
1741    #[inline(always)]
1742    fn unzip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1743        let (a0, a1) = self.split_i8x32(a);
1744        let (b0, b1) = self.split_i8x32(b);
1745        self.combine_i8x16(self.unzip_high_i8x16(a0, a1), self.unzip_high_i8x16(b0, b1))
1746    }
1747    #[inline(always)]
1748    fn select_i8x32(self, a: mask8x32<Self>, b: i8x32<Self>, c: i8x32<Self>) -> i8x32<Self> {
1749        let (a0, a1) = self.split_mask8x32(a);
1750        let (b0, b1) = self.split_i8x32(b);
1751        let (c0, c1) = self.split_i8x32(c);
1752        self.combine_i8x16(self.select_i8x16(a0, b0, c0), self.select_i8x16(a1, b1, c1))
1753    }
1754    #[inline(always)]
1755    fn min_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1756        let (a0, a1) = self.split_i8x32(a);
1757        let (b0, b1) = self.split_i8x32(b);
1758        self.combine_i8x16(self.min_i8x16(a0, b0), self.min_i8x16(a1, b1))
1759    }
1760    #[inline(always)]
1761    fn max_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1762        let (a0, a1) = self.split_i8x32(a);
1763        let (b0, b1) = self.split_i8x32(b);
1764        self.combine_i8x16(self.max_i8x16(a0, b0), self.max_i8x16(a1, b1))
1765    }
1766    #[inline(always)]
1767    fn combine_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x64<Self> {
1768        let mut result = [0; 64usize];
1769        result[0..32usize].copy_from_slice(&a.val);
1770        result[32usize..64usize].copy_from_slice(&b.val);
1771        result.simd_into(self)
1772    }
1773    #[inline(always)]
1774    fn split_i8x32(self, a: i8x32<Self>) -> (i8x16<Self>, i8x16<Self>) {
1775        let mut b0 = [0; 16usize];
1776        let mut b1 = [0; 16usize];
1777        b0.copy_from_slice(&a.val[0..16usize]);
1778        b1.copy_from_slice(&a.val[16usize..32usize]);
1779        (b0.simd_into(self), b1.simd_into(self))
1780    }
1781    #[inline(always)]
1782    fn neg_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
1783        let (a0, a1) = self.split_i8x32(a);
1784        self.combine_i8x16(self.neg_i8x16(a0), self.neg_i8x16(a1))
1785    }
1786    #[inline(always)]
1787    fn reinterpret_u8_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
1788        let (a0, a1) = self.split_i8x32(a);
1789        self.combine_u8x16(self.reinterpret_u8_i8x16(a0), self.reinterpret_u8_i8x16(a1))
1790    }
1791    #[inline(always)]
1792    fn reinterpret_u32_i8x32(self, a: i8x32<Self>) -> u32x8<Self> {
1793        let (a0, a1) = self.split_i8x32(a);
1794        self.combine_u32x4(
1795            self.reinterpret_u32_i8x16(a0),
1796            self.reinterpret_u32_i8x16(a1),
1797        )
1798    }
1799    #[inline(always)]
1800    fn splat_u8x32(self, a: u8) -> u8x32<Self> {
1801        let half = self.splat_u8x16(a);
1802        self.combine_u8x16(half, half)
1803    }
1804    #[inline(always)]
1805    fn not_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
1806        let (a0, a1) = self.split_u8x32(a);
1807        self.combine_u8x16(self.not_u8x16(a0), self.not_u8x16(a1))
1808    }
1809    #[inline(always)]
1810    fn add_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1811        let (a0, a1) = self.split_u8x32(a);
1812        let (b0, b1) = self.split_u8x32(b);
1813        self.combine_u8x16(self.add_u8x16(a0, b0), self.add_u8x16(a1, b1))
1814    }
1815    #[inline(always)]
1816    fn sub_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1817        let (a0, a1) = self.split_u8x32(a);
1818        let (b0, b1) = self.split_u8x32(b);
1819        self.combine_u8x16(self.sub_u8x16(a0, b0), self.sub_u8x16(a1, b1))
1820    }
1821    #[inline(always)]
1822    fn mul_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1823        let (a0, a1) = self.split_u8x32(a);
1824        let (b0, b1) = self.split_u8x32(b);
1825        self.combine_u8x16(self.mul_u8x16(a0, b0), self.mul_u8x16(a1, b1))
1826    }
1827    #[inline(always)]
1828    fn and_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1829        let (a0, a1) = self.split_u8x32(a);
1830        let (b0, b1) = self.split_u8x32(b);
1831        self.combine_u8x16(self.and_u8x16(a0, b0), self.and_u8x16(a1, b1))
1832    }
1833    #[inline(always)]
1834    fn or_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1835        let (a0, a1) = self.split_u8x32(a);
1836        let (b0, b1) = self.split_u8x32(b);
1837        self.combine_u8x16(self.or_u8x16(a0, b0), self.or_u8x16(a1, b1))
1838    }
1839    #[inline(always)]
1840    fn xor_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1841        let (a0, a1) = self.split_u8x32(a);
1842        let (b0, b1) = self.split_u8x32(b);
1843        self.combine_u8x16(self.xor_u8x16(a0, b0), self.xor_u8x16(a1, b1))
1844    }
1845    #[inline(always)]
1846    fn shr_u8x32(self, a: u8x32<Self>, b: u32) -> u8x32<Self> {
1847        let (a0, a1) = self.split_u8x32(a);
1848        self.combine_u8x16(self.shr_u8x16(a0, b), self.shr_u8x16(a1, b))
1849    }
1850    #[inline(always)]
1851    fn shrv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1852        let (a0, a1) = self.split_u8x32(a);
1853        let (b0, b1) = self.split_u8x32(b);
1854        self.combine_u8x16(self.shrv_u8x16(a0, b0), self.shrv_u8x16(a1, b1))
1855    }
1856    #[inline(always)]
1857    fn shl_u8x32(self, a: u8x32<Self>, b: u32) -> u8x32<Self> {
1858        let (a0, a1) = self.split_u8x32(a);
1859        self.combine_u8x16(self.shl_u8x16(a0, b), self.shl_u8x16(a1, b))
1860    }
1861    #[inline(always)]
1862    fn simd_eq_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
1863        let (a0, a1) = self.split_u8x32(a);
1864        let (b0, b1) = self.split_u8x32(b);
1865        self.combine_mask8x16(self.simd_eq_u8x16(a0, b0), self.simd_eq_u8x16(a1, b1))
1866    }
1867    #[inline(always)]
1868    fn simd_lt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
1869        let (a0, a1) = self.split_u8x32(a);
1870        let (b0, b1) = self.split_u8x32(b);
1871        self.combine_mask8x16(self.simd_lt_u8x16(a0, b0), self.simd_lt_u8x16(a1, b1))
1872    }
1873    #[inline(always)]
1874    fn simd_le_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
1875        let (a0, a1) = self.split_u8x32(a);
1876        let (b0, b1) = self.split_u8x32(b);
1877        self.combine_mask8x16(self.simd_le_u8x16(a0, b0), self.simd_le_u8x16(a1, b1))
1878    }
1879    #[inline(always)]
1880    fn simd_ge_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
1881        let (a0, a1) = self.split_u8x32(a);
1882        let (b0, b1) = self.split_u8x32(b);
1883        self.combine_mask8x16(self.simd_ge_u8x16(a0, b0), self.simd_ge_u8x16(a1, b1))
1884    }
1885    #[inline(always)]
1886    fn simd_gt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
1887        let (a0, a1) = self.split_u8x32(a);
1888        let (b0, b1) = self.split_u8x32(b);
1889        self.combine_mask8x16(self.simd_gt_u8x16(a0, b0), self.simd_gt_u8x16(a1, b1))
1890    }
1891    #[inline(always)]
1892    fn zip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1893        let (a0, _) = self.split_u8x32(a);
1894        let (b0, _) = self.split_u8x32(b);
1895        self.combine_u8x16(self.zip_low_u8x16(a0, b0), self.zip_high_u8x16(a0, b0))
1896    }
1897    #[inline(always)]
1898    fn zip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1899        let (_, a1) = self.split_u8x32(a);
1900        let (_, b1) = self.split_u8x32(b);
1901        self.combine_u8x16(self.zip_low_u8x16(a1, b1), self.zip_high_u8x16(a1, b1))
1902    }
1903    #[inline(always)]
1904    fn unzip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1905        let (a0, a1) = self.split_u8x32(a);
1906        let (b0, b1) = self.split_u8x32(b);
1907        self.combine_u8x16(self.unzip_low_u8x16(a0, a1), self.unzip_low_u8x16(b0, b1))
1908    }
1909    #[inline(always)]
1910    fn unzip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1911        let (a0, a1) = self.split_u8x32(a);
1912        let (b0, b1) = self.split_u8x32(b);
1913        self.combine_u8x16(self.unzip_high_u8x16(a0, a1), self.unzip_high_u8x16(b0, b1))
1914    }
1915    #[inline(always)]
1916    fn select_u8x32(self, a: mask8x32<Self>, b: u8x32<Self>, c: u8x32<Self>) -> u8x32<Self> {
1917        let (a0, a1) = self.split_mask8x32(a);
1918        let (b0, b1) = self.split_u8x32(b);
1919        let (c0, c1) = self.split_u8x32(c);
1920        self.combine_u8x16(self.select_u8x16(a0, b0, c0), self.select_u8x16(a1, b1, c1))
1921    }
1922    #[inline(always)]
1923    fn min_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1924        let (a0, a1) = self.split_u8x32(a);
1925        let (b0, b1) = self.split_u8x32(b);
1926        self.combine_u8x16(self.min_u8x16(a0, b0), self.min_u8x16(a1, b1))
1927    }
1928    #[inline(always)]
1929    fn max_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1930        let (a0, a1) = self.split_u8x32(a);
1931        let (b0, b1) = self.split_u8x32(b);
1932        self.combine_u8x16(self.max_u8x16(a0, b0), self.max_u8x16(a1, b1))
1933    }
1934    #[inline(always)]
1935    fn combine_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x64<Self> {
1936        let mut result = [0; 64usize];
1937        result[0..32usize].copy_from_slice(&a.val);
1938        result[32usize..64usize].copy_from_slice(&b.val);
1939        result.simd_into(self)
1940    }
1941    #[inline(always)]
1942    fn split_u8x32(self, a: u8x32<Self>) -> (u8x16<Self>, u8x16<Self>) {
1943        let mut b0 = [0; 16usize];
1944        let mut b1 = [0; 16usize];
1945        b0.copy_from_slice(&a.val[0..16usize]);
1946        b1.copy_from_slice(&a.val[16usize..32usize]);
1947        (b0.simd_into(self), b1.simd_into(self))
1948    }
1949    #[inline(always)]
1950    fn widen_u8x32(self, a: u8x32<Self>) -> u16x32<Self> {
1951        let (a0, a1) = self.split_u8x32(a);
1952        self.combine_u16x16(self.widen_u8x16(a0), self.widen_u8x16(a1))
1953    }
1954    #[inline(always)]
1955    fn reinterpret_u32_u8x32(self, a: u8x32<Self>) -> u32x8<Self> {
1956        let (a0, a1) = self.split_u8x32(a);
1957        self.combine_u32x4(
1958            self.reinterpret_u32_u8x16(a0),
1959            self.reinterpret_u32_u8x16(a1),
1960        )
1961    }
1962    #[inline(always)]
1963    fn splat_mask8x32(self, a: i8) -> mask8x32<Self> {
1964        let half = self.splat_mask8x16(a);
1965        self.combine_mask8x16(half, half)
1966    }
1967    #[inline(always)]
1968    fn not_mask8x32(self, a: mask8x32<Self>) -> mask8x32<Self> {
1969        let (a0, a1) = self.split_mask8x32(a);
1970        self.combine_mask8x16(self.not_mask8x16(a0), self.not_mask8x16(a1))
1971    }
1972    #[inline(always)]
1973    fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
1974        let (a0, a1) = self.split_mask8x32(a);
1975        let (b0, b1) = self.split_mask8x32(b);
1976        self.combine_mask8x16(self.and_mask8x16(a0, b0), self.and_mask8x16(a1, b1))
1977    }
1978    #[inline(always)]
1979    fn or_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
1980        let (a0, a1) = self.split_mask8x32(a);
1981        let (b0, b1) = self.split_mask8x32(b);
1982        self.combine_mask8x16(self.or_mask8x16(a0, b0), self.or_mask8x16(a1, b1))
1983    }
1984    #[inline(always)]
1985    fn xor_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
1986        let (a0, a1) = self.split_mask8x32(a);
1987        let (b0, b1) = self.split_mask8x32(b);
1988        self.combine_mask8x16(self.xor_mask8x16(a0, b0), self.xor_mask8x16(a1, b1))
1989    }
1990    #[inline(always)]
1991    fn select_mask8x32(
1992        self,
1993        a: mask8x32<Self>,
1994        b: mask8x32<Self>,
1995        c: mask8x32<Self>,
1996    ) -> mask8x32<Self> {
1997        let (a0, a1) = self.split_mask8x32(a);
1998        let (b0, b1) = self.split_mask8x32(b);
1999        let (c0, c1) = self.split_mask8x32(c);
2000        self.combine_mask8x16(
2001            self.select_mask8x16(a0, b0, c0),
2002            self.select_mask8x16(a1, b1, c1),
2003        )
2004    }
2005    #[inline(always)]
2006    fn simd_eq_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
2007        let (a0, a1) = self.split_mask8x32(a);
2008        let (b0, b1) = self.split_mask8x32(b);
2009        self.combine_mask8x16(self.simd_eq_mask8x16(a0, b0), self.simd_eq_mask8x16(a1, b1))
2010    }
2011    #[inline(always)]
2012    fn combine_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x64<Self> {
2013        let mut result = [0; 64usize];
2014        result[0..32usize].copy_from_slice(&a.val);
2015        result[32usize..64usize].copy_from_slice(&b.val);
2016        result.simd_into(self)
2017    }
2018    #[inline(always)]
2019    fn split_mask8x32(self, a: mask8x32<Self>) -> (mask8x16<Self>, mask8x16<Self>) {
2020        let mut b0 = [0; 16usize];
2021        let mut b1 = [0; 16usize];
2022        b0.copy_from_slice(&a.val[0..16usize]);
2023        b1.copy_from_slice(&a.val[16usize..32usize]);
2024        (b0.simd_into(self), b1.simd_into(self))
2025    }
2026    #[inline(always)]
2027    fn splat_i16x16(self, a: i16) -> i16x16<Self> {
2028        let half = self.splat_i16x8(a);
2029        self.combine_i16x8(half, half)
2030    }
2031    #[inline(always)]
2032    fn not_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
2033        let (a0, a1) = self.split_i16x16(a);
2034        self.combine_i16x8(self.not_i16x8(a0), self.not_i16x8(a1))
2035    }
2036    #[inline(always)]
2037    fn add_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2038        let (a0, a1) = self.split_i16x16(a);
2039        let (b0, b1) = self.split_i16x16(b);
2040        self.combine_i16x8(self.add_i16x8(a0, b0), self.add_i16x8(a1, b1))
2041    }
2042    #[inline(always)]
2043    fn sub_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2044        let (a0, a1) = self.split_i16x16(a);
2045        let (b0, b1) = self.split_i16x16(b);
2046        self.combine_i16x8(self.sub_i16x8(a0, b0), self.sub_i16x8(a1, b1))
2047    }
2048    #[inline(always)]
2049    fn mul_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2050        let (a0, a1) = self.split_i16x16(a);
2051        let (b0, b1) = self.split_i16x16(b);
2052        self.combine_i16x8(self.mul_i16x8(a0, b0), self.mul_i16x8(a1, b1))
2053    }
2054    #[inline(always)]
2055    fn and_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2056        let (a0, a1) = self.split_i16x16(a);
2057        let (b0, b1) = self.split_i16x16(b);
2058        self.combine_i16x8(self.and_i16x8(a0, b0), self.and_i16x8(a1, b1))
2059    }
2060    #[inline(always)]
2061    fn or_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2062        let (a0, a1) = self.split_i16x16(a);
2063        let (b0, b1) = self.split_i16x16(b);
2064        self.combine_i16x8(self.or_i16x8(a0, b0), self.or_i16x8(a1, b1))
2065    }
2066    #[inline(always)]
2067    fn xor_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2068        let (a0, a1) = self.split_i16x16(a);
2069        let (b0, b1) = self.split_i16x16(b);
2070        self.combine_i16x8(self.xor_i16x8(a0, b0), self.xor_i16x8(a1, b1))
2071    }
2072    #[inline(always)]
2073    fn shr_i16x16(self, a: i16x16<Self>, b: u32) -> i16x16<Self> {
2074        let (a0, a1) = self.split_i16x16(a);
2075        self.combine_i16x8(self.shr_i16x8(a0, b), self.shr_i16x8(a1, b))
2076    }
2077    #[inline(always)]
2078    fn shrv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2079        let (a0, a1) = self.split_i16x16(a);
2080        let (b0, b1) = self.split_i16x16(b);
2081        self.combine_i16x8(self.shrv_i16x8(a0, b0), self.shrv_i16x8(a1, b1))
2082    }
2083    #[inline(always)]
2084    fn shl_i16x16(self, a: i16x16<Self>, b: u32) -> i16x16<Self> {
2085        let (a0, a1) = self.split_i16x16(a);
2086        self.combine_i16x8(self.shl_i16x8(a0, b), self.shl_i16x8(a1, b))
2087    }
2088    #[inline(always)]
2089    fn simd_eq_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
2090        let (a0, a1) = self.split_i16x16(a);
2091        let (b0, b1) = self.split_i16x16(b);
2092        self.combine_mask16x8(self.simd_eq_i16x8(a0, b0), self.simd_eq_i16x8(a1, b1))
2093    }
2094    #[inline(always)]
2095    fn simd_lt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
2096        let (a0, a1) = self.split_i16x16(a);
2097        let (b0, b1) = self.split_i16x16(b);
2098        self.combine_mask16x8(self.simd_lt_i16x8(a0, b0), self.simd_lt_i16x8(a1, b1))
2099    }
2100    #[inline(always)]
2101    fn simd_le_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
2102        let (a0, a1) = self.split_i16x16(a);
2103        let (b0, b1) = self.split_i16x16(b);
2104        self.combine_mask16x8(self.simd_le_i16x8(a0, b0), self.simd_le_i16x8(a1, b1))
2105    }
2106    #[inline(always)]
2107    fn simd_ge_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
2108        let (a0, a1) = self.split_i16x16(a);
2109        let (b0, b1) = self.split_i16x16(b);
2110        self.combine_mask16x8(self.simd_ge_i16x8(a0, b0), self.simd_ge_i16x8(a1, b1))
2111    }
2112    #[inline(always)]
2113    fn simd_gt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
2114        let (a0, a1) = self.split_i16x16(a);
2115        let (b0, b1) = self.split_i16x16(b);
2116        self.combine_mask16x8(self.simd_gt_i16x8(a0, b0), self.simd_gt_i16x8(a1, b1))
2117    }
2118    #[inline(always)]
2119    fn zip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2120        let (a0, _) = self.split_i16x16(a);
2121        let (b0, _) = self.split_i16x16(b);
2122        self.combine_i16x8(self.zip_low_i16x8(a0, b0), self.zip_high_i16x8(a0, b0))
2123    }
2124    #[inline(always)]
2125    fn zip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2126        let (_, a1) = self.split_i16x16(a);
2127        let (_, b1) = self.split_i16x16(b);
2128        self.combine_i16x8(self.zip_low_i16x8(a1, b1), self.zip_high_i16x8(a1, b1))
2129    }
2130    #[inline(always)]
2131    fn unzip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2132        let (a0, a1) = self.split_i16x16(a);
2133        let (b0, b1) = self.split_i16x16(b);
2134        self.combine_i16x8(self.unzip_low_i16x8(a0, a1), self.unzip_low_i16x8(b0, b1))
2135    }
2136    #[inline(always)]
2137    fn unzip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2138        let (a0, a1) = self.split_i16x16(a);
2139        let (b0, b1) = self.split_i16x16(b);
2140        self.combine_i16x8(self.unzip_high_i16x8(a0, a1), self.unzip_high_i16x8(b0, b1))
2141    }
2142    #[inline(always)]
2143    fn select_i16x16(self, a: mask16x16<Self>, b: i16x16<Self>, c: i16x16<Self>) -> i16x16<Self> {
2144        let (a0, a1) = self.split_mask16x16(a);
2145        let (b0, b1) = self.split_i16x16(b);
2146        let (c0, c1) = self.split_i16x16(c);
2147        self.combine_i16x8(self.select_i16x8(a0, b0, c0), self.select_i16x8(a1, b1, c1))
2148    }
2149    #[inline(always)]
2150    fn min_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2151        let (a0, a1) = self.split_i16x16(a);
2152        let (b0, b1) = self.split_i16x16(b);
2153        self.combine_i16x8(self.min_i16x8(a0, b0), self.min_i16x8(a1, b1))
2154    }
2155    #[inline(always)]
2156    fn max_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2157        let (a0, a1) = self.split_i16x16(a);
2158        let (b0, b1) = self.split_i16x16(b);
2159        self.combine_i16x8(self.max_i16x8(a0, b0), self.max_i16x8(a1, b1))
2160    }
2161    #[inline(always)]
2162    fn combine_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x32<Self> {
2163        let mut result = [0; 32usize];
2164        result[0..16usize].copy_from_slice(&a.val);
2165        result[16usize..32usize].copy_from_slice(&b.val);
2166        result.simd_into(self)
2167    }
2168    #[inline(always)]
2169    fn split_i16x16(self, a: i16x16<Self>) -> (i16x8<Self>, i16x8<Self>) {
2170        let mut b0 = [0; 8usize];
2171        let mut b1 = [0; 8usize];
2172        b0.copy_from_slice(&a.val[0..8usize]);
2173        b1.copy_from_slice(&a.val[8usize..16usize]);
2174        (b0.simd_into(self), b1.simd_into(self))
2175    }
2176    #[inline(always)]
2177    fn neg_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
2178        let (a0, a1) = self.split_i16x16(a);
2179        self.combine_i16x8(self.neg_i16x8(a0), self.neg_i16x8(a1))
2180    }
2181    #[inline(always)]
2182    fn reinterpret_u8_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
2183        let (a0, a1) = self.split_i16x16(a);
2184        self.combine_u8x16(self.reinterpret_u8_i16x8(a0), self.reinterpret_u8_i16x8(a1))
2185    }
2186    #[inline(always)]
2187    fn reinterpret_u32_i16x16(self, a: i16x16<Self>) -> u32x8<Self> {
2188        let (a0, a1) = self.split_i16x16(a);
2189        self.combine_u32x4(
2190            self.reinterpret_u32_i16x8(a0),
2191            self.reinterpret_u32_i16x8(a1),
2192        )
2193    }
2194    #[inline(always)]
2195    fn splat_u16x16(self, a: u16) -> u16x16<Self> {
2196        let half = self.splat_u16x8(a);
2197        self.combine_u16x8(half, half)
2198    }
2199    #[inline(always)]
2200    fn not_u16x16(self, a: u16x16<Self>) -> u16x16<Self> {
2201        let (a0, a1) = self.split_u16x16(a);
2202        self.combine_u16x8(self.not_u16x8(a0), self.not_u16x8(a1))
2203    }
2204    #[inline(always)]
2205    fn add_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2206        let (a0, a1) = self.split_u16x16(a);
2207        let (b0, b1) = self.split_u16x16(b);
2208        self.combine_u16x8(self.add_u16x8(a0, b0), self.add_u16x8(a1, b1))
2209    }
2210    #[inline(always)]
2211    fn sub_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2212        let (a0, a1) = self.split_u16x16(a);
2213        let (b0, b1) = self.split_u16x16(b);
2214        self.combine_u16x8(self.sub_u16x8(a0, b0), self.sub_u16x8(a1, b1))
2215    }
2216    #[inline(always)]
2217    fn mul_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2218        let (a0, a1) = self.split_u16x16(a);
2219        let (b0, b1) = self.split_u16x16(b);
2220        self.combine_u16x8(self.mul_u16x8(a0, b0), self.mul_u16x8(a1, b1))
2221    }
2222    #[inline(always)]
2223    fn and_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2224        let (a0, a1) = self.split_u16x16(a);
2225        let (b0, b1) = self.split_u16x16(b);
2226        self.combine_u16x8(self.and_u16x8(a0, b0), self.and_u16x8(a1, b1))
2227    }
2228    #[inline(always)]
2229    fn or_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2230        let (a0, a1) = self.split_u16x16(a);
2231        let (b0, b1) = self.split_u16x16(b);
2232        self.combine_u16x8(self.or_u16x8(a0, b0), self.or_u16x8(a1, b1))
2233    }
2234    #[inline(always)]
2235    fn xor_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2236        let (a0, a1) = self.split_u16x16(a);
2237        let (b0, b1) = self.split_u16x16(b);
2238        self.combine_u16x8(self.xor_u16x8(a0, b0), self.xor_u16x8(a1, b1))
2239    }
2240    #[inline(always)]
2241    fn shr_u16x16(self, a: u16x16<Self>, b: u32) -> u16x16<Self> {
2242        let (a0, a1) = self.split_u16x16(a);
2243        self.combine_u16x8(self.shr_u16x8(a0, b), self.shr_u16x8(a1, b))
2244    }
2245    #[inline(always)]
2246    fn shrv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2247        let (a0, a1) = self.split_u16x16(a);
2248        let (b0, b1) = self.split_u16x16(b);
2249        self.combine_u16x8(self.shrv_u16x8(a0, b0), self.shrv_u16x8(a1, b1))
2250    }
2251    #[inline(always)]
2252    fn shl_u16x16(self, a: u16x16<Self>, b: u32) -> u16x16<Self> {
2253        let (a0, a1) = self.split_u16x16(a);
2254        self.combine_u16x8(self.shl_u16x8(a0, b), self.shl_u16x8(a1, b))
2255    }
2256    #[inline(always)]
2257    fn simd_eq_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
2258        let (a0, a1) = self.split_u16x16(a);
2259        let (b0, b1) = self.split_u16x16(b);
2260        self.combine_mask16x8(self.simd_eq_u16x8(a0, b0), self.simd_eq_u16x8(a1, b1))
2261    }
2262    #[inline(always)]
2263    fn simd_lt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
2264        let (a0, a1) = self.split_u16x16(a);
2265        let (b0, b1) = self.split_u16x16(b);
2266        self.combine_mask16x8(self.simd_lt_u16x8(a0, b0), self.simd_lt_u16x8(a1, b1))
2267    }
2268    #[inline(always)]
2269    fn simd_le_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
2270        let (a0, a1) = self.split_u16x16(a);
2271        let (b0, b1) = self.split_u16x16(b);
2272        self.combine_mask16x8(self.simd_le_u16x8(a0, b0), self.simd_le_u16x8(a1, b1))
2273    }
2274    #[inline(always)]
2275    fn simd_ge_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
2276        let (a0, a1) = self.split_u16x16(a);
2277        let (b0, b1) = self.split_u16x16(b);
2278        self.combine_mask16x8(self.simd_ge_u16x8(a0, b0), self.simd_ge_u16x8(a1, b1))
2279    }
2280    #[inline(always)]
2281    fn simd_gt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
2282        let (a0, a1) = self.split_u16x16(a);
2283        let (b0, b1) = self.split_u16x16(b);
2284        self.combine_mask16x8(self.simd_gt_u16x8(a0, b0), self.simd_gt_u16x8(a1, b1))
2285    }
2286    #[inline(always)]
2287    fn zip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2288        let (a0, _) = self.split_u16x16(a);
2289        let (b0, _) = self.split_u16x16(b);
2290        self.combine_u16x8(self.zip_low_u16x8(a0, b0), self.zip_high_u16x8(a0, b0))
2291    }
2292    #[inline(always)]
2293    fn zip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2294        let (_, a1) = self.split_u16x16(a);
2295        let (_, b1) = self.split_u16x16(b);
2296        self.combine_u16x8(self.zip_low_u16x8(a1, b1), self.zip_high_u16x8(a1, b1))
2297    }
2298    #[inline(always)]
2299    fn unzip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2300        let (a0, a1) = self.split_u16x16(a);
2301        let (b0, b1) = self.split_u16x16(b);
2302        self.combine_u16x8(self.unzip_low_u16x8(a0, a1), self.unzip_low_u16x8(b0, b1))
2303    }
2304    #[inline(always)]
2305    fn unzip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2306        let (a0, a1) = self.split_u16x16(a);
2307        let (b0, b1) = self.split_u16x16(b);
2308        self.combine_u16x8(self.unzip_high_u16x8(a0, a1), self.unzip_high_u16x8(b0, b1))
2309    }
2310    #[inline(always)]
2311    fn select_u16x16(self, a: mask16x16<Self>, b: u16x16<Self>, c: u16x16<Self>) -> u16x16<Self> {
2312        let (a0, a1) = self.split_mask16x16(a);
2313        let (b0, b1) = self.split_u16x16(b);
2314        let (c0, c1) = self.split_u16x16(c);
2315        self.combine_u16x8(self.select_u16x8(a0, b0, c0), self.select_u16x8(a1, b1, c1))
2316    }
2317    #[inline(always)]
2318    fn min_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2319        let (a0, a1) = self.split_u16x16(a);
2320        let (b0, b1) = self.split_u16x16(b);
2321        self.combine_u16x8(self.min_u16x8(a0, b0), self.min_u16x8(a1, b1))
2322    }
2323    #[inline(always)]
2324    fn max_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2325        let (a0, a1) = self.split_u16x16(a);
2326        let (b0, b1) = self.split_u16x16(b);
2327        self.combine_u16x8(self.max_u16x8(a0, b0), self.max_u16x8(a1, b1))
2328    }
2329    #[inline(always)]
2330    fn combine_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x32<Self> {
2331        let mut result = [0; 32usize];
2332        result[0..16usize].copy_from_slice(&a.val);
2333        result[16usize..32usize].copy_from_slice(&b.val);
2334        result.simd_into(self)
2335    }
2336    #[inline(always)]
2337    fn split_u16x16(self, a: u16x16<Self>) -> (u16x8<Self>, u16x8<Self>) {
2338        let mut b0 = [0; 8usize];
2339        let mut b1 = [0; 8usize];
2340        b0.copy_from_slice(&a.val[0..8usize]);
2341        b1.copy_from_slice(&a.val[8usize..16usize]);
2342        (b0.simd_into(self), b1.simd_into(self))
2343    }
2344    #[inline(always)]
2345    fn narrow_u16x16(self, a: u16x16<Self>) -> u8x16<Self> {
2346        let (a, b) = self.split_u16x16(a);
2347        unsafe {
2348            let mask = _mm_set1_epi16(0xFF);
2349            let lo_masked = _mm_and_si128(a.into(), mask);
2350            let hi_masked = _mm_and_si128(b.into(), mask);
2351            let result = _mm_packus_epi16(lo_masked, hi_masked);
2352            result.simd_into(self)
2353        }
2354    }
2355    #[inline(always)]
2356    fn reinterpret_u8_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
2357        let (a0, a1) = self.split_u16x16(a);
2358        self.combine_u8x16(self.reinterpret_u8_u16x8(a0), self.reinterpret_u8_u16x8(a1))
2359    }
2360    #[inline(always)]
2361    fn reinterpret_u32_u16x16(self, a: u16x16<Self>) -> u32x8<Self> {
2362        let (a0, a1) = self.split_u16x16(a);
2363        self.combine_u32x4(
2364            self.reinterpret_u32_u16x8(a0),
2365            self.reinterpret_u32_u16x8(a1),
2366        )
2367    }
2368    #[inline(always)]
2369    fn splat_mask16x16(self, a: i16) -> mask16x16<Self> {
2370        let half = self.splat_mask16x8(a);
2371        self.combine_mask16x8(half, half)
2372    }
2373    #[inline(always)]
2374    fn not_mask16x16(self, a: mask16x16<Self>) -> mask16x16<Self> {
2375        let (a0, a1) = self.split_mask16x16(a);
2376        self.combine_mask16x8(self.not_mask16x8(a0), self.not_mask16x8(a1))
2377    }
2378    #[inline(always)]
2379    fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
2380        let (a0, a1) = self.split_mask16x16(a);
2381        let (b0, b1) = self.split_mask16x16(b);
2382        self.combine_mask16x8(self.and_mask16x8(a0, b0), self.and_mask16x8(a1, b1))
2383    }
2384    #[inline(always)]
2385    fn or_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
2386        let (a0, a1) = self.split_mask16x16(a);
2387        let (b0, b1) = self.split_mask16x16(b);
2388        self.combine_mask16x8(self.or_mask16x8(a0, b0), self.or_mask16x8(a1, b1))
2389    }
2390    #[inline(always)]
2391    fn xor_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
2392        let (a0, a1) = self.split_mask16x16(a);
2393        let (b0, b1) = self.split_mask16x16(b);
2394        self.combine_mask16x8(self.xor_mask16x8(a0, b0), self.xor_mask16x8(a1, b1))
2395    }
2396    #[inline(always)]
2397    fn select_mask16x16(
2398        self,
2399        a: mask16x16<Self>,
2400        b: mask16x16<Self>,
2401        c: mask16x16<Self>,
2402    ) -> mask16x16<Self> {
2403        let (a0, a1) = self.split_mask16x16(a);
2404        let (b0, b1) = self.split_mask16x16(b);
2405        let (c0, c1) = self.split_mask16x16(c);
2406        self.combine_mask16x8(
2407            self.select_mask16x8(a0, b0, c0),
2408            self.select_mask16x8(a1, b1, c1),
2409        )
2410    }
2411    #[inline(always)]
2412    fn simd_eq_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
2413        let (a0, a1) = self.split_mask16x16(a);
2414        let (b0, b1) = self.split_mask16x16(b);
2415        self.combine_mask16x8(self.simd_eq_mask16x8(a0, b0), self.simd_eq_mask16x8(a1, b1))
2416    }
2417    #[inline(always)]
2418    fn combine_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x32<Self> {
2419        let mut result = [0; 32usize];
2420        result[0..16usize].copy_from_slice(&a.val);
2421        result[16usize..32usize].copy_from_slice(&b.val);
2422        result.simd_into(self)
2423    }
2424    #[inline(always)]
2425    fn split_mask16x16(self, a: mask16x16<Self>) -> (mask16x8<Self>, mask16x8<Self>) {
2426        let mut b0 = [0; 8usize];
2427        let mut b1 = [0; 8usize];
2428        b0.copy_from_slice(&a.val[0..8usize]);
2429        b1.copy_from_slice(&a.val[8usize..16usize]);
2430        (b0.simd_into(self), b1.simd_into(self))
2431    }
2432    #[inline(always)]
2433    fn splat_i32x8(self, a: i32) -> i32x8<Self> {
2434        let half = self.splat_i32x4(a);
2435        self.combine_i32x4(half, half)
2436    }
2437    #[inline(always)]
2438    fn not_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
2439        let (a0, a1) = self.split_i32x8(a);
2440        self.combine_i32x4(self.not_i32x4(a0), self.not_i32x4(a1))
2441    }
2442    #[inline(always)]
2443    fn add_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2444        let (a0, a1) = self.split_i32x8(a);
2445        let (b0, b1) = self.split_i32x8(b);
2446        self.combine_i32x4(self.add_i32x4(a0, b0), self.add_i32x4(a1, b1))
2447    }
2448    #[inline(always)]
2449    fn sub_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2450        let (a0, a1) = self.split_i32x8(a);
2451        let (b0, b1) = self.split_i32x8(b);
2452        self.combine_i32x4(self.sub_i32x4(a0, b0), self.sub_i32x4(a1, b1))
2453    }
2454    #[inline(always)]
2455    fn mul_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2456        let (a0, a1) = self.split_i32x8(a);
2457        let (b0, b1) = self.split_i32x8(b);
2458        self.combine_i32x4(self.mul_i32x4(a0, b0), self.mul_i32x4(a1, b1))
2459    }
2460    #[inline(always)]
2461    fn and_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2462        let (a0, a1) = self.split_i32x8(a);
2463        let (b0, b1) = self.split_i32x8(b);
2464        self.combine_i32x4(self.and_i32x4(a0, b0), self.and_i32x4(a1, b1))
2465    }
2466    #[inline(always)]
2467    fn or_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2468        let (a0, a1) = self.split_i32x8(a);
2469        let (b0, b1) = self.split_i32x8(b);
2470        self.combine_i32x4(self.or_i32x4(a0, b0), self.or_i32x4(a1, b1))
2471    }
2472    #[inline(always)]
2473    fn xor_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2474        let (a0, a1) = self.split_i32x8(a);
2475        let (b0, b1) = self.split_i32x8(b);
2476        self.combine_i32x4(self.xor_i32x4(a0, b0), self.xor_i32x4(a1, b1))
2477    }
2478    #[inline(always)]
2479    fn shr_i32x8(self, a: i32x8<Self>, b: u32) -> i32x8<Self> {
2480        let (a0, a1) = self.split_i32x8(a);
2481        self.combine_i32x4(self.shr_i32x4(a0, b), self.shr_i32x4(a1, b))
2482    }
2483    #[inline(always)]
2484    fn shrv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2485        let (a0, a1) = self.split_i32x8(a);
2486        let (b0, b1) = self.split_i32x8(b);
2487        self.combine_i32x4(self.shrv_i32x4(a0, b0), self.shrv_i32x4(a1, b1))
2488    }
2489    #[inline(always)]
2490    fn shl_i32x8(self, a: i32x8<Self>, b: u32) -> i32x8<Self> {
2491        let (a0, a1) = self.split_i32x8(a);
2492        self.combine_i32x4(self.shl_i32x4(a0, b), self.shl_i32x4(a1, b))
2493    }
2494    #[inline(always)]
2495    fn simd_eq_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
2496        let (a0, a1) = self.split_i32x8(a);
2497        let (b0, b1) = self.split_i32x8(b);
2498        self.combine_mask32x4(self.simd_eq_i32x4(a0, b0), self.simd_eq_i32x4(a1, b1))
2499    }
2500    #[inline(always)]
2501    fn simd_lt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
2502        let (a0, a1) = self.split_i32x8(a);
2503        let (b0, b1) = self.split_i32x8(b);
2504        self.combine_mask32x4(self.simd_lt_i32x4(a0, b0), self.simd_lt_i32x4(a1, b1))
2505    }
2506    #[inline(always)]
2507    fn simd_le_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
2508        let (a0, a1) = self.split_i32x8(a);
2509        let (b0, b1) = self.split_i32x8(b);
2510        self.combine_mask32x4(self.simd_le_i32x4(a0, b0), self.simd_le_i32x4(a1, b1))
2511    }
2512    #[inline(always)]
2513    fn simd_ge_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
2514        let (a0, a1) = self.split_i32x8(a);
2515        let (b0, b1) = self.split_i32x8(b);
2516        self.combine_mask32x4(self.simd_ge_i32x4(a0, b0), self.simd_ge_i32x4(a1, b1))
2517    }
2518    #[inline(always)]
2519    fn simd_gt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
2520        let (a0, a1) = self.split_i32x8(a);
2521        let (b0, b1) = self.split_i32x8(b);
2522        self.combine_mask32x4(self.simd_gt_i32x4(a0, b0), self.simd_gt_i32x4(a1, b1))
2523    }
2524    #[inline(always)]
2525    fn zip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2526        let (a0, _) = self.split_i32x8(a);
2527        let (b0, _) = self.split_i32x8(b);
2528        self.combine_i32x4(self.zip_low_i32x4(a0, b0), self.zip_high_i32x4(a0, b0))
2529    }
2530    #[inline(always)]
2531    fn zip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2532        let (_, a1) = self.split_i32x8(a);
2533        let (_, b1) = self.split_i32x8(b);
2534        self.combine_i32x4(self.zip_low_i32x4(a1, b1), self.zip_high_i32x4(a1, b1))
2535    }
2536    #[inline(always)]
2537    fn unzip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2538        let (a0, a1) = self.split_i32x8(a);
2539        let (b0, b1) = self.split_i32x8(b);
2540        self.combine_i32x4(self.unzip_low_i32x4(a0, a1), self.unzip_low_i32x4(b0, b1))
2541    }
2542    #[inline(always)]
2543    fn unzip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2544        let (a0, a1) = self.split_i32x8(a);
2545        let (b0, b1) = self.split_i32x8(b);
2546        self.combine_i32x4(self.unzip_high_i32x4(a0, a1), self.unzip_high_i32x4(b0, b1))
2547    }
2548    #[inline(always)]
2549    fn select_i32x8(self, a: mask32x8<Self>, b: i32x8<Self>, c: i32x8<Self>) -> i32x8<Self> {
2550        let (a0, a1) = self.split_mask32x8(a);
2551        let (b0, b1) = self.split_i32x8(b);
2552        let (c0, c1) = self.split_i32x8(c);
2553        self.combine_i32x4(self.select_i32x4(a0, b0, c0), self.select_i32x4(a1, b1, c1))
2554    }
2555    #[inline(always)]
2556    fn min_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2557        let (a0, a1) = self.split_i32x8(a);
2558        let (b0, b1) = self.split_i32x8(b);
2559        self.combine_i32x4(self.min_i32x4(a0, b0), self.min_i32x4(a1, b1))
2560    }
2561    #[inline(always)]
2562    fn max_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2563        let (a0, a1) = self.split_i32x8(a);
2564        let (b0, b1) = self.split_i32x8(b);
2565        self.combine_i32x4(self.max_i32x4(a0, b0), self.max_i32x4(a1, b1))
2566    }
2567    #[inline(always)]
2568    fn combine_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x16<Self> {
2569        let mut result = [0; 16usize];
2570        result[0..8usize].copy_from_slice(&a.val);
2571        result[8usize..16usize].copy_from_slice(&b.val);
2572        result.simd_into(self)
2573    }
2574    #[inline(always)]
2575    fn split_i32x8(self, a: i32x8<Self>) -> (i32x4<Self>, i32x4<Self>) {
2576        let mut b0 = [0; 4usize];
2577        let mut b1 = [0; 4usize];
2578        b0.copy_from_slice(&a.val[0..4usize]);
2579        b1.copy_from_slice(&a.val[4usize..8usize]);
2580        (b0.simd_into(self), b1.simd_into(self))
2581    }
2582    #[inline(always)]
2583    fn neg_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
2584        let (a0, a1) = self.split_i32x8(a);
2585        self.combine_i32x4(self.neg_i32x4(a0), self.neg_i32x4(a1))
2586    }
2587    #[inline(always)]
2588    fn reinterpret_u8_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
2589        let (a0, a1) = self.split_i32x8(a);
2590        self.combine_u8x16(self.reinterpret_u8_i32x4(a0), self.reinterpret_u8_i32x4(a1))
2591    }
2592    #[inline(always)]
2593    fn reinterpret_u32_i32x8(self, a: i32x8<Self>) -> u32x8<Self> {
2594        let (a0, a1) = self.split_i32x8(a);
2595        self.combine_u32x4(
2596            self.reinterpret_u32_i32x4(a0),
2597            self.reinterpret_u32_i32x4(a1),
2598        )
2599    }
2600    #[inline(always)]
2601    fn cvt_f32_i32x8(self, a: i32x8<Self>) -> f32x8<Self> {
2602        let (a0, a1) = self.split_i32x8(a);
2603        self.combine_f32x4(self.cvt_f32_i32x4(a0), self.cvt_f32_i32x4(a1))
2604    }
2605    #[inline(always)]
2606    fn splat_u32x8(self, a: u32) -> u32x8<Self> {
2607        let half = self.splat_u32x4(a);
2608        self.combine_u32x4(half, half)
2609    }
2610    #[inline(always)]
2611    fn not_u32x8(self, a: u32x8<Self>) -> u32x8<Self> {
2612        let (a0, a1) = self.split_u32x8(a);
2613        self.combine_u32x4(self.not_u32x4(a0), self.not_u32x4(a1))
2614    }
2615    #[inline(always)]
2616    fn add_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2617        let (a0, a1) = self.split_u32x8(a);
2618        let (b0, b1) = self.split_u32x8(b);
2619        self.combine_u32x4(self.add_u32x4(a0, b0), self.add_u32x4(a1, b1))
2620    }
2621    #[inline(always)]
2622    fn sub_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2623        let (a0, a1) = self.split_u32x8(a);
2624        let (b0, b1) = self.split_u32x8(b);
2625        self.combine_u32x4(self.sub_u32x4(a0, b0), self.sub_u32x4(a1, b1))
2626    }
2627    #[inline(always)]
2628    fn mul_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2629        let (a0, a1) = self.split_u32x8(a);
2630        let (b0, b1) = self.split_u32x8(b);
2631        self.combine_u32x4(self.mul_u32x4(a0, b0), self.mul_u32x4(a1, b1))
2632    }
2633    #[inline(always)]
2634    fn and_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2635        let (a0, a1) = self.split_u32x8(a);
2636        let (b0, b1) = self.split_u32x8(b);
2637        self.combine_u32x4(self.and_u32x4(a0, b0), self.and_u32x4(a1, b1))
2638    }
2639    #[inline(always)]
2640    fn or_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2641        let (a0, a1) = self.split_u32x8(a);
2642        let (b0, b1) = self.split_u32x8(b);
2643        self.combine_u32x4(self.or_u32x4(a0, b0), self.or_u32x4(a1, b1))
2644    }
2645    #[inline(always)]
2646    fn xor_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2647        let (a0, a1) = self.split_u32x8(a);
2648        let (b0, b1) = self.split_u32x8(b);
2649        self.combine_u32x4(self.xor_u32x4(a0, b0), self.xor_u32x4(a1, b1))
2650    }
2651    #[inline(always)]
2652    fn shr_u32x8(self, a: u32x8<Self>, b: u32) -> u32x8<Self> {
2653        let (a0, a1) = self.split_u32x8(a);
2654        self.combine_u32x4(self.shr_u32x4(a0, b), self.shr_u32x4(a1, b))
2655    }
2656    #[inline(always)]
2657    fn shrv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2658        let (a0, a1) = self.split_u32x8(a);
2659        let (b0, b1) = self.split_u32x8(b);
2660        self.combine_u32x4(self.shrv_u32x4(a0, b0), self.shrv_u32x4(a1, b1))
2661    }
2662    #[inline(always)]
2663    fn shl_u32x8(self, a: u32x8<Self>, b: u32) -> u32x8<Self> {
2664        let (a0, a1) = self.split_u32x8(a);
2665        self.combine_u32x4(self.shl_u32x4(a0, b), self.shl_u32x4(a1, b))
2666    }
2667    #[inline(always)]
2668    fn simd_eq_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
2669        let (a0, a1) = self.split_u32x8(a);
2670        let (b0, b1) = self.split_u32x8(b);
2671        self.combine_mask32x4(self.simd_eq_u32x4(a0, b0), self.simd_eq_u32x4(a1, b1))
2672    }
2673    #[inline(always)]
2674    fn simd_lt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
2675        let (a0, a1) = self.split_u32x8(a);
2676        let (b0, b1) = self.split_u32x8(b);
2677        self.combine_mask32x4(self.simd_lt_u32x4(a0, b0), self.simd_lt_u32x4(a1, b1))
2678    }
2679    #[inline(always)]
2680    fn simd_le_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
2681        let (a0, a1) = self.split_u32x8(a);
2682        let (b0, b1) = self.split_u32x8(b);
2683        self.combine_mask32x4(self.simd_le_u32x4(a0, b0), self.simd_le_u32x4(a1, b1))
2684    }
2685    #[inline(always)]
2686    fn simd_ge_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
2687        let (a0, a1) = self.split_u32x8(a);
2688        let (b0, b1) = self.split_u32x8(b);
2689        self.combine_mask32x4(self.simd_ge_u32x4(a0, b0), self.simd_ge_u32x4(a1, b1))
2690    }
2691    #[inline(always)]
2692    fn simd_gt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
2693        let (a0, a1) = self.split_u32x8(a);
2694        let (b0, b1) = self.split_u32x8(b);
2695        self.combine_mask32x4(self.simd_gt_u32x4(a0, b0), self.simd_gt_u32x4(a1, b1))
2696    }
2697    #[inline(always)]
2698    fn zip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2699        let (a0, _) = self.split_u32x8(a);
2700        let (b0, _) = self.split_u32x8(b);
2701        self.combine_u32x4(self.zip_low_u32x4(a0, b0), self.zip_high_u32x4(a0, b0))
2702    }
2703    #[inline(always)]
2704    fn zip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2705        let (_, a1) = self.split_u32x8(a);
2706        let (_, b1) = self.split_u32x8(b);
2707        self.combine_u32x4(self.zip_low_u32x4(a1, b1), self.zip_high_u32x4(a1, b1))
2708    }
2709    #[inline(always)]
2710    fn unzip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2711        let (a0, a1) = self.split_u32x8(a);
2712        let (b0, b1) = self.split_u32x8(b);
2713        self.combine_u32x4(self.unzip_low_u32x4(a0, a1), self.unzip_low_u32x4(b0, b1))
2714    }
2715    #[inline(always)]
2716    fn unzip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2717        let (a0, a1) = self.split_u32x8(a);
2718        let (b0, b1) = self.split_u32x8(b);
2719        self.combine_u32x4(self.unzip_high_u32x4(a0, a1), self.unzip_high_u32x4(b0, b1))
2720    }
2721    #[inline(always)]
2722    fn select_u32x8(self, a: mask32x8<Self>, b: u32x8<Self>, c: u32x8<Self>) -> u32x8<Self> {
2723        let (a0, a1) = self.split_mask32x8(a);
2724        let (b0, b1) = self.split_u32x8(b);
2725        let (c0, c1) = self.split_u32x8(c);
2726        self.combine_u32x4(self.select_u32x4(a0, b0, c0), self.select_u32x4(a1, b1, c1))
2727    }
2728    #[inline(always)]
2729    fn min_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2730        let (a0, a1) = self.split_u32x8(a);
2731        let (b0, b1) = self.split_u32x8(b);
2732        self.combine_u32x4(self.min_u32x4(a0, b0), self.min_u32x4(a1, b1))
2733    }
2734    #[inline(always)]
2735    fn max_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2736        let (a0, a1) = self.split_u32x8(a);
2737        let (b0, b1) = self.split_u32x8(b);
2738        self.combine_u32x4(self.max_u32x4(a0, b0), self.max_u32x4(a1, b1))
2739    }
2740    #[inline(always)]
2741    fn combine_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x16<Self> {
2742        let mut result = [0; 16usize];
2743        result[0..8usize].copy_from_slice(&a.val);
2744        result[8usize..16usize].copy_from_slice(&b.val);
2745        result.simd_into(self)
2746    }
2747    #[inline(always)]
2748    fn split_u32x8(self, a: u32x8<Self>) -> (u32x4<Self>, u32x4<Self>) {
2749        let mut b0 = [0; 4usize];
2750        let mut b1 = [0; 4usize];
2751        b0.copy_from_slice(&a.val[0..4usize]);
2752        b1.copy_from_slice(&a.val[4usize..8usize]);
2753        (b0.simd_into(self), b1.simd_into(self))
2754    }
2755    #[inline(always)]
2756    fn reinterpret_u8_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
2757        let (a0, a1) = self.split_u32x8(a);
2758        self.combine_u8x16(self.reinterpret_u8_u32x4(a0), self.reinterpret_u8_u32x4(a1))
2759    }
2760    #[inline(always)]
2761    fn cvt_f32_u32x8(self, a: u32x8<Self>) -> f32x8<Self> {
2762        let (a0, a1) = self.split_u32x8(a);
2763        self.combine_f32x4(self.cvt_f32_u32x4(a0), self.cvt_f32_u32x4(a1))
2764    }
2765    #[inline(always)]
2766    fn splat_mask32x8(self, a: i32) -> mask32x8<Self> {
2767        let half = self.splat_mask32x4(a);
2768        self.combine_mask32x4(half, half)
2769    }
2770    #[inline(always)]
2771    fn not_mask32x8(self, a: mask32x8<Self>) -> mask32x8<Self> {
2772        let (a0, a1) = self.split_mask32x8(a);
2773        self.combine_mask32x4(self.not_mask32x4(a0), self.not_mask32x4(a1))
2774    }
2775    #[inline(always)]
2776    fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
2777        let (a0, a1) = self.split_mask32x8(a);
2778        let (b0, b1) = self.split_mask32x8(b);
2779        self.combine_mask32x4(self.and_mask32x4(a0, b0), self.and_mask32x4(a1, b1))
2780    }
2781    #[inline(always)]
2782    fn or_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
2783        let (a0, a1) = self.split_mask32x8(a);
2784        let (b0, b1) = self.split_mask32x8(b);
2785        self.combine_mask32x4(self.or_mask32x4(a0, b0), self.or_mask32x4(a1, b1))
2786    }
2787    #[inline(always)]
2788    fn xor_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
2789        let (a0, a1) = self.split_mask32x8(a);
2790        let (b0, b1) = self.split_mask32x8(b);
2791        self.combine_mask32x4(self.xor_mask32x4(a0, b0), self.xor_mask32x4(a1, b1))
2792    }
2793    #[inline(always)]
2794    fn select_mask32x8(
2795        self,
2796        a: mask32x8<Self>,
2797        b: mask32x8<Self>,
2798        c: mask32x8<Self>,
2799    ) -> mask32x8<Self> {
2800        let (a0, a1) = self.split_mask32x8(a);
2801        let (b0, b1) = self.split_mask32x8(b);
2802        let (c0, c1) = self.split_mask32x8(c);
2803        self.combine_mask32x4(
2804            self.select_mask32x4(a0, b0, c0),
2805            self.select_mask32x4(a1, b1, c1),
2806        )
2807    }
2808    #[inline(always)]
2809    fn simd_eq_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
2810        let (a0, a1) = self.split_mask32x8(a);
2811        let (b0, b1) = self.split_mask32x8(b);
2812        self.combine_mask32x4(self.simd_eq_mask32x4(a0, b0), self.simd_eq_mask32x4(a1, b1))
2813    }
2814    #[inline(always)]
2815    fn combine_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x16<Self> {
2816        let mut result = [0; 16usize];
2817        result[0..8usize].copy_from_slice(&a.val);
2818        result[8usize..16usize].copy_from_slice(&b.val);
2819        result.simd_into(self)
2820    }
2821    #[inline(always)]
2822    fn split_mask32x8(self, a: mask32x8<Self>) -> (mask32x4<Self>, mask32x4<Self>) {
2823        let mut b0 = [0; 4usize];
2824        let mut b1 = [0; 4usize];
2825        b0.copy_from_slice(&a.val[0..4usize]);
2826        b1.copy_from_slice(&a.val[4usize..8usize]);
2827        (b0.simd_into(self), b1.simd_into(self))
2828    }
2829    #[inline(always)]
2830    fn splat_f64x4(self, a: f64) -> f64x4<Self> {
2831        let half = self.splat_f64x2(a);
2832        self.combine_f64x2(half, half)
2833    }
2834    #[inline(always)]
2835    fn abs_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
2836        let (a0, a1) = self.split_f64x4(a);
2837        self.combine_f64x2(self.abs_f64x2(a0), self.abs_f64x2(a1))
2838    }
2839    #[inline(always)]
2840    fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
2841        let (a0, a1) = self.split_f64x4(a);
2842        self.combine_f64x2(self.neg_f64x2(a0), self.neg_f64x2(a1))
2843    }
2844    #[inline(always)]
2845    fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
2846        let (a0, a1) = self.split_f64x4(a);
2847        self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1))
2848    }
2849    #[inline(always)]
2850    fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2851        let (a0, a1) = self.split_f64x4(a);
2852        let (b0, b1) = self.split_f64x4(b);
2853        self.combine_f64x2(self.add_f64x2(a0, b0), self.add_f64x2(a1, b1))
2854    }
2855    #[inline(always)]
2856    fn sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2857        let (a0, a1) = self.split_f64x4(a);
2858        let (b0, b1) = self.split_f64x4(b);
2859        self.combine_f64x2(self.sub_f64x2(a0, b0), self.sub_f64x2(a1, b1))
2860    }
2861    #[inline(always)]
2862    fn mul_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2863        let (a0, a1) = self.split_f64x4(a);
2864        let (b0, b1) = self.split_f64x4(b);
2865        self.combine_f64x2(self.mul_f64x2(a0, b0), self.mul_f64x2(a1, b1))
2866    }
2867    #[inline(always)]
2868    fn div_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2869        let (a0, a1) = self.split_f64x4(a);
2870        let (b0, b1) = self.split_f64x4(b);
2871        self.combine_f64x2(self.div_f64x2(a0, b0), self.div_f64x2(a1, b1))
2872    }
2873    #[inline(always)]
2874    fn copysign_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2875        let (a0, a1) = self.split_f64x4(a);
2876        let (b0, b1) = self.split_f64x4(b);
2877        self.combine_f64x2(self.copysign_f64x2(a0, b0), self.copysign_f64x2(a1, b1))
2878    }
2879    #[inline(always)]
2880    fn simd_eq_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
2881        let (a0, a1) = self.split_f64x4(a);
2882        let (b0, b1) = self.split_f64x4(b);
2883        self.combine_mask64x2(self.simd_eq_f64x2(a0, b0), self.simd_eq_f64x2(a1, b1))
2884    }
2885    #[inline(always)]
2886    fn simd_lt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
2887        let (a0, a1) = self.split_f64x4(a);
2888        let (b0, b1) = self.split_f64x4(b);
2889        self.combine_mask64x2(self.simd_lt_f64x2(a0, b0), self.simd_lt_f64x2(a1, b1))
2890    }
2891    #[inline(always)]
2892    fn simd_le_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
2893        let (a0, a1) = self.split_f64x4(a);
2894        let (b0, b1) = self.split_f64x4(b);
2895        self.combine_mask64x2(self.simd_le_f64x2(a0, b0), self.simd_le_f64x2(a1, b1))
2896    }
2897    #[inline(always)]
2898    fn simd_ge_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
2899        let (a0, a1) = self.split_f64x4(a);
2900        let (b0, b1) = self.split_f64x4(b);
2901        self.combine_mask64x2(self.simd_ge_f64x2(a0, b0), self.simd_ge_f64x2(a1, b1))
2902    }
2903    #[inline(always)]
2904    fn simd_gt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
2905        let (a0, a1) = self.split_f64x4(a);
2906        let (b0, b1) = self.split_f64x4(b);
2907        self.combine_mask64x2(self.simd_gt_f64x2(a0, b0), self.simd_gt_f64x2(a1, b1))
2908    }
2909    #[inline(always)]
2910    fn zip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2911        let (a0, _) = self.split_f64x4(a);
2912        let (b0, _) = self.split_f64x4(b);
2913        self.combine_f64x2(self.zip_low_f64x2(a0, b0), self.zip_high_f64x2(a0, b0))
2914    }
2915    #[inline(always)]
2916    fn zip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2917        let (_, a1) = self.split_f64x4(a);
2918        let (_, b1) = self.split_f64x4(b);
2919        self.combine_f64x2(self.zip_low_f64x2(a1, b1), self.zip_high_f64x2(a1, b1))
2920    }
2921    #[inline(always)]
2922    fn unzip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2923        let (a0, a1) = self.split_f64x4(a);
2924        let (b0, b1) = self.split_f64x4(b);
2925        self.combine_f64x2(self.unzip_low_f64x2(a0, a1), self.unzip_low_f64x2(b0, b1))
2926    }
2927    #[inline(always)]
2928    fn unzip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2929        let (a0, a1) = self.split_f64x4(a);
2930        let (b0, b1) = self.split_f64x4(b);
2931        self.combine_f64x2(self.unzip_high_f64x2(a0, a1), self.unzip_high_f64x2(b0, b1))
2932    }
2933    #[inline(always)]
2934    fn max_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2935        let (a0, a1) = self.split_f64x4(a);
2936        let (b0, b1) = self.split_f64x4(b);
2937        self.combine_f64x2(self.max_f64x2(a0, b0), self.max_f64x2(a1, b1))
2938    }
2939    #[inline(always)]
2940    fn max_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2941        let (a0, a1) = self.split_f64x4(a);
2942        let (b0, b1) = self.split_f64x4(b);
2943        self.combine_f64x2(
2944            self.max_precise_f64x2(a0, b0),
2945            self.max_precise_f64x2(a1, b1),
2946        )
2947    }
2948    #[inline(always)]
2949    fn min_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2950        let (a0, a1) = self.split_f64x4(a);
2951        let (b0, b1) = self.split_f64x4(b);
2952        self.combine_f64x2(self.min_f64x2(a0, b0), self.min_f64x2(a1, b1))
2953    }
2954    #[inline(always)]
2955    fn min_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2956        let (a0, a1) = self.split_f64x4(a);
2957        let (b0, b1) = self.split_f64x4(b);
2958        self.combine_f64x2(
2959            self.min_precise_f64x2(a0, b0),
2960            self.min_precise_f64x2(a1, b1),
2961        )
2962    }
2963    #[inline(always)]
2964    fn madd_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
2965        let (a0, a1) = self.split_f64x4(a);
2966        let (b0, b1) = self.split_f64x4(b);
2967        let (c0, c1) = self.split_f64x4(c);
2968        self.combine_f64x2(self.madd_f64x2(a0, b0, c0), self.madd_f64x2(a1, b1, c1))
2969    }
2970    #[inline(always)]
2971    fn msub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
2972        let (a0, a1) = self.split_f64x4(a);
2973        let (b0, b1) = self.split_f64x4(b);
2974        let (c0, c1) = self.split_f64x4(c);
2975        self.combine_f64x2(self.msub_f64x2(a0, b0, c0), self.msub_f64x2(a1, b1, c1))
2976    }
2977    #[inline(always)]
2978    fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
2979        let (a0, a1) = self.split_f64x4(a);
2980        self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1))
2981    }
2982    #[inline(always)]
2983    fn fract_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
2984        let (a0, a1) = self.split_f64x4(a);
2985        self.combine_f64x2(self.fract_f64x2(a0), self.fract_f64x2(a1))
2986    }
2987    #[inline(always)]
2988    fn trunc_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
2989        let (a0, a1) = self.split_f64x4(a);
2990        self.combine_f64x2(self.trunc_f64x2(a0), self.trunc_f64x2(a1))
2991    }
2992    #[inline(always)]
2993    fn select_f64x4(self, a: mask64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
2994        let (a0, a1) = self.split_mask64x4(a);
2995        let (b0, b1) = self.split_f64x4(b);
2996        let (c0, c1) = self.split_f64x4(c);
2997        self.combine_f64x2(self.select_f64x2(a0, b0, c0), self.select_f64x2(a1, b1, c1))
2998    }
2999    #[inline(always)]
3000    fn combine_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x8<Self> {
3001        let mut result = [0.0; 8usize];
3002        result[0..4usize].copy_from_slice(&a.val);
3003        result[4usize..8usize].copy_from_slice(&b.val);
3004        result.simd_into(self)
3005    }
3006    #[inline(always)]
3007    fn split_f64x4(self, a: f64x4<Self>) -> (f64x2<Self>, f64x2<Self>) {
3008        let mut b0 = [0.0; 2usize];
3009        let mut b1 = [0.0; 2usize];
3010        b0.copy_from_slice(&a.val[0..2usize]);
3011        b1.copy_from_slice(&a.val[2usize..4usize]);
3012        (b0.simd_into(self), b1.simd_into(self))
3013    }
3014    #[inline(always)]
3015    fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
3016        let (a0, a1) = self.split_f64x4(a);
3017        self.combine_f32x4(
3018            self.reinterpret_f32_f64x2(a0),
3019            self.reinterpret_f32_f64x2(a1),
3020        )
3021    }
3022    #[inline(always)]
3023    fn splat_mask64x4(self, a: i64) -> mask64x4<Self> {
3024        let half = self.splat_mask64x2(a);
3025        self.combine_mask64x2(half, half)
3026    }
3027    #[inline(always)]
3028    fn not_mask64x4(self, a: mask64x4<Self>) -> mask64x4<Self> {
3029        let (a0, a1) = self.split_mask64x4(a);
3030        self.combine_mask64x2(self.not_mask64x2(a0), self.not_mask64x2(a1))
3031    }
3032    #[inline(always)]
3033    fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
3034        let (a0, a1) = self.split_mask64x4(a);
3035        let (b0, b1) = self.split_mask64x4(b);
3036        self.combine_mask64x2(self.and_mask64x2(a0, b0), self.and_mask64x2(a1, b1))
3037    }
3038    #[inline(always)]
3039    fn or_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
3040        let (a0, a1) = self.split_mask64x4(a);
3041        let (b0, b1) = self.split_mask64x4(b);
3042        self.combine_mask64x2(self.or_mask64x2(a0, b0), self.or_mask64x2(a1, b1))
3043    }
3044    #[inline(always)]
3045    fn xor_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
3046        let (a0, a1) = self.split_mask64x4(a);
3047        let (b0, b1) = self.split_mask64x4(b);
3048        self.combine_mask64x2(self.xor_mask64x2(a0, b0), self.xor_mask64x2(a1, b1))
3049    }
3050    #[inline(always)]
3051    fn select_mask64x4(
3052        self,
3053        a: mask64x4<Self>,
3054        b: mask64x4<Self>,
3055        c: mask64x4<Self>,
3056    ) -> mask64x4<Self> {
3057        let (a0, a1) = self.split_mask64x4(a);
3058        let (b0, b1) = self.split_mask64x4(b);
3059        let (c0, c1) = self.split_mask64x4(c);
3060        self.combine_mask64x2(
3061            self.select_mask64x2(a0, b0, c0),
3062            self.select_mask64x2(a1, b1, c1),
3063        )
3064    }
3065    #[inline(always)]
3066    fn simd_eq_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
3067        let (a0, a1) = self.split_mask64x4(a);
3068        let (b0, b1) = self.split_mask64x4(b);
3069        self.combine_mask64x2(self.simd_eq_mask64x2(a0, b0), self.simd_eq_mask64x2(a1, b1))
3070    }
3071    #[inline(always)]
3072    fn combine_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x8<Self> {
3073        let mut result = [0; 8usize];
3074        result[0..4usize].copy_from_slice(&a.val);
3075        result[4usize..8usize].copy_from_slice(&b.val);
3076        result.simd_into(self)
3077    }
3078    #[inline(always)]
3079    fn split_mask64x4(self, a: mask64x4<Self>) -> (mask64x2<Self>, mask64x2<Self>) {
3080        let mut b0 = [0; 2usize];
3081        let mut b1 = [0; 2usize];
3082        b0.copy_from_slice(&a.val[0..2usize]);
3083        b1.copy_from_slice(&a.val[2usize..4usize]);
3084        (b0.simd_into(self), b1.simd_into(self))
3085    }
3086    #[inline(always)]
3087    fn splat_f32x16(self, a: f32) -> f32x16<Self> {
3088        let half = self.splat_f32x8(a);
3089        self.combine_f32x8(half, half)
3090    }
3091    #[inline(always)]
3092    fn abs_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
3093        let (a0, a1) = self.split_f32x16(a);
3094        self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1))
3095    }
3096    #[inline(always)]
3097    fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
3098        let (a0, a1) = self.split_f32x16(a);
3099        self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1))
3100    }
3101    #[inline(always)]
3102    fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
3103        let (a0, a1) = self.split_f32x16(a);
3104        self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
3105    }
3106    #[inline(always)]
3107    fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3108        let (a0, a1) = self.split_f32x16(a);
3109        let (b0, b1) = self.split_f32x16(b);
3110        self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1))
3111    }
3112    #[inline(always)]
3113    fn sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3114        let (a0, a1) = self.split_f32x16(a);
3115        let (b0, b1) = self.split_f32x16(b);
3116        self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1))
3117    }
3118    #[inline(always)]
3119    fn mul_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3120        let (a0, a1) = self.split_f32x16(a);
3121        let (b0, b1) = self.split_f32x16(b);
3122        self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1))
3123    }
3124    #[inline(always)]
3125    fn div_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3126        let (a0, a1) = self.split_f32x16(a);
3127        let (b0, b1) = self.split_f32x16(b);
3128        self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1))
3129    }
3130    #[inline(always)]
3131    fn copysign_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3132        let (a0, a1) = self.split_f32x16(a);
3133        let (b0, b1) = self.split_f32x16(b);
3134        self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1))
3135    }
3136    #[inline(always)]
3137    fn simd_eq_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
3138        let (a0, a1) = self.split_f32x16(a);
3139        let (b0, b1) = self.split_f32x16(b);
3140        self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1))
3141    }
3142    #[inline(always)]
3143    fn simd_lt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
3144        let (a0, a1) = self.split_f32x16(a);
3145        let (b0, b1) = self.split_f32x16(b);
3146        self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1))
3147    }
3148    #[inline(always)]
3149    fn simd_le_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
3150        let (a0, a1) = self.split_f32x16(a);
3151        let (b0, b1) = self.split_f32x16(b);
3152        self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1))
3153    }
3154    #[inline(always)]
3155    fn simd_ge_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
3156        let (a0, a1) = self.split_f32x16(a);
3157        let (b0, b1) = self.split_f32x16(b);
3158        self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1))
3159    }
3160    #[inline(always)]
3161    fn simd_gt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
3162        let (a0, a1) = self.split_f32x16(a);
3163        let (b0, b1) = self.split_f32x16(b);
3164        self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1))
3165    }
3166    #[inline(always)]
3167    fn zip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3168        let (a0, _) = self.split_f32x16(a);
3169        let (b0, _) = self.split_f32x16(b);
3170        self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0))
3171    }
3172    #[inline(always)]
3173    fn zip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3174        let (_, a1) = self.split_f32x16(a);
3175        let (_, b1) = self.split_f32x16(b);
3176        self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1))
3177    }
3178    #[inline(always)]
3179    fn unzip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3180        let (a0, a1) = self.split_f32x16(a);
3181        let (b0, b1) = self.split_f32x16(b);
3182        self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1))
3183    }
3184    #[inline(always)]
3185    fn unzip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3186        let (a0, a1) = self.split_f32x16(a);
3187        let (b0, b1) = self.split_f32x16(b);
3188        self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1))
3189    }
3190    #[inline(always)]
3191    fn max_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3192        let (a0, a1) = self.split_f32x16(a);
3193        let (b0, b1) = self.split_f32x16(b);
3194        self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1))
3195    }
3196    #[inline(always)]
3197    fn max_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3198        let (a0, a1) = self.split_f32x16(a);
3199        let (b0, b1) = self.split_f32x16(b);
3200        self.combine_f32x8(
3201            self.max_precise_f32x8(a0, b0),
3202            self.max_precise_f32x8(a1, b1),
3203        )
3204    }
3205    #[inline(always)]
3206    fn min_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3207        let (a0, a1) = self.split_f32x16(a);
3208        let (b0, b1) = self.split_f32x16(b);
3209        self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1))
3210    }
3211    #[inline(always)]
3212    fn min_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3213        let (a0, a1) = self.split_f32x16(a);
3214        let (b0, b1) = self.split_f32x16(b);
3215        self.combine_f32x8(
3216            self.min_precise_f32x8(a0, b0),
3217            self.min_precise_f32x8(a1, b1),
3218        )
3219    }
3220    #[inline(always)]
3221    fn madd_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
3222        let (a0, a1) = self.split_f32x16(a);
3223        let (b0, b1) = self.split_f32x16(b);
3224        let (c0, c1) = self.split_f32x16(c);
3225        self.combine_f32x8(self.madd_f32x8(a0, b0, c0), self.madd_f32x8(a1, b1, c1))
3226    }
3227    #[inline(always)]
3228    fn msub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
3229        let (a0, a1) = self.split_f32x16(a);
3230        let (b0, b1) = self.split_f32x16(b);
3231        let (c0, c1) = self.split_f32x16(c);
3232        self.combine_f32x8(self.msub_f32x8(a0, b0, c0), self.msub_f32x8(a1, b1, c1))
3233    }
3234    #[inline(always)]
3235    fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
3236        let (a0, a1) = self.split_f32x16(a);
3237        self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1))
3238    }
3239    #[inline(always)]
3240    fn fract_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
3241        let (a0, a1) = self.split_f32x16(a);
3242        self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1))
3243    }
3244    #[inline(always)]
3245    fn trunc_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
3246        let (a0, a1) = self.split_f32x16(a);
3247        self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1))
3248    }
3249    #[inline(always)]
3250    fn select_f32x16(self, a: mask32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
3251        let (a0, a1) = self.split_mask32x16(a);
3252        let (b0, b1) = self.split_f32x16(b);
3253        let (c0, c1) = self.split_f32x16(c);
3254        self.combine_f32x8(self.select_f32x8(a0, b0, c0), self.select_f32x8(a1, b1, c1))
3255    }
3256    #[inline(always)]
3257    fn split_f32x16(self, a: f32x16<Self>) -> (f32x8<Self>, f32x8<Self>) {
3258        let mut b0 = [0.0; 8usize];
3259        let mut b1 = [0.0; 8usize];
3260        b0.copy_from_slice(&a.val[0..8usize]);
3261        b1.copy_from_slice(&a.val[8usize..16usize]);
3262        (b0.simd_into(self), b1.simd_into(self))
3263    }
3264    #[inline(always)]
3265    fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
3266        let (a0, a1) = self.split_f32x16(a);
3267        self.combine_f64x4(
3268            self.reinterpret_f64_f32x8(a0),
3269            self.reinterpret_f64_f32x8(a1),
3270        )
3271    }
3272    #[inline(always)]
3273    fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
3274        let (a0, a1) = self.split_f32x16(a);
3275        self.combine_i32x8(
3276            self.reinterpret_i32_f32x8(a0),
3277            self.reinterpret_i32_f32x8(a1),
3278        )
3279    }
3280    #[inline(always)]
3281    fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
3282        crate::Fallback::new()
3283            .load_interleaved_128_f32x16(src)
3284            .val
3285            .simd_into(self)
3286    }
3287    #[inline(always)]
3288    fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
3289        let fb = crate::Fallback::new();
3290        fb.store_interleaved_128_f32x16(a.val.simd_into(fb), dest);
3291    }
3292    #[inline(always)]
3293    fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
3294        let (a0, a1) = self.split_f32x16(a);
3295        self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1))
3296    }
3297    #[inline(always)]
3298    fn reinterpret_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
3299        let (a0, a1) = self.split_f32x16(a);
3300        self.combine_u32x8(
3301            self.reinterpret_u32_f32x8(a0),
3302            self.reinterpret_u32_f32x8(a1),
3303        )
3304    }
3305    #[inline(always)]
3306    fn cvt_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
3307        let (a0, a1) = self.split_f32x16(a);
3308        self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1))
3309    }
3310    #[inline(always)]
3311    fn cvt_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
3312        let (a0, a1) = self.split_f32x16(a);
3313        self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1))
3314    }
3315    #[inline(always)]
3316    fn splat_i8x64(self, a: i8) -> i8x64<Self> {
3317        let half = self.splat_i8x32(a);
3318        self.combine_i8x32(half, half)
3319    }
3320    #[inline(always)]
3321    fn not_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
3322        let (a0, a1) = self.split_i8x64(a);
3323        self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1))
3324    }
3325    #[inline(always)]
3326    fn add_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3327        let (a0, a1) = self.split_i8x64(a);
3328        let (b0, b1) = self.split_i8x64(b);
3329        self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1))
3330    }
3331    #[inline(always)]
3332    fn sub_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3333        let (a0, a1) = self.split_i8x64(a);
3334        let (b0, b1) = self.split_i8x64(b);
3335        self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1))
3336    }
3337    #[inline(always)]
3338    fn mul_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3339        let (a0, a1) = self.split_i8x64(a);
3340        let (b0, b1) = self.split_i8x64(b);
3341        self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1))
3342    }
3343    #[inline(always)]
3344    fn and_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3345        let (a0, a1) = self.split_i8x64(a);
3346        let (b0, b1) = self.split_i8x64(b);
3347        self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1))
3348    }
3349    #[inline(always)]
3350    fn or_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3351        let (a0, a1) = self.split_i8x64(a);
3352        let (b0, b1) = self.split_i8x64(b);
3353        self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1))
3354    }
3355    #[inline(always)]
3356    fn xor_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3357        let (a0, a1) = self.split_i8x64(a);
3358        let (b0, b1) = self.split_i8x64(b);
3359        self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1))
3360    }
3361    #[inline(always)]
3362    fn shr_i8x64(self, a: i8x64<Self>, b: u32) -> i8x64<Self> {
3363        let (a0, a1) = self.split_i8x64(a);
3364        self.combine_i8x32(self.shr_i8x32(a0, b), self.shr_i8x32(a1, b))
3365    }
3366    #[inline(always)]
3367    fn shrv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3368        let (a0, a1) = self.split_i8x64(a);
3369        let (b0, b1) = self.split_i8x64(b);
3370        self.combine_i8x32(self.shrv_i8x32(a0, b0), self.shrv_i8x32(a1, b1))
3371    }
3372    #[inline(always)]
3373    fn shl_i8x64(self, a: i8x64<Self>, b: u32) -> i8x64<Self> {
3374        let (a0, a1) = self.split_i8x64(a);
3375        self.combine_i8x32(self.shl_i8x32(a0, b), self.shl_i8x32(a1, b))
3376    }
3377    #[inline(always)]
3378    fn simd_eq_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
3379        let (a0, a1) = self.split_i8x64(a);
3380        let (b0, b1) = self.split_i8x64(b);
3381        self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1))
3382    }
3383    #[inline(always)]
3384    fn simd_lt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
3385        let (a0, a1) = self.split_i8x64(a);
3386        let (b0, b1) = self.split_i8x64(b);
3387        self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1))
3388    }
3389    #[inline(always)]
3390    fn simd_le_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
3391        let (a0, a1) = self.split_i8x64(a);
3392        let (b0, b1) = self.split_i8x64(b);
3393        self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1))
3394    }
3395    #[inline(always)]
3396    fn simd_ge_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
3397        let (a0, a1) = self.split_i8x64(a);
3398        let (b0, b1) = self.split_i8x64(b);
3399        self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1))
3400    }
3401    #[inline(always)]
3402    fn simd_gt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
3403        let (a0, a1) = self.split_i8x64(a);
3404        let (b0, b1) = self.split_i8x64(b);
3405        self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1))
3406    }
3407    #[inline(always)]
3408    fn zip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3409        let (a0, _) = self.split_i8x64(a);
3410        let (b0, _) = self.split_i8x64(b);
3411        self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0))
3412    }
3413    #[inline(always)]
3414    fn zip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3415        let (_, a1) = self.split_i8x64(a);
3416        let (_, b1) = self.split_i8x64(b);
3417        self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1))
3418    }
3419    #[inline(always)]
3420    fn unzip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3421        let (a0, a1) = self.split_i8x64(a);
3422        let (b0, b1) = self.split_i8x64(b);
3423        self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1))
3424    }
3425    #[inline(always)]
3426    fn unzip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3427        let (a0, a1) = self.split_i8x64(a);
3428        let (b0, b1) = self.split_i8x64(b);
3429        self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1))
3430    }
3431    #[inline(always)]
3432    fn select_i8x64(self, a: mask8x64<Self>, b: i8x64<Self>, c: i8x64<Self>) -> i8x64<Self> {
3433        let (a0, a1) = self.split_mask8x64(a);
3434        let (b0, b1) = self.split_i8x64(b);
3435        let (c0, c1) = self.split_i8x64(c);
3436        self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1))
3437    }
3438    #[inline(always)]
3439    fn min_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3440        let (a0, a1) = self.split_i8x64(a);
3441        let (b0, b1) = self.split_i8x64(b);
3442        self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1))
3443    }
3444    #[inline(always)]
3445    fn max_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3446        let (a0, a1) = self.split_i8x64(a);
3447        let (b0, b1) = self.split_i8x64(b);
3448        self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1))
3449    }
3450    #[inline(always)]
3451    fn split_i8x64(self, a: i8x64<Self>) -> (i8x32<Self>, i8x32<Self>) {
3452        let mut b0 = [0; 32usize];
3453        let mut b1 = [0; 32usize];
3454        b0.copy_from_slice(&a.val[0..32usize]);
3455        b1.copy_from_slice(&a.val[32usize..64usize]);
3456        (b0.simd_into(self), b1.simd_into(self))
3457    }
3458    #[inline(always)]
3459    fn neg_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
3460        let (a0, a1) = self.split_i8x64(a);
3461        self.combine_i8x32(self.neg_i8x32(a0), self.neg_i8x32(a1))
3462    }
3463    #[inline(always)]
3464    fn reinterpret_u8_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
3465        let (a0, a1) = self.split_i8x64(a);
3466        self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1))
3467    }
3468    #[inline(always)]
3469    fn reinterpret_u32_i8x64(self, a: i8x64<Self>) -> u32x16<Self> {
3470        let (a0, a1) = self.split_i8x64(a);
3471        self.combine_u32x8(
3472            self.reinterpret_u32_i8x32(a0),
3473            self.reinterpret_u32_i8x32(a1),
3474        )
3475    }
3476    #[inline(always)]
3477    fn splat_u8x64(self, a: u8) -> u8x64<Self> {
3478        let half = self.splat_u8x32(a);
3479        self.combine_u8x32(half, half)
3480    }
3481    #[inline(always)]
3482    fn not_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
3483        let (a0, a1) = self.split_u8x64(a);
3484        self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1))
3485    }
3486    #[inline(always)]
3487    fn add_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3488        let (a0, a1) = self.split_u8x64(a);
3489        let (b0, b1) = self.split_u8x64(b);
3490        self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1))
3491    }
3492    #[inline(always)]
3493    fn sub_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3494        let (a0, a1) = self.split_u8x64(a);
3495        let (b0, b1) = self.split_u8x64(b);
3496        self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1))
3497    }
3498    #[inline(always)]
3499    fn mul_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3500        let (a0, a1) = self.split_u8x64(a);
3501        let (b0, b1) = self.split_u8x64(b);
3502        self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1))
3503    }
3504    #[inline(always)]
3505    fn and_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3506        let (a0, a1) = self.split_u8x64(a);
3507        let (b0, b1) = self.split_u8x64(b);
3508        self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1))
3509    }
3510    #[inline(always)]
3511    fn or_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3512        let (a0, a1) = self.split_u8x64(a);
3513        let (b0, b1) = self.split_u8x64(b);
3514        self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1))
3515    }
3516    #[inline(always)]
3517    fn xor_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3518        let (a0, a1) = self.split_u8x64(a);
3519        let (b0, b1) = self.split_u8x64(b);
3520        self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1))
3521    }
3522    #[inline(always)]
3523    fn shr_u8x64(self, a: u8x64<Self>, b: u32) -> u8x64<Self> {
3524        let (a0, a1) = self.split_u8x64(a);
3525        self.combine_u8x32(self.shr_u8x32(a0, b), self.shr_u8x32(a1, b))
3526    }
3527    #[inline(always)]
3528    fn shrv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3529        let (a0, a1) = self.split_u8x64(a);
3530        let (b0, b1) = self.split_u8x64(b);
3531        self.combine_u8x32(self.shrv_u8x32(a0, b0), self.shrv_u8x32(a1, b1))
3532    }
3533    #[inline(always)]
3534    fn shl_u8x64(self, a: u8x64<Self>, b: u32) -> u8x64<Self> {
3535        let (a0, a1) = self.split_u8x64(a);
3536        self.combine_u8x32(self.shl_u8x32(a0, b), self.shl_u8x32(a1, b))
3537    }
3538    #[inline(always)]
3539    fn simd_eq_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
3540        let (a0, a1) = self.split_u8x64(a);
3541        let (b0, b1) = self.split_u8x64(b);
3542        self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1))
3543    }
3544    #[inline(always)]
3545    fn simd_lt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
3546        let (a0, a1) = self.split_u8x64(a);
3547        let (b0, b1) = self.split_u8x64(b);
3548        self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1))
3549    }
3550    #[inline(always)]
3551    fn simd_le_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
3552        let (a0, a1) = self.split_u8x64(a);
3553        let (b0, b1) = self.split_u8x64(b);
3554        self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1))
3555    }
3556    #[inline(always)]
3557    fn simd_ge_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
3558        let (a0, a1) = self.split_u8x64(a);
3559        let (b0, b1) = self.split_u8x64(b);
3560        self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1))
3561    }
3562    #[inline(always)]
3563    fn simd_gt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
3564        let (a0, a1) = self.split_u8x64(a);
3565        let (b0, b1) = self.split_u8x64(b);
3566        self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1))
3567    }
3568    #[inline(always)]
3569    fn zip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3570        let (a0, _) = self.split_u8x64(a);
3571        let (b0, _) = self.split_u8x64(b);
3572        self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0))
3573    }
3574    #[inline(always)]
3575    fn zip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3576        let (_, a1) = self.split_u8x64(a);
3577        let (_, b1) = self.split_u8x64(b);
3578        self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1))
3579    }
3580    #[inline(always)]
3581    fn unzip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3582        let (a0, a1) = self.split_u8x64(a);
3583        let (b0, b1) = self.split_u8x64(b);
3584        self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1))
3585    }
3586    #[inline(always)]
3587    fn unzip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3588        let (a0, a1) = self.split_u8x64(a);
3589        let (b0, b1) = self.split_u8x64(b);
3590        self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1))
3591    }
3592    #[inline(always)]
3593    fn select_u8x64(self, a: mask8x64<Self>, b: u8x64<Self>, c: u8x64<Self>) -> u8x64<Self> {
3594        let (a0, a1) = self.split_mask8x64(a);
3595        let (b0, b1) = self.split_u8x64(b);
3596        let (c0, c1) = self.split_u8x64(c);
3597        self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1))
3598    }
3599    #[inline(always)]
3600    fn min_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3601        let (a0, a1) = self.split_u8x64(a);
3602        let (b0, b1) = self.split_u8x64(b);
3603        self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1))
3604    }
3605    #[inline(always)]
3606    fn max_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3607        let (a0, a1) = self.split_u8x64(a);
3608        let (b0, b1) = self.split_u8x64(b);
3609        self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1))
3610    }
3611    #[inline(always)]
3612    fn split_u8x64(self, a: u8x64<Self>) -> (u8x32<Self>, u8x32<Self>) {
3613        let mut b0 = [0; 32usize];
3614        let mut b1 = [0; 32usize];
3615        b0.copy_from_slice(&a.val[0..32usize]);
3616        b1.copy_from_slice(&a.val[32usize..64usize]);
3617        (b0.simd_into(self), b1.simd_into(self))
3618    }
3619    #[inline(always)]
3620    fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self> {
3621        crate::Fallback::new()
3622            .load_interleaved_128_u8x64(src)
3623            .val
3624            .simd_into(self)
3625    }
3626    #[inline(always)]
3627    fn store_interleaved_128_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
3628        let fb = crate::Fallback::new();
3629        fb.store_interleaved_128_u8x64(a.val.simd_into(fb), dest);
3630    }
3631    #[inline(always)]
3632    fn reinterpret_u32_u8x64(self, a: u8x64<Self>) -> u32x16<Self> {
3633        let (a0, a1) = self.split_u8x64(a);
3634        self.combine_u32x8(
3635            self.reinterpret_u32_u8x32(a0),
3636            self.reinterpret_u32_u8x32(a1),
3637        )
3638    }
3639    #[inline(always)]
3640    fn splat_mask8x64(self, a: i8) -> mask8x64<Self> {
3641        let half = self.splat_mask8x32(a);
3642        self.combine_mask8x32(half, half)
3643    }
3644    #[inline(always)]
3645    fn not_mask8x64(self, a: mask8x64<Self>) -> mask8x64<Self> {
3646        let (a0, a1) = self.split_mask8x64(a);
3647        self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1))
3648    }
3649    #[inline(always)]
3650    fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
3651        let (a0, a1) = self.split_mask8x64(a);
3652        let (b0, b1) = self.split_mask8x64(b);
3653        self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1))
3654    }
3655    #[inline(always)]
3656    fn or_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
3657        let (a0, a1) = self.split_mask8x64(a);
3658        let (b0, b1) = self.split_mask8x64(b);
3659        self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1))
3660    }
3661    #[inline(always)]
3662    fn xor_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
3663        let (a0, a1) = self.split_mask8x64(a);
3664        let (b0, b1) = self.split_mask8x64(b);
3665        self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1))
3666    }
3667    #[inline(always)]
3668    fn select_mask8x64(
3669        self,
3670        a: mask8x64<Self>,
3671        b: mask8x64<Self>,
3672        c: mask8x64<Self>,
3673    ) -> mask8x64<Self> {
3674        let (a0, a1) = self.split_mask8x64(a);
3675        let (b0, b1) = self.split_mask8x64(b);
3676        let (c0, c1) = self.split_mask8x64(c);
3677        self.combine_mask8x32(
3678            self.select_mask8x32(a0, b0, c0),
3679            self.select_mask8x32(a1, b1, c1),
3680        )
3681    }
3682    #[inline(always)]
3683    fn simd_eq_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
3684        let (a0, a1) = self.split_mask8x64(a);
3685        let (b0, b1) = self.split_mask8x64(b);
3686        self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1))
3687    }
3688    #[inline(always)]
3689    fn split_mask8x64(self, a: mask8x64<Self>) -> (mask8x32<Self>, mask8x32<Self>) {
3690        let mut b0 = [0; 32usize];
3691        let mut b1 = [0; 32usize];
3692        b0.copy_from_slice(&a.val[0..32usize]);
3693        b1.copy_from_slice(&a.val[32usize..64usize]);
3694        (b0.simd_into(self), b1.simd_into(self))
3695    }
3696    #[inline(always)]
3697    fn splat_i16x32(self, a: i16) -> i16x32<Self> {
3698        let half = self.splat_i16x16(a);
3699        self.combine_i16x16(half, half)
3700    }
3701    #[inline(always)]
3702    fn not_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
3703        let (a0, a1) = self.split_i16x32(a);
3704        self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1))
3705    }
3706    #[inline(always)]
3707    fn add_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3708        let (a0, a1) = self.split_i16x32(a);
3709        let (b0, b1) = self.split_i16x32(b);
3710        self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1))
3711    }
3712    #[inline(always)]
3713    fn sub_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3714        let (a0, a1) = self.split_i16x32(a);
3715        let (b0, b1) = self.split_i16x32(b);
3716        self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1))
3717    }
3718    #[inline(always)]
3719    fn mul_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3720        let (a0, a1) = self.split_i16x32(a);
3721        let (b0, b1) = self.split_i16x32(b);
3722        self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1))
3723    }
3724    #[inline(always)]
3725    fn and_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3726        let (a0, a1) = self.split_i16x32(a);
3727        let (b0, b1) = self.split_i16x32(b);
3728        self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1))
3729    }
3730    #[inline(always)]
3731    fn or_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3732        let (a0, a1) = self.split_i16x32(a);
3733        let (b0, b1) = self.split_i16x32(b);
3734        self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1))
3735    }
3736    #[inline(always)]
3737    fn xor_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3738        let (a0, a1) = self.split_i16x32(a);
3739        let (b0, b1) = self.split_i16x32(b);
3740        self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1))
3741    }
3742    #[inline(always)]
3743    fn shr_i16x32(self, a: i16x32<Self>, b: u32) -> i16x32<Self> {
3744        let (a0, a1) = self.split_i16x32(a);
3745        self.combine_i16x16(self.shr_i16x16(a0, b), self.shr_i16x16(a1, b))
3746    }
3747    #[inline(always)]
3748    fn shrv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3749        let (a0, a1) = self.split_i16x32(a);
3750        let (b0, b1) = self.split_i16x32(b);
3751        self.combine_i16x16(self.shrv_i16x16(a0, b0), self.shrv_i16x16(a1, b1))
3752    }
3753    #[inline(always)]
3754    fn shl_i16x32(self, a: i16x32<Self>, b: u32) -> i16x32<Self> {
3755        let (a0, a1) = self.split_i16x32(a);
3756        self.combine_i16x16(self.shl_i16x16(a0, b), self.shl_i16x16(a1, b))
3757    }
3758    #[inline(always)]
3759    fn simd_eq_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
3760        let (a0, a1) = self.split_i16x32(a);
3761        let (b0, b1) = self.split_i16x32(b);
3762        self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1))
3763    }
3764    #[inline(always)]
3765    fn simd_lt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
3766        let (a0, a1) = self.split_i16x32(a);
3767        let (b0, b1) = self.split_i16x32(b);
3768        self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1))
3769    }
3770    #[inline(always)]
3771    fn simd_le_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
3772        let (a0, a1) = self.split_i16x32(a);
3773        let (b0, b1) = self.split_i16x32(b);
3774        self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1))
3775    }
3776    #[inline(always)]
3777    fn simd_ge_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
3778        let (a0, a1) = self.split_i16x32(a);
3779        let (b0, b1) = self.split_i16x32(b);
3780        self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1))
3781    }
3782    #[inline(always)]
3783    fn simd_gt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
3784        let (a0, a1) = self.split_i16x32(a);
3785        let (b0, b1) = self.split_i16x32(b);
3786        self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1))
3787    }
3788    #[inline(always)]
3789    fn zip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3790        let (a0, _) = self.split_i16x32(a);
3791        let (b0, _) = self.split_i16x32(b);
3792        self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0))
3793    }
3794    #[inline(always)]
3795    fn zip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3796        let (_, a1) = self.split_i16x32(a);
3797        let (_, b1) = self.split_i16x32(b);
3798        self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1))
3799    }
3800    #[inline(always)]
3801    fn unzip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3802        let (a0, a1) = self.split_i16x32(a);
3803        let (b0, b1) = self.split_i16x32(b);
3804        self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1))
3805    }
3806    #[inline(always)]
3807    fn unzip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3808        let (a0, a1) = self.split_i16x32(a);
3809        let (b0, b1) = self.split_i16x32(b);
3810        self.combine_i16x16(
3811            self.unzip_high_i16x16(a0, a1),
3812            self.unzip_high_i16x16(b0, b1),
3813        )
3814    }
3815    #[inline(always)]
3816    fn select_i16x32(self, a: mask16x32<Self>, b: i16x32<Self>, c: i16x32<Self>) -> i16x32<Self> {
3817        let (a0, a1) = self.split_mask16x32(a);
3818        let (b0, b1) = self.split_i16x32(b);
3819        let (c0, c1) = self.split_i16x32(c);
3820        self.combine_i16x16(
3821            self.select_i16x16(a0, b0, c0),
3822            self.select_i16x16(a1, b1, c1),
3823        )
3824    }
3825    #[inline(always)]
3826    fn min_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3827        let (a0, a1) = self.split_i16x32(a);
3828        let (b0, b1) = self.split_i16x32(b);
3829        self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1))
3830    }
3831    #[inline(always)]
3832    fn max_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3833        let (a0, a1) = self.split_i16x32(a);
3834        let (b0, b1) = self.split_i16x32(b);
3835        self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1))
3836    }
3837    #[inline(always)]
3838    fn split_i16x32(self, a: i16x32<Self>) -> (i16x16<Self>, i16x16<Self>) {
3839        let mut b0 = [0; 16usize];
3840        let mut b1 = [0; 16usize];
3841        b0.copy_from_slice(&a.val[0..16usize]);
3842        b1.copy_from_slice(&a.val[16usize..32usize]);
3843        (b0.simd_into(self), b1.simd_into(self))
3844    }
3845    #[inline(always)]
3846    fn neg_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
3847        let (a0, a1) = self.split_i16x32(a);
3848        self.combine_i16x16(self.neg_i16x16(a0), self.neg_i16x16(a1))
3849    }
3850    #[inline(always)]
3851    fn reinterpret_u8_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
3852        let (a0, a1) = self.split_i16x32(a);
3853        self.combine_u8x32(
3854            self.reinterpret_u8_i16x16(a0),
3855            self.reinterpret_u8_i16x16(a1),
3856        )
3857    }
3858    #[inline(always)]
3859    fn reinterpret_u32_i16x32(self, a: i16x32<Self>) -> u32x16<Self> {
3860        let (a0, a1) = self.split_i16x32(a);
3861        self.combine_u32x8(
3862            self.reinterpret_u32_i16x16(a0),
3863            self.reinterpret_u32_i16x16(a1),
3864        )
3865    }
3866    #[inline(always)]
3867    fn splat_u16x32(self, a: u16) -> u16x32<Self> {
3868        let half = self.splat_u16x16(a);
3869        self.combine_u16x16(half, half)
3870    }
3871    #[inline(always)]
3872    fn not_u16x32(self, a: u16x32<Self>) -> u16x32<Self> {
3873        let (a0, a1) = self.split_u16x32(a);
3874        self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1))
3875    }
3876    #[inline(always)]
3877    fn add_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3878        let (a0, a1) = self.split_u16x32(a);
3879        let (b0, b1) = self.split_u16x32(b);
3880        self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1))
3881    }
3882    #[inline(always)]
3883    fn sub_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3884        let (a0, a1) = self.split_u16x32(a);
3885        let (b0, b1) = self.split_u16x32(b);
3886        self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1))
3887    }
3888    #[inline(always)]
3889    fn mul_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3890        let (a0, a1) = self.split_u16x32(a);
3891        let (b0, b1) = self.split_u16x32(b);
3892        self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1))
3893    }
3894    #[inline(always)]
3895    fn and_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3896        let (a0, a1) = self.split_u16x32(a);
3897        let (b0, b1) = self.split_u16x32(b);
3898        self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1))
3899    }
3900    #[inline(always)]
3901    fn or_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3902        let (a0, a1) = self.split_u16x32(a);
3903        let (b0, b1) = self.split_u16x32(b);
3904        self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1))
3905    }
3906    #[inline(always)]
3907    fn xor_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3908        let (a0, a1) = self.split_u16x32(a);
3909        let (b0, b1) = self.split_u16x32(b);
3910        self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1))
3911    }
3912    #[inline(always)]
3913    fn shr_u16x32(self, a: u16x32<Self>, b: u32) -> u16x32<Self> {
3914        let (a0, a1) = self.split_u16x32(a);
3915        self.combine_u16x16(self.shr_u16x16(a0, b), self.shr_u16x16(a1, b))
3916    }
3917    #[inline(always)]
3918    fn shrv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3919        let (a0, a1) = self.split_u16x32(a);
3920        let (b0, b1) = self.split_u16x32(b);
3921        self.combine_u16x16(self.shrv_u16x16(a0, b0), self.shrv_u16x16(a1, b1))
3922    }
3923    #[inline(always)]
3924    fn shl_u16x32(self, a: u16x32<Self>, b: u32) -> u16x32<Self> {
3925        let (a0, a1) = self.split_u16x32(a);
3926        self.combine_u16x16(self.shl_u16x16(a0, b), self.shl_u16x16(a1, b))
3927    }
3928    #[inline(always)]
3929    fn simd_eq_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
3930        let (a0, a1) = self.split_u16x32(a);
3931        let (b0, b1) = self.split_u16x32(b);
3932        self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1))
3933    }
3934    #[inline(always)]
3935    fn simd_lt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
3936        let (a0, a1) = self.split_u16x32(a);
3937        let (b0, b1) = self.split_u16x32(b);
3938        self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1))
3939    }
3940    #[inline(always)]
3941    fn simd_le_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
3942        let (a0, a1) = self.split_u16x32(a);
3943        let (b0, b1) = self.split_u16x32(b);
3944        self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1))
3945    }
3946    #[inline(always)]
3947    fn simd_ge_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
3948        let (a0, a1) = self.split_u16x32(a);
3949        let (b0, b1) = self.split_u16x32(b);
3950        self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1))
3951    }
3952    #[inline(always)]
3953    fn simd_gt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
3954        let (a0, a1) = self.split_u16x32(a);
3955        let (b0, b1) = self.split_u16x32(b);
3956        self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1))
3957    }
3958    #[inline(always)]
3959    fn zip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3960        let (a0, _) = self.split_u16x32(a);
3961        let (b0, _) = self.split_u16x32(b);
3962        self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0))
3963    }
3964    #[inline(always)]
3965    fn zip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3966        let (_, a1) = self.split_u16x32(a);
3967        let (_, b1) = self.split_u16x32(b);
3968        self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1))
3969    }
3970    #[inline(always)]
3971    fn unzip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3972        let (a0, a1) = self.split_u16x32(a);
3973        let (b0, b1) = self.split_u16x32(b);
3974        self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1))
3975    }
3976    #[inline(always)]
3977    fn unzip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3978        let (a0, a1) = self.split_u16x32(a);
3979        let (b0, b1) = self.split_u16x32(b);
3980        self.combine_u16x16(
3981            self.unzip_high_u16x16(a0, a1),
3982            self.unzip_high_u16x16(b0, b1),
3983        )
3984    }
3985    #[inline(always)]
3986    fn select_u16x32(self, a: mask16x32<Self>, b: u16x32<Self>, c: u16x32<Self>) -> u16x32<Self> {
3987        let (a0, a1) = self.split_mask16x32(a);
3988        let (b0, b1) = self.split_u16x32(b);
3989        let (c0, c1) = self.split_u16x32(c);
3990        self.combine_u16x16(
3991            self.select_u16x16(a0, b0, c0),
3992            self.select_u16x16(a1, b1, c1),
3993        )
3994    }
3995    #[inline(always)]
3996    fn min_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3997        let (a0, a1) = self.split_u16x32(a);
3998        let (b0, b1) = self.split_u16x32(b);
3999        self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1))
4000    }
4001    #[inline(always)]
4002    fn max_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
4003        let (a0, a1) = self.split_u16x32(a);
4004        let (b0, b1) = self.split_u16x32(b);
4005        self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1))
4006    }
4007    #[inline(always)]
4008    fn split_u16x32(self, a: u16x32<Self>) -> (u16x16<Self>, u16x16<Self>) {
4009        let mut b0 = [0; 16usize];
4010        let mut b1 = [0; 16usize];
4011        b0.copy_from_slice(&a.val[0..16usize]);
4012        b1.copy_from_slice(&a.val[16usize..32usize]);
4013        (b0.simd_into(self), b1.simd_into(self))
4014    }
4015    #[inline(always)]
4016    fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self> {
4017        crate::Fallback::new()
4018            .load_interleaved_128_u16x32(src)
4019            .val
4020            .simd_into(self)
4021    }
4022    #[inline(always)]
4023    fn store_interleaved_128_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
4024        let fb = crate::Fallback::new();
4025        fb.store_interleaved_128_u16x32(a.val.simd_into(fb), dest);
4026    }
4027    #[inline(always)]
4028    fn narrow_u16x32(self, a: u16x32<Self>) -> u8x32<Self> {
4029        let (a0, a1) = self.split_u16x32(a);
4030        self.combine_u8x16(self.narrow_u16x16(a0), self.narrow_u16x16(a1))
4031    }
4032    #[inline(always)]
4033    fn reinterpret_u8_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
4034        let (a0, a1) = self.split_u16x32(a);
4035        self.combine_u8x32(
4036            self.reinterpret_u8_u16x16(a0),
4037            self.reinterpret_u8_u16x16(a1),
4038        )
4039    }
4040    #[inline(always)]
4041    fn reinterpret_u32_u16x32(self, a: u16x32<Self>) -> u32x16<Self> {
4042        let (a0, a1) = self.split_u16x32(a);
4043        self.combine_u32x8(
4044            self.reinterpret_u32_u16x16(a0),
4045            self.reinterpret_u32_u16x16(a1),
4046        )
4047    }
4048    #[inline(always)]
4049    fn splat_mask16x32(self, a: i16) -> mask16x32<Self> {
4050        let half = self.splat_mask16x16(a);
4051        self.combine_mask16x16(half, half)
4052    }
4053    #[inline(always)]
4054    fn not_mask16x32(self, a: mask16x32<Self>) -> mask16x32<Self> {
4055        let (a0, a1) = self.split_mask16x32(a);
4056        self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1))
4057    }
4058    #[inline(always)]
4059    fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
4060        let (a0, a1) = self.split_mask16x32(a);
4061        let (b0, b1) = self.split_mask16x32(b);
4062        self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1))
4063    }
4064    #[inline(always)]
4065    fn or_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
4066        let (a0, a1) = self.split_mask16x32(a);
4067        let (b0, b1) = self.split_mask16x32(b);
4068        self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1))
4069    }
4070    #[inline(always)]
4071    fn xor_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
4072        let (a0, a1) = self.split_mask16x32(a);
4073        let (b0, b1) = self.split_mask16x32(b);
4074        self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1))
4075    }
4076    #[inline(always)]
4077    fn select_mask16x32(
4078        self,
4079        a: mask16x32<Self>,
4080        b: mask16x32<Self>,
4081        c: mask16x32<Self>,
4082    ) -> mask16x32<Self> {
4083        let (a0, a1) = self.split_mask16x32(a);
4084        let (b0, b1) = self.split_mask16x32(b);
4085        let (c0, c1) = self.split_mask16x32(c);
4086        self.combine_mask16x16(
4087            self.select_mask16x16(a0, b0, c0),
4088            self.select_mask16x16(a1, b1, c1),
4089        )
4090    }
4091    #[inline(always)]
4092    fn simd_eq_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
4093        let (a0, a1) = self.split_mask16x32(a);
4094        let (b0, b1) = self.split_mask16x32(b);
4095        self.combine_mask16x16(
4096            self.simd_eq_mask16x16(a0, b0),
4097            self.simd_eq_mask16x16(a1, b1),
4098        )
4099    }
4100    #[inline(always)]
4101    fn split_mask16x32(self, a: mask16x32<Self>) -> (mask16x16<Self>, mask16x16<Self>) {
4102        let mut b0 = [0; 16usize];
4103        let mut b1 = [0; 16usize];
4104        b0.copy_from_slice(&a.val[0..16usize]);
4105        b1.copy_from_slice(&a.val[16usize..32usize]);
4106        (b0.simd_into(self), b1.simd_into(self))
4107    }
4108    #[inline(always)]
4109    fn splat_i32x16(self, a: i32) -> i32x16<Self> {
4110        let half = self.splat_i32x8(a);
4111        self.combine_i32x8(half, half)
4112    }
4113    #[inline(always)]
4114    fn not_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
4115        let (a0, a1) = self.split_i32x16(a);
4116        self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1))
4117    }
4118    #[inline(always)]
4119    fn add_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
4120        let (a0, a1) = self.split_i32x16(a);
4121        let (b0, b1) = self.split_i32x16(b);
4122        self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1))
4123    }
4124    #[inline(always)]
4125    fn sub_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
4126        let (a0, a1) = self.split_i32x16(a);
4127        let (b0, b1) = self.split_i32x16(b);
4128        self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1))
4129    }
4130    #[inline(always)]
4131    fn mul_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
4132        let (a0, a1) = self.split_i32x16(a);
4133        let (b0, b1) = self.split_i32x16(b);
4134        self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1))
4135    }
4136    #[inline(always)]
4137    fn and_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
4138        let (a0, a1) = self.split_i32x16(a);
4139        let (b0, b1) = self.split_i32x16(b);
4140        self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1))
4141    }
4142    #[inline(always)]
4143    fn or_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
4144        let (a0, a1) = self.split_i32x16(a);
4145        let (b0, b1) = self.split_i32x16(b);
4146        self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1))
4147    }
4148    #[inline(always)]
4149    fn xor_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
4150        let (a0, a1) = self.split_i32x16(a);
4151        let (b0, b1) = self.split_i32x16(b);
4152        self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1))
4153    }
4154    #[inline(always)]
4155    fn shr_i32x16(self, a: i32x16<Self>, b: u32) -> i32x16<Self> {
4156        let (a0, a1) = self.split_i32x16(a);
4157        self.combine_i32x8(self.shr_i32x8(a0, b), self.shr_i32x8(a1, b))
4158    }
4159    #[inline(always)]
4160    fn shrv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
4161        let (a0, a1) = self.split_i32x16(a);
4162        let (b0, b1) = self.split_i32x16(b);
4163        self.combine_i32x8(self.shrv_i32x8(a0, b0), self.shrv_i32x8(a1, b1))
4164    }
4165    #[inline(always)]
4166    fn shl_i32x16(self, a: i32x16<Self>, b: u32) -> i32x16<Self> {
4167        let (a0, a1) = self.split_i32x16(a);
4168        self.combine_i32x8(self.shl_i32x8(a0, b), self.shl_i32x8(a1, b))
4169    }
4170    #[inline(always)]
4171    fn simd_eq_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
4172        let (a0, a1) = self.split_i32x16(a);
4173        let (b0, b1) = self.split_i32x16(b);
4174        self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1))
4175    }
4176    #[inline(always)]
4177    fn simd_lt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
4178        let (a0, a1) = self.split_i32x16(a);
4179        let (b0, b1) = self.split_i32x16(b);
4180        self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1))
4181    }
4182    #[inline(always)]
4183    fn simd_le_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
4184        let (a0, a1) = self.split_i32x16(a);
4185        let (b0, b1) = self.split_i32x16(b);
4186        self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1))
4187    }
4188    #[inline(always)]
4189    fn simd_ge_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
4190        let (a0, a1) = self.split_i32x16(a);
4191        let (b0, b1) = self.split_i32x16(b);
4192        self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1))
4193    }
4194    #[inline(always)]
4195    fn simd_gt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
4196        let (a0, a1) = self.split_i32x16(a);
4197        let (b0, b1) = self.split_i32x16(b);
4198        self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1))
4199    }
4200    #[inline(always)]
4201    fn zip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
4202        let (a0, _) = self.split_i32x16(a);
4203        let (b0, _) = self.split_i32x16(b);
4204        self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0))
4205    }
4206    #[inline(always)]
4207    fn zip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
4208        let (_, a1) = self.split_i32x16(a);
4209        let (_, b1) = self.split_i32x16(b);
4210        self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1))
4211    }
4212    #[inline(always)]
4213    fn unzip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
4214        let (a0, a1) = self.split_i32x16(a);
4215        let (b0, b1) = self.split_i32x16(b);
4216        self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1))
4217    }
4218    #[inline(always)]
4219    fn unzip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
4220        let (a0, a1) = self.split_i32x16(a);
4221        let (b0, b1) = self.split_i32x16(b);
4222        self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1))
4223    }
4224    #[inline(always)]
4225    fn select_i32x16(self, a: mask32x16<Self>, b: i32x16<Self>, c: i32x16<Self>) -> i32x16<Self> {
4226        let (a0, a1) = self.split_mask32x16(a);
4227        let (b0, b1) = self.split_i32x16(b);
4228        let (c0, c1) = self.split_i32x16(c);
4229        self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1))
4230    }
4231    #[inline(always)]
4232    fn min_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
4233        let (a0, a1) = self.split_i32x16(a);
4234        let (b0, b1) = self.split_i32x16(b);
4235        self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1))
4236    }
4237    #[inline(always)]
4238    fn max_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
4239        let (a0, a1) = self.split_i32x16(a);
4240        let (b0, b1) = self.split_i32x16(b);
4241        self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1))
4242    }
4243    #[inline(always)]
4244    fn split_i32x16(self, a: i32x16<Self>) -> (i32x8<Self>, i32x8<Self>) {
4245        let mut b0 = [0; 8usize];
4246        let mut b1 = [0; 8usize];
4247        b0.copy_from_slice(&a.val[0..8usize]);
4248        b1.copy_from_slice(&a.val[8usize..16usize]);
4249        (b0.simd_into(self), b1.simd_into(self))
4250    }
4251    #[inline(always)]
4252    fn neg_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
4253        let (a0, a1) = self.split_i32x16(a);
4254        self.combine_i32x8(self.neg_i32x8(a0), self.neg_i32x8(a1))
4255    }
4256    #[inline(always)]
4257    fn reinterpret_u8_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
4258        let (a0, a1) = self.split_i32x16(a);
4259        self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1))
4260    }
4261    #[inline(always)]
4262    fn reinterpret_u32_i32x16(self, a: i32x16<Self>) -> u32x16<Self> {
4263        let (a0, a1) = self.split_i32x16(a);
4264        self.combine_u32x8(
4265            self.reinterpret_u32_i32x8(a0),
4266            self.reinterpret_u32_i32x8(a1),
4267        )
4268    }
4269    #[inline(always)]
4270    fn cvt_f32_i32x16(self, a: i32x16<Self>) -> f32x16<Self> {
4271        let (a0, a1) = self.split_i32x16(a);
4272        self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1))
4273    }
4274    #[inline(always)]
4275    fn splat_u32x16(self, a: u32) -> u32x16<Self> {
4276        let half = self.splat_u32x8(a);
4277        self.combine_u32x8(half, half)
4278    }
4279    #[inline(always)]
4280    fn not_u32x16(self, a: u32x16<Self>) -> u32x16<Self> {
4281        let (a0, a1) = self.split_u32x16(a);
4282        self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1))
4283    }
4284    #[inline(always)]
4285    fn add_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4286        let (a0, a1) = self.split_u32x16(a);
4287        let (b0, b1) = self.split_u32x16(b);
4288        self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1))
4289    }
4290    #[inline(always)]
4291    fn sub_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4292        let (a0, a1) = self.split_u32x16(a);
4293        let (b0, b1) = self.split_u32x16(b);
4294        self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1))
4295    }
4296    #[inline(always)]
4297    fn mul_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4298        let (a0, a1) = self.split_u32x16(a);
4299        let (b0, b1) = self.split_u32x16(b);
4300        self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1))
4301    }
4302    #[inline(always)]
4303    fn and_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4304        let (a0, a1) = self.split_u32x16(a);
4305        let (b0, b1) = self.split_u32x16(b);
4306        self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1))
4307    }
4308    #[inline(always)]
4309    fn or_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4310        let (a0, a1) = self.split_u32x16(a);
4311        let (b0, b1) = self.split_u32x16(b);
4312        self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1))
4313    }
4314    #[inline(always)]
4315    fn xor_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4316        let (a0, a1) = self.split_u32x16(a);
4317        let (b0, b1) = self.split_u32x16(b);
4318        self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1))
4319    }
4320    #[inline(always)]
4321    fn shr_u32x16(self, a: u32x16<Self>, b: u32) -> u32x16<Self> {
4322        let (a0, a1) = self.split_u32x16(a);
4323        self.combine_u32x8(self.shr_u32x8(a0, b), self.shr_u32x8(a1, b))
4324    }
4325    #[inline(always)]
4326    fn shrv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4327        let (a0, a1) = self.split_u32x16(a);
4328        let (b0, b1) = self.split_u32x16(b);
4329        self.combine_u32x8(self.shrv_u32x8(a0, b0), self.shrv_u32x8(a1, b1))
4330    }
4331    #[inline(always)]
4332    fn shl_u32x16(self, a: u32x16<Self>, b: u32) -> u32x16<Self> {
4333        let (a0, a1) = self.split_u32x16(a);
4334        self.combine_u32x8(self.shl_u32x8(a0, b), self.shl_u32x8(a1, b))
4335    }
4336    #[inline(always)]
4337    fn simd_eq_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
4338        let (a0, a1) = self.split_u32x16(a);
4339        let (b0, b1) = self.split_u32x16(b);
4340        self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1))
4341    }
4342    #[inline(always)]
4343    fn simd_lt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
4344        let (a0, a1) = self.split_u32x16(a);
4345        let (b0, b1) = self.split_u32x16(b);
4346        self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1))
4347    }
4348    #[inline(always)]
4349    fn simd_le_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
4350        let (a0, a1) = self.split_u32x16(a);
4351        let (b0, b1) = self.split_u32x16(b);
4352        self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1))
4353    }
4354    #[inline(always)]
4355    fn simd_ge_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
4356        let (a0, a1) = self.split_u32x16(a);
4357        let (b0, b1) = self.split_u32x16(b);
4358        self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1))
4359    }
4360    #[inline(always)]
4361    fn simd_gt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
4362        let (a0, a1) = self.split_u32x16(a);
4363        let (b0, b1) = self.split_u32x16(b);
4364        self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1))
4365    }
4366    #[inline(always)]
4367    fn zip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4368        let (a0, _) = self.split_u32x16(a);
4369        let (b0, _) = self.split_u32x16(b);
4370        self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0))
4371    }
4372    #[inline(always)]
4373    fn zip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4374        let (_, a1) = self.split_u32x16(a);
4375        let (_, b1) = self.split_u32x16(b);
4376        self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1))
4377    }
4378    #[inline(always)]
4379    fn unzip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4380        let (a0, a1) = self.split_u32x16(a);
4381        let (b0, b1) = self.split_u32x16(b);
4382        self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1))
4383    }
4384    #[inline(always)]
4385    fn unzip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4386        let (a0, a1) = self.split_u32x16(a);
4387        let (b0, b1) = self.split_u32x16(b);
4388        self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1))
4389    }
4390    #[inline(always)]
4391    fn select_u32x16(self, a: mask32x16<Self>, b: u32x16<Self>, c: u32x16<Self>) -> u32x16<Self> {
4392        let (a0, a1) = self.split_mask32x16(a);
4393        let (b0, b1) = self.split_u32x16(b);
4394        let (c0, c1) = self.split_u32x16(c);
4395        self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1))
4396    }
4397    #[inline(always)]
4398    fn min_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4399        let (a0, a1) = self.split_u32x16(a);
4400        let (b0, b1) = self.split_u32x16(b);
4401        self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1))
4402    }
4403    #[inline(always)]
4404    fn max_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4405        let (a0, a1) = self.split_u32x16(a);
4406        let (b0, b1) = self.split_u32x16(b);
4407        self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1))
4408    }
4409    #[inline(always)]
4410    fn split_u32x16(self, a: u32x16<Self>) -> (u32x8<Self>, u32x8<Self>) {
4411        let mut b0 = [0; 8usize];
4412        let mut b1 = [0; 8usize];
4413        b0.copy_from_slice(&a.val[0..8usize]);
4414        b1.copy_from_slice(&a.val[8usize..16usize]);
4415        (b0.simd_into(self), b1.simd_into(self))
4416    }
4417    #[inline(always)]
4418    fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self> {
4419        unsafe {
4420            let v0 = _mm_loadu_si128(src.as_ptr().add(0) as *const __m128i);
4421            let v1 = _mm_loadu_si128(src.as_ptr().add(4) as *const __m128i);
4422            let v2 = _mm_loadu_si128(src.as_ptr().add(8) as *const __m128i);
4423            let v3 = _mm_loadu_si128(src.as_ptr().add(12) as *const __m128i);
4424            let tmp0 = _mm_unpacklo_epi32(v0, v1);
4425            let tmp1 = _mm_unpackhi_epi32(v0, v1);
4426            let tmp2 = _mm_unpacklo_epi32(v2, v3);
4427            let tmp3 = _mm_unpackhi_epi32(v2, v3);
4428            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
4429            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
4430            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
4431            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
4432            self.combine_u32x8(
4433                self.combine_u32x4(out0.simd_into(self), out1.simd_into(self)),
4434                self.combine_u32x4(out2.simd_into(self), out3.simd_into(self)),
4435            )
4436        }
4437    }
4438    #[inline(always)]
4439    fn store_interleaved_128_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
4440        let fb = crate::Fallback::new();
4441        fb.store_interleaved_128_u32x16(a.val.simd_into(fb), dest);
4442    }
4443    #[inline(always)]
4444    fn reinterpret_u8_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
4445        let (a0, a1) = self.split_u32x16(a);
4446        self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1))
4447    }
4448    #[inline(always)]
4449    fn cvt_f32_u32x16(self, a: u32x16<Self>) -> f32x16<Self> {
4450        let (a0, a1) = self.split_u32x16(a);
4451        self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1))
4452    }
4453    #[inline(always)]
4454    fn splat_mask32x16(self, a: i32) -> mask32x16<Self> {
4455        let half = self.splat_mask32x8(a);
4456        self.combine_mask32x8(half, half)
4457    }
4458    #[inline(always)]
4459    fn not_mask32x16(self, a: mask32x16<Self>) -> mask32x16<Self> {
4460        let (a0, a1) = self.split_mask32x16(a);
4461        self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1))
4462    }
4463    #[inline(always)]
4464    fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
4465        let (a0, a1) = self.split_mask32x16(a);
4466        let (b0, b1) = self.split_mask32x16(b);
4467        self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1))
4468    }
4469    #[inline(always)]
4470    fn or_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
4471        let (a0, a1) = self.split_mask32x16(a);
4472        let (b0, b1) = self.split_mask32x16(b);
4473        self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1))
4474    }
4475    #[inline(always)]
4476    fn xor_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
4477        let (a0, a1) = self.split_mask32x16(a);
4478        let (b0, b1) = self.split_mask32x16(b);
4479        self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1))
4480    }
4481    #[inline(always)]
4482    fn select_mask32x16(
4483        self,
4484        a: mask32x16<Self>,
4485        b: mask32x16<Self>,
4486        c: mask32x16<Self>,
4487    ) -> mask32x16<Self> {
4488        let (a0, a1) = self.split_mask32x16(a);
4489        let (b0, b1) = self.split_mask32x16(b);
4490        let (c0, c1) = self.split_mask32x16(c);
4491        self.combine_mask32x8(
4492            self.select_mask32x8(a0, b0, c0),
4493            self.select_mask32x8(a1, b1, c1),
4494        )
4495    }
4496    #[inline(always)]
4497    fn simd_eq_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
4498        let (a0, a1) = self.split_mask32x16(a);
4499        let (b0, b1) = self.split_mask32x16(b);
4500        self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1))
4501    }
4502    #[inline(always)]
4503    fn split_mask32x16(self, a: mask32x16<Self>) -> (mask32x8<Self>, mask32x8<Self>) {
4504        let mut b0 = [0; 8usize];
4505        let mut b1 = [0; 8usize];
4506        b0.copy_from_slice(&a.val[0..8usize]);
4507        b1.copy_from_slice(&a.val[8usize..16usize]);
4508        (b0.simd_into(self), b1.simd_into(self))
4509    }
4510    #[inline(always)]
4511    fn splat_f64x8(self, a: f64) -> f64x8<Self> {
4512        let half = self.splat_f64x4(a);
4513        self.combine_f64x4(half, half)
4514    }
4515    #[inline(always)]
4516    fn abs_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
4517        let (a0, a1) = self.split_f64x8(a);
4518        self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1))
4519    }
4520    #[inline(always)]
4521    fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
4522        let (a0, a1) = self.split_f64x8(a);
4523        self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1))
4524    }
4525    #[inline(always)]
4526    fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
4527        let (a0, a1) = self.split_f64x8(a);
4528        self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
4529    }
4530    #[inline(always)]
4531    fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4532        let (a0, a1) = self.split_f64x8(a);
4533        let (b0, b1) = self.split_f64x8(b);
4534        self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1))
4535    }
4536    #[inline(always)]
4537    fn sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4538        let (a0, a1) = self.split_f64x8(a);
4539        let (b0, b1) = self.split_f64x8(b);
4540        self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1))
4541    }
4542    #[inline(always)]
4543    fn mul_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4544        let (a0, a1) = self.split_f64x8(a);
4545        let (b0, b1) = self.split_f64x8(b);
4546        self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1))
4547    }
4548    #[inline(always)]
4549    fn div_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4550        let (a0, a1) = self.split_f64x8(a);
4551        let (b0, b1) = self.split_f64x8(b);
4552        self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1))
4553    }
4554    #[inline(always)]
4555    fn copysign_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4556        let (a0, a1) = self.split_f64x8(a);
4557        let (b0, b1) = self.split_f64x8(b);
4558        self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1))
4559    }
4560    #[inline(always)]
4561    fn simd_eq_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
4562        let (a0, a1) = self.split_f64x8(a);
4563        let (b0, b1) = self.split_f64x8(b);
4564        self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1))
4565    }
4566    #[inline(always)]
4567    fn simd_lt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
4568        let (a0, a1) = self.split_f64x8(a);
4569        let (b0, b1) = self.split_f64x8(b);
4570        self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1))
4571    }
4572    #[inline(always)]
4573    fn simd_le_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
4574        let (a0, a1) = self.split_f64x8(a);
4575        let (b0, b1) = self.split_f64x8(b);
4576        self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1))
4577    }
4578    #[inline(always)]
4579    fn simd_ge_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
4580        let (a0, a1) = self.split_f64x8(a);
4581        let (b0, b1) = self.split_f64x8(b);
4582        self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1))
4583    }
4584    #[inline(always)]
4585    fn simd_gt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
4586        let (a0, a1) = self.split_f64x8(a);
4587        let (b0, b1) = self.split_f64x8(b);
4588        self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1))
4589    }
4590    #[inline(always)]
4591    fn zip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4592        let (a0, _) = self.split_f64x8(a);
4593        let (b0, _) = self.split_f64x8(b);
4594        self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0))
4595    }
4596    #[inline(always)]
4597    fn zip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4598        let (_, a1) = self.split_f64x8(a);
4599        let (_, b1) = self.split_f64x8(b);
4600        self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1))
4601    }
4602    #[inline(always)]
4603    fn unzip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4604        let (a0, a1) = self.split_f64x8(a);
4605        let (b0, b1) = self.split_f64x8(b);
4606        self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1))
4607    }
4608    #[inline(always)]
4609    fn unzip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4610        let (a0, a1) = self.split_f64x8(a);
4611        let (b0, b1) = self.split_f64x8(b);
4612        self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1))
4613    }
4614    #[inline(always)]
4615    fn max_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4616        let (a0, a1) = self.split_f64x8(a);
4617        let (b0, b1) = self.split_f64x8(b);
4618        self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1))
4619    }
4620    #[inline(always)]
4621    fn max_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4622        let (a0, a1) = self.split_f64x8(a);
4623        let (b0, b1) = self.split_f64x8(b);
4624        self.combine_f64x4(
4625            self.max_precise_f64x4(a0, b0),
4626            self.max_precise_f64x4(a1, b1),
4627        )
4628    }
4629    #[inline(always)]
4630    fn min_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4631        let (a0, a1) = self.split_f64x8(a);
4632        let (b0, b1) = self.split_f64x8(b);
4633        self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1))
4634    }
4635    #[inline(always)]
4636    fn min_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4637        let (a0, a1) = self.split_f64x8(a);
4638        let (b0, b1) = self.split_f64x8(b);
4639        self.combine_f64x4(
4640            self.min_precise_f64x4(a0, b0),
4641            self.min_precise_f64x4(a1, b1),
4642        )
4643    }
4644    #[inline(always)]
4645    fn madd_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
4646        let (a0, a1) = self.split_f64x8(a);
4647        let (b0, b1) = self.split_f64x8(b);
4648        let (c0, c1) = self.split_f64x8(c);
4649        self.combine_f64x4(self.madd_f64x4(a0, b0, c0), self.madd_f64x4(a1, b1, c1))
4650    }
4651    #[inline(always)]
4652    fn msub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
4653        let (a0, a1) = self.split_f64x8(a);
4654        let (b0, b1) = self.split_f64x8(b);
4655        let (c0, c1) = self.split_f64x8(c);
4656        self.combine_f64x4(self.msub_f64x4(a0, b0, c0), self.msub_f64x4(a1, b1, c1))
4657    }
4658    #[inline(always)]
4659    fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
4660        let (a0, a1) = self.split_f64x8(a);
4661        self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1))
4662    }
4663    #[inline(always)]
4664    fn fract_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
4665        let (a0, a1) = self.split_f64x8(a);
4666        self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1))
4667    }
4668    #[inline(always)]
4669    fn trunc_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
4670        let (a0, a1) = self.split_f64x8(a);
4671        self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1))
4672    }
4673    #[inline(always)]
4674    fn select_f64x8(self, a: mask64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
4675        let (a0, a1) = self.split_mask64x8(a);
4676        let (b0, b1) = self.split_f64x8(b);
4677        let (c0, c1) = self.split_f64x8(c);
4678        self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1))
4679    }
4680    #[inline(always)]
4681    fn split_f64x8(self, a: f64x8<Self>) -> (f64x4<Self>, f64x4<Self>) {
4682        let mut b0 = [0.0; 4usize];
4683        let mut b1 = [0.0; 4usize];
4684        b0.copy_from_slice(&a.val[0..4usize]);
4685        b1.copy_from_slice(&a.val[4usize..8usize]);
4686        (b0.simd_into(self), b1.simd_into(self))
4687    }
4688    #[inline(always)]
4689    fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self> {
4690        let (a0, a1) = self.split_f64x8(a);
4691        self.combine_f32x8(
4692            self.reinterpret_f32_f64x4(a0),
4693            self.reinterpret_f32_f64x4(a1),
4694        )
4695    }
4696    #[inline(always)]
4697    fn splat_mask64x8(self, a: i64) -> mask64x8<Self> {
4698        let half = self.splat_mask64x4(a);
4699        self.combine_mask64x4(half, half)
4700    }
4701    #[inline(always)]
4702    fn not_mask64x8(self, a: mask64x8<Self>) -> mask64x8<Self> {
4703        let (a0, a1) = self.split_mask64x8(a);
4704        self.combine_mask64x4(self.not_mask64x4(a0), self.not_mask64x4(a1))
4705    }
4706    #[inline(always)]
4707    fn and_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
4708        let (a0, a1) = self.split_mask64x8(a);
4709        let (b0, b1) = self.split_mask64x8(b);
4710        self.combine_mask64x4(self.and_mask64x4(a0, b0), self.and_mask64x4(a1, b1))
4711    }
4712    #[inline(always)]
4713    fn or_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
4714        let (a0, a1) = self.split_mask64x8(a);
4715        let (b0, b1) = self.split_mask64x8(b);
4716        self.combine_mask64x4(self.or_mask64x4(a0, b0), self.or_mask64x4(a1, b1))
4717    }
4718    #[inline(always)]
4719    fn xor_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
4720        let (a0, a1) = self.split_mask64x8(a);
4721        let (b0, b1) = self.split_mask64x8(b);
4722        self.combine_mask64x4(self.xor_mask64x4(a0, b0), self.xor_mask64x4(a1, b1))
4723    }
4724    #[inline(always)]
4725    fn select_mask64x8(
4726        self,
4727        a: mask64x8<Self>,
4728        b: mask64x8<Self>,
4729        c: mask64x8<Self>,
4730    ) -> mask64x8<Self> {
4731        let (a0, a1) = self.split_mask64x8(a);
4732        let (b0, b1) = self.split_mask64x8(b);
4733        let (c0, c1) = self.split_mask64x8(c);
4734        self.combine_mask64x4(
4735            self.select_mask64x4(a0, b0, c0),
4736            self.select_mask64x4(a1, b1, c1),
4737        )
4738    }
4739    #[inline(always)]
4740    fn simd_eq_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
4741        let (a0, a1) = self.split_mask64x8(a);
4742        let (b0, b1) = self.split_mask64x8(b);
4743        self.combine_mask64x4(self.simd_eq_mask64x4(a0, b0), self.simd_eq_mask64x4(a1, b1))
4744    }
4745    #[inline(always)]
4746    fn split_mask64x8(self, a: mask64x8<Self>) -> (mask64x4<Self>, mask64x4<Self>) {
4747        let mut b0 = [0; 4usize];
4748        let mut b1 = [0; 4usize];
4749        b0.copy_from_slice(&a.val[0..4usize]);
4750        b1.copy_from_slice(&a.val[4usize..8usize]);
4751        (b0.simd_into(self), b1.simd_into(self))
4752    }
4753}
4754impl<S: Simd> SimdFrom<__m128, S> for f32x4<S> {
4755    #[inline(always)]
4756    fn simd_from(arch: __m128, simd: S) -> Self {
4757        Self {
4758            val: unsafe { core::mem::transmute(arch) },
4759            simd,
4760        }
4761    }
4762}
4763impl<S: Simd> From<f32x4<S>> for __m128 {
4764    #[inline(always)]
4765    fn from(value: f32x4<S>) -> Self {
4766        unsafe { core::mem::transmute(value.val) }
4767    }
4768}
4769impl<S: Simd> SimdFrom<__m128i, S> for i8x16<S> {
4770    #[inline(always)]
4771    fn simd_from(arch: __m128i, simd: S) -> Self {
4772        Self {
4773            val: unsafe { core::mem::transmute(arch) },
4774            simd,
4775        }
4776    }
4777}
4778impl<S: Simd> From<i8x16<S>> for __m128i {
4779    #[inline(always)]
4780    fn from(value: i8x16<S>) -> Self {
4781        unsafe { core::mem::transmute(value.val) }
4782    }
4783}
4784impl<S: Simd> SimdFrom<__m128i, S> for u8x16<S> {
4785    #[inline(always)]
4786    fn simd_from(arch: __m128i, simd: S) -> Self {
4787        Self {
4788            val: unsafe { core::mem::transmute(arch) },
4789            simd,
4790        }
4791    }
4792}
4793impl<S: Simd> From<u8x16<S>> for __m128i {
4794    #[inline(always)]
4795    fn from(value: u8x16<S>) -> Self {
4796        unsafe { core::mem::transmute(value.val) }
4797    }
4798}
4799impl<S: Simd> SimdFrom<__m128i, S> for mask8x16<S> {
4800    #[inline(always)]
4801    fn simd_from(arch: __m128i, simd: S) -> Self {
4802        Self {
4803            val: unsafe { core::mem::transmute(arch) },
4804            simd,
4805        }
4806    }
4807}
4808impl<S: Simd> From<mask8x16<S>> for __m128i {
4809    #[inline(always)]
4810    fn from(value: mask8x16<S>) -> Self {
4811        unsafe { core::mem::transmute(value.val) }
4812    }
4813}
4814impl<S: Simd> SimdFrom<__m128i, S> for i16x8<S> {
4815    #[inline(always)]
4816    fn simd_from(arch: __m128i, simd: S) -> Self {
4817        Self {
4818            val: unsafe { core::mem::transmute(arch) },
4819            simd,
4820        }
4821    }
4822}
4823impl<S: Simd> From<i16x8<S>> for __m128i {
4824    #[inline(always)]
4825    fn from(value: i16x8<S>) -> Self {
4826        unsafe { core::mem::transmute(value.val) }
4827    }
4828}
4829impl<S: Simd> SimdFrom<__m128i, S> for u16x8<S> {
4830    #[inline(always)]
4831    fn simd_from(arch: __m128i, simd: S) -> Self {
4832        Self {
4833            val: unsafe { core::mem::transmute(arch) },
4834            simd,
4835        }
4836    }
4837}
4838impl<S: Simd> From<u16x8<S>> for __m128i {
4839    #[inline(always)]
4840    fn from(value: u16x8<S>) -> Self {
4841        unsafe { core::mem::transmute(value.val) }
4842    }
4843}
4844impl<S: Simd> SimdFrom<__m128i, S> for mask16x8<S> {
4845    #[inline(always)]
4846    fn simd_from(arch: __m128i, simd: S) -> Self {
4847        Self {
4848            val: unsafe { core::mem::transmute(arch) },
4849            simd,
4850        }
4851    }
4852}
4853impl<S: Simd> From<mask16x8<S>> for __m128i {
4854    #[inline(always)]
4855    fn from(value: mask16x8<S>) -> Self {
4856        unsafe { core::mem::transmute(value.val) }
4857    }
4858}
4859impl<S: Simd> SimdFrom<__m128i, S> for i32x4<S> {
4860    #[inline(always)]
4861    fn simd_from(arch: __m128i, simd: S) -> Self {
4862        Self {
4863            val: unsafe { core::mem::transmute(arch) },
4864            simd,
4865        }
4866    }
4867}
4868impl<S: Simd> From<i32x4<S>> for __m128i {
4869    #[inline(always)]
4870    fn from(value: i32x4<S>) -> Self {
4871        unsafe { core::mem::transmute(value.val) }
4872    }
4873}
4874impl<S: Simd> SimdFrom<__m128i, S> for u32x4<S> {
4875    #[inline(always)]
4876    fn simd_from(arch: __m128i, simd: S) -> Self {
4877        Self {
4878            val: unsafe { core::mem::transmute(arch) },
4879            simd,
4880        }
4881    }
4882}
4883impl<S: Simd> From<u32x4<S>> for __m128i {
4884    #[inline(always)]
4885    fn from(value: u32x4<S>) -> Self {
4886        unsafe { core::mem::transmute(value.val) }
4887    }
4888}
4889impl<S: Simd> SimdFrom<__m128i, S> for mask32x4<S> {
4890    #[inline(always)]
4891    fn simd_from(arch: __m128i, simd: S) -> Self {
4892        Self {
4893            val: unsafe { core::mem::transmute(arch) },
4894            simd,
4895        }
4896    }
4897}
4898impl<S: Simd> From<mask32x4<S>> for __m128i {
4899    #[inline(always)]
4900    fn from(value: mask32x4<S>) -> Self {
4901        unsafe { core::mem::transmute(value.val) }
4902    }
4903}
4904impl<S: Simd> SimdFrom<__m128d, S> for f64x2<S> {
4905    #[inline(always)]
4906    fn simd_from(arch: __m128d, simd: S) -> Self {
4907        Self {
4908            val: unsafe { core::mem::transmute(arch) },
4909            simd,
4910        }
4911    }
4912}
4913impl<S: Simd> From<f64x2<S>> for __m128d {
4914    #[inline(always)]
4915    fn from(value: f64x2<S>) -> Self {
4916        unsafe { core::mem::transmute(value.val) }
4917    }
4918}
4919impl<S: Simd> SimdFrom<__m128i, S> for mask64x2<S> {
4920    #[inline(always)]
4921    fn simd_from(arch: __m128i, simd: S) -> Self {
4922        Self {
4923            val: unsafe { core::mem::transmute(arch) },
4924            simd,
4925        }
4926    }
4927}
4928impl<S: Simd> From<mask64x2<S>> for __m128i {
4929    #[inline(always)]
4930    fn from(value: mask64x2<S>) -> Self {
4931        unsafe { core::mem::transmute(value.val) }
4932    }
4933}