1#![expect(
7 unused_variables,
8 clippy::todo,
9 reason = "TODO: https://github.com/linebender/fearless_simd/issues/40"
10)]
11use crate::{Level, Simd, SimdFrom, SimdInto, seal::Seal};
12use crate::{
13 f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4,
14 i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4,
15 mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32,
16 u32x4, u32x8, u32x16,
17};
18#[cfg(target_arch = "x86")]
19use core::arch::x86::*;
20#[cfg(target_arch = "x86_64")]
21use core::arch::x86_64::*;
22use core::ops::*;
23#[doc = r#" The SIMD token for the "AVX2" and "FMA" level."#]
24#[derive(Clone, Copy, Debug)]
25pub struct Avx2 {
26 pub avx2: crate::core_arch::x86::Avx2,
27}
28impl Avx2 {
29 #[doc = r" Create a SIMD token."]
30 #[doc = r""]
31 #[doc = r" # Safety"]
32 #[doc = r""]
33 #[doc = r" The AVX2 and FMA CPU feature must be available."]
34 #[inline]
35 pub const unsafe fn new_unchecked() -> Self {
36 Avx2 {
37 avx2: unsafe { crate::core_arch::x86::Avx2::new_unchecked() },
38 }
39 }
40}
41impl Seal for Avx2 {}
42impl Simd for Avx2 {
43 type f32s = f32x4<Self>;
44 type u8s = u8x16<Self>;
45 type i8s = i8x16<Self>;
46 type u16s = u16x8<Self>;
47 type i16s = i16x8<Self>;
48 type u32s = u32x4<Self>;
49 type i32s = i32x4<Self>;
50 type mask8s = mask8x16<Self>;
51 type mask16s = mask16x8<Self>;
52 type mask32s = mask32x4<Self>;
53 #[inline(always)]
54 fn level(self) -> Level {
55 Level::Avx2(self)
56 }
57 #[inline]
58 fn vectorize<F: FnOnce() -> R, R>(self, f: F) -> R {
59 #[target_feature(enable = "avx2,fma")]
60 #[inline]
61 unsafe fn vectorize_avx2<F: FnOnce() -> R, R>(f: F) -> R {
62 f()
63 }
64 unsafe { vectorize_avx2(f) }
65 }
66 #[inline(always)]
67 fn splat_f32x4(self, val: f32) -> f32x4<Self> {
68 unsafe { _mm_set1_ps(val).simd_into(self) }
69 }
70 #[inline(always)]
71 fn abs_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
72 unsafe { _mm_andnot_ps(_mm_set1_ps(-0.0), a.into()).simd_into(self) }
73 }
74 #[inline(always)]
75 fn neg_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
76 unsafe { _mm_xor_ps(a.into(), _mm_set1_ps(-0.0)).simd_into(self) }
77 }
78 #[inline(always)]
79 fn sqrt_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
80 unsafe { _mm_sqrt_ps(a.into()).simd_into(self) }
81 }
82 #[inline(always)]
83 fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
84 unsafe { _mm_add_ps(a.into(), b.into()).simd_into(self) }
85 }
86 #[inline(always)]
87 fn sub_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
88 unsafe { _mm_sub_ps(a.into(), b.into()).simd_into(self) }
89 }
90 #[inline(always)]
91 fn mul_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
92 unsafe { _mm_mul_ps(a.into(), b.into()).simd_into(self) }
93 }
94 #[inline(always)]
95 fn div_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
96 unsafe { _mm_div_ps(a.into(), b.into()).simd_into(self) }
97 }
98 #[inline(always)]
99 fn copysign_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
100 unsafe {
101 let mask = _mm_set1_ps(-0.0);
102 _mm_or_ps(_mm_and_ps(mask, b.into()), _mm_andnot_ps(mask, a.into())).simd_into(self)
103 }
104 }
105 #[inline(always)]
106 fn simd_eq_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
107 unsafe { _mm_castps_si128(_mm_cmpeq_ps(a.into(), b.into())).simd_into(self) }
108 }
109 #[inline(always)]
110 fn simd_lt_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
111 unsafe { _mm_castps_si128(_mm_cmplt_ps(a.into(), b.into())).simd_into(self) }
112 }
113 #[inline(always)]
114 fn simd_le_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
115 unsafe { _mm_castps_si128(_mm_cmple_ps(a.into(), b.into())).simd_into(self) }
116 }
117 #[inline(always)]
118 fn simd_ge_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
119 unsafe { _mm_castps_si128(_mm_cmpge_ps(a.into(), b.into())).simd_into(self) }
120 }
121 #[inline(always)]
122 fn simd_gt_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
123 unsafe { _mm_castps_si128(_mm_cmpgt_ps(a.into(), b.into())).simd_into(self) }
124 }
125 #[inline(always)]
126 fn zip_low_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
127 unsafe { _mm_unpacklo_ps(a.into(), b.into()).simd_into(self) }
128 }
129 #[inline(always)]
130 fn zip_high_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
131 unsafe { _mm_unpackhi_ps(a.into(), b.into()).simd_into(self) }
132 }
133 #[inline(always)]
134 fn unzip_low_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
135 unsafe { _mm_shuffle_ps::<0b10_00_10_00>(a.into(), b.into()).simd_into(self) }
136 }
137 #[inline(always)]
138 fn unzip_high_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
139 unsafe { _mm_shuffle_ps::<0b11_01_11_01>(a.into(), b.into()).simd_into(self) }
140 }
141 #[inline(always)]
142 fn max_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
143 unsafe { _mm_max_ps(a.into(), b.into()).simd_into(self) }
144 }
145 #[inline(always)]
146 fn max_precise_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
147 unsafe { _mm_max_ps(a.into(), b.into()).simd_into(self) }
148 }
149 #[inline(always)]
150 fn min_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
151 unsafe { _mm_min_ps(a.into(), b.into()).simd_into(self) }
152 }
153 #[inline(always)]
154 fn min_precise_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
155 unsafe { _mm_min_ps(a.into(), b.into()).simd_into(self) }
156 }
157 #[inline(always)]
158 fn madd_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
159 unsafe { _mm_fmadd_ps(a.into(), b.into(), c.into()).simd_into(self) }
160 }
161 #[inline(always)]
162 fn msub_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
163 unsafe { _mm_fmsub_ps(a.into(), b.into(), c.into()).simd_into(self) }
164 }
165 #[inline(always)]
166 fn floor_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
167 unsafe { _mm_floor_ps(a.into()).simd_into(self) }
168 }
169 #[inline(always)]
170 fn fract_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
171 a - a.trunc()
172 }
173 #[inline(always)]
174 fn trunc_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
175 unsafe { _mm_round_ps(a.into(), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC).simd_into(self) }
176 }
177 #[inline(always)]
178 fn select_f32x4(self, a: mask32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
179 unsafe {
180 let mask = _mm_castsi128_ps(a.into());
181 _mm_or_ps(_mm_and_ps(mask, b.into()), _mm_andnot_ps(mask, c.into())).simd_into(self)
182 }
183 }
184 #[inline(always)]
185 fn combine_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x8<Self> {
186 let mut result = [0.0; 8usize];
187 result[0..4usize].copy_from_slice(&a.val);
188 result[4usize..8usize].copy_from_slice(&b.val);
189 result.simd_into(self)
190 }
191 #[inline(always)]
192 fn reinterpret_f64_f32x4(self, a: f32x4<Self>) -> f64x2<Self> {
193 f64x2 {
194 val: bytemuck::cast(a.val),
195 simd: a.simd,
196 }
197 }
198 #[inline(always)]
199 fn reinterpret_i32_f32x4(self, a: f32x4<Self>) -> i32x4<Self> {
200 i32x4 {
201 val: bytemuck::cast(a.val),
202 simd: a.simd,
203 }
204 }
205 #[inline(always)]
206 fn reinterpret_u8_f32x4(self, a: f32x4<Self>) -> u8x16<Self> {
207 u8x16 {
208 val: bytemuck::cast(a.val),
209 simd: a.simd,
210 }
211 }
212 #[inline(always)]
213 fn reinterpret_u32_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
214 u32x4 {
215 val: bytemuck::cast(a.val),
216 simd: a.simd,
217 }
218 }
219 #[inline(always)]
220 fn cvt_u32_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
221 unsafe {
222 _mm_cvtps_epi32(_mm_max_ps(_mm_floor_ps(a.into()), _mm_set1_ps(0.0))).simd_into(self)
223 }
224 }
225 #[inline(always)]
226 fn cvt_i32_f32x4(self, a: f32x4<Self>) -> i32x4<Self> {
227 unsafe { _mm_cvtps_epi32(a.trunc().into()).simd_into(self) }
228 }
229 #[inline(always)]
230 fn splat_i8x16(self, val: i8) -> i8x16<Self> {
231 unsafe { _mm_set1_epi8(val).simd_into(self) }
232 }
233 #[inline(always)]
234 fn not_i8x16(self, a: i8x16<Self>) -> i8x16<Self> {
235 a ^ !0
236 }
237 #[inline(always)]
238 fn add_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
239 unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) }
240 }
241 #[inline(always)]
242 fn sub_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
243 unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) }
244 }
245 #[inline(always)]
246 fn mul_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
247 todo!()
248 }
249 #[inline(always)]
250 fn and_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
251 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
252 }
253 #[inline(always)]
254 fn or_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
255 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
256 }
257 #[inline(always)]
258 fn xor_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
259 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
260 }
261 #[inline(always)]
262 fn shr_i8x16(self, a: i8x16<Self>, shift: u32) -> i8x16<Self> {
263 unsafe {
264 let val = a.into();
265 let shift_count = _mm_cvtsi32_si128(shift as i32);
266 let lo_16 = _mm_unpacklo_epi8(val, _mm_cmplt_epi8(val, _mm_setzero_si128()));
267 let hi_16 = _mm_unpackhi_epi8(val, _mm_cmplt_epi8(val, _mm_setzero_si128()));
268 let lo_shifted = _mm_sra_epi16(lo_16, shift_count);
269 let hi_shifted = _mm_sra_epi16(hi_16, shift_count);
270 _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
271 }
272 }
273 #[inline(always)]
274 fn shrv_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
275 core::array::from_fn(|i| core::ops::Shr::shr(a.val[i], b.val[i])).simd_into(self)
276 }
277 #[inline(always)]
278 fn shl_i8x16(self, a: i8x16<Self>, shift: u32) -> i8x16<Self> {
279 unsafe {
280 let val = a.into();
281 let shift_count = _mm_cvtsi32_si128(shift as i32);
282 let lo_16 = _mm_unpacklo_epi8(val, _mm_cmplt_epi8(val, _mm_setzero_si128()));
283 let hi_16 = _mm_unpackhi_epi8(val, _mm_cmplt_epi8(val, _mm_setzero_si128()));
284 let lo_shifted = _mm_sll_epi16(lo_16, shift_count);
285 let hi_shifted = _mm_sll_epi16(hi_16, shift_count);
286 _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
287 }
288 }
289 #[inline(always)]
290 fn simd_eq_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
291 unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) }
292 }
293 #[inline(always)]
294 fn simd_lt_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
295 unsafe { _mm_cmplt_epi8(a.into(), b.into()).simd_into(self) }
296 }
297 #[inline(always)]
298 fn simd_le_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
299 unsafe { _mm_cmpeq_epi8(_mm_min_epi8(a.into(), b.into()), a.into()).simd_into(self) }
300 }
301 #[inline(always)]
302 fn simd_ge_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
303 unsafe { _mm_cmpeq_epi8(_mm_max_epi8(a.into(), b.into()), a.into()).simd_into(self) }
304 }
305 #[inline(always)]
306 fn simd_gt_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
307 unsafe { _mm_cmpgt_epi8(a.into(), b.into()).simd_into(self) }
308 }
309 #[inline(always)]
310 fn zip_low_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
311 unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) }
312 }
313 #[inline(always)]
314 fn zip_high_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
315 unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) }
316 }
317 #[inline(always)]
318 fn unzip_low_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
319 unsafe {
320 let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14);
321 let t1 = _mm_shuffle_epi8(a.into(), mask);
322 let t2 = _mm_shuffle_epi8(b.into(), mask);
323 _mm_unpacklo_epi64(t1, t2).simd_into(self)
324 }
325 }
326 #[inline(always)]
327 fn unzip_high_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
328 unsafe {
329 let mask = _mm_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, 1, 3, 5, 7, 9, 11, 13, 15);
330 let t1 = _mm_shuffle_epi8(a.into(), mask);
331 let t2 = _mm_shuffle_epi8(b.into(), mask);
332 _mm_unpacklo_epi64(t1, t2).simd_into(self)
333 }
334 }
335 #[inline(always)]
336 fn select_i8x16(self, a: mask8x16<Self>, b: i8x16<Self>, c: i8x16<Self>) -> i8x16<Self> {
337 unsafe {
338 _mm_or_si128(
339 _mm_and_si128(a.into(), b.into()),
340 _mm_andnot_si128(a.into(), c.into()),
341 )
342 .simd_into(self)
343 }
344 }
345 #[inline(always)]
346 fn min_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
347 unsafe { _mm_min_epi8(a.into(), b.into()).simd_into(self) }
348 }
349 #[inline(always)]
350 fn max_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
351 unsafe { _mm_max_epi8(a.into(), b.into()).simd_into(self) }
352 }
353 #[inline(always)]
354 fn combine_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x32<Self> {
355 let mut result = [0; 32usize];
356 result[0..16usize].copy_from_slice(&a.val);
357 result[16usize..32usize].copy_from_slice(&b.val);
358 result.simd_into(self)
359 }
360 #[inline(always)]
361 fn neg_i8x16(self, a: i8x16<Self>) -> i8x16<Self> {
362 unsafe { _mm_sub_epi8(_mm_setzero_si128(), a.into()).simd_into(self) }
363 }
364 #[inline(always)]
365 fn reinterpret_u8_i8x16(self, a: i8x16<Self>) -> u8x16<Self> {
366 u8x16 {
367 val: bytemuck::cast(a.val),
368 simd: a.simd,
369 }
370 }
371 #[inline(always)]
372 fn reinterpret_u32_i8x16(self, a: i8x16<Self>) -> u32x4<Self> {
373 u32x4 {
374 val: bytemuck::cast(a.val),
375 simd: a.simd,
376 }
377 }
378 #[inline(always)]
379 fn splat_u8x16(self, val: u8) -> u8x16<Self> {
380 unsafe { _mm_set1_epi8(val as _).simd_into(self) }
381 }
382 #[inline(always)]
383 fn not_u8x16(self, a: u8x16<Self>) -> u8x16<Self> {
384 a ^ !0
385 }
386 #[inline(always)]
387 fn add_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
388 unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) }
389 }
390 #[inline(always)]
391 fn sub_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
392 unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) }
393 }
394 #[inline(always)]
395 fn mul_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
396 todo!()
397 }
398 #[inline(always)]
399 fn and_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
400 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
401 }
402 #[inline(always)]
403 fn or_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
404 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
405 }
406 #[inline(always)]
407 fn xor_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
408 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
409 }
410 #[inline(always)]
411 fn shr_u8x16(self, a: u8x16<Self>, shift: u32) -> u8x16<Self> {
412 unsafe {
413 let val = a.into();
414 let shift_count = _mm_cvtsi32_si128(shift as i32);
415 let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128());
416 let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128());
417 let lo_shifted = _mm_srl_epi16(lo_16, shift_count);
418 let hi_shifted = _mm_srl_epi16(hi_16, shift_count);
419 _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
420 }
421 }
422 #[inline(always)]
423 fn shrv_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
424 core::array::from_fn(|i| core::ops::Shr::shr(a.val[i], b.val[i])).simd_into(self)
425 }
426 #[inline(always)]
427 fn shl_u8x16(self, a: u8x16<Self>, shift: u32) -> u8x16<Self> {
428 unsafe {
429 let val = a.into();
430 let shift_count = _mm_cvtsi32_si128(shift as i32);
431 let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128());
432 let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128());
433 let lo_shifted = _mm_sll_epi16(lo_16, shift_count);
434 let hi_shifted = _mm_sll_epi16(hi_16, shift_count);
435 _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
436 }
437 }
438 #[inline(always)]
439 fn simd_eq_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
440 unsafe {
441 let sign_bit = _mm_set1_epi8(0x80u8 as _);
442 let a_signed = _mm_xor_si128(a.into(), sign_bit);
443 let b_signed = _mm_xor_si128(b.into(), sign_bit);
444 _mm_cmpgt_epi8(a_signed, b_signed).simd_into(self)
445 }
446 }
447 #[inline(always)]
448 fn simd_lt_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
449 unsafe {
450 let sign_bit = _mm_set1_epi8(0x80u8 as _);
451 let a_signed = _mm_xor_si128(a.into(), sign_bit);
452 let b_signed = _mm_xor_si128(b.into(), sign_bit);
453 _mm_cmpgt_epi8(b_signed, a_signed).simd_into(self)
454 }
455 }
456 #[inline(always)]
457 fn simd_le_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
458 unsafe { _mm_cmpeq_epi8(_mm_min_epu8(a.into(), b.into()), a.into()).simd_into(self) }
459 }
460 #[inline(always)]
461 fn simd_ge_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
462 unsafe { _mm_cmpeq_epi8(_mm_max_epu8(a.into(), b.into()), a.into()).simd_into(self) }
463 }
464 #[inline(always)]
465 fn simd_gt_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
466 unsafe {
467 let sign_bit = _mm_set1_epi8(0x80u8 as _);
468 let a_signed = _mm_xor_si128(a.into(), sign_bit);
469 let b_signed = _mm_xor_si128(b.into(), sign_bit);
470 _mm_cmpgt_epi8(a_signed, b_signed).simd_into(self)
471 }
472 }
473 #[inline(always)]
474 fn zip_low_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
475 unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) }
476 }
477 #[inline(always)]
478 fn zip_high_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
479 unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) }
480 }
481 #[inline(always)]
482 fn unzip_low_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
483 unsafe {
484 let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14);
485 let t1 = _mm_shuffle_epi8(a.into(), mask);
486 let t2 = _mm_shuffle_epi8(b.into(), mask);
487 _mm_unpacklo_epi64(t1, t2).simd_into(self)
488 }
489 }
490 #[inline(always)]
491 fn unzip_high_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
492 unsafe {
493 let mask = _mm_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, 1, 3, 5, 7, 9, 11, 13, 15);
494 let t1 = _mm_shuffle_epi8(a.into(), mask);
495 let t2 = _mm_shuffle_epi8(b.into(), mask);
496 _mm_unpacklo_epi64(t1, t2).simd_into(self)
497 }
498 }
499 #[inline(always)]
500 fn select_u8x16(self, a: mask8x16<Self>, b: u8x16<Self>, c: u8x16<Self>) -> u8x16<Self> {
501 unsafe {
502 _mm_or_si128(
503 _mm_and_si128(a.into(), b.into()),
504 _mm_andnot_si128(a.into(), c.into()),
505 )
506 .simd_into(self)
507 }
508 }
509 #[inline(always)]
510 fn min_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
511 unsafe { _mm_min_epu8(a.into(), b.into()).simd_into(self) }
512 }
513 #[inline(always)]
514 fn max_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
515 unsafe { _mm_max_epu8(a.into(), b.into()).simd_into(self) }
516 }
517 #[inline(always)]
518 fn combine_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x32<Self> {
519 let mut result = [0; 32usize];
520 result[0..16usize].copy_from_slice(&a.val);
521 result[16usize..32usize].copy_from_slice(&b.val);
522 result.simd_into(self)
523 }
524 #[inline(always)]
525 fn widen_u8x16(self, a: u8x16<Self>) -> u16x16<Self> {
526 unsafe {
527 let raw = a.into();
528 let high = _mm_cvtepu8_epi16(raw).simd_into(self);
529 let low = _mm_cvtepu8_epi16(_mm_srli_si128::<8>(raw)).simd_into(self);
530 self.combine_u16x8(high, low)
531 }
532 }
533 #[inline(always)]
534 fn reinterpret_u32_u8x16(self, a: u8x16<Self>) -> u32x4<Self> {
535 u32x4 {
536 val: bytemuck::cast(a.val),
537 simd: a.simd,
538 }
539 }
540 #[inline(always)]
541 fn splat_mask8x16(self, val: i8) -> mask8x16<Self> {
542 unsafe { _mm_set1_epi8(val).simd_into(self) }
543 }
544 #[inline(always)]
545 fn not_mask8x16(self, a: mask8x16<Self>) -> mask8x16<Self> {
546 a ^ !0
547 }
548 #[inline(always)]
549 fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
550 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
551 }
552 #[inline(always)]
553 fn or_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
554 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
555 }
556 #[inline(always)]
557 fn xor_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
558 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
559 }
560 #[inline(always)]
561 fn select_mask8x16(
562 self,
563 a: mask8x16<Self>,
564 b: mask8x16<Self>,
565 c: mask8x16<Self>,
566 ) -> mask8x16<Self> {
567 unsafe {
568 _mm_or_si128(
569 _mm_and_si128(a.into(), b.into()),
570 _mm_andnot_si128(a.into(), c.into()),
571 )
572 .simd_into(self)
573 }
574 }
575 #[inline(always)]
576 fn simd_eq_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
577 unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) }
578 }
579 #[inline(always)]
580 fn combine_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x32<Self> {
581 let mut result = [0; 32usize];
582 result[0..16usize].copy_from_slice(&a.val);
583 result[16usize..32usize].copy_from_slice(&b.val);
584 result.simd_into(self)
585 }
586 #[inline(always)]
587 fn splat_i16x8(self, val: i16) -> i16x8<Self> {
588 unsafe { _mm_set1_epi16(val).simd_into(self) }
589 }
590 #[inline(always)]
591 fn not_i16x8(self, a: i16x8<Self>) -> i16x8<Self> {
592 a ^ !0
593 }
594 #[inline(always)]
595 fn add_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
596 unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) }
597 }
598 #[inline(always)]
599 fn sub_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
600 unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) }
601 }
602 #[inline(always)]
603 fn mul_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
604 unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) }
605 }
606 #[inline(always)]
607 fn and_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
608 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
609 }
610 #[inline(always)]
611 fn or_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
612 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
613 }
614 #[inline(always)]
615 fn xor_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
616 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
617 }
618 #[inline(always)]
619 fn shr_i16x8(self, a: i16x8<Self>, shift: u32) -> i16x8<Self> {
620 unsafe { _mm_sra_epi16(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) }
621 }
622 #[inline(always)]
623 fn shrv_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
624 core::array::from_fn(|i| core::ops::Shr::shr(a.val[i], b.val[i])).simd_into(self)
625 }
626 #[inline(always)]
627 fn shl_i16x8(self, a: i16x8<Self>, shift: u32) -> i16x8<Self> {
628 unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) }
629 }
630 #[inline(always)]
631 fn simd_eq_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
632 unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) }
633 }
634 #[inline(always)]
635 fn simd_lt_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
636 unsafe { _mm_cmplt_epi16(a.into(), b.into()).simd_into(self) }
637 }
638 #[inline(always)]
639 fn simd_le_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
640 unsafe { _mm_cmpeq_epi16(_mm_min_epi16(a.into(), b.into()), a.into()).simd_into(self) }
641 }
642 #[inline(always)]
643 fn simd_ge_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
644 unsafe { _mm_cmpeq_epi16(_mm_max_epi16(a.into(), b.into()), a.into()).simd_into(self) }
645 }
646 #[inline(always)]
647 fn simd_gt_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
648 unsafe { _mm_cmpgt_epi16(a.into(), b.into()).simd_into(self) }
649 }
650 #[inline(always)]
651 fn zip_low_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
652 unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) }
653 }
654 #[inline(always)]
655 fn zip_high_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
656 unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) }
657 }
658 #[inline(always)]
659 fn unzip_low_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
660 unsafe {
661 let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13);
662 let t1 = _mm_shuffle_epi8(a.into(), mask);
663 let t2 = _mm_shuffle_epi8(b.into(), mask);
664 _mm_unpacklo_epi64(t1, t2).simd_into(self)
665 }
666 }
667 #[inline(always)]
668 fn unzip_high_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
669 unsafe {
670 let mask = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
671 let t1 = _mm_shuffle_epi8(a.into(), mask);
672 let t2 = _mm_shuffle_epi8(b.into(), mask);
673 _mm_unpacklo_epi64(t1, t2).simd_into(self)
674 }
675 }
676 #[inline(always)]
677 fn select_i16x8(self, a: mask16x8<Self>, b: i16x8<Self>, c: i16x8<Self>) -> i16x8<Self> {
678 unsafe {
679 _mm_or_si128(
680 _mm_and_si128(a.into(), b.into()),
681 _mm_andnot_si128(a.into(), c.into()),
682 )
683 .simd_into(self)
684 }
685 }
686 #[inline(always)]
687 fn min_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
688 unsafe { _mm_min_epi16(a.into(), b.into()).simd_into(self) }
689 }
690 #[inline(always)]
691 fn max_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
692 unsafe { _mm_max_epi16(a.into(), b.into()).simd_into(self) }
693 }
694 #[inline(always)]
695 fn combine_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x16<Self> {
696 let mut result = [0; 16usize];
697 result[0..8usize].copy_from_slice(&a.val);
698 result[8usize..16usize].copy_from_slice(&b.val);
699 result.simd_into(self)
700 }
701 #[inline(always)]
702 fn neg_i16x8(self, a: i16x8<Self>) -> i16x8<Self> {
703 unsafe { _mm_sub_epi16(_mm_setzero_si128(), a.into()).simd_into(self) }
704 }
705 #[inline(always)]
706 fn reinterpret_u8_i16x8(self, a: i16x8<Self>) -> u8x16<Self> {
707 u8x16 {
708 val: bytemuck::cast(a.val),
709 simd: a.simd,
710 }
711 }
712 #[inline(always)]
713 fn reinterpret_u32_i16x8(self, a: i16x8<Self>) -> u32x4<Self> {
714 u32x4 {
715 val: bytemuck::cast(a.val),
716 simd: a.simd,
717 }
718 }
719 #[inline(always)]
720 fn splat_u16x8(self, val: u16) -> u16x8<Self> {
721 unsafe { _mm_set1_epi16(val as _).simd_into(self) }
722 }
723 #[inline(always)]
724 fn not_u16x8(self, a: u16x8<Self>) -> u16x8<Self> {
725 a ^ !0
726 }
727 #[inline(always)]
728 fn add_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
729 unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) }
730 }
731 #[inline(always)]
732 fn sub_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
733 unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) }
734 }
735 #[inline(always)]
736 fn mul_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
737 unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) }
738 }
739 #[inline(always)]
740 fn and_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
741 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
742 }
743 #[inline(always)]
744 fn or_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
745 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
746 }
747 #[inline(always)]
748 fn xor_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
749 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
750 }
751 #[inline(always)]
752 fn shr_u16x8(self, a: u16x8<Self>, shift: u32) -> u16x8<Self> {
753 unsafe { _mm_srl_epi16(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) }
754 }
755 #[inline(always)]
756 fn shrv_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
757 core::array::from_fn(|i| core::ops::Shr::shr(a.val[i], b.val[i])).simd_into(self)
758 }
759 #[inline(always)]
760 fn shl_u16x8(self, a: u16x8<Self>, shift: u32) -> u16x8<Self> {
761 unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) }
762 }
763 #[inline(always)]
764 fn simd_eq_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
765 unsafe {
766 let sign_bit = _mm_set1_epi16(0x8000u16 as _);
767 let a_signed = _mm_xor_si128(a.into(), sign_bit);
768 let b_signed = _mm_xor_si128(b.into(), sign_bit);
769 _mm_cmpgt_epi16(a_signed, b_signed).simd_into(self)
770 }
771 }
772 #[inline(always)]
773 fn simd_lt_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
774 unsafe {
775 let sign_bit = _mm_set1_epi16(0x8000u16 as _);
776 let a_signed = _mm_xor_si128(a.into(), sign_bit);
777 let b_signed = _mm_xor_si128(b.into(), sign_bit);
778 _mm_cmpgt_epi16(b_signed, a_signed).simd_into(self)
779 }
780 }
781 #[inline(always)]
782 fn simd_le_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
783 unsafe { _mm_cmpeq_epi16(_mm_min_epu16(a.into(), b.into()), a.into()).simd_into(self) }
784 }
785 #[inline(always)]
786 fn simd_ge_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
787 unsafe { _mm_cmpeq_epi16(_mm_max_epu16(a.into(), b.into()), a.into()).simd_into(self) }
788 }
789 #[inline(always)]
790 fn simd_gt_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
791 unsafe {
792 let sign_bit = _mm_set1_epi16(0x8000u16 as _);
793 let a_signed = _mm_xor_si128(a.into(), sign_bit);
794 let b_signed = _mm_xor_si128(b.into(), sign_bit);
795 _mm_cmpgt_epi16(a_signed, b_signed).simd_into(self)
796 }
797 }
798 #[inline(always)]
799 fn zip_low_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
800 unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) }
801 }
802 #[inline(always)]
803 fn zip_high_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
804 unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) }
805 }
806 #[inline(always)]
807 fn unzip_low_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
808 unsafe {
809 let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13);
810 let t1 = _mm_shuffle_epi8(a.into(), mask);
811 let t2 = _mm_shuffle_epi8(b.into(), mask);
812 _mm_unpacklo_epi64(t1, t2).simd_into(self)
813 }
814 }
815 #[inline(always)]
816 fn unzip_high_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
817 unsafe {
818 let mask = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
819 let t1 = _mm_shuffle_epi8(a.into(), mask);
820 let t2 = _mm_shuffle_epi8(b.into(), mask);
821 _mm_unpacklo_epi64(t1, t2).simd_into(self)
822 }
823 }
824 #[inline(always)]
825 fn select_u16x8(self, a: mask16x8<Self>, b: u16x8<Self>, c: u16x8<Self>) -> u16x8<Self> {
826 unsafe {
827 _mm_or_si128(
828 _mm_and_si128(a.into(), b.into()),
829 _mm_andnot_si128(a.into(), c.into()),
830 )
831 .simd_into(self)
832 }
833 }
834 #[inline(always)]
835 fn min_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
836 unsafe { _mm_min_epu16(a.into(), b.into()).simd_into(self) }
837 }
838 #[inline(always)]
839 fn max_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
840 unsafe { _mm_max_epu16(a.into(), b.into()).simd_into(self) }
841 }
842 #[inline(always)]
843 fn combine_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x16<Self> {
844 let mut result = [0; 16usize];
845 result[0..8usize].copy_from_slice(&a.val);
846 result[8usize..16usize].copy_from_slice(&b.val);
847 result.simd_into(self)
848 }
849 #[inline(always)]
850 fn reinterpret_u8_u16x8(self, a: u16x8<Self>) -> u8x16<Self> {
851 u8x16 {
852 val: bytemuck::cast(a.val),
853 simd: a.simd,
854 }
855 }
856 #[inline(always)]
857 fn reinterpret_u32_u16x8(self, a: u16x8<Self>) -> u32x4<Self> {
858 u32x4 {
859 val: bytemuck::cast(a.val),
860 simd: a.simd,
861 }
862 }
863 #[inline(always)]
864 fn splat_mask16x8(self, val: i16) -> mask16x8<Self> {
865 unsafe { _mm_set1_epi16(val).simd_into(self) }
866 }
867 #[inline(always)]
868 fn not_mask16x8(self, a: mask16x8<Self>) -> mask16x8<Self> {
869 a ^ !0
870 }
871 #[inline(always)]
872 fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
873 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
874 }
875 #[inline(always)]
876 fn or_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
877 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
878 }
879 #[inline(always)]
880 fn xor_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
881 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
882 }
883 #[inline(always)]
884 fn select_mask16x8(
885 self,
886 a: mask16x8<Self>,
887 b: mask16x8<Self>,
888 c: mask16x8<Self>,
889 ) -> mask16x8<Self> {
890 unsafe {
891 _mm_or_si128(
892 _mm_and_si128(a.into(), b.into()),
893 _mm_andnot_si128(a.into(), c.into()),
894 )
895 .simd_into(self)
896 }
897 }
898 #[inline(always)]
899 fn simd_eq_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
900 unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) }
901 }
902 #[inline(always)]
903 fn combine_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x16<Self> {
904 let mut result = [0; 16usize];
905 result[0..8usize].copy_from_slice(&a.val);
906 result[8usize..16usize].copy_from_slice(&b.val);
907 result.simd_into(self)
908 }
909 #[inline(always)]
910 fn splat_i32x4(self, val: i32) -> i32x4<Self> {
911 unsafe { _mm_set1_epi32(val).simd_into(self) }
912 }
913 #[inline(always)]
914 fn not_i32x4(self, a: i32x4<Self>) -> i32x4<Self> {
915 a ^ !0
916 }
917 #[inline(always)]
918 fn add_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
919 unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) }
920 }
921 #[inline(always)]
922 fn sub_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
923 unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) }
924 }
925 #[inline(always)]
926 fn mul_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
927 unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) }
928 }
929 #[inline(always)]
930 fn and_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
931 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
932 }
933 #[inline(always)]
934 fn or_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
935 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
936 }
937 #[inline(always)]
938 fn xor_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
939 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
940 }
941 #[inline(always)]
942 fn shr_i32x4(self, a: i32x4<Self>, shift: u32) -> i32x4<Self> {
943 unsafe { _mm_sra_epi32(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) }
944 }
945 #[inline(always)]
946 fn shrv_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
947 unsafe { _mm_srav_epi32(a.into(), b.into()).simd_into(self) }
948 }
949 #[inline(always)]
950 fn shl_i32x4(self, a: i32x4<Self>, shift: u32) -> i32x4<Self> {
951 unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) }
952 }
953 #[inline(always)]
954 fn simd_eq_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
955 unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) }
956 }
957 #[inline(always)]
958 fn simd_lt_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
959 unsafe { _mm_cmplt_epi32(a.into(), b.into()).simd_into(self) }
960 }
961 #[inline(always)]
962 fn simd_le_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
963 unsafe { _mm_cmpeq_epi32(_mm_min_epi32(a.into(), b.into()), a.into()).simd_into(self) }
964 }
965 #[inline(always)]
966 fn simd_ge_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
967 unsafe { _mm_cmpeq_epi32(_mm_max_epi32(a.into(), b.into()), a.into()).simd_into(self) }
968 }
969 #[inline(always)]
970 fn simd_gt_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
971 unsafe { _mm_cmpgt_epi32(a.into(), b.into()).simd_into(self) }
972 }
973 #[inline(always)]
974 fn zip_low_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
975 unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) }
976 }
977 #[inline(always)]
978 fn zip_high_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
979 unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) }
980 }
981 #[inline(always)]
982 fn unzip_low_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
983 unsafe {
984 let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
985 let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
986 _mm_unpacklo_epi64(t1, t2).simd_into(self)
987 }
988 }
989 #[inline(always)]
990 fn unzip_high_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
991 unsafe {
992 let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
993 let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
994 _mm_unpackhi_epi64(t1, t2).simd_into(self)
995 }
996 }
997 #[inline(always)]
998 fn select_i32x4(self, a: mask32x4<Self>, b: i32x4<Self>, c: i32x4<Self>) -> i32x4<Self> {
999 unsafe {
1000 _mm_or_si128(
1001 _mm_and_si128(a.into(), b.into()),
1002 _mm_andnot_si128(a.into(), c.into()),
1003 )
1004 .simd_into(self)
1005 }
1006 }
1007 #[inline(always)]
1008 fn min_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1009 unsafe { _mm_min_epi32(a.into(), b.into()).simd_into(self) }
1010 }
1011 #[inline(always)]
1012 fn max_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1013 unsafe { _mm_max_epi32(a.into(), b.into()).simd_into(self) }
1014 }
1015 #[inline(always)]
1016 fn combine_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x8<Self> {
1017 let mut result = [0; 8usize];
1018 result[0..4usize].copy_from_slice(&a.val);
1019 result[4usize..8usize].copy_from_slice(&b.val);
1020 result.simd_into(self)
1021 }
1022 #[inline(always)]
1023 fn neg_i32x4(self, a: i32x4<Self>) -> i32x4<Self> {
1024 unsafe { _mm_sub_epi32(_mm_setzero_si128(), a.into()).simd_into(self) }
1025 }
1026 #[inline(always)]
1027 fn reinterpret_u8_i32x4(self, a: i32x4<Self>) -> u8x16<Self> {
1028 u8x16 {
1029 val: bytemuck::cast(a.val),
1030 simd: a.simd,
1031 }
1032 }
1033 #[inline(always)]
1034 fn reinterpret_u32_i32x4(self, a: i32x4<Self>) -> u32x4<Self> {
1035 u32x4 {
1036 val: bytemuck::cast(a.val),
1037 simd: a.simd,
1038 }
1039 }
1040 #[inline(always)]
1041 fn cvt_f32_i32x4(self, a: i32x4<Self>) -> f32x4<Self> {
1042 unsafe { _mm_cvtepi32_ps(a.into()).simd_into(self) }
1043 }
1044 #[inline(always)]
1045 fn splat_u32x4(self, val: u32) -> u32x4<Self> {
1046 unsafe { _mm_set1_epi32(val as _).simd_into(self) }
1047 }
1048 #[inline(always)]
1049 fn not_u32x4(self, a: u32x4<Self>) -> u32x4<Self> {
1050 a ^ !0
1051 }
1052 #[inline(always)]
1053 fn add_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1054 unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) }
1055 }
1056 #[inline(always)]
1057 fn sub_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1058 unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) }
1059 }
1060 #[inline(always)]
1061 fn mul_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1062 unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) }
1063 }
1064 #[inline(always)]
1065 fn and_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1066 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1067 }
1068 #[inline(always)]
1069 fn or_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1070 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1071 }
1072 #[inline(always)]
1073 fn xor_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1074 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1075 }
1076 #[inline(always)]
1077 fn shr_u32x4(self, a: u32x4<Self>, shift: u32) -> u32x4<Self> {
1078 unsafe { _mm_srl_epi32(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) }
1079 }
1080 #[inline(always)]
1081 fn shrv_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1082 unsafe { _mm_srlv_epi32(a.into(), b.into()).simd_into(self) }
1083 }
1084 #[inline(always)]
1085 fn shl_u32x4(self, a: u32x4<Self>, shift: u32) -> u32x4<Self> {
1086 unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) }
1087 }
1088 #[inline(always)]
1089 fn simd_eq_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1090 unsafe {
1091 let sign_bit = _mm_set1_epi32(0x80000000u32 as _);
1092 let a_signed = _mm_xor_si128(a.into(), sign_bit);
1093 let b_signed = _mm_xor_si128(b.into(), sign_bit);
1094 _mm_cmpgt_epi32(a_signed, b_signed).simd_into(self)
1095 }
1096 }
1097 #[inline(always)]
1098 fn simd_lt_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1099 unsafe {
1100 let sign_bit = _mm_set1_epi32(0x80000000u32 as _);
1101 let a_signed = _mm_xor_si128(a.into(), sign_bit);
1102 let b_signed = _mm_xor_si128(b.into(), sign_bit);
1103 _mm_cmpgt_epi32(b_signed, a_signed).simd_into(self)
1104 }
1105 }
1106 #[inline(always)]
1107 fn simd_le_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1108 unsafe { _mm_cmpeq_epi32(_mm_min_epu32(a.into(), b.into()), a.into()).simd_into(self) }
1109 }
1110 #[inline(always)]
1111 fn simd_ge_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1112 unsafe { _mm_cmpeq_epi32(_mm_max_epu32(a.into(), b.into()), a.into()).simd_into(self) }
1113 }
1114 #[inline(always)]
1115 fn simd_gt_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1116 unsafe {
1117 let sign_bit = _mm_set1_epi32(0x80000000u32 as _);
1118 let a_signed = _mm_xor_si128(a.into(), sign_bit);
1119 let b_signed = _mm_xor_si128(b.into(), sign_bit);
1120 _mm_cmpgt_epi32(a_signed, b_signed).simd_into(self)
1121 }
1122 }
1123 #[inline(always)]
1124 fn zip_low_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1125 unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) }
1126 }
1127 #[inline(always)]
1128 fn zip_high_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1129 unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) }
1130 }
1131 #[inline(always)]
1132 fn unzip_low_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1133 unsafe {
1134 let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
1135 let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
1136 _mm_unpacklo_epi64(t1, t2).simd_into(self)
1137 }
1138 }
1139 #[inline(always)]
1140 fn unzip_high_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1141 unsafe {
1142 let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
1143 let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
1144 _mm_unpackhi_epi64(t1, t2).simd_into(self)
1145 }
1146 }
1147 #[inline(always)]
1148 fn select_u32x4(self, a: mask32x4<Self>, b: u32x4<Self>, c: u32x4<Self>) -> u32x4<Self> {
1149 unsafe {
1150 _mm_or_si128(
1151 _mm_and_si128(a.into(), b.into()),
1152 _mm_andnot_si128(a.into(), c.into()),
1153 )
1154 .simd_into(self)
1155 }
1156 }
1157 #[inline(always)]
1158 fn min_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1159 unsafe { _mm_min_epu32(a.into(), b.into()).simd_into(self) }
1160 }
1161 #[inline(always)]
1162 fn max_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1163 unsafe { _mm_max_epu32(a.into(), b.into()).simd_into(self) }
1164 }
1165 #[inline(always)]
1166 fn combine_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x8<Self> {
1167 let mut result = [0; 8usize];
1168 result[0..4usize].copy_from_slice(&a.val);
1169 result[4usize..8usize].copy_from_slice(&b.val);
1170 result.simd_into(self)
1171 }
1172 #[inline(always)]
1173 fn reinterpret_u8_u32x4(self, a: u32x4<Self>) -> u8x16<Self> {
1174 u8x16 {
1175 val: bytemuck::cast(a.val),
1176 simd: a.simd,
1177 }
1178 }
1179 #[inline(always)]
1180 fn cvt_f32_u32x4(self, a: u32x4<Self>) -> f32x4<Self> {
1181 unsafe { _mm_cvtepi32_ps(a.into()).simd_into(self) }
1182 }
1183 #[inline(always)]
1184 fn splat_mask32x4(self, val: i32) -> mask32x4<Self> {
1185 unsafe { _mm_set1_epi32(val).simd_into(self) }
1186 }
1187 #[inline(always)]
1188 fn not_mask32x4(self, a: mask32x4<Self>) -> mask32x4<Self> {
1189 a ^ !0
1190 }
1191 #[inline(always)]
1192 fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
1193 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1194 }
1195 #[inline(always)]
1196 fn or_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
1197 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1198 }
1199 #[inline(always)]
1200 fn xor_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
1201 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1202 }
1203 #[inline(always)]
1204 fn select_mask32x4(
1205 self,
1206 a: mask32x4<Self>,
1207 b: mask32x4<Self>,
1208 c: mask32x4<Self>,
1209 ) -> mask32x4<Self> {
1210 unsafe {
1211 _mm_or_si128(
1212 _mm_and_si128(a.into(), b.into()),
1213 _mm_andnot_si128(a.into(), c.into()),
1214 )
1215 .simd_into(self)
1216 }
1217 }
1218 #[inline(always)]
1219 fn simd_eq_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
1220 unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) }
1221 }
1222 #[inline(always)]
1223 fn combine_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x8<Self> {
1224 let mut result = [0; 8usize];
1225 result[0..4usize].copy_from_slice(&a.val);
1226 result[4usize..8usize].copy_from_slice(&b.val);
1227 result.simd_into(self)
1228 }
1229 #[inline(always)]
1230 fn splat_f64x2(self, val: f64) -> f64x2<Self> {
1231 unsafe { _mm_set1_pd(val).simd_into(self) }
1232 }
1233 #[inline(always)]
1234 fn abs_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
1235 unsafe { _mm_andnot_pd(_mm_set1_pd(-0.0), a.into()).simd_into(self) }
1236 }
1237 #[inline(always)]
1238 fn neg_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
1239 unsafe { _mm_xor_pd(a.into(), _mm_set1_pd(-0.0)).simd_into(self) }
1240 }
1241 #[inline(always)]
1242 fn sqrt_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
1243 unsafe { _mm_sqrt_pd(a.into()).simd_into(self) }
1244 }
1245 #[inline(always)]
1246 fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1247 unsafe { _mm_add_pd(a.into(), b.into()).simd_into(self) }
1248 }
1249 #[inline(always)]
1250 fn sub_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1251 unsafe { _mm_sub_pd(a.into(), b.into()).simd_into(self) }
1252 }
1253 #[inline(always)]
1254 fn mul_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1255 unsafe { _mm_mul_pd(a.into(), b.into()).simd_into(self) }
1256 }
1257 #[inline(always)]
1258 fn div_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1259 unsafe { _mm_div_pd(a.into(), b.into()).simd_into(self) }
1260 }
1261 #[inline(always)]
1262 fn copysign_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1263 unsafe {
1264 let mask = _mm_set1_pd(-0.0);
1265 _mm_or_pd(_mm_and_pd(mask, b.into()), _mm_andnot_pd(mask, a.into())).simd_into(self)
1266 }
1267 }
1268 #[inline(always)]
1269 fn simd_eq_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
1270 unsafe { _mm_castpd_si128(_mm_cmpeq_pd(a.into(), b.into())).simd_into(self) }
1271 }
1272 #[inline(always)]
1273 fn simd_lt_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
1274 unsafe { _mm_castpd_si128(_mm_cmplt_pd(a.into(), b.into())).simd_into(self) }
1275 }
1276 #[inline(always)]
1277 fn simd_le_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
1278 unsafe { _mm_castpd_si128(_mm_cmple_pd(a.into(), b.into())).simd_into(self) }
1279 }
1280 #[inline(always)]
1281 fn simd_ge_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
1282 unsafe { _mm_castpd_si128(_mm_cmpge_pd(a.into(), b.into())).simd_into(self) }
1283 }
1284 #[inline(always)]
1285 fn simd_gt_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
1286 unsafe { _mm_castpd_si128(_mm_cmpgt_pd(a.into(), b.into())).simd_into(self) }
1287 }
1288 #[inline(always)]
1289 fn zip_low_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1290 unsafe { _mm_unpacklo_pd(a.into(), b.into()).simd_into(self) }
1291 }
1292 #[inline(always)]
1293 fn zip_high_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1294 unsafe { _mm_unpackhi_pd(a.into(), b.into()).simd_into(self) }
1295 }
1296 #[inline(always)]
1297 fn unzip_low_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1298 unsafe { _mm_shuffle_pd::<0b00>(a.into(), b.into()).simd_into(self) }
1299 }
1300 #[inline(always)]
1301 fn unzip_high_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1302 unsafe { _mm_shuffle_pd::<0b11>(a.into(), b.into()).simd_into(self) }
1303 }
1304 #[inline(always)]
1305 fn max_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1306 unsafe { _mm_max_pd(a.into(), b.into()).simd_into(self) }
1307 }
1308 #[inline(always)]
1309 fn max_precise_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1310 unsafe { _mm_max_pd(a.into(), b.into()).simd_into(self) }
1311 }
1312 #[inline(always)]
1313 fn min_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1314 unsafe { _mm_min_pd(a.into(), b.into()).simd_into(self) }
1315 }
1316 #[inline(always)]
1317 fn min_precise_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1318 unsafe { _mm_min_pd(a.into(), b.into()).simd_into(self) }
1319 }
1320 #[inline(always)]
1321 fn madd_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
1322 unsafe { _mm_fmadd_pd(a.into(), b.into(), c.into()).simd_into(self) }
1323 }
1324 #[inline(always)]
1325 fn msub_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
1326 unsafe { _mm_fmsub_pd(a.into(), b.into(), c.into()).simd_into(self) }
1327 }
1328 #[inline(always)]
1329 fn floor_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
1330 unsafe { _mm_floor_pd(a.into()).simd_into(self) }
1331 }
1332 #[inline(always)]
1333 fn fract_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
1334 a - a.trunc()
1335 }
1336 #[inline(always)]
1337 fn trunc_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
1338 unsafe { _mm_round_pd(a.into(), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC).simd_into(self) }
1339 }
1340 #[inline(always)]
1341 fn select_f64x2(self, a: mask64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
1342 unsafe {
1343 let mask = _mm_castsi128_pd(a.into());
1344 _mm_or_pd(_mm_and_pd(mask, b.into()), _mm_andnot_pd(mask, c.into())).simd_into(self)
1345 }
1346 }
1347 #[inline(always)]
1348 fn combine_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x4<Self> {
1349 let mut result = [0.0; 4usize];
1350 result[0..2usize].copy_from_slice(&a.val);
1351 result[2usize..4usize].copy_from_slice(&b.val);
1352 result.simd_into(self)
1353 }
1354 #[inline(always)]
1355 fn reinterpret_f32_f64x2(self, a: f64x2<Self>) -> f32x4<Self> {
1356 f32x4 {
1357 val: bytemuck::cast(a.val),
1358 simd: a.simd,
1359 }
1360 }
1361 #[inline(always)]
1362 fn splat_mask64x2(self, val: i64) -> mask64x2<Self> {
1363 unsafe { _mm_set1_epi64x(val).simd_into(self) }
1364 }
1365 #[inline(always)]
1366 fn not_mask64x2(self, a: mask64x2<Self>) -> mask64x2<Self> {
1367 a ^ !0
1368 }
1369 #[inline(always)]
1370 fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
1371 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1372 }
1373 #[inline(always)]
1374 fn or_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
1375 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1376 }
1377 #[inline(always)]
1378 fn xor_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
1379 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1380 }
1381 #[inline(always)]
1382 fn select_mask64x2(
1383 self,
1384 a: mask64x2<Self>,
1385 b: mask64x2<Self>,
1386 c: mask64x2<Self>,
1387 ) -> mask64x2<Self> {
1388 unsafe {
1389 _mm_or_si128(
1390 _mm_and_si128(a.into(), b.into()),
1391 _mm_andnot_si128(a.into(), c.into()),
1392 )
1393 .simd_into(self)
1394 }
1395 }
1396 #[inline(always)]
1397 fn simd_eq_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
1398 unsafe { _mm_cmpeq_epi64(a.into(), b.into()).simd_into(self) }
1399 }
1400 #[inline(always)]
1401 fn combine_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x4<Self> {
1402 let mut result = [0; 4usize];
1403 result[0..2usize].copy_from_slice(&a.val);
1404 result[2usize..4usize].copy_from_slice(&b.val);
1405 result.simd_into(self)
1406 }
1407 #[inline(always)]
1408 fn splat_f32x8(self, a: f32) -> f32x8<Self> {
1409 let half = self.splat_f32x4(a);
1410 self.combine_f32x4(half, half)
1411 }
1412 #[inline(always)]
1413 fn abs_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
1414 let (a0, a1) = self.split_f32x8(a);
1415 self.combine_f32x4(self.abs_f32x4(a0), self.abs_f32x4(a1))
1416 }
1417 #[inline(always)]
1418 fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
1419 let (a0, a1) = self.split_f32x8(a);
1420 self.combine_f32x4(self.neg_f32x4(a0), self.neg_f32x4(a1))
1421 }
1422 #[inline(always)]
1423 fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
1424 let (a0, a1) = self.split_f32x8(a);
1425 self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1))
1426 }
1427 #[inline(always)]
1428 fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1429 let (a0, a1) = self.split_f32x8(a);
1430 let (b0, b1) = self.split_f32x8(b);
1431 self.combine_f32x4(self.add_f32x4(a0, b0), self.add_f32x4(a1, b1))
1432 }
1433 #[inline(always)]
1434 fn sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1435 let (a0, a1) = self.split_f32x8(a);
1436 let (b0, b1) = self.split_f32x8(b);
1437 self.combine_f32x4(self.sub_f32x4(a0, b0), self.sub_f32x4(a1, b1))
1438 }
1439 #[inline(always)]
1440 fn mul_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1441 let (a0, a1) = self.split_f32x8(a);
1442 let (b0, b1) = self.split_f32x8(b);
1443 self.combine_f32x4(self.mul_f32x4(a0, b0), self.mul_f32x4(a1, b1))
1444 }
1445 #[inline(always)]
1446 fn div_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1447 let (a0, a1) = self.split_f32x8(a);
1448 let (b0, b1) = self.split_f32x8(b);
1449 self.combine_f32x4(self.div_f32x4(a0, b0), self.div_f32x4(a1, b1))
1450 }
1451 #[inline(always)]
1452 fn copysign_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1453 let (a0, a1) = self.split_f32x8(a);
1454 let (b0, b1) = self.split_f32x8(b);
1455 self.combine_f32x4(self.copysign_f32x4(a0, b0), self.copysign_f32x4(a1, b1))
1456 }
1457 #[inline(always)]
1458 fn simd_eq_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
1459 let (a0, a1) = self.split_f32x8(a);
1460 let (b0, b1) = self.split_f32x8(b);
1461 self.combine_mask32x4(self.simd_eq_f32x4(a0, b0), self.simd_eq_f32x4(a1, b1))
1462 }
1463 #[inline(always)]
1464 fn simd_lt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
1465 let (a0, a1) = self.split_f32x8(a);
1466 let (b0, b1) = self.split_f32x8(b);
1467 self.combine_mask32x4(self.simd_lt_f32x4(a0, b0), self.simd_lt_f32x4(a1, b1))
1468 }
1469 #[inline(always)]
1470 fn simd_le_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
1471 let (a0, a1) = self.split_f32x8(a);
1472 let (b0, b1) = self.split_f32x8(b);
1473 self.combine_mask32x4(self.simd_le_f32x4(a0, b0), self.simd_le_f32x4(a1, b1))
1474 }
1475 #[inline(always)]
1476 fn simd_ge_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
1477 let (a0, a1) = self.split_f32x8(a);
1478 let (b0, b1) = self.split_f32x8(b);
1479 self.combine_mask32x4(self.simd_ge_f32x4(a0, b0), self.simd_ge_f32x4(a1, b1))
1480 }
1481 #[inline(always)]
1482 fn simd_gt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
1483 let (a0, a1) = self.split_f32x8(a);
1484 let (b0, b1) = self.split_f32x8(b);
1485 self.combine_mask32x4(self.simd_gt_f32x4(a0, b0), self.simd_gt_f32x4(a1, b1))
1486 }
1487 #[inline(always)]
1488 fn zip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1489 let (a0, _) = self.split_f32x8(a);
1490 let (b0, _) = self.split_f32x8(b);
1491 self.combine_f32x4(self.zip_low_f32x4(a0, b0), self.zip_high_f32x4(a0, b0))
1492 }
1493 #[inline(always)]
1494 fn zip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1495 let (_, a1) = self.split_f32x8(a);
1496 let (_, b1) = self.split_f32x8(b);
1497 self.combine_f32x4(self.zip_low_f32x4(a1, b1), self.zip_high_f32x4(a1, b1))
1498 }
1499 #[inline(always)]
1500 fn unzip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1501 let (a0, a1) = self.split_f32x8(a);
1502 let (b0, b1) = self.split_f32x8(b);
1503 self.combine_f32x4(self.unzip_low_f32x4(a0, a1), self.unzip_low_f32x4(b0, b1))
1504 }
1505 #[inline(always)]
1506 fn unzip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1507 let (a0, a1) = self.split_f32x8(a);
1508 let (b0, b1) = self.split_f32x8(b);
1509 self.combine_f32x4(self.unzip_high_f32x4(a0, a1), self.unzip_high_f32x4(b0, b1))
1510 }
1511 #[inline(always)]
1512 fn max_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1513 let (a0, a1) = self.split_f32x8(a);
1514 let (b0, b1) = self.split_f32x8(b);
1515 self.combine_f32x4(self.max_f32x4(a0, b0), self.max_f32x4(a1, b1))
1516 }
1517 #[inline(always)]
1518 fn max_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1519 let (a0, a1) = self.split_f32x8(a);
1520 let (b0, b1) = self.split_f32x8(b);
1521 self.combine_f32x4(
1522 self.max_precise_f32x4(a0, b0),
1523 self.max_precise_f32x4(a1, b1),
1524 )
1525 }
1526 #[inline(always)]
1527 fn min_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1528 let (a0, a1) = self.split_f32x8(a);
1529 let (b0, b1) = self.split_f32x8(b);
1530 self.combine_f32x4(self.min_f32x4(a0, b0), self.min_f32x4(a1, b1))
1531 }
1532 #[inline(always)]
1533 fn min_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1534 let (a0, a1) = self.split_f32x8(a);
1535 let (b0, b1) = self.split_f32x8(b);
1536 self.combine_f32x4(
1537 self.min_precise_f32x4(a0, b0),
1538 self.min_precise_f32x4(a1, b1),
1539 )
1540 }
1541 #[inline(always)]
1542 fn madd_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
1543 let (a0, a1) = self.split_f32x8(a);
1544 let (b0, b1) = self.split_f32x8(b);
1545 let (c0, c1) = self.split_f32x8(c);
1546 self.combine_f32x4(self.madd_f32x4(a0, b0, c0), self.madd_f32x4(a1, b1, c1))
1547 }
1548 #[inline(always)]
1549 fn msub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
1550 let (a0, a1) = self.split_f32x8(a);
1551 let (b0, b1) = self.split_f32x8(b);
1552 let (c0, c1) = self.split_f32x8(c);
1553 self.combine_f32x4(self.msub_f32x4(a0, b0, c0), self.msub_f32x4(a1, b1, c1))
1554 }
1555 #[inline(always)]
1556 fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
1557 let (a0, a1) = self.split_f32x8(a);
1558 self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1))
1559 }
1560 #[inline(always)]
1561 fn fract_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
1562 let (a0, a1) = self.split_f32x8(a);
1563 self.combine_f32x4(self.fract_f32x4(a0), self.fract_f32x4(a1))
1564 }
1565 #[inline(always)]
1566 fn trunc_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
1567 let (a0, a1) = self.split_f32x8(a);
1568 self.combine_f32x4(self.trunc_f32x4(a0), self.trunc_f32x4(a1))
1569 }
1570 #[inline(always)]
1571 fn select_f32x8(self, a: mask32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
1572 let (a0, a1) = self.split_mask32x8(a);
1573 let (b0, b1) = self.split_f32x8(b);
1574 let (c0, c1) = self.split_f32x8(c);
1575 self.combine_f32x4(self.select_f32x4(a0, b0, c0), self.select_f32x4(a1, b1, c1))
1576 }
1577 #[inline(always)]
1578 fn combine_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x16<Self> {
1579 let mut result = [0.0; 16usize];
1580 result[0..8usize].copy_from_slice(&a.val);
1581 result[8usize..16usize].copy_from_slice(&b.val);
1582 result.simd_into(self)
1583 }
1584 #[inline(always)]
1585 fn split_f32x8(self, a: f32x8<Self>) -> (f32x4<Self>, f32x4<Self>) {
1586 let mut b0 = [0.0; 4usize];
1587 let mut b1 = [0.0; 4usize];
1588 b0.copy_from_slice(&a.val[0..4usize]);
1589 b1.copy_from_slice(&a.val[4usize..8usize]);
1590 (b0.simd_into(self), b1.simd_into(self))
1591 }
1592 #[inline(always)]
1593 fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self> {
1594 let (a0, a1) = self.split_f32x8(a);
1595 self.combine_f64x2(
1596 self.reinterpret_f64_f32x4(a0),
1597 self.reinterpret_f64_f32x4(a1),
1598 )
1599 }
1600 #[inline(always)]
1601 fn reinterpret_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
1602 let (a0, a1) = self.split_f32x8(a);
1603 self.combine_i32x4(
1604 self.reinterpret_i32_f32x4(a0),
1605 self.reinterpret_i32_f32x4(a1),
1606 )
1607 }
1608 #[inline(always)]
1609 fn reinterpret_u8_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
1610 let (a0, a1) = self.split_f32x8(a);
1611 self.combine_u8x16(self.reinterpret_u8_f32x4(a0), self.reinterpret_u8_f32x4(a1))
1612 }
1613 #[inline(always)]
1614 fn reinterpret_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
1615 let (a0, a1) = self.split_f32x8(a);
1616 self.combine_u32x4(
1617 self.reinterpret_u32_f32x4(a0),
1618 self.reinterpret_u32_f32x4(a1),
1619 )
1620 }
1621 #[inline(always)]
1622 fn cvt_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
1623 let (a0, a1) = self.split_f32x8(a);
1624 self.combine_u32x4(self.cvt_u32_f32x4(a0), self.cvt_u32_f32x4(a1))
1625 }
1626 #[inline(always)]
1627 fn cvt_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
1628 let (a0, a1) = self.split_f32x8(a);
1629 self.combine_i32x4(self.cvt_i32_f32x4(a0), self.cvt_i32_f32x4(a1))
1630 }
1631 #[inline(always)]
1632 fn splat_i8x32(self, a: i8) -> i8x32<Self> {
1633 let half = self.splat_i8x16(a);
1634 self.combine_i8x16(half, half)
1635 }
1636 #[inline(always)]
1637 fn not_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
1638 let (a0, a1) = self.split_i8x32(a);
1639 self.combine_i8x16(self.not_i8x16(a0), self.not_i8x16(a1))
1640 }
1641 #[inline(always)]
1642 fn add_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1643 let (a0, a1) = self.split_i8x32(a);
1644 let (b0, b1) = self.split_i8x32(b);
1645 self.combine_i8x16(self.add_i8x16(a0, b0), self.add_i8x16(a1, b1))
1646 }
1647 #[inline(always)]
1648 fn sub_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1649 let (a0, a1) = self.split_i8x32(a);
1650 let (b0, b1) = self.split_i8x32(b);
1651 self.combine_i8x16(self.sub_i8x16(a0, b0), self.sub_i8x16(a1, b1))
1652 }
1653 #[inline(always)]
1654 fn mul_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1655 let (a0, a1) = self.split_i8x32(a);
1656 let (b0, b1) = self.split_i8x32(b);
1657 self.combine_i8x16(self.mul_i8x16(a0, b0), self.mul_i8x16(a1, b1))
1658 }
1659 #[inline(always)]
1660 fn and_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1661 let (a0, a1) = self.split_i8x32(a);
1662 let (b0, b1) = self.split_i8x32(b);
1663 self.combine_i8x16(self.and_i8x16(a0, b0), self.and_i8x16(a1, b1))
1664 }
1665 #[inline(always)]
1666 fn or_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1667 let (a0, a1) = self.split_i8x32(a);
1668 let (b0, b1) = self.split_i8x32(b);
1669 self.combine_i8x16(self.or_i8x16(a0, b0), self.or_i8x16(a1, b1))
1670 }
1671 #[inline(always)]
1672 fn xor_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1673 let (a0, a1) = self.split_i8x32(a);
1674 let (b0, b1) = self.split_i8x32(b);
1675 self.combine_i8x16(self.xor_i8x16(a0, b0), self.xor_i8x16(a1, b1))
1676 }
1677 #[inline(always)]
1678 fn shr_i8x32(self, a: i8x32<Self>, b: u32) -> i8x32<Self> {
1679 let (a0, a1) = self.split_i8x32(a);
1680 self.combine_i8x16(self.shr_i8x16(a0, b), self.shr_i8x16(a1, b))
1681 }
1682 #[inline(always)]
1683 fn shrv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1684 let (a0, a1) = self.split_i8x32(a);
1685 let (b0, b1) = self.split_i8x32(b);
1686 self.combine_i8x16(self.shrv_i8x16(a0, b0), self.shrv_i8x16(a1, b1))
1687 }
1688 #[inline(always)]
1689 fn shl_i8x32(self, a: i8x32<Self>, b: u32) -> i8x32<Self> {
1690 let (a0, a1) = self.split_i8x32(a);
1691 self.combine_i8x16(self.shl_i8x16(a0, b), self.shl_i8x16(a1, b))
1692 }
1693 #[inline(always)]
1694 fn simd_eq_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
1695 let (a0, a1) = self.split_i8x32(a);
1696 let (b0, b1) = self.split_i8x32(b);
1697 self.combine_mask8x16(self.simd_eq_i8x16(a0, b0), self.simd_eq_i8x16(a1, b1))
1698 }
1699 #[inline(always)]
1700 fn simd_lt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
1701 let (a0, a1) = self.split_i8x32(a);
1702 let (b0, b1) = self.split_i8x32(b);
1703 self.combine_mask8x16(self.simd_lt_i8x16(a0, b0), self.simd_lt_i8x16(a1, b1))
1704 }
1705 #[inline(always)]
1706 fn simd_le_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
1707 let (a0, a1) = self.split_i8x32(a);
1708 let (b0, b1) = self.split_i8x32(b);
1709 self.combine_mask8x16(self.simd_le_i8x16(a0, b0), self.simd_le_i8x16(a1, b1))
1710 }
1711 #[inline(always)]
1712 fn simd_ge_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
1713 let (a0, a1) = self.split_i8x32(a);
1714 let (b0, b1) = self.split_i8x32(b);
1715 self.combine_mask8x16(self.simd_ge_i8x16(a0, b0), self.simd_ge_i8x16(a1, b1))
1716 }
1717 #[inline(always)]
1718 fn simd_gt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
1719 let (a0, a1) = self.split_i8x32(a);
1720 let (b0, b1) = self.split_i8x32(b);
1721 self.combine_mask8x16(self.simd_gt_i8x16(a0, b0), self.simd_gt_i8x16(a1, b1))
1722 }
1723 #[inline(always)]
1724 fn zip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1725 let (a0, _) = self.split_i8x32(a);
1726 let (b0, _) = self.split_i8x32(b);
1727 self.combine_i8x16(self.zip_low_i8x16(a0, b0), self.zip_high_i8x16(a0, b0))
1728 }
1729 #[inline(always)]
1730 fn zip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1731 let (_, a1) = self.split_i8x32(a);
1732 let (_, b1) = self.split_i8x32(b);
1733 self.combine_i8x16(self.zip_low_i8x16(a1, b1), self.zip_high_i8x16(a1, b1))
1734 }
1735 #[inline(always)]
1736 fn unzip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1737 let (a0, a1) = self.split_i8x32(a);
1738 let (b0, b1) = self.split_i8x32(b);
1739 self.combine_i8x16(self.unzip_low_i8x16(a0, a1), self.unzip_low_i8x16(b0, b1))
1740 }
1741 #[inline(always)]
1742 fn unzip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1743 let (a0, a1) = self.split_i8x32(a);
1744 let (b0, b1) = self.split_i8x32(b);
1745 self.combine_i8x16(self.unzip_high_i8x16(a0, a1), self.unzip_high_i8x16(b0, b1))
1746 }
1747 #[inline(always)]
1748 fn select_i8x32(self, a: mask8x32<Self>, b: i8x32<Self>, c: i8x32<Self>) -> i8x32<Self> {
1749 let (a0, a1) = self.split_mask8x32(a);
1750 let (b0, b1) = self.split_i8x32(b);
1751 let (c0, c1) = self.split_i8x32(c);
1752 self.combine_i8x16(self.select_i8x16(a0, b0, c0), self.select_i8x16(a1, b1, c1))
1753 }
1754 #[inline(always)]
1755 fn min_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1756 let (a0, a1) = self.split_i8x32(a);
1757 let (b0, b1) = self.split_i8x32(b);
1758 self.combine_i8x16(self.min_i8x16(a0, b0), self.min_i8x16(a1, b1))
1759 }
1760 #[inline(always)]
1761 fn max_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1762 let (a0, a1) = self.split_i8x32(a);
1763 let (b0, b1) = self.split_i8x32(b);
1764 self.combine_i8x16(self.max_i8x16(a0, b0), self.max_i8x16(a1, b1))
1765 }
1766 #[inline(always)]
1767 fn combine_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x64<Self> {
1768 let mut result = [0; 64usize];
1769 result[0..32usize].copy_from_slice(&a.val);
1770 result[32usize..64usize].copy_from_slice(&b.val);
1771 result.simd_into(self)
1772 }
1773 #[inline(always)]
1774 fn split_i8x32(self, a: i8x32<Self>) -> (i8x16<Self>, i8x16<Self>) {
1775 let mut b0 = [0; 16usize];
1776 let mut b1 = [0; 16usize];
1777 b0.copy_from_slice(&a.val[0..16usize]);
1778 b1.copy_from_slice(&a.val[16usize..32usize]);
1779 (b0.simd_into(self), b1.simd_into(self))
1780 }
1781 #[inline(always)]
1782 fn neg_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
1783 let (a0, a1) = self.split_i8x32(a);
1784 self.combine_i8x16(self.neg_i8x16(a0), self.neg_i8x16(a1))
1785 }
1786 #[inline(always)]
1787 fn reinterpret_u8_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
1788 let (a0, a1) = self.split_i8x32(a);
1789 self.combine_u8x16(self.reinterpret_u8_i8x16(a0), self.reinterpret_u8_i8x16(a1))
1790 }
1791 #[inline(always)]
1792 fn reinterpret_u32_i8x32(self, a: i8x32<Self>) -> u32x8<Self> {
1793 let (a0, a1) = self.split_i8x32(a);
1794 self.combine_u32x4(
1795 self.reinterpret_u32_i8x16(a0),
1796 self.reinterpret_u32_i8x16(a1),
1797 )
1798 }
1799 #[inline(always)]
1800 fn splat_u8x32(self, a: u8) -> u8x32<Self> {
1801 let half = self.splat_u8x16(a);
1802 self.combine_u8x16(half, half)
1803 }
1804 #[inline(always)]
1805 fn not_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
1806 let (a0, a1) = self.split_u8x32(a);
1807 self.combine_u8x16(self.not_u8x16(a0), self.not_u8x16(a1))
1808 }
1809 #[inline(always)]
1810 fn add_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1811 let (a0, a1) = self.split_u8x32(a);
1812 let (b0, b1) = self.split_u8x32(b);
1813 self.combine_u8x16(self.add_u8x16(a0, b0), self.add_u8x16(a1, b1))
1814 }
1815 #[inline(always)]
1816 fn sub_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1817 let (a0, a1) = self.split_u8x32(a);
1818 let (b0, b1) = self.split_u8x32(b);
1819 self.combine_u8x16(self.sub_u8x16(a0, b0), self.sub_u8x16(a1, b1))
1820 }
1821 #[inline(always)]
1822 fn mul_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1823 let (a0, a1) = self.split_u8x32(a);
1824 let (b0, b1) = self.split_u8x32(b);
1825 self.combine_u8x16(self.mul_u8x16(a0, b0), self.mul_u8x16(a1, b1))
1826 }
1827 #[inline(always)]
1828 fn and_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1829 let (a0, a1) = self.split_u8x32(a);
1830 let (b0, b1) = self.split_u8x32(b);
1831 self.combine_u8x16(self.and_u8x16(a0, b0), self.and_u8x16(a1, b1))
1832 }
1833 #[inline(always)]
1834 fn or_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1835 let (a0, a1) = self.split_u8x32(a);
1836 let (b0, b1) = self.split_u8x32(b);
1837 self.combine_u8x16(self.or_u8x16(a0, b0), self.or_u8x16(a1, b1))
1838 }
1839 #[inline(always)]
1840 fn xor_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1841 let (a0, a1) = self.split_u8x32(a);
1842 let (b0, b1) = self.split_u8x32(b);
1843 self.combine_u8x16(self.xor_u8x16(a0, b0), self.xor_u8x16(a1, b1))
1844 }
1845 #[inline(always)]
1846 fn shr_u8x32(self, a: u8x32<Self>, b: u32) -> u8x32<Self> {
1847 let (a0, a1) = self.split_u8x32(a);
1848 self.combine_u8x16(self.shr_u8x16(a0, b), self.shr_u8x16(a1, b))
1849 }
1850 #[inline(always)]
1851 fn shrv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1852 let (a0, a1) = self.split_u8x32(a);
1853 let (b0, b1) = self.split_u8x32(b);
1854 self.combine_u8x16(self.shrv_u8x16(a0, b0), self.shrv_u8x16(a1, b1))
1855 }
1856 #[inline(always)]
1857 fn shl_u8x32(self, a: u8x32<Self>, b: u32) -> u8x32<Self> {
1858 let (a0, a1) = self.split_u8x32(a);
1859 self.combine_u8x16(self.shl_u8x16(a0, b), self.shl_u8x16(a1, b))
1860 }
1861 #[inline(always)]
1862 fn simd_eq_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
1863 let (a0, a1) = self.split_u8x32(a);
1864 let (b0, b1) = self.split_u8x32(b);
1865 self.combine_mask8x16(self.simd_eq_u8x16(a0, b0), self.simd_eq_u8x16(a1, b1))
1866 }
1867 #[inline(always)]
1868 fn simd_lt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
1869 let (a0, a1) = self.split_u8x32(a);
1870 let (b0, b1) = self.split_u8x32(b);
1871 self.combine_mask8x16(self.simd_lt_u8x16(a0, b0), self.simd_lt_u8x16(a1, b1))
1872 }
1873 #[inline(always)]
1874 fn simd_le_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
1875 let (a0, a1) = self.split_u8x32(a);
1876 let (b0, b1) = self.split_u8x32(b);
1877 self.combine_mask8x16(self.simd_le_u8x16(a0, b0), self.simd_le_u8x16(a1, b1))
1878 }
1879 #[inline(always)]
1880 fn simd_ge_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
1881 let (a0, a1) = self.split_u8x32(a);
1882 let (b0, b1) = self.split_u8x32(b);
1883 self.combine_mask8x16(self.simd_ge_u8x16(a0, b0), self.simd_ge_u8x16(a1, b1))
1884 }
1885 #[inline(always)]
1886 fn simd_gt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
1887 let (a0, a1) = self.split_u8x32(a);
1888 let (b0, b1) = self.split_u8x32(b);
1889 self.combine_mask8x16(self.simd_gt_u8x16(a0, b0), self.simd_gt_u8x16(a1, b1))
1890 }
1891 #[inline(always)]
1892 fn zip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1893 let (a0, _) = self.split_u8x32(a);
1894 let (b0, _) = self.split_u8x32(b);
1895 self.combine_u8x16(self.zip_low_u8x16(a0, b0), self.zip_high_u8x16(a0, b0))
1896 }
1897 #[inline(always)]
1898 fn zip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1899 let (_, a1) = self.split_u8x32(a);
1900 let (_, b1) = self.split_u8x32(b);
1901 self.combine_u8x16(self.zip_low_u8x16(a1, b1), self.zip_high_u8x16(a1, b1))
1902 }
1903 #[inline(always)]
1904 fn unzip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1905 let (a0, a1) = self.split_u8x32(a);
1906 let (b0, b1) = self.split_u8x32(b);
1907 self.combine_u8x16(self.unzip_low_u8x16(a0, a1), self.unzip_low_u8x16(b0, b1))
1908 }
1909 #[inline(always)]
1910 fn unzip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1911 let (a0, a1) = self.split_u8x32(a);
1912 let (b0, b1) = self.split_u8x32(b);
1913 self.combine_u8x16(self.unzip_high_u8x16(a0, a1), self.unzip_high_u8x16(b0, b1))
1914 }
1915 #[inline(always)]
1916 fn select_u8x32(self, a: mask8x32<Self>, b: u8x32<Self>, c: u8x32<Self>) -> u8x32<Self> {
1917 let (a0, a1) = self.split_mask8x32(a);
1918 let (b0, b1) = self.split_u8x32(b);
1919 let (c0, c1) = self.split_u8x32(c);
1920 self.combine_u8x16(self.select_u8x16(a0, b0, c0), self.select_u8x16(a1, b1, c1))
1921 }
1922 #[inline(always)]
1923 fn min_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1924 let (a0, a1) = self.split_u8x32(a);
1925 let (b0, b1) = self.split_u8x32(b);
1926 self.combine_u8x16(self.min_u8x16(a0, b0), self.min_u8x16(a1, b1))
1927 }
1928 #[inline(always)]
1929 fn max_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1930 let (a0, a1) = self.split_u8x32(a);
1931 let (b0, b1) = self.split_u8x32(b);
1932 self.combine_u8x16(self.max_u8x16(a0, b0), self.max_u8x16(a1, b1))
1933 }
1934 #[inline(always)]
1935 fn combine_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x64<Self> {
1936 let mut result = [0; 64usize];
1937 result[0..32usize].copy_from_slice(&a.val);
1938 result[32usize..64usize].copy_from_slice(&b.val);
1939 result.simd_into(self)
1940 }
1941 #[inline(always)]
1942 fn split_u8x32(self, a: u8x32<Self>) -> (u8x16<Self>, u8x16<Self>) {
1943 let mut b0 = [0; 16usize];
1944 let mut b1 = [0; 16usize];
1945 b0.copy_from_slice(&a.val[0..16usize]);
1946 b1.copy_from_slice(&a.val[16usize..32usize]);
1947 (b0.simd_into(self), b1.simd_into(self))
1948 }
1949 #[inline(always)]
1950 fn widen_u8x32(self, a: u8x32<Self>) -> u16x32<Self> {
1951 let (a0, a1) = self.split_u8x32(a);
1952 self.combine_u16x16(self.widen_u8x16(a0), self.widen_u8x16(a1))
1953 }
1954 #[inline(always)]
1955 fn reinterpret_u32_u8x32(self, a: u8x32<Self>) -> u32x8<Self> {
1956 let (a0, a1) = self.split_u8x32(a);
1957 self.combine_u32x4(
1958 self.reinterpret_u32_u8x16(a0),
1959 self.reinterpret_u32_u8x16(a1),
1960 )
1961 }
1962 #[inline(always)]
1963 fn splat_mask8x32(self, a: i8) -> mask8x32<Self> {
1964 let half = self.splat_mask8x16(a);
1965 self.combine_mask8x16(half, half)
1966 }
1967 #[inline(always)]
1968 fn not_mask8x32(self, a: mask8x32<Self>) -> mask8x32<Self> {
1969 let (a0, a1) = self.split_mask8x32(a);
1970 self.combine_mask8x16(self.not_mask8x16(a0), self.not_mask8x16(a1))
1971 }
1972 #[inline(always)]
1973 fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
1974 let (a0, a1) = self.split_mask8x32(a);
1975 let (b0, b1) = self.split_mask8x32(b);
1976 self.combine_mask8x16(self.and_mask8x16(a0, b0), self.and_mask8x16(a1, b1))
1977 }
1978 #[inline(always)]
1979 fn or_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
1980 let (a0, a1) = self.split_mask8x32(a);
1981 let (b0, b1) = self.split_mask8x32(b);
1982 self.combine_mask8x16(self.or_mask8x16(a0, b0), self.or_mask8x16(a1, b1))
1983 }
1984 #[inline(always)]
1985 fn xor_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
1986 let (a0, a1) = self.split_mask8x32(a);
1987 let (b0, b1) = self.split_mask8x32(b);
1988 self.combine_mask8x16(self.xor_mask8x16(a0, b0), self.xor_mask8x16(a1, b1))
1989 }
1990 #[inline(always)]
1991 fn select_mask8x32(
1992 self,
1993 a: mask8x32<Self>,
1994 b: mask8x32<Self>,
1995 c: mask8x32<Self>,
1996 ) -> mask8x32<Self> {
1997 let (a0, a1) = self.split_mask8x32(a);
1998 let (b0, b1) = self.split_mask8x32(b);
1999 let (c0, c1) = self.split_mask8x32(c);
2000 self.combine_mask8x16(
2001 self.select_mask8x16(a0, b0, c0),
2002 self.select_mask8x16(a1, b1, c1),
2003 )
2004 }
2005 #[inline(always)]
2006 fn simd_eq_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
2007 let (a0, a1) = self.split_mask8x32(a);
2008 let (b0, b1) = self.split_mask8x32(b);
2009 self.combine_mask8x16(self.simd_eq_mask8x16(a0, b0), self.simd_eq_mask8x16(a1, b1))
2010 }
2011 #[inline(always)]
2012 fn combine_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x64<Self> {
2013 let mut result = [0; 64usize];
2014 result[0..32usize].copy_from_slice(&a.val);
2015 result[32usize..64usize].copy_from_slice(&b.val);
2016 result.simd_into(self)
2017 }
2018 #[inline(always)]
2019 fn split_mask8x32(self, a: mask8x32<Self>) -> (mask8x16<Self>, mask8x16<Self>) {
2020 let mut b0 = [0; 16usize];
2021 let mut b1 = [0; 16usize];
2022 b0.copy_from_slice(&a.val[0..16usize]);
2023 b1.copy_from_slice(&a.val[16usize..32usize]);
2024 (b0.simd_into(self), b1.simd_into(self))
2025 }
2026 #[inline(always)]
2027 fn splat_i16x16(self, a: i16) -> i16x16<Self> {
2028 let half = self.splat_i16x8(a);
2029 self.combine_i16x8(half, half)
2030 }
2031 #[inline(always)]
2032 fn not_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
2033 let (a0, a1) = self.split_i16x16(a);
2034 self.combine_i16x8(self.not_i16x8(a0), self.not_i16x8(a1))
2035 }
2036 #[inline(always)]
2037 fn add_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2038 let (a0, a1) = self.split_i16x16(a);
2039 let (b0, b1) = self.split_i16x16(b);
2040 self.combine_i16x8(self.add_i16x8(a0, b0), self.add_i16x8(a1, b1))
2041 }
2042 #[inline(always)]
2043 fn sub_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2044 let (a0, a1) = self.split_i16x16(a);
2045 let (b0, b1) = self.split_i16x16(b);
2046 self.combine_i16x8(self.sub_i16x8(a0, b0), self.sub_i16x8(a1, b1))
2047 }
2048 #[inline(always)]
2049 fn mul_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2050 let (a0, a1) = self.split_i16x16(a);
2051 let (b0, b1) = self.split_i16x16(b);
2052 self.combine_i16x8(self.mul_i16x8(a0, b0), self.mul_i16x8(a1, b1))
2053 }
2054 #[inline(always)]
2055 fn and_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2056 let (a0, a1) = self.split_i16x16(a);
2057 let (b0, b1) = self.split_i16x16(b);
2058 self.combine_i16x8(self.and_i16x8(a0, b0), self.and_i16x8(a1, b1))
2059 }
2060 #[inline(always)]
2061 fn or_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2062 let (a0, a1) = self.split_i16x16(a);
2063 let (b0, b1) = self.split_i16x16(b);
2064 self.combine_i16x8(self.or_i16x8(a0, b0), self.or_i16x8(a1, b1))
2065 }
2066 #[inline(always)]
2067 fn xor_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2068 let (a0, a1) = self.split_i16x16(a);
2069 let (b0, b1) = self.split_i16x16(b);
2070 self.combine_i16x8(self.xor_i16x8(a0, b0), self.xor_i16x8(a1, b1))
2071 }
2072 #[inline(always)]
2073 fn shr_i16x16(self, a: i16x16<Self>, b: u32) -> i16x16<Self> {
2074 let (a0, a1) = self.split_i16x16(a);
2075 self.combine_i16x8(self.shr_i16x8(a0, b), self.shr_i16x8(a1, b))
2076 }
2077 #[inline(always)]
2078 fn shrv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2079 let (a0, a1) = self.split_i16x16(a);
2080 let (b0, b1) = self.split_i16x16(b);
2081 self.combine_i16x8(self.shrv_i16x8(a0, b0), self.shrv_i16x8(a1, b1))
2082 }
2083 #[inline(always)]
2084 fn shl_i16x16(self, a: i16x16<Self>, b: u32) -> i16x16<Self> {
2085 let (a0, a1) = self.split_i16x16(a);
2086 self.combine_i16x8(self.shl_i16x8(a0, b), self.shl_i16x8(a1, b))
2087 }
2088 #[inline(always)]
2089 fn simd_eq_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
2090 let (a0, a1) = self.split_i16x16(a);
2091 let (b0, b1) = self.split_i16x16(b);
2092 self.combine_mask16x8(self.simd_eq_i16x8(a0, b0), self.simd_eq_i16x8(a1, b1))
2093 }
2094 #[inline(always)]
2095 fn simd_lt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
2096 let (a0, a1) = self.split_i16x16(a);
2097 let (b0, b1) = self.split_i16x16(b);
2098 self.combine_mask16x8(self.simd_lt_i16x8(a0, b0), self.simd_lt_i16x8(a1, b1))
2099 }
2100 #[inline(always)]
2101 fn simd_le_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
2102 let (a0, a1) = self.split_i16x16(a);
2103 let (b0, b1) = self.split_i16x16(b);
2104 self.combine_mask16x8(self.simd_le_i16x8(a0, b0), self.simd_le_i16x8(a1, b1))
2105 }
2106 #[inline(always)]
2107 fn simd_ge_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
2108 let (a0, a1) = self.split_i16x16(a);
2109 let (b0, b1) = self.split_i16x16(b);
2110 self.combine_mask16x8(self.simd_ge_i16x8(a0, b0), self.simd_ge_i16x8(a1, b1))
2111 }
2112 #[inline(always)]
2113 fn simd_gt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
2114 let (a0, a1) = self.split_i16x16(a);
2115 let (b0, b1) = self.split_i16x16(b);
2116 self.combine_mask16x8(self.simd_gt_i16x8(a0, b0), self.simd_gt_i16x8(a1, b1))
2117 }
2118 #[inline(always)]
2119 fn zip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2120 let (a0, _) = self.split_i16x16(a);
2121 let (b0, _) = self.split_i16x16(b);
2122 self.combine_i16x8(self.zip_low_i16x8(a0, b0), self.zip_high_i16x8(a0, b0))
2123 }
2124 #[inline(always)]
2125 fn zip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2126 let (_, a1) = self.split_i16x16(a);
2127 let (_, b1) = self.split_i16x16(b);
2128 self.combine_i16x8(self.zip_low_i16x8(a1, b1), self.zip_high_i16x8(a1, b1))
2129 }
2130 #[inline(always)]
2131 fn unzip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2132 let (a0, a1) = self.split_i16x16(a);
2133 let (b0, b1) = self.split_i16x16(b);
2134 self.combine_i16x8(self.unzip_low_i16x8(a0, a1), self.unzip_low_i16x8(b0, b1))
2135 }
2136 #[inline(always)]
2137 fn unzip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2138 let (a0, a1) = self.split_i16x16(a);
2139 let (b0, b1) = self.split_i16x16(b);
2140 self.combine_i16x8(self.unzip_high_i16x8(a0, a1), self.unzip_high_i16x8(b0, b1))
2141 }
2142 #[inline(always)]
2143 fn select_i16x16(self, a: mask16x16<Self>, b: i16x16<Self>, c: i16x16<Self>) -> i16x16<Self> {
2144 let (a0, a1) = self.split_mask16x16(a);
2145 let (b0, b1) = self.split_i16x16(b);
2146 let (c0, c1) = self.split_i16x16(c);
2147 self.combine_i16x8(self.select_i16x8(a0, b0, c0), self.select_i16x8(a1, b1, c1))
2148 }
2149 #[inline(always)]
2150 fn min_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2151 let (a0, a1) = self.split_i16x16(a);
2152 let (b0, b1) = self.split_i16x16(b);
2153 self.combine_i16x8(self.min_i16x8(a0, b0), self.min_i16x8(a1, b1))
2154 }
2155 #[inline(always)]
2156 fn max_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2157 let (a0, a1) = self.split_i16x16(a);
2158 let (b0, b1) = self.split_i16x16(b);
2159 self.combine_i16x8(self.max_i16x8(a0, b0), self.max_i16x8(a1, b1))
2160 }
2161 #[inline(always)]
2162 fn combine_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x32<Self> {
2163 let mut result = [0; 32usize];
2164 result[0..16usize].copy_from_slice(&a.val);
2165 result[16usize..32usize].copy_from_slice(&b.val);
2166 result.simd_into(self)
2167 }
2168 #[inline(always)]
2169 fn split_i16x16(self, a: i16x16<Self>) -> (i16x8<Self>, i16x8<Self>) {
2170 let mut b0 = [0; 8usize];
2171 let mut b1 = [0; 8usize];
2172 b0.copy_from_slice(&a.val[0..8usize]);
2173 b1.copy_from_slice(&a.val[8usize..16usize]);
2174 (b0.simd_into(self), b1.simd_into(self))
2175 }
2176 #[inline(always)]
2177 fn neg_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
2178 let (a0, a1) = self.split_i16x16(a);
2179 self.combine_i16x8(self.neg_i16x8(a0), self.neg_i16x8(a1))
2180 }
2181 #[inline(always)]
2182 fn reinterpret_u8_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
2183 let (a0, a1) = self.split_i16x16(a);
2184 self.combine_u8x16(self.reinterpret_u8_i16x8(a0), self.reinterpret_u8_i16x8(a1))
2185 }
2186 #[inline(always)]
2187 fn reinterpret_u32_i16x16(self, a: i16x16<Self>) -> u32x8<Self> {
2188 let (a0, a1) = self.split_i16x16(a);
2189 self.combine_u32x4(
2190 self.reinterpret_u32_i16x8(a0),
2191 self.reinterpret_u32_i16x8(a1),
2192 )
2193 }
2194 #[inline(always)]
2195 fn splat_u16x16(self, a: u16) -> u16x16<Self> {
2196 let half = self.splat_u16x8(a);
2197 self.combine_u16x8(half, half)
2198 }
2199 #[inline(always)]
2200 fn not_u16x16(self, a: u16x16<Self>) -> u16x16<Self> {
2201 let (a0, a1) = self.split_u16x16(a);
2202 self.combine_u16x8(self.not_u16x8(a0), self.not_u16x8(a1))
2203 }
2204 #[inline(always)]
2205 fn add_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2206 let (a0, a1) = self.split_u16x16(a);
2207 let (b0, b1) = self.split_u16x16(b);
2208 self.combine_u16x8(self.add_u16x8(a0, b0), self.add_u16x8(a1, b1))
2209 }
2210 #[inline(always)]
2211 fn sub_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2212 let (a0, a1) = self.split_u16x16(a);
2213 let (b0, b1) = self.split_u16x16(b);
2214 self.combine_u16x8(self.sub_u16x8(a0, b0), self.sub_u16x8(a1, b1))
2215 }
2216 #[inline(always)]
2217 fn mul_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2218 let (a0, a1) = self.split_u16x16(a);
2219 let (b0, b1) = self.split_u16x16(b);
2220 self.combine_u16x8(self.mul_u16x8(a0, b0), self.mul_u16x8(a1, b1))
2221 }
2222 #[inline(always)]
2223 fn and_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2224 let (a0, a1) = self.split_u16x16(a);
2225 let (b0, b1) = self.split_u16x16(b);
2226 self.combine_u16x8(self.and_u16x8(a0, b0), self.and_u16x8(a1, b1))
2227 }
2228 #[inline(always)]
2229 fn or_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2230 let (a0, a1) = self.split_u16x16(a);
2231 let (b0, b1) = self.split_u16x16(b);
2232 self.combine_u16x8(self.or_u16x8(a0, b0), self.or_u16x8(a1, b1))
2233 }
2234 #[inline(always)]
2235 fn xor_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2236 let (a0, a1) = self.split_u16x16(a);
2237 let (b0, b1) = self.split_u16x16(b);
2238 self.combine_u16x8(self.xor_u16x8(a0, b0), self.xor_u16x8(a1, b1))
2239 }
2240 #[inline(always)]
2241 fn shr_u16x16(self, a: u16x16<Self>, b: u32) -> u16x16<Self> {
2242 let (a0, a1) = self.split_u16x16(a);
2243 self.combine_u16x8(self.shr_u16x8(a0, b), self.shr_u16x8(a1, b))
2244 }
2245 #[inline(always)]
2246 fn shrv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2247 let (a0, a1) = self.split_u16x16(a);
2248 let (b0, b1) = self.split_u16x16(b);
2249 self.combine_u16x8(self.shrv_u16x8(a0, b0), self.shrv_u16x8(a1, b1))
2250 }
2251 #[inline(always)]
2252 fn shl_u16x16(self, a: u16x16<Self>, b: u32) -> u16x16<Self> {
2253 let (a0, a1) = self.split_u16x16(a);
2254 self.combine_u16x8(self.shl_u16x8(a0, b), self.shl_u16x8(a1, b))
2255 }
2256 #[inline(always)]
2257 fn simd_eq_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
2258 let (a0, a1) = self.split_u16x16(a);
2259 let (b0, b1) = self.split_u16x16(b);
2260 self.combine_mask16x8(self.simd_eq_u16x8(a0, b0), self.simd_eq_u16x8(a1, b1))
2261 }
2262 #[inline(always)]
2263 fn simd_lt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
2264 let (a0, a1) = self.split_u16x16(a);
2265 let (b0, b1) = self.split_u16x16(b);
2266 self.combine_mask16x8(self.simd_lt_u16x8(a0, b0), self.simd_lt_u16x8(a1, b1))
2267 }
2268 #[inline(always)]
2269 fn simd_le_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
2270 let (a0, a1) = self.split_u16x16(a);
2271 let (b0, b1) = self.split_u16x16(b);
2272 self.combine_mask16x8(self.simd_le_u16x8(a0, b0), self.simd_le_u16x8(a1, b1))
2273 }
2274 #[inline(always)]
2275 fn simd_ge_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
2276 let (a0, a1) = self.split_u16x16(a);
2277 let (b0, b1) = self.split_u16x16(b);
2278 self.combine_mask16x8(self.simd_ge_u16x8(a0, b0), self.simd_ge_u16x8(a1, b1))
2279 }
2280 #[inline(always)]
2281 fn simd_gt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
2282 let (a0, a1) = self.split_u16x16(a);
2283 let (b0, b1) = self.split_u16x16(b);
2284 self.combine_mask16x8(self.simd_gt_u16x8(a0, b0), self.simd_gt_u16x8(a1, b1))
2285 }
2286 #[inline(always)]
2287 fn zip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2288 let (a0, _) = self.split_u16x16(a);
2289 let (b0, _) = self.split_u16x16(b);
2290 self.combine_u16x8(self.zip_low_u16x8(a0, b0), self.zip_high_u16x8(a0, b0))
2291 }
2292 #[inline(always)]
2293 fn zip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2294 let (_, a1) = self.split_u16x16(a);
2295 let (_, b1) = self.split_u16x16(b);
2296 self.combine_u16x8(self.zip_low_u16x8(a1, b1), self.zip_high_u16x8(a1, b1))
2297 }
2298 #[inline(always)]
2299 fn unzip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2300 let (a0, a1) = self.split_u16x16(a);
2301 let (b0, b1) = self.split_u16x16(b);
2302 self.combine_u16x8(self.unzip_low_u16x8(a0, a1), self.unzip_low_u16x8(b0, b1))
2303 }
2304 #[inline(always)]
2305 fn unzip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2306 let (a0, a1) = self.split_u16x16(a);
2307 let (b0, b1) = self.split_u16x16(b);
2308 self.combine_u16x8(self.unzip_high_u16x8(a0, a1), self.unzip_high_u16x8(b0, b1))
2309 }
2310 #[inline(always)]
2311 fn select_u16x16(self, a: mask16x16<Self>, b: u16x16<Self>, c: u16x16<Self>) -> u16x16<Self> {
2312 let (a0, a1) = self.split_mask16x16(a);
2313 let (b0, b1) = self.split_u16x16(b);
2314 let (c0, c1) = self.split_u16x16(c);
2315 self.combine_u16x8(self.select_u16x8(a0, b0, c0), self.select_u16x8(a1, b1, c1))
2316 }
2317 #[inline(always)]
2318 fn min_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2319 let (a0, a1) = self.split_u16x16(a);
2320 let (b0, b1) = self.split_u16x16(b);
2321 self.combine_u16x8(self.min_u16x8(a0, b0), self.min_u16x8(a1, b1))
2322 }
2323 #[inline(always)]
2324 fn max_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2325 let (a0, a1) = self.split_u16x16(a);
2326 let (b0, b1) = self.split_u16x16(b);
2327 self.combine_u16x8(self.max_u16x8(a0, b0), self.max_u16x8(a1, b1))
2328 }
2329 #[inline(always)]
2330 fn combine_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x32<Self> {
2331 let mut result = [0; 32usize];
2332 result[0..16usize].copy_from_slice(&a.val);
2333 result[16usize..32usize].copy_from_slice(&b.val);
2334 result.simd_into(self)
2335 }
2336 #[inline(always)]
2337 fn split_u16x16(self, a: u16x16<Self>) -> (u16x8<Self>, u16x8<Self>) {
2338 let mut b0 = [0; 8usize];
2339 let mut b1 = [0; 8usize];
2340 b0.copy_from_slice(&a.val[0..8usize]);
2341 b1.copy_from_slice(&a.val[8usize..16usize]);
2342 (b0.simd_into(self), b1.simd_into(self))
2343 }
2344 #[inline(always)]
2345 fn narrow_u16x16(self, a: u16x16<Self>) -> u8x16<Self> {
2346 let (a, b) = self.split_u16x16(a);
2347 unsafe {
2348 let mask = _mm_set1_epi16(0xFF);
2349 let lo_masked = _mm_and_si128(a.into(), mask);
2350 let hi_masked = _mm_and_si128(b.into(), mask);
2351 let result = _mm_packus_epi16(lo_masked, hi_masked);
2352 result.simd_into(self)
2353 }
2354 }
2355 #[inline(always)]
2356 fn reinterpret_u8_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
2357 let (a0, a1) = self.split_u16x16(a);
2358 self.combine_u8x16(self.reinterpret_u8_u16x8(a0), self.reinterpret_u8_u16x8(a1))
2359 }
2360 #[inline(always)]
2361 fn reinterpret_u32_u16x16(self, a: u16x16<Self>) -> u32x8<Self> {
2362 let (a0, a1) = self.split_u16x16(a);
2363 self.combine_u32x4(
2364 self.reinterpret_u32_u16x8(a0),
2365 self.reinterpret_u32_u16x8(a1),
2366 )
2367 }
2368 #[inline(always)]
2369 fn splat_mask16x16(self, a: i16) -> mask16x16<Self> {
2370 let half = self.splat_mask16x8(a);
2371 self.combine_mask16x8(half, half)
2372 }
2373 #[inline(always)]
2374 fn not_mask16x16(self, a: mask16x16<Self>) -> mask16x16<Self> {
2375 let (a0, a1) = self.split_mask16x16(a);
2376 self.combine_mask16x8(self.not_mask16x8(a0), self.not_mask16x8(a1))
2377 }
2378 #[inline(always)]
2379 fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
2380 let (a0, a1) = self.split_mask16x16(a);
2381 let (b0, b1) = self.split_mask16x16(b);
2382 self.combine_mask16x8(self.and_mask16x8(a0, b0), self.and_mask16x8(a1, b1))
2383 }
2384 #[inline(always)]
2385 fn or_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
2386 let (a0, a1) = self.split_mask16x16(a);
2387 let (b0, b1) = self.split_mask16x16(b);
2388 self.combine_mask16x8(self.or_mask16x8(a0, b0), self.or_mask16x8(a1, b1))
2389 }
2390 #[inline(always)]
2391 fn xor_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
2392 let (a0, a1) = self.split_mask16x16(a);
2393 let (b0, b1) = self.split_mask16x16(b);
2394 self.combine_mask16x8(self.xor_mask16x8(a0, b0), self.xor_mask16x8(a1, b1))
2395 }
2396 #[inline(always)]
2397 fn select_mask16x16(
2398 self,
2399 a: mask16x16<Self>,
2400 b: mask16x16<Self>,
2401 c: mask16x16<Self>,
2402 ) -> mask16x16<Self> {
2403 let (a0, a1) = self.split_mask16x16(a);
2404 let (b0, b1) = self.split_mask16x16(b);
2405 let (c0, c1) = self.split_mask16x16(c);
2406 self.combine_mask16x8(
2407 self.select_mask16x8(a0, b0, c0),
2408 self.select_mask16x8(a1, b1, c1),
2409 )
2410 }
2411 #[inline(always)]
2412 fn simd_eq_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
2413 let (a0, a1) = self.split_mask16x16(a);
2414 let (b0, b1) = self.split_mask16x16(b);
2415 self.combine_mask16x8(self.simd_eq_mask16x8(a0, b0), self.simd_eq_mask16x8(a1, b1))
2416 }
2417 #[inline(always)]
2418 fn combine_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x32<Self> {
2419 let mut result = [0; 32usize];
2420 result[0..16usize].copy_from_slice(&a.val);
2421 result[16usize..32usize].copy_from_slice(&b.val);
2422 result.simd_into(self)
2423 }
2424 #[inline(always)]
2425 fn split_mask16x16(self, a: mask16x16<Self>) -> (mask16x8<Self>, mask16x8<Self>) {
2426 let mut b0 = [0; 8usize];
2427 let mut b1 = [0; 8usize];
2428 b0.copy_from_slice(&a.val[0..8usize]);
2429 b1.copy_from_slice(&a.val[8usize..16usize]);
2430 (b0.simd_into(self), b1.simd_into(self))
2431 }
2432 #[inline(always)]
2433 fn splat_i32x8(self, a: i32) -> i32x8<Self> {
2434 let half = self.splat_i32x4(a);
2435 self.combine_i32x4(half, half)
2436 }
2437 #[inline(always)]
2438 fn not_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
2439 let (a0, a1) = self.split_i32x8(a);
2440 self.combine_i32x4(self.not_i32x4(a0), self.not_i32x4(a1))
2441 }
2442 #[inline(always)]
2443 fn add_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2444 let (a0, a1) = self.split_i32x8(a);
2445 let (b0, b1) = self.split_i32x8(b);
2446 self.combine_i32x4(self.add_i32x4(a0, b0), self.add_i32x4(a1, b1))
2447 }
2448 #[inline(always)]
2449 fn sub_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2450 let (a0, a1) = self.split_i32x8(a);
2451 let (b0, b1) = self.split_i32x8(b);
2452 self.combine_i32x4(self.sub_i32x4(a0, b0), self.sub_i32x4(a1, b1))
2453 }
2454 #[inline(always)]
2455 fn mul_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2456 let (a0, a1) = self.split_i32x8(a);
2457 let (b0, b1) = self.split_i32x8(b);
2458 self.combine_i32x4(self.mul_i32x4(a0, b0), self.mul_i32x4(a1, b1))
2459 }
2460 #[inline(always)]
2461 fn and_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2462 let (a0, a1) = self.split_i32x8(a);
2463 let (b0, b1) = self.split_i32x8(b);
2464 self.combine_i32x4(self.and_i32x4(a0, b0), self.and_i32x4(a1, b1))
2465 }
2466 #[inline(always)]
2467 fn or_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2468 let (a0, a1) = self.split_i32x8(a);
2469 let (b0, b1) = self.split_i32x8(b);
2470 self.combine_i32x4(self.or_i32x4(a0, b0), self.or_i32x4(a1, b1))
2471 }
2472 #[inline(always)]
2473 fn xor_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2474 let (a0, a1) = self.split_i32x8(a);
2475 let (b0, b1) = self.split_i32x8(b);
2476 self.combine_i32x4(self.xor_i32x4(a0, b0), self.xor_i32x4(a1, b1))
2477 }
2478 #[inline(always)]
2479 fn shr_i32x8(self, a: i32x8<Self>, b: u32) -> i32x8<Self> {
2480 let (a0, a1) = self.split_i32x8(a);
2481 self.combine_i32x4(self.shr_i32x4(a0, b), self.shr_i32x4(a1, b))
2482 }
2483 #[inline(always)]
2484 fn shrv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2485 let (a0, a1) = self.split_i32x8(a);
2486 let (b0, b1) = self.split_i32x8(b);
2487 self.combine_i32x4(self.shrv_i32x4(a0, b0), self.shrv_i32x4(a1, b1))
2488 }
2489 #[inline(always)]
2490 fn shl_i32x8(self, a: i32x8<Self>, b: u32) -> i32x8<Self> {
2491 let (a0, a1) = self.split_i32x8(a);
2492 self.combine_i32x4(self.shl_i32x4(a0, b), self.shl_i32x4(a1, b))
2493 }
2494 #[inline(always)]
2495 fn simd_eq_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
2496 let (a0, a1) = self.split_i32x8(a);
2497 let (b0, b1) = self.split_i32x8(b);
2498 self.combine_mask32x4(self.simd_eq_i32x4(a0, b0), self.simd_eq_i32x4(a1, b1))
2499 }
2500 #[inline(always)]
2501 fn simd_lt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
2502 let (a0, a1) = self.split_i32x8(a);
2503 let (b0, b1) = self.split_i32x8(b);
2504 self.combine_mask32x4(self.simd_lt_i32x4(a0, b0), self.simd_lt_i32x4(a1, b1))
2505 }
2506 #[inline(always)]
2507 fn simd_le_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
2508 let (a0, a1) = self.split_i32x8(a);
2509 let (b0, b1) = self.split_i32x8(b);
2510 self.combine_mask32x4(self.simd_le_i32x4(a0, b0), self.simd_le_i32x4(a1, b1))
2511 }
2512 #[inline(always)]
2513 fn simd_ge_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
2514 let (a0, a1) = self.split_i32x8(a);
2515 let (b0, b1) = self.split_i32x8(b);
2516 self.combine_mask32x4(self.simd_ge_i32x4(a0, b0), self.simd_ge_i32x4(a1, b1))
2517 }
2518 #[inline(always)]
2519 fn simd_gt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
2520 let (a0, a1) = self.split_i32x8(a);
2521 let (b0, b1) = self.split_i32x8(b);
2522 self.combine_mask32x4(self.simd_gt_i32x4(a0, b0), self.simd_gt_i32x4(a1, b1))
2523 }
2524 #[inline(always)]
2525 fn zip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2526 let (a0, _) = self.split_i32x8(a);
2527 let (b0, _) = self.split_i32x8(b);
2528 self.combine_i32x4(self.zip_low_i32x4(a0, b0), self.zip_high_i32x4(a0, b0))
2529 }
2530 #[inline(always)]
2531 fn zip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2532 let (_, a1) = self.split_i32x8(a);
2533 let (_, b1) = self.split_i32x8(b);
2534 self.combine_i32x4(self.zip_low_i32x4(a1, b1), self.zip_high_i32x4(a1, b1))
2535 }
2536 #[inline(always)]
2537 fn unzip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2538 let (a0, a1) = self.split_i32x8(a);
2539 let (b0, b1) = self.split_i32x8(b);
2540 self.combine_i32x4(self.unzip_low_i32x4(a0, a1), self.unzip_low_i32x4(b0, b1))
2541 }
2542 #[inline(always)]
2543 fn unzip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2544 let (a0, a1) = self.split_i32x8(a);
2545 let (b0, b1) = self.split_i32x8(b);
2546 self.combine_i32x4(self.unzip_high_i32x4(a0, a1), self.unzip_high_i32x4(b0, b1))
2547 }
2548 #[inline(always)]
2549 fn select_i32x8(self, a: mask32x8<Self>, b: i32x8<Self>, c: i32x8<Self>) -> i32x8<Self> {
2550 let (a0, a1) = self.split_mask32x8(a);
2551 let (b0, b1) = self.split_i32x8(b);
2552 let (c0, c1) = self.split_i32x8(c);
2553 self.combine_i32x4(self.select_i32x4(a0, b0, c0), self.select_i32x4(a1, b1, c1))
2554 }
2555 #[inline(always)]
2556 fn min_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2557 let (a0, a1) = self.split_i32x8(a);
2558 let (b0, b1) = self.split_i32x8(b);
2559 self.combine_i32x4(self.min_i32x4(a0, b0), self.min_i32x4(a1, b1))
2560 }
2561 #[inline(always)]
2562 fn max_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2563 let (a0, a1) = self.split_i32x8(a);
2564 let (b0, b1) = self.split_i32x8(b);
2565 self.combine_i32x4(self.max_i32x4(a0, b0), self.max_i32x4(a1, b1))
2566 }
2567 #[inline(always)]
2568 fn combine_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x16<Self> {
2569 let mut result = [0; 16usize];
2570 result[0..8usize].copy_from_slice(&a.val);
2571 result[8usize..16usize].copy_from_slice(&b.val);
2572 result.simd_into(self)
2573 }
2574 #[inline(always)]
2575 fn split_i32x8(self, a: i32x8<Self>) -> (i32x4<Self>, i32x4<Self>) {
2576 let mut b0 = [0; 4usize];
2577 let mut b1 = [0; 4usize];
2578 b0.copy_from_slice(&a.val[0..4usize]);
2579 b1.copy_from_slice(&a.val[4usize..8usize]);
2580 (b0.simd_into(self), b1.simd_into(self))
2581 }
2582 #[inline(always)]
2583 fn neg_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
2584 let (a0, a1) = self.split_i32x8(a);
2585 self.combine_i32x4(self.neg_i32x4(a0), self.neg_i32x4(a1))
2586 }
2587 #[inline(always)]
2588 fn reinterpret_u8_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
2589 let (a0, a1) = self.split_i32x8(a);
2590 self.combine_u8x16(self.reinterpret_u8_i32x4(a0), self.reinterpret_u8_i32x4(a1))
2591 }
2592 #[inline(always)]
2593 fn reinterpret_u32_i32x8(self, a: i32x8<Self>) -> u32x8<Self> {
2594 let (a0, a1) = self.split_i32x8(a);
2595 self.combine_u32x4(
2596 self.reinterpret_u32_i32x4(a0),
2597 self.reinterpret_u32_i32x4(a1),
2598 )
2599 }
2600 #[inline(always)]
2601 fn cvt_f32_i32x8(self, a: i32x8<Self>) -> f32x8<Self> {
2602 let (a0, a1) = self.split_i32x8(a);
2603 self.combine_f32x4(self.cvt_f32_i32x4(a0), self.cvt_f32_i32x4(a1))
2604 }
2605 #[inline(always)]
2606 fn splat_u32x8(self, a: u32) -> u32x8<Self> {
2607 let half = self.splat_u32x4(a);
2608 self.combine_u32x4(half, half)
2609 }
2610 #[inline(always)]
2611 fn not_u32x8(self, a: u32x8<Self>) -> u32x8<Self> {
2612 let (a0, a1) = self.split_u32x8(a);
2613 self.combine_u32x4(self.not_u32x4(a0), self.not_u32x4(a1))
2614 }
2615 #[inline(always)]
2616 fn add_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2617 let (a0, a1) = self.split_u32x8(a);
2618 let (b0, b1) = self.split_u32x8(b);
2619 self.combine_u32x4(self.add_u32x4(a0, b0), self.add_u32x4(a1, b1))
2620 }
2621 #[inline(always)]
2622 fn sub_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2623 let (a0, a1) = self.split_u32x8(a);
2624 let (b0, b1) = self.split_u32x8(b);
2625 self.combine_u32x4(self.sub_u32x4(a0, b0), self.sub_u32x4(a1, b1))
2626 }
2627 #[inline(always)]
2628 fn mul_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2629 let (a0, a1) = self.split_u32x8(a);
2630 let (b0, b1) = self.split_u32x8(b);
2631 self.combine_u32x4(self.mul_u32x4(a0, b0), self.mul_u32x4(a1, b1))
2632 }
2633 #[inline(always)]
2634 fn and_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2635 let (a0, a1) = self.split_u32x8(a);
2636 let (b0, b1) = self.split_u32x8(b);
2637 self.combine_u32x4(self.and_u32x4(a0, b0), self.and_u32x4(a1, b1))
2638 }
2639 #[inline(always)]
2640 fn or_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2641 let (a0, a1) = self.split_u32x8(a);
2642 let (b0, b1) = self.split_u32x8(b);
2643 self.combine_u32x4(self.or_u32x4(a0, b0), self.or_u32x4(a1, b1))
2644 }
2645 #[inline(always)]
2646 fn xor_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2647 let (a0, a1) = self.split_u32x8(a);
2648 let (b0, b1) = self.split_u32x8(b);
2649 self.combine_u32x4(self.xor_u32x4(a0, b0), self.xor_u32x4(a1, b1))
2650 }
2651 #[inline(always)]
2652 fn shr_u32x8(self, a: u32x8<Self>, b: u32) -> u32x8<Self> {
2653 let (a0, a1) = self.split_u32x8(a);
2654 self.combine_u32x4(self.shr_u32x4(a0, b), self.shr_u32x4(a1, b))
2655 }
2656 #[inline(always)]
2657 fn shrv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2658 let (a0, a1) = self.split_u32x8(a);
2659 let (b0, b1) = self.split_u32x8(b);
2660 self.combine_u32x4(self.shrv_u32x4(a0, b0), self.shrv_u32x4(a1, b1))
2661 }
2662 #[inline(always)]
2663 fn shl_u32x8(self, a: u32x8<Self>, b: u32) -> u32x8<Self> {
2664 let (a0, a1) = self.split_u32x8(a);
2665 self.combine_u32x4(self.shl_u32x4(a0, b), self.shl_u32x4(a1, b))
2666 }
2667 #[inline(always)]
2668 fn simd_eq_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
2669 let (a0, a1) = self.split_u32x8(a);
2670 let (b0, b1) = self.split_u32x8(b);
2671 self.combine_mask32x4(self.simd_eq_u32x4(a0, b0), self.simd_eq_u32x4(a1, b1))
2672 }
2673 #[inline(always)]
2674 fn simd_lt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
2675 let (a0, a1) = self.split_u32x8(a);
2676 let (b0, b1) = self.split_u32x8(b);
2677 self.combine_mask32x4(self.simd_lt_u32x4(a0, b0), self.simd_lt_u32x4(a1, b1))
2678 }
2679 #[inline(always)]
2680 fn simd_le_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
2681 let (a0, a1) = self.split_u32x8(a);
2682 let (b0, b1) = self.split_u32x8(b);
2683 self.combine_mask32x4(self.simd_le_u32x4(a0, b0), self.simd_le_u32x4(a1, b1))
2684 }
2685 #[inline(always)]
2686 fn simd_ge_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
2687 let (a0, a1) = self.split_u32x8(a);
2688 let (b0, b1) = self.split_u32x8(b);
2689 self.combine_mask32x4(self.simd_ge_u32x4(a0, b0), self.simd_ge_u32x4(a1, b1))
2690 }
2691 #[inline(always)]
2692 fn simd_gt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
2693 let (a0, a1) = self.split_u32x8(a);
2694 let (b0, b1) = self.split_u32x8(b);
2695 self.combine_mask32x4(self.simd_gt_u32x4(a0, b0), self.simd_gt_u32x4(a1, b1))
2696 }
2697 #[inline(always)]
2698 fn zip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2699 let (a0, _) = self.split_u32x8(a);
2700 let (b0, _) = self.split_u32x8(b);
2701 self.combine_u32x4(self.zip_low_u32x4(a0, b0), self.zip_high_u32x4(a0, b0))
2702 }
2703 #[inline(always)]
2704 fn zip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2705 let (_, a1) = self.split_u32x8(a);
2706 let (_, b1) = self.split_u32x8(b);
2707 self.combine_u32x4(self.zip_low_u32x4(a1, b1), self.zip_high_u32x4(a1, b1))
2708 }
2709 #[inline(always)]
2710 fn unzip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2711 let (a0, a1) = self.split_u32x8(a);
2712 let (b0, b1) = self.split_u32x8(b);
2713 self.combine_u32x4(self.unzip_low_u32x4(a0, a1), self.unzip_low_u32x4(b0, b1))
2714 }
2715 #[inline(always)]
2716 fn unzip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2717 let (a0, a1) = self.split_u32x8(a);
2718 let (b0, b1) = self.split_u32x8(b);
2719 self.combine_u32x4(self.unzip_high_u32x4(a0, a1), self.unzip_high_u32x4(b0, b1))
2720 }
2721 #[inline(always)]
2722 fn select_u32x8(self, a: mask32x8<Self>, b: u32x8<Self>, c: u32x8<Self>) -> u32x8<Self> {
2723 let (a0, a1) = self.split_mask32x8(a);
2724 let (b0, b1) = self.split_u32x8(b);
2725 let (c0, c1) = self.split_u32x8(c);
2726 self.combine_u32x4(self.select_u32x4(a0, b0, c0), self.select_u32x4(a1, b1, c1))
2727 }
2728 #[inline(always)]
2729 fn min_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2730 let (a0, a1) = self.split_u32x8(a);
2731 let (b0, b1) = self.split_u32x8(b);
2732 self.combine_u32x4(self.min_u32x4(a0, b0), self.min_u32x4(a1, b1))
2733 }
2734 #[inline(always)]
2735 fn max_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2736 let (a0, a1) = self.split_u32x8(a);
2737 let (b0, b1) = self.split_u32x8(b);
2738 self.combine_u32x4(self.max_u32x4(a0, b0), self.max_u32x4(a1, b1))
2739 }
2740 #[inline(always)]
2741 fn combine_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x16<Self> {
2742 let mut result = [0; 16usize];
2743 result[0..8usize].copy_from_slice(&a.val);
2744 result[8usize..16usize].copy_from_slice(&b.val);
2745 result.simd_into(self)
2746 }
2747 #[inline(always)]
2748 fn split_u32x8(self, a: u32x8<Self>) -> (u32x4<Self>, u32x4<Self>) {
2749 let mut b0 = [0; 4usize];
2750 let mut b1 = [0; 4usize];
2751 b0.copy_from_slice(&a.val[0..4usize]);
2752 b1.copy_from_slice(&a.val[4usize..8usize]);
2753 (b0.simd_into(self), b1.simd_into(self))
2754 }
2755 #[inline(always)]
2756 fn reinterpret_u8_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
2757 let (a0, a1) = self.split_u32x8(a);
2758 self.combine_u8x16(self.reinterpret_u8_u32x4(a0), self.reinterpret_u8_u32x4(a1))
2759 }
2760 #[inline(always)]
2761 fn cvt_f32_u32x8(self, a: u32x8<Self>) -> f32x8<Self> {
2762 let (a0, a1) = self.split_u32x8(a);
2763 self.combine_f32x4(self.cvt_f32_u32x4(a0), self.cvt_f32_u32x4(a1))
2764 }
2765 #[inline(always)]
2766 fn splat_mask32x8(self, a: i32) -> mask32x8<Self> {
2767 let half = self.splat_mask32x4(a);
2768 self.combine_mask32x4(half, half)
2769 }
2770 #[inline(always)]
2771 fn not_mask32x8(self, a: mask32x8<Self>) -> mask32x8<Self> {
2772 let (a0, a1) = self.split_mask32x8(a);
2773 self.combine_mask32x4(self.not_mask32x4(a0), self.not_mask32x4(a1))
2774 }
2775 #[inline(always)]
2776 fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
2777 let (a0, a1) = self.split_mask32x8(a);
2778 let (b0, b1) = self.split_mask32x8(b);
2779 self.combine_mask32x4(self.and_mask32x4(a0, b0), self.and_mask32x4(a1, b1))
2780 }
2781 #[inline(always)]
2782 fn or_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
2783 let (a0, a1) = self.split_mask32x8(a);
2784 let (b0, b1) = self.split_mask32x8(b);
2785 self.combine_mask32x4(self.or_mask32x4(a0, b0), self.or_mask32x4(a1, b1))
2786 }
2787 #[inline(always)]
2788 fn xor_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
2789 let (a0, a1) = self.split_mask32x8(a);
2790 let (b0, b1) = self.split_mask32x8(b);
2791 self.combine_mask32x4(self.xor_mask32x4(a0, b0), self.xor_mask32x4(a1, b1))
2792 }
2793 #[inline(always)]
2794 fn select_mask32x8(
2795 self,
2796 a: mask32x8<Self>,
2797 b: mask32x8<Self>,
2798 c: mask32x8<Self>,
2799 ) -> mask32x8<Self> {
2800 let (a0, a1) = self.split_mask32x8(a);
2801 let (b0, b1) = self.split_mask32x8(b);
2802 let (c0, c1) = self.split_mask32x8(c);
2803 self.combine_mask32x4(
2804 self.select_mask32x4(a0, b0, c0),
2805 self.select_mask32x4(a1, b1, c1),
2806 )
2807 }
2808 #[inline(always)]
2809 fn simd_eq_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
2810 let (a0, a1) = self.split_mask32x8(a);
2811 let (b0, b1) = self.split_mask32x8(b);
2812 self.combine_mask32x4(self.simd_eq_mask32x4(a0, b0), self.simd_eq_mask32x4(a1, b1))
2813 }
2814 #[inline(always)]
2815 fn combine_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x16<Self> {
2816 let mut result = [0; 16usize];
2817 result[0..8usize].copy_from_slice(&a.val);
2818 result[8usize..16usize].copy_from_slice(&b.val);
2819 result.simd_into(self)
2820 }
2821 #[inline(always)]
2822 fn split_mask32x8(self, a: mask32x8<Self>) -> (mask32x4<Self>, mask32x4<Self>) {
2823 let mut b0 = [0; 4usize];
2824 let mut b1 = [0; 4usize];
2825 b0.copy_from_slice(&a.val[0..4usize]);
2826 b1.copy_from_slice(&a.val[4usize..8usize]);
2827 (b0.simd_into(self), b1.simd_into(self))
2828 }
2829 #[inline(always)]
2830 fn splat_f64x4(self, a: f64) -> f64x4<Self> {
2831 let half = self.splat_f64x2(a);
2832 self.combine_f64x2(half, half)
2833 }
2834 #[inline(always)]
2835 fn abs_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
2836 let (a0, a1) = self.split_f64x4(a);
2837 self.combine_f64x2(self.abs_f64x2(a0), self.abs_f64x2(a1))
2838 }
2839 #[inline(always)]
2840 fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
2841 let (a0, a1) = self.split_f64x4(a);
2842 self.combine_f64x2(self.neg_f64x2(a0), self.neg_f64x2(a1))
2843 }
2844 #[inline(always)]
2845 fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
2846 let (a0, a1) = self.split_f64x4(a);
2847 self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1))
2848 }
2849 #[inline(always)]
2850 fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2851 let (a0, a1) = self.split_f64x4(a);
2852 let (b0, b1) = self.split_f64x4(b);
2853 self.combine_f64x2(self.add_f64x2(a0, b0), self.add_f64x2(a1, b1))
2854 }
2855 #[inline(always)]
2856 fn sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2857 let (a0, a1) = self.split_f64x4(a);
2858 let (b0, b1) = self.split_f64x4(b);
2859 self.combine_f64x2(self.sub_f64x2(a0, b0), self.sub_f64x2(a1, b1))
2860 }
2861 #[inline(always)]
2862 fn mul_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2863 let (a0, a1) = self.split_f64x4(a);
2864 let (b0, b1) = self.split_f64x4(b);
2865 self.combine_f64x2(self.mul_f64x2(a0, b0), self.mul_f64x2(a1, b1))
2866 }
2867 #[inline(always)]
2868 fn div_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2869 let (a0, a1) = self.split_f64x4(a);
2870 let (b0, b1) = self.split_f64x4(b);
2871 self.combine_f64x2(self.div_f64x2(a0, b0), self.div_f64x2(a1, b1))
2872 }
2873 #[inline(always)]
2874 fn copysign_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2875 let (a0, a1) = self.split_f64x4(a);
2876 let (b0, b1) = self.split_f64x4(b);
2877 self.combine_f64x2(self.copysign_f64x2(a0, b0), self.copysign_f64x2(a1, b1))
2878 }
2879 #[inline(always)]
2880 fn simd_eq_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
2881 let (a0, a1) = self.split_f64x4(a);
2882 let (b0, b1) = self.split_f64x4(b);
2883 self.combine_mask64x2(self.simd_eq_f64x2(a0, b0), self.simd_eq_f64x2(a1, b1))
2884 }
2885 #[inline(always)]
2886 fn simd_lt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
2887 let (a0, a1) = self.split_f64x4(a);
2888 let (b0, b1) = self.split_f64x4(b);
2889 self.combine_mask64x2(self.simd_lt_f64x2(a0, b0), self.simd_lt_f64x2(a1, b1))
2890 }
2891 #[inline(always)]
2892 fn simd_le_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
2893 let (a0, a1) = self.split_f64x4(a);
2894 let (b0, b1) = self.split_f64x4(b);
2895 self.combine_mask64x2(self.simd_le_f64x2(a0, b0), self.simd_le_f64x2(a1, b1))
2896 }
2897 #[inline(always)]
2898 fn simd_ge_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
2899 let (a0, a1) = self.split_f64x4(a);
2900 let (b0, b1) = self.split_f64x4(b);
2901 self.combine_mask64x2(self.simd_ge_f64x2(a0, b0), self.simd_ge_f64x2(a1, b1))
2902 }
2903 #[inline(always)]
2904 fn simd_gt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
2905 let (a0, a1) = self.split_f64x4(a);
2906 let (b0, b1) = self.split_f64x4(b);
2907 self.combine_mask64x2(self.simd_gt_f64x2(a0, b0), self.simd_gt_f64x2(a1, b1))
2908 }
2909 #[inline(always)]
2910 fn zip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2911 let (a0, _) = self.split_f64x4(a);
2912 let (b0, _) = self.split_f64x4(b);
2913 self.combine_f64x2(self.zip_low_f64x2(a0, b0), self.zip_high_f64x2(a0, b0))
2914 }
2915 #[inline(always)]
2916 fn zip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2917 let (_, a1) = self.split_f64x4(a);
2918 let (_, b1) = self.split_f64x4(b);
2919 self.combine_f64x2(self.zip_low_f64x2(a1, b1), self.zip_high_f64x2(a1, b1))
2920 }
2921 #[inline(always)]
2922 fn unzip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2923 let (a0, a1) = self.split_f64x4(a);
2924 let (b0, b1) = self.split_f64x4(b);
2925 self.combine_f64x2(self.unzip_low_f64x2(a0, a1), self.unzip_low_f64x2(b0, b1))
2926 }
2927 #[inline(always)]
2928 fn unzip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2929 let (a0, a1) = self.split_f64x4(a);
2930 let (b0, b1) = self.split_f64x4(b);
2931 self.combine_f64x2(self.unzip_high_f64x2(a0, a1), self.unzip_high_f64x2(b0, b1))
2932 }
2933 #[inline(always)]
2934 fn max_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2935 let (a0, a1) = self.split_f64x4(a);
2936 let (b0, b1) = self.split_f64x4(b);
2937 self.combine_f64x2(self.max_f64x2(a0, b0), self.max_f64x2(a1, b1))
2938 }
2939 #[inline(always)]
2940 fn max_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2941 let (a0, a1) = self.split_f64x4(a);
2942 let (b0, b1) = self.split_f64x4(b);
2943 self.combine_f64x2(
2944 self.max_precise_f64x2(a0, b0),
2945 self.max_precise_f64x2(a1, b1),
2946 )
2947 }
2948 #[inline(always)]
2949 fn min_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2950 let (a0, a1) = self.split_f64x4(a);
2951 let (b0, b1) = self.split_f64x4(b);
2952 self.combine_f64x2(self.min_f64x2(a0, b0), self.min_f64x2(a1, b1))
2953 }
2954 #[inline(always)]
2955 fn min_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2956 let (a0, a1) = self.split_f64x4(a);
2957 let (b0, b1) = self.split_f64x4(b);
2958 self.combine_f64x2(
2959 self.min_precise_f64x2(a0, b0),
2960 self.min_precise_f64x2(a1, b1),
2961 )
2962 }
2963 #[inline(always)]
2964 fn madd_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
2965 let (a0, a1) = self.split_f64x4(a);
2966 let (b0, b1) = self.split_f64x4(b);
2967 let (c0, c1) = self.split_f64x4(c);
2968 self.combine_f64x2(self.madd_f64x2(a0, b0, c0), self.madd_f64x2(a1, b1, c1))
2969 }
2970 #[inline(always)]
2971 fn msub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
2972 let (a0, a1) = self.split_f64x4(a);
2973 let (b0, b1) = self.split_f64x4(b);
2974 let (c0, c1) = self.split_f64x4(c);
2975 self.combine_f64x2(self.msub_f64x2(a0, b0, c0), self.msub_f64x2(a1, b1, c1))
2976 }
2977 #[inline(always)]
2978 fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
2979 let (a0, a1) = self.split_f64x4(a);
2980 self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1))
2981 }
2982 #[inline(always)]
2983 fn fract_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
2984 let (a0, a1) = self.split_f64x4(a);
2985 self.combine_f64x2(self.fract_f64x2(a0), self.fract_f64x2(a1))
2986 }
2987 #[inline(always)]
2988 fn trunc_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
2989 let (a0, a1) = self.split_f64x4(a);
2990 self.combine_f64x2(self.trunc_f64x2(a0), self.trunc_f64x2(a1))
2991 }
2992 #[inline(always)]
2993 fn select_f64x4(self, a: mask64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
2994 let (a0, a1) = self.split_mask64x4(a);
2995 let (b0, b1) = self.split_f64x4(b);
2996 let (c0, c1) = self.split_f64x4(c);
2997 self.combine_f64x2(self.select_f64x2(a0, b0, c0), self.select_f64x2(a1, b1, c1))
2998 }
2999 #[inline(always)]
3000 fn combine_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x8<Self> {
3001 let mut result = [0.0; 8usize];
3002 result[0..4usize].copy_from_slice(&a.val);
3003 result[4usize..8usize].copy_from_slice(&b.val);
3004 result.simd_into(self)
3005 }
3006 #[inline(always)]
3007 fn split_f64x4(self, a: f64x4<Self>) -> (f64x2<Self>, f64x2<Self>) {
3008 let mut b0 = [0.0; 2usize];
3009 let mut b1 = [0.0; 2usize];
3010 b0.copy_from_slice(&a.val[0..2usize]);
3011 b1.copy_from_slice(&a.val[2usize..4usize]);
3012 (b0.simd_into(self), b1.simd_into(self))
3013 }
3014 #[inline(always)]
3015 fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
3016 let (a0, a1) = self.split_f64x4(a);
3017 self.combine_f32x4(
3018 self.reinterpret_f32_f64x2(a0),
3019 self.reinterpret_f32_f64x2(a1),
3020 )
3021 }
3022 #[inline(always)]
3023 fn splat_mask64x4(self, a: i64) -> mask64x4<Self> {
3024 let half = self.splat_mask64x2(a);
3025 self.combine_mask64x2(half, half)
3026 }
3027 #[inline(always)]
3028 fn not_mask64x4(self, a: mask64x4<Self>) -> mask64x4<Self> {
3029 let (a0, a1) = self.split_mask64x4(a);
3030 self.combine_mask64x2(self.not_mask64x2(a0), self.not_mask64x2(a1))
3031 }
3032 #[inline(always)]
3033 fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
3034 let (a0, a1) = self.split_mask64x4(a);
3035 let (b0, b1) = self.split_mask64x4(b);
3036 self.combine_mask64x2(self.and_mask64x2(a0, b0), self.and_mask64x2(a1, b1))
3037 }
3038 #[inline(always)]
3039 fn or_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
3040 let (a0, a1) = self.split_mask64x4(a);
3041 let (b0, b1) = self.split_mask64x4(b);
3042 self.combine_mask64x2(self.or_mask64x2(a0, b0), self.or_mask64x2(a1, b1))
3043 }
3044 #[inline(always)]
3045 fn xor_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
3046 let (a0, a1) = self.split_mask64x4(a);
3047 let (b0, b1) = self.split_mask64x4(b);
3048 self.combine_mask64x2(self.xor_mask64x2(a0, b0), self.xor_mask64x2(a1, b1))
3049 }
3050 #[inline(always)]
3051 fn select_mask64x4(
3052 self,
3053 a: mask64x4<Self>,
3054 b: mask64x4<Self>,
3055 c: mask64x4<Self>,
3056 ) -> mask64x4<Self> {
3057 let (a0, a1) = self.split_mask64x4(a);
3058 let (b0, b1) = self.split_mask64x4(b);
3059 let (c0, c1) = self.split_mask64x4(c);
3060 self.combine_mask64x2(
3061 self.select_mask64x2(a0, b0, c0),
3062 self.select_mask64x2(a1, b1, c1),
3063 )
3064 }
3065 #[inline(always)]
3066 fn simd_eq_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
3067 let (a0, a1) = self.split_mask64x4(a);
3068 let (b0, b1) = self.split_mask64x4(b);
3069 self.combine_mask64x2(self.simd_eq_mask64x2(a0, b0), self.simd_eq_mask64x2(a1, b1))
3070 }
3071 #[inline(always)]
3072 fn combine_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x8<Self> {
3073 let mut result = [0; 8usize];
3074 result[0..4usize].copy_from_slice(&a.val);
3075 result[4usize..8usize].copy_from_slice(&b.val);
3076 result.simd_into(self)
3077 }
3078 #[inline(always)]
3079 fn split_mask64x4(self, a: mask64x4<Self>) -> (mask64x2<Self>, mask64x2<Self>) {
3080 let mut b0 = [0; 2usize];
3081 let mut b1 = [0; 2usize];
3082 b0.copy_from_slice(&a.val[0..2usize]);
3083 b1.copy_from_slice(&a.val[2usize..4usize]);
3084 (b0.simd_into(self), b1.simd_into(self))
3085 }
3086 #[inline(always)]
3087 fn splat_f32x16(self, a: f32) -> f32x16<Self> {
3088 let half = self.splat_f32x8(a);
3089 self.combine_f32x8(half, half)
3090 }
3091 #[inline(always)]
3092 fn abs_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
3093 let (a0, a1) = self.split_f32x16(a);
3094 self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1))
3095 }
3096 #[inline(always)]
3097 fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
3098 let (a0, a1) = self.split_f32x16(a);
3099 self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1))
3100 }
3101 #[inline(always)]
3102 fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
3103 let (a0, a1) = self.split_f32x16(a);
3104 self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
3105 }
3106 #[inline(always)]
3107 fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3108 let (a0, a1) = self.split_f32x16(a);
3109 let (b0, b1) = self.split_f32x16(b);
3110 self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1))
3111 }
3112 #[inline(always)]
3113 fn sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3114 let (a0, a1) = self.split_f32x16(a);
3115 let (b0, b1) = self.split_f32x16(b);
3116 self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1))
3117 }
3118 #[inline(always)]
3119 fn mul_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3120 let (a0, a1) = self.split_f32x16(a);
3121 let (b0, b1) = self.split_f32x16(b);
3122 self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1))
3123 }
3124 #[inline(always)]
3125 fn div_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3126 let (a0, a1) = self.split_f32x16(a);
3127 let (b0, b1) = self.split_f32x16(b);
3128 self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1))
3129 }
3130 #[inline(always)]
3131 fn copysign_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3132 let (a0, a1) = self.split_f32x16(a);
3133 let (b0, b1) = self.split_f32x16(b);
3134 self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1))
3135 }
3136 #[inline(always)]
3137 fn simd_eq_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
3138 let (a0, a1) = self.split_f32x16(a);
3139 let (b0, b1) = self.split_f32x16(b);
3140 self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1))
3141 }
3142 #[inline(always)]
3143 fn simd_lt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
3144 let (a0, a1) = self.split_f32x16(a);
3145 let (b0, b1) = self.split_f32x16(b);
3146 self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1))
3147 }
3148 #[inline(always)]
3149 fn simd_le_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
3150 let (a0, a1) = self.split_f32x16(a);
3151 let (b0, b1) = self.split_f32x16(b);
3152 self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1))
3153 }
3154 #[inline(always)]
3155 fn simd_ge_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
3156 let (a0, a1) = self.split_f32x16(a);
3157 let (b0, b1) = self.split_f32x16(b);
3158 self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1))
3159 }
3160 #[inline(always)]
3161 fn simd_gt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
3162 let (a0, a1) = self.split_f32x16(a);
3163 let (b0, b1) = self.split_f32x16(b);
3164 self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1))
3165 }
3166 #[inline(always)]
3167 fn zip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3168 let (a0, _) = self.split_f32x16(a);
3169 let (b0, _) = self.split_f32x16(b);
3170 self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0))
3171 }
3172 #[inline(always)]
3173 fn zip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3174 let (_, a1) = self.split_f32x16(a);
3175 let (_, b1) = self.split_f32x16(b);
3176 self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1))
3177 }
3178 #[inline(always)]
3179 fn unzip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3180 let (a0, a1) = self.split_f32x16(a);
3181 let (b0, b1) = self.split_f32x16(b);
3182 self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1))
3183 }
3184 #[inline(always)]
3185 fn unzip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3186 let (a0, a1) = self.split_f32x16(a);
3187 let (b0, b1) = self.split_f32x16(b);
3188 self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1))
3189 }
3190 #[inline(always)]
3191 fn max_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3192 let (a0, a1) = self.split_f32x16(a);
3193 let (b0, b1) = self.split_f32x16(b);
3194 self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1))
3195 }
3196 #[inline(always)]
3197 fn max_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3198 let (a0, a1) = self.split_f32x16(a);
3199 let (b0, b1) = self.split_f32x16(b);
3200 self.combine_f32x8(
3201 self.max_precise_f32x8(a0, b0),
3202 self.max_precise_f32x8(a1, b1),
3203 )
3204 }
3205 #[inline(always)]
3206 fn min_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3207 let (a0, a1) = self.split_f32x16(a);
3208 let (b0, b1) = self.split_f32x16(b);
3209 self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1))
3210 }
3211 #[inline(always)]
3212 fn min_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3213 let (a0, a1) = self.split_f32x16(a);
3214 let (b0, b1) = self.split_f32x16(b);
3215 self.combine_f32x8(
3216 self.min_precise_f32x8(a0, b0),
3217 self.min_precise_f32x8(a1, b1),
3218 )
3219 }
3220 #[inline(always)]
3221 fn madd_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
3222 let (a0, a1) = self.split_f32x16(a);
3223 let (b0, b1) = self.split_f32x16(b);
3224 let (c0, c1) = self.split_f32x16(c);
3225 self.combine_f32x8(self.madd_f32x8(a0, b0, c0), self.madd_f32x8(a1, b1, c1))
3226 }
3227 #[inline(always)]
3228 fn msub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
3229 let (a0, a1) = self.split_f32x16(a);
3230 let (b0, b1) = self.split_f32x16(b);
3231 let (c0, c1) = self.split_f32x16(c);
3232 self.combine_f32x8(self.msub_f32x8(a0, b0, c0), self.msub_f32x8(a1, b1, c1))
3233 }
3234 #[inline(always)]
3235 fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
3236 let (a0, a1) = self.split_f32x16(a);
3237 self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1))
3238 }
3239 #[inline(always)]
3240 fn fract_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
3241 let (a0, a1) = self.split_f32x16(a);
3242 self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1))
3243 }
3244 #[inline(always)]
3245 fn trunc_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
3246 let (a0, a1) = self.split_f32x16(a);
3247 self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1))
3248 }
3249 #[inline(always)]
3250 fn select_f32x16(self, a: mask32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
3251 let (a0, a1) = self.split_mask32x16(a);
3252 let (b0, b1) = self.split_f32x16(b);
3253 let (c0, c1) = self.split_f32x16(c);
3254 self.combine_f32x8(self.select_f32x8(a0, b0, c0), self.select_f32x8(a1, b1, c1))
3255 }
3256 #[inline(always)]
3257 fn split_f32x16(self, a: f32x16<Self>) -> (f32x8<Self>, f32x8<Self>) {
3258 let mut b0 = [0.0; 8usize];
3259 let mut b1 = [0.0; 8usize];
3260 b0.copy_from_slice(&a.val[0..8usize]);
3261 b1.copy_from_slice(&a.val[8usize..16usize]);
3262 (b0.simd_into(self), b1.simd_into(self))
3263 }
3264 #[inline(always)]
3265 fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
3266 let (a0, a1) = self.split_f32x16(a);
3267 self.combine_f64x4(
3268 self.reinterpret_f64_f32x8(a0),
3269 self.reinterpret_f64_f32x8(a1),
3270 )
3271 }
3272 #[inline(always)]
3273 fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
3274 let (a0, a1) = self.split_f32x16(a);
3275 self.combine_i32x8(
3276 self.reinterpret_i32_f32x8(a0),
3277 self.reinterpret_i32_f32x8(a1),
3278 )
3279 }
3280 #[inline(always)]
3281 fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
3282 crate::Fallback::new()
3283 .load_interleaved_128_f32x16(src)
3284 .val
3285 .simd_into(self)
3286 }
3287 #[inline(always)]
3288 fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
3289 let fb = crate::Fallback::new();
3290 fb.store_interleaved_128_f32x16(a.val.simd_into(fb), dest);
3291 }
3292 #[inline(always)]
3293 fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
3294 let (a0, a1) = self.split_f32x16(a);
3295 self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1))
3296 }
3297 #[inline(always)]
3298 fn reinterpret_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
3299 let (a0, a1) = self.split_f32x16(a);
3300 self.combine_u32x8(
3301 self.reinterpret_u32_f32x8(a0),
3302 self.reinterpret_u32_f32x8(a1),
3303 )
3304 }
3305 #[inline(always)]
3306 fn cvt_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
3307 let (a0, a1) = self.split_f32x16(a);
3308 self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1))
3309 }
3310 #[inline(always)]
3311 fn cvt_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
3312 let (a0, a1) = self.split_f32x16(a);
3313 self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1))
3314 }
3315 #[inline(always)]
3316 fn splat_i8x64(self, a: i8) -> i8x64<Self> {
3317 let half = self.splat_i8x32(a);
3318 self.combine_i8x32(half, half)
3319 }
3320 #[inline(always)]
3321 fn not_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
3322 let (a0, a1) = self.split_i8x64(a);
3323 self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1))
3324 }
3325 #[inline(always)]
3326 fn add_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3327 let (a0, a1) = self.split_i8x64(a);
3328 let (b0, b1) = self.split_i8x64(b);
3329 self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1))
3330 }
3331 #[inline(always)]
3332 fn sub_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3333 let (a0, a1) = self.split_i8x64(a);
3334 let (b0, b1) = self.split_i8x64(b);
3335 self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1))
3336 }
3337 #[inline(always)]
3338 fn mul_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3339 let (a0, a1) = self.split_i8x64(a);
3340 let (b0, b1) = self.split_i8x64(b);
3341 self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1))
3342 }
3343 #[inline(always)]
3344 fn and_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3345 let (a0, a1) = self.split_i8x64(a);
3346 let (b0, b1) = self.split_i8x64(b);
3347 self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1))
3348 }
3349 #[inline(always)]
3350 fn or_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3351 let (a0, a1) = self.split_i8x64(a);
3352 let (b0, b1) = self.split_i8x64(b);
3353 self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1))
3354 }
3355 #[inline(always)]
3356 fn xor_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3357 let (a0, a1) = self.split_i8x64(a);
3358 let (b0, b1) = self.split_i8x64(b);
3359 self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1))
3360 }
3361 #[inline(always)]
3362 fn shr_i8x64(self, a: i8x64<Self>, b: u32) -> i8x64<Self> {
3363 let (a0, a1) = self.split_i8x64(a);
3364 self.combine_i8x32(self.shr_i8x32(a0, b), self.shr_i8x32(a1, b))
3365 }
3366 #[inline(always)]
3367 fn shrv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3368 let (a0, a1) = self.split_i8x64(a);
3369 let (b0, b1) = self.split_i8x64(b);
3370 self.combine_i8x32(self.shrv_i8x32(a0, b0), self.shrv_i8x32(a1, b1))
3371 }
3372 #[inline(always)]
3373 fn shl_i8x64(self, a: i8x64<Self>, b: u32) -> i8x64<Self> {
3374 let (a0, a1) = self.split_i8x64(a);
3375 self.combine_i8x32(self.shl_i8x32(a0, b), self.shl_i8x32(a1, b))
3376 }
3377 #[inline(always)]
3378 fn simd_eq_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
3379 let (a0, a1) = self.split_i8x64(a);
3380 let (b0, b1) = self.split_i8x64(b);
3381 self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1))
3382 }
3383 #[inline(always)]
3384 fn simd_lt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
3385 let (a0, a1) = self.split_i8x64(a);
3386 let (b0, b1) = self.split_i8x64(b);
3387 self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1))
3388 }
3389 #[inline(always)]
3390 fn simd_le_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
3391 let (a0, a1) = self.split_i8x64(a);
3392 let (b0, b1) = self.split_i8x64(b);
3393 self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1))
3394 }
3395 #[inline(always)]
3396 fn simd_ge_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
3397 let (a0, a1) = self.split_i8x64(a);
3398 let (b0, b1) = self.split_i8x64(b);
3399 self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1))
3400 }
3401 #[inline(always)]
3402 fn simd_gt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
3403 let (a0, a1) = self.split_i8x64(a);
3404 let (b0, b1) = self.split_i8x64(b);
3405 self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1))
3406 }
3407 #[inline(always)]
3408 fn zip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3409 let (a0, _) = self.split_i8x64(a);
3410 let (b0, _) = self.split_i8x64(b);
3411 self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0))
3412 }
3413 #[inline(always)]
3414 fn zip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3415 let (_, a1) = self.split_i8x64(a);
3416 let (_, b1) = self.split_i8x64(b);
3417 self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1))
3418 }
3419 #[inline(always)]
3420 fn unzip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3421 let (a0, a1) = self.split_i8x64(a);
3422 let (b0, b1) = self.split_i8x64(b);
3423 self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1))
3424 }
3425 #[inline(always)]
3426 fn unzip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3427 let (a0, a1) = self.split_i8x64(a);
3428 let (b0, b1) = self.split_i8x64(b);
3429 self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1))
3430 }
3431 #[inline(always)]
3432 fn select_i8x64(self, a: mask8x64<Self>, b: i8x64<Self>, c: i8x64<Self>) -> i8x64<Self> {
3433 let (a0, a1) = self.split_mask8x64(a);
3434 let (b0, b1) = self.split_i8x64(b);
3435 let (c0, c1) = self.split_i8x64(c);
3436 self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1))
3437 }
3438 #[inline(always)]
3439 fn min_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3440 let (a0, a1) = self.split_i8x64(a);
3441 let (b0, b1) = self.split_i8x64(b);
3442 self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1))
3443 }
3444 #[inline(always)]
3445 fn max_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3446 let (a0, a1) = self.split_i8x64(a);
3447 let (b0, b1) = self.split_i8x64(b);
3448 self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1))
3449 }
3450 #[inline(always)]
3451 fn split_i8x64(self, a: i8x64<Self>) -> (i8x32<Self>, i8x32<Self>) {
3452 let mut b0 = [0; 32usize];
3453 let mut b1 = [0; 32usize];
3454 b0.copy_from_slice(&a.val[0..32usize]);
3455 b1.copy_from_slice(&a.val[32usize..64usize]);
3456 (b0.simd_into(self), b1.simd_into(self))
3457 }
3458 #[inline(always)]
3459 fn neg_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
3460 let (a0, a1) = self.split_i8x64(a);
3461 self.combine_i8x32(self.neg_i8x32(a0), self.neg_i8x32(a1))
3462 }
3463 #[inline(always)]
3464 fn reinterpret_u8_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
3465 let (a0, a1) = self.split_i8x64(a);
3466 self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1))
3467 }
3468 #[inline(always)]
3469 fn reinterpret_u32_i8x64(self, a: i8x64<Self>) -> u32x16<Self> {
3470 let (a0, a1) = self.split_i8x64(a);
3471 self.combine_u32x8(
3472 self.reinterpret_u32_i8x32(a0),
3473 self.reinterpret_u32_i8x32(a1),
3474 )
3475 }
3476 #[inline(always)]
3477 fn splat_u8x64(self, a: u8) -> u8x64<Self> {
3478 let half = self.splat_u8x32(a);
3479 self.combine_u8x32(half, half)
3480 }
3481 #[inline(always)]
3482 fn not_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
3483 let (a0, a1) = self.split_u8x64(a);
3484 self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1))
3485 }
3486 #[inline(always)]
3487 fn add_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3488 let (a0, a1) = self.split_u8x64(a);
3489 let (b0, b1) = self.split_u8x64(b);
3490 self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1))
3491 }
3492 #[inline(always)]
3493 fn sub_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3494 let (a0, a1) = self.split_u8x64(a);
3495 let (b0, b1) = self.split_u8x64(b);
3496 self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1))
3497 }
3498 #[inline(always)]
3499 fn mul_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3500 let (a0, a1) = self.split_u8x64(a);
3501 let (b0, b1) = self.split_u8x64(b);
3502 self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1))
3503 }
3504 #[inline(always)]
3505 fn and_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3506 let (a0, a1) = self.split_u8x64(a);
3507 let (b0, b1) = self.split_u8x64(b);
3508 self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1))
3509 }
3510 #[inline(always)]
3511 fn or_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3512 let (a0, a1) = self.split_u8x64(a);
3513 let (b0, b1) = self.split_u8x64(b);
3514 self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1))
3515 }
3516 #[inline(always)]
3517 fn xor_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3518 let (a0, a1) = self.split_u8x64(a);
3519 let (b0, b1) = self.split_u8x64(b);
3520 self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1))
3521 }
3522 #[inline(always)]
3523 fn shr_u8x64(self, a: u8x64<Self>, b: u32) -> u8x64<Self> {
3524 let (a0, a1) = self.split_u8x64(a);
3525 self.combine_u8x32(self.shr_u8x32(a0, b), self.shr_u8x32(a1, b))
3526 }
3527 #[inline(always)]
3528 fn shrv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3529 let (a0, a1) = self.split_u8x64(a);
3530 let (b0, b1) = self.split_u8x64(b);
3531 self.combine_u8x32(self.shrv_u8x32(a0, b0), self.shrv_u8x32(a1, b1))
3532 }
3533 #[inline(always)]
3534 fn shl_u8x64(self, a: u8x64<Self>, b: u32) -> u8x64<Self> {
3535 let (a0, a1) = self.split_u8x64(a);
3536 self.combine_u8x32(self.shl_u8x32(a0, b), self.shl_u8x32(a1, b))
3537 }
3538 #[inline(always)]
3539 fn simd_eq_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
3540 let (a0, a1) = self.split_u8x64(a);
3541 let (b0, b1) = self.split_u8x64(b);
3542 self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1))
3543 }
3544 #[inline(always)]
3545 fn simd_lt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
3546 let (a0, a1) = self.split_u8x64(a);
3547 let (b0, b1) = self.split_u8x64(b);
3548 self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1))
3549 }
3550 #[inline(always)]
3551 fn simd_le_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
3552 let (a0, a1) = self.split_u8x64(a);
3553 let (b0, b1) = self.split_u8x64(b);
3554 self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1))
3555 }
3556 #[inline(always)]
3557 fn simd_ge_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
3558 let (a0, a1) = self.split_u8x64(a);
3559 let (b0, b1) = self.split_u8x64(b);
3560 self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1))
3561 }
3562 #[inline(always)]
3563 fn simd_gt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
3564 let (a0, a1) = self.split_u8x64(a);
3565 let (b0, b1) = self.split_u8x64(b);
3566 self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1))
3567 }
3568 #[inline(always)]
3569 fn zip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3570 let (a0, _) = self.split_u8x64(a);
3571 let (b0, _) = self.split_u8x64(b);
3572 self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0))
3573 }
3574 #[inline(always)]
3575 fn zip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3576 let (_, a1) = self.split_u8x64(a);
3577 let (_, b1) = self.split_u8x64(b);
3578 self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1))
3579 }
3580 #[inline(always)]
3581 fn unzip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3582 let (a0, a1) = self.split_u8x64(a);
3583 let (b0, b1) = self.split_u8x64(b);
3584 self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1))
3585 }
3586 #[inline(always)]
3587 fn unzip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3588 let (a0, a1) = self.split_u8x64(a);
3589 let (b0, b1) = self.split_u8x64(b);
3590 self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1))
3591 }
3592 #[inline(always)]
3593 fn select_u8x64(self, a: mask8x64<Self>, b: u8x64<Self>, c: u8x64<Self>) -> u8x64<Self> {
3594 let (a0, a1) = self.split_mask8x64(a);
3595 let (b0, b1) = self.split_u8x64(b);
3596 let (c0, c1) = self.split_u8x64(c);
3597 self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1))
3598 }
3599 #[inline(always)]
3600 fn min_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3601 let (a0, a1) = self.split_u8x64(a);
3602 let (b0, b1) = self.split_u8x64(b);
3603 self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1))
3604 }
3605 #[inline(always)]
3606 fn max_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3607 let (a0, a1) = self.split_u8x64(a);
3608 let (b0, b1) = self.split_u8x64(b);
3609 self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1))
3610 }
3611 #[inline(always)]
3612 fn split_u8x64(self, a: u8x64<Self>) -> (u8x32<Self>, u8x32<Self>) {
3613 let mut b0 = [0; 32usize];
3614 let mut b1 = [0; 32usize];
3615 b0.copy_from_slice(&a.val[0..32usize]);
3616 b1.copy_from_slice(&a.val[32usize..64usize]);
3617 (b0.simd_into(self), b1.simd_into(self))
3618 }
3619 #[inline(always)]
3620 fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self> {
3621 crate::Fallback::new()
3622 .load_interleaved_128_u8x64(src)
3623 .val
3624 .simd_into(self)
3625 }
3626 #[inline(always)]
3627 fn store_interleaved_128_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
3628 let fb = crate::Fallback::new();
3629 fb.store_interleaved_128_u8x64(a.val.simd_into(fb), dest);
3630 }
3631 #[inline(always)]
3632 fn reinterpret_u32_u8x64(self, a: u8x64<Self>) -> u32x16<Self> {
3633 let (a0, a1) = self.split_u8x64(a);
3634 self.combine_u32x8(
3635 self.reinterpret_u32_u8x32(a0),
3636 self.reinterpret_u32_u8x32(a1),
3637 )
3638 }
3639 #[inline(always)]
3640 fn splat_mask8x64(self, a: i8) -> mask8x64<Self> {
3641 let half = self.splat_mask8x32(a);
3642 self.combine_mask8x32(half, half)
3643 }
3644 #[inline(always)]
3645 fn not_mask8x64(self, a: mask8x64<Self>) -> mask8x64<Self> {
3646 let (a0, a1) = self.split_mask8x64(a);
3647 self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1))
3648 }
3649 #[inline(always)]
3650 fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
3651 let (a0, a1) = self.split_mask8x64(a);
3652 let (b0, b1) = self.split_mask8x64(b);
3653 self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1))
3654 }
3655 #[inline(always)]
3656 fn or_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
3657 let (a0, a1) = self.split_mask8x64(a);
3658 let (b0, b1) = self.split_mask8x64(b);
3659 self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1))
3660 }
3661 #[inline(always)]
3662 fn xor_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
3663 let (a0, a1) = self.split_mask8x64(a);
3664 let (b0, b1) = self.split_mask8x64(b);
3665 self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1))
3666 }
3667 #[inline(always)]
3668 fn select_mask8x64(
3669 self,
3670 a: mask8x64<Self>,
3671 b: mask8x64<Self>,
3672 c: mask8x64<Self>,
3673 ) -> mask8x64<Self> {
3674 let (a0, a1) = self.split_mask8x64(a);
3675 let (b0, b1) = self.split_mask8x64(b);
3676 let (c0, c1) = self.split_mask8x64(c);
3677 self.combine_mask8x32(
3678 self.select_mask8x32(a0, b0, c0),
3679 self.select_mask8x32(a1, b1, c1),
3680 )
3681 }
3682 #[inline(always)]
3683 fn simd_eq_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
3684 let (a0, a1) = self.split_mask8x64(a);
3685 let (b0, b1) = self.split_mask8x64(b);
3686 self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1))
3687 }
3688 #[inline(always)]
3689 fn split_mask8x64(self, a: mask8x64<Self>) -> (mask8x32<Self>, mask8x32<Self>) {
3690 let mut b0 = [0; 32usize];
3691 let mut b1 = [0; 32usize];
3692 b0.copy_from_slice(&a.val[0..32usize]);
3693 b1.copy_from_slice(&a.val[32usize..64usize]);
3694 (b0.simd_into(self), b1.simd_into(self))
3695 }
3696 #[inline(always)]
3697 fn splat_i16x32(self, a: i16) -> i16x32<Self> {
3698 let half = self.splat_i16x16(a);
3699 self.combine_i16x16(half, half)
3700 }
3701 #[inline(always)]
3702 fn not_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
3703 let (a0, a1) = self.split_i16x32(a);
3704 self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1))
3705 }
3706 #[inline(always)]
3707 fn add_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3708 let (a0, a1) = self.split_i16x32(a);
3709 let (b0, b1) = self.split_i16x32(b);
3710 self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1))
3711 }
3712 #[inline(always)]
3713 fn sub_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3714 let (a0, a1) = self.split_i16x32(a);
3715 let (b0, b1) = self.split_i16x32(b);
3716 self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1))
3717 }
3718 #[inline(always)]
3719 fn mul_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3720 let (a0, a1) = self.split_i16x32(a);
3721 let (b0, b1) = self.split_i16x32(b);
3722 self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1))
3723 }
3724 #[inline(always)]
3725 fn and_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3726 let (a0, a1) = self.split_i16x32(a);
3727 let (b0, b1) = self.split_i16x32(b);
3728 self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1))
3729 }
3730 #[inline(always)]
3731 fn or_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3732 let (a0, a1) = self.split_i16x32(a);
3733 let (b0, b1) = self.split_i16x32(b);
3734 self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1))
3735 }
3736 #[inline(always)]
3737 fn xor_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3738 let (a0, a1) = self.split_i16x32(a);
3739 let (b0, b1) = self.split_i16x32(b);
3740 self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1))
3741 }
3742 #[inline(always)]
3743 fn shr_i16x32(self, a: i16x32<Self>, b: u32) -> i16x32<Self> {
3744 let (a0, a1) = self.split_i16x32(a);
3745 self.combine_i16x16(self.shr_i16x16(a0, b), self.shr_i16x16(a1, b))
3746 }
3747 #[inline(always)]
3748 fn shrv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3749 let (a0, a1) = self.split_i16x32(a);
3750 let (b0, b1) = self.split_i16x32(b);
3751 self.combine_i16x16(self.shrv_i16x16(a0, b0), self.shrv_i16x16(a1, b1))
3752 }
3753 #[inline(always)]
3754 fn shl_i16x32(self, a: i16x32<Self>, b: u32) -> i16x32<Self> {
3755 let (a0, a1) = self.split_i16x32(a);
3756 self.combine_i16x16(self.shl_i16x16(a0, b), self.shl_i16x16(a1, b))
3757 }
3758 #[inline(always)]
3759 fn simd_eq_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
3760 let (a0, a1) = self.split_i16x32(a);
3761 let (b0, b1) = self.split_i16x32(b);
3762 self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1))
3763 }
3764 #[inline(always)]
3765 fn simd_lt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
3766 let (a0, a1) = self.split_i16x32(a);
3767 let (b0, b1) = self.split_i16x32(b);
3768 self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1))
3769 }
3770 #[inline(always)]
3771 fn simd_le_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
3772 let (a0, a1) = self.split_i16x32(a);
3773 let (b0, b1) = self.split_i16x32(b);
3774 self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1))
3775 }
3776 #[inline(always)]
3777 fn simd_ge_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
3778 let (a0, a1) = self.split_i16x32(a);
3779 let (b0, b1) = self.split_i16x32(b);
3780 self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1))
3781 }
3782 #[inline(always)]
3783 fn simd_gt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
3784 let (a0, a1) = self.split_i16x32(a);
3785 let (b0, b1) = self.split_i16x32(b);
3786 self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1))
3787 }
3788 #[inline(always)]
3789 fn zip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3790 let (a0, _) = self.split_i16x32(a);
3791 let (b0, _) = self.split_i16x32(b);
3792 self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0))
3793 }
3794 #[inline(always)]
3795 fn zip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3796 let (_, a1) = self.split_i16x32(a);
3797 let (_, b1) = self.split_i16x32(b);
3798 self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1))
3799 }
3800 #[inline(always)]
3801 fn unzip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3802 let (a0, a1) = self.split_i16x32(a);
3803 let (b0, b1) = self.split_i16x32(b);
3804 self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1))
3805 }
3806 #[inline(always)]
3807 fn unzip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3808 let (a0, a1) = self.split_i16x32(a);
3809 let (b0, b1) = self.split_i16x32(b);
3810 self.combine_i16x16(
3811 self.unzip_high_i16x16(a0, a1),
3812 self.unzip_high_i16x16(b0, b1),
3813 )
3814 }
3815 #[inline(always)]
3816 fn select_i16x32(self, a: mask16x32<Self>, b: i16x32<Self>, c: i16x32<Self>) -> i16x32<Self> {
3817 let (a0, a1) = self.split_mask16x32(a);
3818 let (b0, b1) = self.split_i16x32(b);
3819 let (c0, c1) = self.split_i16x32(c);
3820 self.combine_i16x16(
3821 self.select_i16x16(a0, b0, c0),
3822 self.select_i16x16(a1, b1, c1),
3823 )
3824 }
3825 #[inline(always)]
3826 fn min_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3827 let (a0, a1) = self.split_i16x32(a);
3828 let (b0, b1) = self.split_i16x32(b);
3829 self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1))
3830 }
3831 #[inline(always)]
3832 fn max_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3833 let (a0, a1) = self.split_i16x32(a);
3834 let (b0, b1) = self.split_i16x32(b);
3835 self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1))
3836 }
3837 #[inline(always)]
3838 fn split_i16x32(self, a: i16x32<Self>) -> (i16x16<Self>, i16x16<Self>) {
3839 let mut b0 = [0; 16usize];
3840 let mut b1 = [0; 16usize];
3841 b0.copy_from_slice(&a.val[0..16usize]);
3842 b1.copy_from_slice(&a.val[16usize..32usize]);
3843 (b0.simd_into(self), b1.simd_into(self))
3844 }
3845 #[inline(always)]
3846 fn neg_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
3847 let (a0, a1) = self.split_i16x32(a);
3848 self.combine_i16x16(self.neg_i16x16(a0), self.neg_i16x16(a1))
3849 }
3850 #[inline(always)]
3851 fn reinterpret_u8_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
3852 let (a0, a1) = self.split_i16x32(a);
3853 self.combine_u8x32(
3854 self.reinterpret_u8_i16x16(a0),
3855 self.reinterpret_u8_i16x16(a1),
3856 )
3857 }
3858 #[inline(always)]
3859 fn reinterpret_u32_i16x32(self, a: i16x32<Self>) -> u32x16<Self> {
3860 let (a0, a1) = self.split_i16x32(a);
3861 self.combine_u32x8(
3862 self.reinterpret_u32_i16x16(a0),
3863 self.reinterpret_u32_i16x16(a1),
3864 )
3865 }
3866 #[inline(always)]
3867 fn splat_u16x32(self, a: u16) -> u16x32<Self> {
3868 let half = self.splat_u16x16(a);
3869 self.combine_u16x16(half, half)
3870 }
3871 #[inline(always)]
3872 fn not_u16x32(self, a: u16x32<Self>) -> u16x32<Self> {
3873 let (a0, a1) = self.split_u16x32(a);
3874 self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1))
3875 }
3876 #[inline(always)]
3877 fn add_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3878 let (a0, a1) = self.split_u16x32(a);
3879 let (b0, b1) = self.split_u16x32(b);
3880 self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1))
3881 }
3882 #[inline(always)]
3883 fn sub_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3884 let (a0, a1) = self.split_u16x32(a);
3885 let (b0, b1) = self.split_u16x32(b);
3886 self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1))
3887 }
3888 #[inline(always)]
3889 fn mul_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3890 let (a0, a1) = self.split_u16x32(a);
3891 let (b0, b1) = self.split_u16x32(b);
3892 self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1))
3893 }
3894 #[inline(always)]
3895 fn and_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3896 let (a0, a1) = self.split_u16x32(a);
3897 let (b0, b1) = self.split_u16x32(b);
3898 self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1))
3899 }
3900 #[inline(always)]
3901 fn or_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3902 let (a0, a1) = self.split_u16x32(a);
3903 let (b0, b1) = self.split_u16x32(b);
3904 self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1))
3905 }
3906 #[inline(always)]
3907 fn xor_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3908 let (a0, a1) = self.split_u16x32(a);
3909 let (b0, b1) = self.split_u16x32(b);
3910 self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1))
3911 }
3912 #[inline(always)]
3913 fn shr_u16x32(self, a: u16x32<Self>, b: u32) -> u16x32<Self> {
3914 let (a0, a1) = self.split_u16x32(a);
3915 self.combine_u16x16(self.shr_u16x16(a0, b), self.shr_u16x16(a1, b))
3916 }
3917 #[inline(always)]
3918 fn shrv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3919 let (a0, a1) = self.split_u16x32(a);
3920 let (b0, b1) = self.split_u16x32(b);
3921 self.combine_u16x16(self.shrv_u16x16(a0, b0), self.shrv_u16x16(a1, b1))
3922 }
3923 #[inline(always)]
3924 fn shl_u16x32(self, a: u16x32<Self>, b: u32) -> u16x32<Self> {
3925 let (a0, a1) = self.split_u16x32(a);
3926 self.combine_u16x16(self.shl_u16x16(a0, b), self.shl_u16x16(a1, b))
3927 }
3928 #[inline(always)]
3929 fn simd_eq_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
3930 let (a0, a1) = self.split_u16x32(a);
3931 let (b0, b1) = self.split_u16x32(b);
3932 self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1))
3933 }
3934 #[inline(always)]
3935 fn simd_lt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
3936 let (a0, a1) = self.split_u16x32(a);
3937 let (b0, b1) = self.split_u16x32(b);
3938 self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1))
3939 }
3940 #[inline(always)]
3941 fn simd_le_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
3942 let (a0, a1) = self.split_u16x32(a);
3943 let (b0, b1) = self.split_u16x32(b);
3944 self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1))
3945 }
3946 #[inline(always)]
3947 fn simd_ge_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
3948 let (a0, a1) = self.split_u16x32(a);
3949 let (b0, b1) = self.split_u16x32(b);
3950 self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1))
3951 }
3952 #[inline(always)]
3953 fn simd_gt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
3954 let (a0, a1) = self.split_u16x32(a);
3955 let (b0, b1) = self.split_u16x32(b);
3956 self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1))
3957 }
3958 #[inline(always)]
3959 fn zip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3960 let (a0, _) = self.split_u16x32(a);
3961 let (b0, _) = self.split_u16x32(b);
3962 self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0))
3963 }
3964 #[inline(always)]
3965 fn zip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3966 let (_, a1) = self.split_u16x32(a);
3967 let (_, b1) = self.split_u16x32(b);
3968 self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1))
3969 }
3970 #[inline(always)]
3971 fn unzip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3972 let (a0, a1) = self.split_u16x32(a);
3973 let (b0, b1) = self.split_u16x32(b);
3974 self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1))
3975 }
3976 #[inline(always)]
3977 fn unzip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3978 let (a0, a1) = self.split_u16x32(a);
3979 let (b0, b1) = self.split_u16x32(b);
3980 self.combine_u16x16(
3981 self.unzip_high_u16x16(a0, a1),
3982 self.unzip_high_u16x16(b0, b1),
3983 )
3984 }
3985 #[inline(always)]
3986 fn select_u16x32(self, a: mask16x32<Self>, b: u16x32<Self>, c: u16x32<Self>) -> u16x32<Self> {
3987 let (a0, a1) = self.split_mask16x32(a);
3988 let (b0, b1) = self.split_u16x32(b);
3989 let (c0, c1) = self.split_u16x32(c);
3990 self.combine_u16x16(
3991 self.select_u16x16(a0, b0, c0),
3992 self.select_u16x16(a1, b1, c1),
3993 )
3994 }
3995 #[inline(always)]
3996 fn min_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3997 let (a0, a1) = self.split_u16x32(a);
3998 let (b0, b1) = self.split_u16x32(b);
3999 self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1))
4000 }
4001 #[inline(always)]
4002 fn max_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
4003 let (a0, a1) = self.split_u16x32(a);
4004 let (b0, b1) = self.split_u16x32(b);
4005 self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1))
4006 }
4007 #[inline(always)]
4008 fn split_u16x32(self, a: u16x32<Self>) -> (u16x16<Self>, u16x16<Self>) {
4009 let mut b0 = [0; 16usize];
4010 let mut b1 = [0; 16usize];
4011 b0.copy_from_slice(&a.val[0..16usize]);
4012 b1.copy_from_slice(&a.val[16usize..32usize]);
4013 (b0.simd_into(self), b1.simd_into(self))
4014 }
4015 #[inline(always)]
4016 fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self> {
4017 crate::Fallback::new()
4018 .load_interleaved_128_u16x32(src)
4019 .val
4020 .simd_into(self)
4021 }
4022 #[inline(always)]
4023 fn store_interleaved_128_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
4024 let fb = crate::Fallback::new();
4025 fb.store_interleaved_128_u16x32(a.val.simd_into(fb), dest);
4026 }
4027 #[inline(always)]
4028 fn narrow_u16x32(self, a: u16x32<Self>) -> u8x32<Self> {
4029 let (a0, a1) = self.split_u16x32(a);
4030 self.combine_u8x16(self.narrow_u16x16(a0), self.narrow_u16x16(a1))
4031 }
4032 #[inline(always)]
4033 fn reinterpret_u8_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
4034 let (a0, a1) = self.split_u16x32(a);
4035 self.combine_u8x32(
4036 self.reinterpret_u8_u16x16(a0),
4037 self.reinterpret_u8_u16x16(a1),
4038 )
4039 }
4040 #[inline(always)]
4041 fn reinterpret_u32_u16x32(self, a: u16x32<Self>) -> u32x16<Self> {
4042 let (a0, a1) = self.split_u16x32(a);
4043 self.combine_u32x8(
4044 self.reinterpret_u32_u16x16(a0),
4045 self.reinterpret_u32_u16x16(a1),
4046 )
4047 }
4048 #[inline(always)]
4049 fn splat_mask16x32(self, a: i16) -> mask16x32<Self> {
4050 let half = self.splat_mask16x16(a);
4051 self.combine_mask16x16(half, half)
4052 }
4053 #[inline(always)]
4054 fn not_mask16x32(self, a: mask16x32<Self>) -> mask16x32<Self> {
4055 let (a0, a1) = self.split_mask16x32(a);
4056 self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1))
4057 }
4058 #[inline(always)]
4059 fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
4060 let (a0, a1) = self.split_mask16x32(a);
4061 let (b0, b1) = self.split_mask16x32(b);
4062 self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1))
4063 }
4064 #[inline(always)]
4065 fn or_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
4066 let (a0, a1) = self.split_mask16x32(a);
4067 let (b0, b1) = self.split_mask16x32(b);
4068 self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1))
4069 }
4070 #[inline(always)]
4071 fn xor_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
4072 let (a0, a1) = self.split_mask16x32(a);
4073 let (b0, b1) = self.split_mask16x32(b);
4074 self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1))
4075 }
4076 #[inline(always)]
4077 fn select_mask16x32(
4078 self,
4079 a: mask16x32<Self>,
4080 b: mask16x32<Self>,
4081 c: mask16x32<Self>,
4082 ) -> mask16x32<Self> {
4083 let (a0, a1) = self.split_mask16x32(a);
4084 let (b0, b1) = self.split_mask16x32(b);
4085 let (c0, c1) = self.split_mask16x32(c);
4086 self.combine_mask16x16(
4087 self.select_mask16x16(a0, b0, c0),
4088 self.select_mask16x16(a1, b1, c1),
4089 )
4090 }
4091 #[inline(always)]
4092 fn simd_eq_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
4093 let (a0, a1) = self.split_mask16x32(a);
4094 let (b0, b1) = self.split_mask16x32(b);
4095 self.combine_mask16x16(
4096 self.simd_eq_mask16x16(a0, b0),
4097 self.simd_eq_mask16x16(a1, b1),
4098 )
4099 }
4100 #[inline(always)]
4101 fn split_mask16x32(self, a: mask16x32<Self>) -> (mask16x16<Self>, mask16x16<Self>) {
4102 let mut b0 = [0; 16usize];
4103 let mut b1 = [0; 16usize];
4104 b0.copy_from_slice(&a.val[0..16usize]);
4105 b1.copy_from_slice(&a.val[16usize..32usize]);
4106 (b0.simd_into(self), b1.simd_into(self))
4107 }
4108 #[inline(always)]
4109 fn splat_i32x16(self, a: i32) -> i32x16<Self> {
4110 let half = self.splat_i32x8(a);
4111 self.combine_i32x8(half, half)
4112 }
4113 #[inline(always)]
4114 fn not_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
4115 let (a0, a1) = self.split_i32x16(a);
4116 self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1))
4117 }
4118 #[inline(always)]
4119 fn add_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
4120 let (a0, a1) = self.split_i32x16(a);
4121 let (b0, b1) = self.split_i32x16(b);
4122 self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1))
4123 }
4124 #[inline(always)]
4125 fn sub_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
4126 let (a0, a1) = self.split_i32x16(a);
4127 let (b0, b1) = self.split_i32x16(b);
4128 self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1))
4129 }
4130 #[inline(always)]
4131 fn mul_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
4132 let (a0, a1) = self.split_i32x16(a);
4133 let (b0, b1) = self.split_i32x16(b);
4134 self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1))
4135 }
4136 #[inline(always)]
4137 fn and_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
4138 let (a0, a1) = self.split_i32x16(a);
4139 let (b0, b1) = self.split_i32x16(b);
4140 self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1))
4141 }
4142 #[inline(always)]
4143 fn or_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
4144 let (a0, a1) = self.split_i32x16(a);
4145 let (b0, b1) = self.split_i32x16(b);
4146 self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1))
4147 }
4148 #[inline(always)]
4149 fn xor_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
4150 let (a0, a1) = self.split_i32x16(a);
4151 let (b0, b1) = self.split_i32x16(b);
4152 self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1))
4153 }
4154 #[inline(always)]
4155 fn shr_i32x16(self, a: i32x16<Self>, b: u32) -> i32x16<Self> {
4156 let (a0, a1) = self.split_i32x16(a);
4157 self.combine_i32x8(self.shr_i32x8(a0, b), self.shr_i32x8(a1, b))
4158 }
4159 #[inline(always)]
4160 fn shrv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
4161 let (a0, a1) = self.split_i32x16(a);
4162 let (b0, b1) = self.split_i32x16(b);
4163 self.combine_i32x8(self.shrv_i32x8(a0, b0), self.shrv_i32x8(a1, b1))
4164 }
4165 #[inline(always)]
4166 fn shl_i32x16(self, a: i32x16<Self>, b: u32) -> i32x16<Self> {
4167 let (a0, a1) = self.split_i32x16(a);
4168 self.combine_i32x8(self.shl_i32x8(a0, b), self.shl_i32x8(a1, b))
4169 }
4170 #[inline(always)]
4171 fn simd_eq_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
4172 let (a0, a1) = self.split_i32x16(a);
4173 let (b0, b1) = self.split_i32x16(b);
4174 self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1))
4175 }
4176 #[inline(always)]
4177 fn simd_lt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
4178 let (a0, a1) = self.split_i32x16(a);
4179 let (b0, b1) = self.split_i32x16(b);
4180 self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1))
4181 }
4182 #[inline(always)]
4183 fn simd_le_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
4184 let (a0, a1) = self.split_i32x16(a);
4185 let (b0, b1) = self.split_i32x16(b);
4186 self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1))
4187 }
4188 #[inline(always)]
4189 fn simd_ge_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
4190 let (a0, a1) = self.split_i32x16(a);
4191 let (b0, b1) = self.split_i32x16(b);
4192 self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1))
4193 }
4194 #[inline(always)]
4195 fn simd_gt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
4196 let (a0, a1) = self.split_i32x16(a);
4197 let (b0, b1) = self.split_i32x16(b);
4198 self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1))
4199 }
4200 #[inline(always)]
4201 fn zip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
4202 let (a0, _) = self.split_i32x16(a);
4203 let (b0, _) = self.split_i32x16(b);
4204 self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0))
4205 }
4206 #[inline(always)]
4207 fn zip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
4208 let (_, a1) = self.split_i32x16(a);
4209 let (_, b1) = self.split_i32x16(b);
4210 self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1))
4211 }
4212 #[inline(always)]
4213 fn unzip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
4214 let (a0, a1) = self.split_i32x16(a);
4215 let (b0, b1) = self.split_i32x16(b);
4216 self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1))
4217 }
4218 #[inline(always)]
4219 fn unzip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
4220 let (a0, a1) = self.split_i32x16(a);
4221 let (b0, b1) = self.split_i32x16(b);
4222 self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1))
4223 }
4224 #[inline(always)]
4225 fn select_i32x16(self, a: mask32x16<Self>, b: i32x16<Self>, c: i32x16<Self>) -> i32x16<Self> {
4226 let (a0, a1) = self.split_mask32x16(a);
4227 let (b0, b1) = self.split_i32x16(b);
4228 let (c0, c1) = self.split_i32x16(c);
4229 self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1))
4230 }
4231 #[inline(always)]
4232 fn min_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
4233 let (a0, a1) = self.split_i32x16(a);
4234 let (b0, b1) = self.split_i32x16(b);
4235 self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1))
4236 }
4237 #[inline(always)]
4238 fn max_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
4239 let (a0, a1) = self.split_i32x16(a);
4240 let (b0, b1) = self.split_i32x16(b);
4241 self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1))
4242 }
4243 #[inline(always)]
4244 fn split_i32x16(self, a: i32x16<Self>) -> (i32x8<Self>, i32x8<Self>) {
4245 let mut b0 = [0; 8usize];
4246 let mut b1 = [0; 8usize];
4247 b0.copy_from_slice(&a.val[0..8usize]);
4248 b1.copy_from_slice(&a.val[8usize..16usize]);
4249 (b0.simd_into(self), b1.simd_into(self))
4250 }
4251 #[inline(always)]
4252 fn neg_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
4253 let (a0, a1) = self.split_i32x16(a);
4254 self.combine_i32x8(self.neg_i32x8(a0), self.neg_i32x8(a1))
4255 }
4256 #[inline(always)]
4257 fn reinterpret_u8_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
4258 let (a0, a1) = self.split_i32x16(a);
4259 self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1))
4260 }
4261 #[inline(always)]
4262 fn reinterpret_u32_i32x16(self, a: i32x16<Self>) -> u32x16<Self> {
4263 let (a0, a1) = self.split_i32x16(a);
4264 self.combine_u32x8(
4265 self.reinterpret_u32_i32x8(a0),
4266 self.reinterpret_u32_i32x8(a1),
4267 )
4268 }
4269 #[inline(always)]
4270 fn cvt_f32_i32x16(self, a: i32x16<Self>) -> f32x16<Self> {
4271 let (a0, a1) = self.split_i32x16(a);
4272 self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1))
4273 }
4274 #[inline(always)]
4275 fn splat_u32x16(self, a: u32) -> u32x16<Self> {
4276 let half = self.splat_u32x8(a);
4277 self.combine_u32x8(half, half)
4278 }
4279 #[inline(always)]
4280 fn not_u32x16(self, a: u32x16<Self>) -> u32x16<Self> {
4281 let (a0, a1) = self.split_u32x16(a);
4282 self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1))
4283 }
4284 #[inline(always)]
4285 fn add_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4286 let (a0, a1) = self.split_u32x16(a);
4287 let (b0, b1) = self.split_u32x16(b);
4288 self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1))
4289 }
4290 #[inline(always)]
4291 fn sub_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4292 let (a0, a1) = self.split_u32x16(a);
4293 let (b0, b1) = self.split_u32x16(b);
4294 self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1))
4295 }
4296 #[inline(always)]
4297 fn mul_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4298 let (a0, a1) = self.split_u32x16(a);
4299 let (b0, b1) = self.split_u32x16(b);
4300 self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1))
4301 }
4302 #[inline(always)]
4303 fn and_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4304 let (a0, a1) = self.split_u32x16(a);
4305 let (b0, b1) = self.split_u32x16(b);
4306 self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1))
4307 }
4308 #[inline(always)]
4309 fn or_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4310 let (a0, a1) = self.split_u32x16(a);
4311 let (b0, b1) = self.split_u32x16(b);
4312 self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1))
4313 }
4314 #[inline(always)]
4315 fn xor_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4316 let (a0, a1) = self.split_u32x16(a);
4317 let (b0, b1) = self.split_u32x16(b);
4318 self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1))
4319 }
4320 #[inline(always)]
4321 fn shr_u32x16(self, a: u32x16<Self>, b: u32) -> u32x16<Self> {
4322 let (a0, a1) = self.split_u32x16(a);
4323 self.combine_u32x8(self.shr_u32x8(a0, b), self.shr_u32x8(a1, b))
4324 }
4325 #[inline(always)]
4326 fn shrv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4327 let (a0, a1) = self.split_u32x16(a);
4328 let (b0, b1) = self.split_u32x16(b);
4329 self.combine_u32x8(self.shrv_u32x8(a0, b0), self.shrv_u32x8(a1, b1))
4330 }
4331 #[inline(always)]
4332 fn shl_u32x16(self, a: u32x16<Self>, b: u32) -> u32x16<Self> {
4333 let (a0, a1) = self.split_u32x16(a);
4334 self.combine_u32x8(self.shl_u32x8(a0, b), self.shl_u32x8(a1, b))
4335 }
4336 #[inline(always)]
4337 fn simd_eq_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
4338 let (a0, a1) = self.split_u32x16(a);
4339 let (b0, b1) = self.split_u32x16(b);
4340 self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1))
4341 }
4342 #[inline(always)]
4343 fn simd_lt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
4344 let (a0, a1) = self.split_u32x16(a);
4345 let (b0, b1) = self.split_u32x16(b);
4346 self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1))
4347 }
4348 #[inline(always)]
4349 fn simd_le_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
4350 let (a0, a1) = self.split_u32x16(a);
4351 let (b0, b1) = self.split_u32x16(b);
4352 self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1))
4353 }
4354 #[inline(always)]
4355 fn simd_ge_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
4356 let (a0, a1) = self.split_u32x16(a);
4357 let (b0, b1) = self.split_u32x16(b);
4358 self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1))
4359 }
4360 #[inline(always)]
4361 fn simd_gt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
4362 let (a0, a1) = self.split_u32x16(a);
4363 let (b0, b1) = self.split_u32x16(b);
4364 self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1))
4365 }
4366 #[inline(always)]
4367 fn zip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4368 let (a0, _) = self.split_u32x16(a);
4369 let (b0, _) = self.split_u32x16(b);
4370 self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0))
4371 }
4372 #[inline(always)]
4373 fn zip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4374 let (_, a1) = self.split_u32x16(a);
4375 let (_, b1) = self.split_u32x16(b);
4376 self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1))
4377 }
4378 #[inline(always)]
4379 fn unzip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4380 let (a0, a1) = self.split_u32x16(a);
4381 let (b0, b1) = self.split_u32x16(b);
4382 self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1))
4383 }
4384 #[inline(always)]
4385 fn unzip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4386 let (a0, a1) = self.split_u32x16(a);
4387 let (b0, b1) = self.split_u32x16(b);
4388 self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1))
4389 }
4390 #[inline(always)]
4391 fn select_u32x16(self, a: mask32x16<Self>, b: u32x16<Self>, c: u32x16<Self>) -> u32x16<Self> {
4392 let (a0, a1) = self.split_mask32x16(a);
4393 let (b0, b1) = self.split_u32x16(b);
4394 let (c0, c1) = self.split_u32x16(c);
4395 self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1))
4396 }
4397 #[inline(always)]
4398 fn min_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4399 let (a0, a1) = self.split_u32x16(a);
4400 let (b0, b1) = self.split_u32x16(b);
4401 self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1))
4402 }
4403 #[inline(always)]
4404 fn max_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4405 let (a0, a1) = self.split_u32x16(a);
4406 let (b0, b1) = self.split_u32x16(b);
4407 self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1))
4408 }
4409 #[inline(always)]
4410 fn split_u32x16(self, a: u32x16<Self>) -> (u32x8<Self>, u32x8<Self>) {
4411 let mut b0 = [0; 8usize];
4412 let mut b1 = [0; 8usize];
4413 b0.copy_from_slice(&a.val[0..8usize]);
4414 b1.copy_from_slice(&a.val[8usize..16usize]);
4415 (b0.simd_into(self), b1.simd_into(self))
4416 }
4417 #[inline(always)]
4418 fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self> {
4419 unsafe {
4420 let v0 = _mm_loadu_si128(src.as_ptr().add(0) as *const __m128i);
4421 let v1 = _mm_loadu_si128(src.as_ptr().add(4) as *const __m128i);
4422 let v2 = _mm_loadu_si128(src.as_ptr().add(8) as *const __m128i);
4423 let v3 = _mm_loadu_si128(src.as_ptr().add(12) as *const __m128i);
4424 let tmp0 = _mm_unpacklo_epi32(v0, v1);
4425 let tmp1 = _mm_unpackhi_epi32(v0, v1);
4426 let tmp2 = _mm_unpacklo_epi32(v2, v3);
4427 let tmp3 = _mm_unpackhi_epi32(v2, v3);
4428 let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
4429 let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
4430 let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
4431 let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
4432 self.combine_u32x8(
4433 self.combine_u32x4(out0.simd_into(self), out1.simd_into(self)),
4434 self.combine_u32x4(out2.simd_into(self), out3.simd_into(self)),
4435 )
4436 }
4437 }
4438 #[inline(always)]
4439 fn store_interleaved_128_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
4440 let fb = crate::Fallback::new();
4441 fb.store_interleaved_128_u32x16(a.val.simd_into(fb), dest);
4442 }
4443 #[inline(always)]
4444 fn reinterpret_u8_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
4445 let (a0, a1) = self.split_u32x16(a);
4446 self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1))
4447 }
4448 #[inline(always)]
4449 fn cvt_f32_u32x16(self, a: u32x16<Self>) -> f32x16<Self> {
4450 let (a0, a1) = self.split_u32x16(a);
4451 self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1))
4452 }
4453 #[inline(always)]
4454 fn splat_mask32x16(self, a: i32) -> mask32x16<Self> {
4455 let half = self.splat_mask32x8(a);
4456 self.combine_mask32x8(half, half)
4457 }
4458 #[inline(always)]
4459 fn not_mask32x16(self, a: mask32x16<Self>) -> mask32x16<Self> {
4460 let (a0, a1) = self.split_mask32x16(a);
4461 self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1))
4462 }
4463 #[inline(always)]
4464 fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
4465 let (a0, a1) = self.split_mask32x16(a);
4466 let (b0, b1) = self.split_mask32x16(b);
4467 self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1))
4468 }
4469 #[inline(always)]
4470 fn or_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
4471 let (a0, a1) = self.split_mask32x16(a);
4472 let (b0, b1) = self.split_mask32x16(b);
4473 self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1))
4474 }
4475 #[inline(always)]
4476 fn xor_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
4477 let (a0, a1) = self.split_mask32x16(a);
4478 let (b0, b1) = self.split_mask32x16(b);
4479 self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1))
4480 }
4481 #[inline(always)]
4482 fn select_mask32x16(
4483 self,
4484 a: mask32x16<Self>,
4485 b: mask32x16<Self>,
4486 c: mask32x16<Self>,
4487 ) -> mask32x16<Self> {
4488 let (a0, a1) = self.split_mask32x16(a);
4489 let (b0, b1) = self.split_mask32x16(b);
4490 let (c0, c1) = self.split_mask32x16(c);
4491 self.combine_mask32x8(
4492 self.select_mask32x8(a0, b0, c0),
4493 self.select_mask32x8(a1, b1, c1),
4494 )
4495 }
4496 #[inline(always)]
4497 fn simd_eq_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
4498 let (a0, a1) = self.split_mask32x16(a);
4499 let (b0, b1) = self.split_mask32x16(b);
4500 self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1))
4501 }
4502 #[inline(always)]
4503 fn split_mask32x16(self, a: mask32x16<Self>) -> (mask32x8<Self>, mask32x8<Self>) {
4504 let mut b0 = [0; 8usize];
4505 let mut b1 = [0; 8usize];
4506 b0.copy_from_slice(&a.val[0..8usize]);
4507 b1.copy_from_slice(&a.val[8usize..16usize]);
4508 (b0.simd_into(self), b1.simd_into(self))
4509 }
4510 #[inline(always)]
4511 fn splat_f64x8(self, a: f64) -> f64x8<Self> {
4512 let half = self.splat_f64x4(a);
4513 self.combine_f64x4(half, half)
4514 }
4515 #[inline(always)]
4516 fn abs_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
4517 let (a0, a1) = self.split_f64x8(a);
4518 self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1))
4519 }
4520 #[inline(always)]
4521 fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
4522 let (a0, a1) = self.split_f64x8(a);
4523 self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1))
4524 }
4525 #[inline(always)]
4526 fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
4527 let (a0, a1) = self.split_f64x8(a);
4528 self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
4529 }
4530 #[inline(always)]
4531 fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4532 let (a0, a1) = self.split_f64x8(a);
4533 let (b0, b1) = self.split_f64x8(b);
4534 self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1))
4535 }
4536 #[inline(always)]
4537 fn sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4538 let (a0, a1) = self.split_f64x8(a);
4539 let (b0, b1) = self.split_f64x8(b);
4540 self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1))
4541 }
4542 #[inline(always)]
4543 fn mul_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4544 let (a0, a1) = self.split_f64x8(a);
4545 let (b0, b1) = self.split_f64x8(b);
4546 self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1))
4547 }
4548 #[inline(always)]
4549 fn div_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4550 let (a0, a1) = self.split_f64x8(a);
4551 let (b0, b1) = self.split_f64x8(b);
4552 self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1))
4553 }
4554 #[inline(always)]
4555 fn copysign_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4556 let (a0, a1) = self.split_f64x8(a);
4557 let (b0, b1) = self.split_f64x8(b);
4558 self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1))
4559 }
4560 #[inline(always)]
4561 fn simd_eq_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
4562 let (a0, a1) = self.split_f64x8(a);
4563 let (b0, b1) = self.split_f64x8(b);
4564 self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1))
4565 }
4566 #[inline(always)]
4567 fn simd_lt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
4568 let (a0, a1) = self.split_f64x8(a);
4569 let (b0, b1) = self.split_f64x8(b);
4570 self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1))
4571 }
4572 #[inline(always)]
4573 fn simd_le_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
4574 let (a0, a1) = self.split_f64x8(a);
4575 let (b0, b1) = self.split_f64x8(b);
4576 self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1))
4577 }
4578 #[inline(always)]
4579 fn simd_ge_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
4580 let (a0, a1) = self.split_f64x8(a);
4581 let (b0, b1) = self.split_f64x8(b);
4582 self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1))
4583 }
4584 #[inline(always)]
4585 fn simd_gt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
4586 let (a0, a1) = self.split_f64x8(a);
4587 let (b0, b1) = self.split_f64x8(b);
4588 self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1))
4589 }
4590 #[inline(always)]
4591 fn zip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4592 let (a0, _) = self.split_f64x8(a);
4593 let (b0, _) = self.split_f64x8(b);
4594 self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0))
4595 }
4596 #[inline(always)]
4597 fn zip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4598 let (_, a1) = self.split_f64x8(a);
4599 let (_, b1) = self.split_f64x8(b);
4600 self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1))
4601 }
4602 #[inline(always)]
4603 fn unzip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4604 let (a0, a1) = self.split_f64x8(a);
4605 let (b0, b1) = self.split_f64x8(b);
4606 self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1))
4607 }
4608 #[inline(always)]
4609 fn unzip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4610 let (a0, a1) = self.split_f64x8(a);
4611 let (b0, b1) = self.split_f64x8(b);
4612 self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1))
4613 }
4614 #[inline(always)]
4615 fn max_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4616 let (a0, a1) = self.split_f64x8(a);
4617 let (b0, b1) = self.split_f64x8(b);
4618 self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1))
4619 }
4620 #[inline(always)]
4621 fn max_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4622 let (a0, a1) = self.split_f64x8(a);
4623 let (b0, b1) = self.split_f64x8(b);
4624 self.combine_f64x4(
4625 self.max_precise_f64x4(a0, b0),
4626 self.max_precise_f64x4(a1, b1),
4627 )
4628 }
4629 #[inline(always)]
4630 fn min_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4631 let (a0, a1) = self.split_f64x8(a);
4632 let (b0, b1) = self.split_f64x8(b);
4633 self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1))
4634 }
4635 #[inline(always)]
4636 fn min_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4637 let (a0, a1) = self.split_f64x8(a);
4638 let (b0, b1) = self.split_f64x8(b);
4639 self.combine_f64x4(
4640 self.min_precise_f64x4(a0, b0),
4641 self.min_precise_f64x4(a1, b1),
4642 )
4643 }
4644 #[inline(always)]
4645 fn madd_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
4646 let (a0, a1) = self.split_f64x8(a);
4647 let (b0, b1) = self.split_f64x8(b);
4648 let (c0, c1) = self.split_f64x8(c);
4649 self.combine_f64x4(self.madd_f64x4(a0, b0, c0), self.madd_f64x4(a1, b1, c1))
4650 }
4651 #[inline(always)]
4652 fn msub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
4653 let (a0, a1) = self.split_f64x8(a);
4654 let (b0, b1) = self.split_f64x8(b);
4655 let (c0, c1) = self.split_f64x8(c);
4656 self.combine_f64x4(self.msub_f64x4(a0, b0, c0), self.msub_f64x4(a1, b1, c1))
4657 }
4658 #[inline(always)]
4659 fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
4660 let (a0, a1) = self.split_f64x8(a);
4661 self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1))
4662 }
4663 #[inline(always)]
4664 fn fract_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
4665 let (a0, a1) = self.split_f64x8(a);
4666 self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1))
4667 }
4668 #[inline(always)]
4669 fn trunc_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
4670 let (a0, a1) = self.split_f64x8(a);
4671 self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1))
4672 }
4673 #[inline(always)]
4674 fn select_f64x8(self, a: mask64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
4675 let (a0, a1) = self.split_mask64x8(a);
4676 let (b0, b1) = self.split_f64x8(b);
4677 let (c0, c1) = self.split_f64x8(c);
4678 self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1))
4679 }
4680 #[inline(always)]
4681 fn split_f64x8(self, a: f64x8<Self>) -> (f64x4<Self>, f64x4<Self>) {
4682 let mut b0 = [0.0; 4usize];
4683 let mut b1 = [0.0; 4usize];
4684 b0.copy_from_slice(&a.val[0..4usize]);
4685 b1.copy_from_slice(&a.val[4usize..8usize]);
4686 (b0.simd_into(self), b1.simd_into(self))
4687 }
4688 #[inline(always)]
4689 fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self> {
4690 let (a0, a1) = self.split_f64x8(a);
4691 self.combine_f32x8(
4692 self.reinterpret_f32_f64x4(a0),
4693 self.reinterpret_f32_f64x4(a1),
4694 )
4695 }
4696 #[inline(always)]
4697 fn splat_mask64x8(self, a: i64) -> mask64x8<Self> {
4698 let half = self.splat_mask64x4(a);
4699 self.combine_mask64x4(half, half)
4700 }
4701 #[inline(always)]
4702 fn not_mask64x8(self, a: mask64x8<Self>) -> mask64x8<Self> {
4703 let (a0, a1) = self.split_mask64x8(a);
4704 self.combine_mask64x4(self.not_mask64x4(a0), self.not_mask64x4(a1))
4705 }
4706 #[inline(always)]
4707 fn and_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
4708 let (a0, a1) = self.split_mask64x8(a);
4709 let (b0, b1) = self.split_mask64x8(b);
4710 self.combine_mask64x4(self.and_mask64x4(a0, b0), self.and_mask64x4(a1, b1))
4711 }
4712 #[inline(always)]
4713 fn or_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
4714 let (a0, a1) = self.split_mask64x8(a);
4715 let (b0, b1) = self.split_mask64x8(b);
4716 self.combine_mask64x4(self.or_mask64x4(a0, b0), self.or_mask64x4(a1, b1))
4717 }
4718 #[inline(always)]
4719 fn xor_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
4720 let (a0, a1) = self.split_mask64x8(a);
4721 let (b0, b1) = self.split_mask64x8(b);
4722 self.combine_mask64x4(self.xor_mask64x4(a0, b0), self.xor_mask64x4(a1, b1))
4723 }
4724 #[inline(always)]
4725 fn select_mask64x8(
4726 self,
4727 a: mask64x8<Self>,
4728 b: mask64x8<Self>,
4729 c: mask64x8<Self>,
4730 ) -> mask64x8<Self> {
4731 let (a0, a1) = self.split_mask64x8(a);
4732 let (b0, b1) = self.split_mask64x8(b);
4733 let (c0, c1) = self.split_mask64x8(c);
4734 self.combine_mask64x4(
4735 self.select_mask64x4(a0, b0, c0),
4736 self.select_mask64x4(a1, b1, c1),
4737 )
4738 }
4739 #[inline(always)]
4740 fn simd_eq_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
4741 let (a0, a1) = self.split_mask64x8(a);
4742 let (b0, b1) = self.split_mask64x8(b);
4743 self.combine_mask64x4(self.simd_eq_mask64x4(a0, b0), self.simd_eq_mask64x4(a1, b1))
4744 }
4745 #[inline(always)]
4746 fn split_mask64x8(self, a: mask64x8<Self>) -> (mask64x4<Self>, mask64x4<Self>) {
4747 let mut b0 = [0; 4usize];
4748 let mut b1 = [0; 4usize];
4749 b0.copy_from_slice(&a.val[0..4usize]);
4750 b1.copy_from_slice(&a.val[4usize..8usize]);
4751 (b0.simd_into(self), b1.simd_into(self))
4752 }
4753}
4754impl<S: Simd> SimdFrom<__m256, S> for f32x8<S> {
4755 #[inline(always)]
4756 fn simd_from(arch: __m256, simd: S) -> Self {
4757 Self {
4758 val: unsafe { core::mem::transmute(arch) },
4759 simd,
4760 }
4761 }
4762}
4763impl<S: Simd> From<f32x8<S>> for __m256 {
4764 #[inline(always)]
4765 fn from(value: f32x8<S>) -> Self {
4766 unsafe { core::mem::transmute(value.val) }
4767 }
4768}
4769impl<S: Simd> SimdFrom<__m256i, S> for i8x32<S> {
4770 #[inline(always)]
4771 fn simd_from(arch: __m256i, simd: S) -> Self {
4772 Self {
4773 val: unsafe { core::mem::transmute(arch) },
4774 simd,
4775 }
4776 }
4777}
4778impl<S: Simd> From<i8x32<S>> for __m256i {
4779 #[inline(always)]
4780 fn from(value: i8x32<S>) -> Self {
4781 unsafe { core::mem::transmute(value.val) }
4782 }
4783}
4784impl<S: Simd> SimdFrom<__m256i, S> for u8x32<S> {
4785 #[inline(always)]
4786 fn simd_from(arch: __m256i, simd: S) -> Self {
4787 Self {
4788 val: unsafe { core::mem::transmute(arch) },
4789 simd,
4790 }
4791 }
4792}
4793impl<S: Simd> From<u8x32<S>> for __m256i {
4794 #[inline(always)]
4795 fn from(value: u8x32<S>) -> Self {
4796 unsafe { core::mem::transmute(value.val) }
4797 }
4798}
4799impl<S: Simd> SimdFrom<__m256i, S> for mask8x32<S> {
4800 #[inline(always)]
4801 fn simd_from(arch: __m256i, simd: S) -> Self {
4802 Self {
4803 val: unsafe { core::mem::transmute(arch) },
4804 simd,
4805 }
4806 }
4807}
4808impl<S: Simd> From<mask8x32<S>> for __m256i {
4809 #[inline(always)]
4810 fn from(value: mask8x32<S>) -> Self {
4811 unsafe { core::mem::transmute(value.val) }
4812 }
4813}
4814impl<S: Simd> SimdFrom<__m256i, S> for i16x16<S> {
4815 #[inline(always)]
4816 fn simd_from(arch: __m256i, simd: S) -> Self {
4817 Self {
4818 val: unsafe { core::mem::transmute(arch) },
4819 simd,
4820 }
4821 }
4822}
4823impl<S: Simd> From<i16x16<S>> for __m256i {
4824 #[inline(always)]
4825 fn from(value: i16x16<S>) -> Self {
4826 unsafe { core::mem::transmute(value.val) }
4827 }
4828}
4829impl<S: Simd> SimdFrom<__m256i, S> for u16x16<S> {
4830 #[inline(always)]
4831 fn simd_from(arch: __m256i, simd: S) -> Self {
4832 Self {
4833 val: unsafe { core::mem::transmute(arch) },
4834 simd,
4835 }
4836 }
4837}
4838impl<S: Simd> From<u16x16<S>> for __m256i {
4839 #[inline(always)]
4840 fn from(value: u16x16<S>) -> Self {
4841 unsafe { core::mem::transmute(value.val) }
4842 }
4843}
4844impl<S: Simd> SimdFrom<__m256i, S> for mask16x16<S> {
4845 #[inline(always)]
4846 fn simd_from(arch: __m256i, simd: S) -> Self {
4847 Self {
4848 val: unsafe { core::mem::transmute(arch) },
4849 simd,
4850 }
4851 }
4852}
4853impl<S: Simd> From<mask16x16<S>> for __m256i {
4854 #[inline(always)]
4855 fn from(value: mask16x16<S>) -> Self {
4856 unsafe { core::mem::transmute(value.val) }
4857 }
4858}
4859impl<S: Simd> SimdFrom<__m256i, S> for i32x8<S> {
4860 #[inline(always)]
4861 fn simd_from(arch: __m256i, simd: S) -> Self {
4862 Self {
4863 val: unsafe { core::mem::transmute(arch) },
4864 simd,
4865 }
4866 }
4867}
4868impl<S: Simd> From<i32x8<S>> for __m256i {
4869 #[inline(always)]
4870 fn from(value: i32x8<S>) -> Self {
4871 unsafe { core::mem::transmute(value.val) }
4872 }
4873}
4874impl<S: Simd> SimdFrom<__m256i, S> for u32x8<S> {
4875 #[inline(always)]
4876 fn simd_from(arch: __m256i, simd: S) -> Self {
4877 Self {
4878 val: unsafe { core::mem::transmute(arch) },
4879 simd,
4880 }
4881 }
4882}
4883impl<S: Simd> From<u32x8<S>> for __m256i {
4884 #[inline(always)]
4885 fn from(value: u32x8<S>) -> Self {
4886 unsafe { core::mem::transmute(value.val) }
4887 }
4888}
4889impl<S: Simd> SimdFrom<__m256i, S> for mask32x8<S> {
4890 #[inline(always)]
4891 fn simd_from(arch: __m256i, simd: S) -> Self {
4892 Self {
4893 val: unsafe { core::mem::transmute(arch) },
4894 simd,
4895 }
4896 }
4897}
4898impl<S: Simd> From<mask32x8<S>> for __m256i {
4899 #[inline(always)]
4900 fn from(value: mask32x8<S>) -> Self {
4901 unsafe { core::mem::transmute(value.val) }
4902 }
4903}
4904impl<S: Simd> SimdFrom<__m256d, S> for f64x4<S> {
4905 #[inline(always)]
4906 fn simd_from(arch: __m256d, simd: S) -> Self {
4907 Self {
4908 val: unsafe { core::mem::transmute(arch) },
4909 simd,
4910 }
4911 }
4912}
4913impl<S: Simd> From<f64x4<S>> for __m256d {
4914 #[inline(always)]
4915 fn from(value: f64x4<S>) -> Self {
4916 unsafe { core::mem::transmute(value.val) }
4917 }
4918}
4919impl<S: Simd> SimdFrom<__m256i, S> for mask64x4<S> {
4920 #[inline(always)]
4921 fn simd_from(arch: __m256i, simd: S) -> Self {
4922 Self {
4923 val: unsafe { core::mem::transmute(arch) },
4924 simd,
4925 }
4926 }
4927}
4928impl<S: Simd> From<mask64x4<S>> for __m256i {
4929 #[inline(always)]
4930 fn from(value: mask64x4<S>) -> Self {
4931 unsafe { core::mem::transmute(value.val) }
4932 }
4933}