1#![expect(
7 unused_variables,
8 clippy::todo,
9 reason = "TODO: https://github.com/linebender/fearless_simd/issues/40"
10)]
11use crate::{Level, Simd, SimdFrom, SimdInto, seal::Seal};
12use crate::{
13 f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4,
14 i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4,
15 mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32,
16 u32x4, u32x8, u32x16,
17};
18#[cfg(target_arch = "x86")]
19use core::arch::x86::*;
20#[cfg(target_arch = "x86_64")]
21use core::arch::x86_64::*;
22use core::ops::*;
23#[doc = r#" The SIMD token for the "SSE 4.2" level."#]
24#[derive(Clone, Copy, Debug)]
25pub struct Sse4_2 {
26 pub sse4_2: crate::core_arch::x86::Sse4_2,
27}
28impl Sse4_2 {
29 #[doc = r" Create a SIMD token."]
30 #[doc = r""]
31 #[doc = r" # Safety"]
32 #[doc = r""]
33 #[doc = r" The SSE4.2 CPU feature must be available."]
34 #[inline]
35 pub unsafe fn new_unchecked() -> Self {
36 Sse4_2 {
37 sse4_2: unsafe { crate::core_arch::x86::Sse4_2::new_unchecked() },
38 }
39 }
40}
41impl Seal for Sse4_2 {}
42impl Simd for Sse4_2 {
43 type f32s = f32x4<Self>;
44 type u8s = u8x16<Self>;
45 type i8s = i8x16<Self>;
46 type u16s = u16x8<Self>;
47 type i16s = i16x8<Self>;
48 type u32s = u32x4<Self>;
49 type i32s = i32x4<Self>;
50 type mask8s = mask8x16<Self>;
51 type mask16s = mask16x8<Self>;
52 type mask32s = mask32x4<Self>;
53 #[inline(always)]
54 fn level(self) -> Level {
55 Level::Sse4_2(self)
56 }
57 #[inline]
58 fn vectorize<F: FnOnce() -> R, R>(self, f: F) -> R {
59 #[target_feature(enable = "sse4.2")]
60 #[inline]
61 unsafe fn vectorize_sse4_2<F: FnOnce() -> R, R>(f: F) -> R {
62 f()
63 }
64 unsafe { vectorize_sse4_2(f) }
65 }
66 #[inline(always)]
67 fn splat_f32x4(self, val: f32) -> f32x4<Self> {
68 unsafe { _mm_set1_ps(val).simd_into(self) }
69 }
70 #[inline(always)]
71 fn abs_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
72 unsafe { _mm_andnot_ps(_mm_set1_ps(-0.0), a.into()).simd_into(self) }
73 }
74 #[inline(always)]
75 fn neg_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
76 unsafe { _mm_xor_ps(a.into(), _mm_set1_ps(-0.0)).simd_into(self) }
77 }
78 #[inline(always)]
79 fn sqrt_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
80 unsafe { _mm_sqrt_ps(a.into()).simd_into(self) }
81 }
82 #[inline(always)]
83 fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
84 unsafe { _mm_add_ps(a.into(), b.into()).simd_into(self) }
85 }
86 #[inline(always)]
87 fn sub_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
88 unsafe { _mm_sub_ps(a.into(), b.into()).simd_into(self) }
89 }
90 #[inline(always)]
91 fn mul_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
92 unsafe { _mm_mul_ps(a.into(), b.into()).simd_into(self) }
93 }
94 #[inline(always)]
95 fn div_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
96 unsafe { _mm_div_ps(a.into(), b.into()).simd_into(self) }
97 }
98 #[inline(always)]
99 fn copysign_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
100 unsafe {
101 let mask = _mm_set1_ps(-0.0);
102 _mm_or_ps(_mm_and_ps(mask, b.into()), _mm_andnot_ps(mask, a.into())).simd_into(self)
103 }
104 }
105 #[inline(always)]
106 fn simd_eq_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
107 unsafe { _mm_castps_si128(_mm_cmpeq_ps(a.into(), b.into())).simd_into(self) }
108 }
109 #[inline(always)]
110 fn simd_lt_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
111 unsafe { _mm_castps_si128(_mm_cmplt_ps(a.into(), b.into())).simd_into(self) }
112 }
113 #[inline(always)]
114 fn simd_le_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
115 unsafe { _mm_castps_si128(_mm_cmple_ps(a.into(), b.into())).simd_into(self) }
116 }
117 #[inline(always)]
118 fn simd_ge_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
119 unsafe { _mm_castps_si128(_mm_cmpge_ps(a.into(), b.into())).simd_into(self) }
120 }
121 #[inline(always)]
122 fn simd_gt_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
123 unsafe { _mm_castps_si128(_mm_cmpgt_ps(a.into(), b.into())).simd_into(self) }
124 }
125 #[inline(always)]
126 fn zip_low_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
127 unsafe { _mm_unpacklo_ps(a.into(), b.into()).simd_into(self) }
128 }
129 #[inline(always)]
130 fn zip_high_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
131 unsafe { _mm_unpackhi_ps(a.into(), b.into()).simd_into(self) }
132 }
133 #[inline(always)]
134 fn unzip_low_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
135 unsafe { _mm_shuffle_ps::<0b10_00_10_00>(a.into(), b.into()).simd_into(self) }
136 }
137 #[inline(always)]
138 fn unzip_high_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
139 unsafe { _mm_shuffle_ps::<0b11_01_11_01>(a.into(), b.into()).simd_into(self) }
140 }
141 #[inline(always)]
142 fn max_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
143 unsafe { _mm_max_ps(a.into(), b.into()).simd_into(self) }
144 }
145 #[inline(always)]
146 fn max_precise_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
147 unsafe { _mm_max_ps(a.into(), b.into()).simd_into(self) }
148 }
149 #[inline(always)]
150 fn min_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
151 unsafe { _mm_min_ps(a.into(), b.into()).simd_into(self) }
152 }
153 #[inline(always)]
154 fn min_precise_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
155 unsafe { _mm_min_ps(a.into(), b.into()).simd_into(self) }
156 }
157 #[inline(always)]
158 fn madd_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
159 a + b * c
160 }
161 #[inline(always)]
162 fn msub_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
163 a - b * c
164 }
165 #[inline(always)]
166 fn floor_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
167 unsafe { _mm_floor_ps(a.into()).simd_into(self) }
168 }
169 #[inline(always)]
170 fn fract_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
171 a - a.trunc()
172 }
173 #[inline(always)]
174 fn trunc_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
175 unsafe { _mm_round_ps(a.into(), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC).simd_into(self) }
176 }
177 #[inline(always)]
178 fn select_f32x4(self, a: mask32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
179 unsafe {
180 let mask = _mm_castsi128_ps(a.into());
181 _mm_or_ps(_mm_and_ps(mask, b.into()), _mm_andnot_ps(mask, c.into())).simd_into(self)
182 }
183 }
184 #[inline(always)]
185 fn combine_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x8<Self> {
186 let mut result = [0.0; 8usize];
187 result[0..4usize].copy_from_slice(&a.val);
188 result[4usize..8usize].copy_from_slice(&b.val);
189 result.simd_into(self)
190 }
191 #[inline(always)]
192 fn reinterpret_f64_f32x4(self, a: f32x4<Self>) -> f64x2<Self> {
193 f64x2 {
194 val: bytemuck::cast(a.val),
195 simd: a.simd,
196 }
197 }
198 #[inline(always)]
199 fn reinterpret_i32_f32x4(self, a: f32x4<Self>) -> i32x4<Self> {
200 i32x4 {
201 val: bytemuck::cast(a.val),
202 simd: a.simd,
203 }
204 }
205 #[inline(always)]
206 fn reinterpret_u8_f32x4(self, a: f32x4<Self>) -> u8x16<Self> {
207 u8x16 {
208 val: bytemuck::cast(a.val),
209 simd: a.simd,
210 }
211 }
212 #[inline(always)]
213 fn reinterpret_u32_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
214 u32x4 {
215 val: bytemuck::cast(a.val),
216 simd: a.simd,
217 }
218 }
219 #[inline(always)]
220 fn cvt_u32_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
221 unsafe {
222 _mm_cvtps_epi32(_mm_max_ps(_mm_floor_ps(a.into()), _mm_set1_ps(0.0))).simd_into(self)
223 }
224 }
225 #[inline(always)]
226 fn cvt_i32_f32x4(self, a: f32x4<Self>) -> i32x4<Self> {
227 unsafe { _mm_cvtps_epi32(a.trunc().into()).simd_into(self) }
228 }
229 #[inline(always)]
230 fn splat_i8x16(self, val: i8) -> i8x16<Self> {
231 unsafe { _mm_set1_epi8(val).simd_into(self) }
232 }
233 #[inline(always)]
234 fn not_i8x16(self, a: i8x16<Self>) -> i8x16<Self> {
235 a ^ !0
236 }
237 #[inline(always)]
238 fn add_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
239 unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) }
240 }
241 #[inline(always)]
242 fn sub_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
243 unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) }
244 }
245 #[inline(always)]
246 fn mul_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
247 todo!()
248 }
249 #[inline(always)]
250 fn and_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
251 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
252 }
253 #[inline(always)]
254 fn or_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
255 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
256 }
257 #[inline(always)]
258 fn xor_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
259 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
260 }
261 #[inline(always)]
262 fn shr_i8x16(self, a: i8x16<Self>, shift: u32) -> i8x16<Self> {
263 unsafe {
264 let val = a.into();
265 let shift_count = _mm_cvtsi32_si128(shift as i32);
266 let lo_16 = _mm_unpacklo_epi8(val, _mm_cmplt_epi8(val, _mm_setzero_si128()));
267 let hi_16 = _mm_unpackhi_epi8(val, _mm_cmplt_epi8(val, _mm_setzero_si128()));
268 let lo_shifted = _mm_sra_epi16(lo_16, shift_count);
269 let hi_shifted = _mm_sra_epi16(hi_16, shift_count);
270 _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
271 }
272 }
273 #[inline(always)]
274 fn simd_eq_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
275 unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) }
276 }
277 #[inline(always)]
278 fn simd_lt_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
279 unsafe { _mm_cmplt_epi8(a.into(), b.into()).simd_into(self) }
280 }
281 #[inline(always)]
282 fn simd_le_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
283 unsafe { _mm_cmpeq_epi8(_mm_min_epi8(a.into(), b.into()), a.into()).simd_into(self) }
284 }
285 #[inline(always)]
286 fn simd_ge_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
287 unsafe { _mm_cmpeq_epi8(_mm_max_epi8(a.into(), b.into()), a.into()).simd_into(self) }
288 }
289 #[inline(always)]
290 fn simd_gt_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
291 unsafe { _mm_cmpgt_epi8(a.into(), b.into()).simd_into(self) }
292 }
293 #[inline(always)]
294 fn zip_low_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
295 unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) }
296 }
297 #[inline(always)]
298 fn zip_high_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
299 unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) }
300 }
301 #[inline(always)]
302 fn unzip_low_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
303 unsafe {
304 let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14);
305 let t1 = _mm_shuffle_epi8(a.into(), mask);
306 let t2 = _mm_shuffle_epi8(b.into(), mask);
307 _mm_unpacklo_epi64(t1, t2).simd_into(self)
308 }
309 }
310 #[inline(always)]
311 fn unzip_high_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
312 unsafe {
313 let mask = _mm_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, 1, 3, 5, 7, 9, 11, 13, 15);
314 let t1 = _mm_shuffle_epi8(a.into(), mask);
315 let t2 = _mm_shuffle_epi8(b.into(), mask);
316 _mm_unpacklo_epi64(t1, t2).simd_into(self)
317 }
318 }
319 #[inline(always)]
320 fn select_i8x16(self, a: mask8x16<Self>, b: i8x16<Self>, c: i8x16<Self>) -> i8x16<Self> {
321 unsafe {
322 _mm_or_si128(
323 _mm_and_si128(a.into(), b.into()),
324 _mm_andnot_si128(a.into(), c.into()),
325 )
326 .simd_into(self)
327 }
328 }
329 #[inline(always)]
330 fn min_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
331 unsafe { _mm_min_epi8(a.into(), b.into()).simd_into(self) }
332 }
333 #[inline(always)]
334 fn max_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
335 unsafe { _mm_max_epi8(a.into(), b.into()).simd_into(self) }
336 }
337 #[inline(always)]
338 fn combine_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x32<Self> {
339 let mut result = [0; 32usize];
340 result[0..16usize].copy_from_slice(&a.val);
341 result[16usize..32usize].copy_from_slice(&b.val);
342 result.simd_into(self)
343 }
344 #[inline(always)]
345 fn reinterpret_u8_i8x16(self, a: i8x16<Self>) -> u8x16<Self> {
346 u8x16 {
347 val: bytemuck::cast(a.val),
348 simd: a.simd,
349 }
350 }
351 #[inline(always)]
352 fn reinterpret_u32_i8x16(self, a: i8x16<Self>) -> u32x4<Self> {
353 u32x4 {
354 val: bytemuck::cast(a.val),
355 simd: a.simd,
356 }
357 }
358 #[inline(always)]
359 fn splat_u8x16(self, val: u8) -> u8x16<Self> {
360 unsafe { _mm_set1_epi8(val as _).simd_into(self) }
361 }
362 #[inline(always)]
363 fn not_u8x16(self, a: u8x16<Self>) -> u8x16<Self> {
364 a ^ !0
365 }
366 #[inline(always)]
367 fn add_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
368 unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) }
369 }
370 #[inline(always)]
371 fn sub_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
372 unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) }
373 }
374 #[inline(always)]
375 fn mul_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
376 todo!()
377 }
378 #[inline(always)]
379 fn and_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
380 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
381 }
382 #[inline(always)]
383 fn or_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
384 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
385 }
386 #[inline(always)]
387 fn xor_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
388 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
389 }
390 #[inline(always)]
391 fn shr_u8x16(self, a: u8x16<Self>, shift: u32) -> u8x16<Self> {
392 unsafe {
393 let val = a.into();
394 let shift_count = _mm_cvtsi32_si128(shift as i32);
395 let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128());
396 let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128());
397 let lo_shifted = _mm_srl_epi16(lo_16, shift_count);
398 let hi_shifted = _mm_srl_epi16(hi_16, shift_count);
399 _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
400 }
401 }
402 #[inline(always)]
403 fn simd_eq_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
404 unsafe {
405 let sign_bit = _mm_set1_epi8(0x80u8 as _);
406 let a_signed = _mm_xor_si128(a.into(), sign_bit);
407 let b_signed = _mm_xor_si128(b.into(), sign_bit);
408 _mm_cmpgt_epi8(a_signed, b_signed).simd_into(self)
409 }
410 }
411 #[inline(always)]
412 fn simd_lt_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
413 unsafe {
414 let sign_bit = _mm_set1_epi8(0x80u8 as _);
415 let a_signed = _mm_xor_si128(a.into(), sign_bit);
416 let b_signed = _mm_xor_si128(b.into(), sign_bit);
417 _mm_cmpgt_epi8(b_signed, a_signed).simd_into(self)
418 }
419 }
420 #[inline(always)]
421 fn simd_le_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
422 unsafe { _mm_cmpeq_epi8(_mm_min_epu8(a.into(), b.into()), a.into()).simd_into(self) }
423 }
424 #[inline(always)]
425 fn simd_ge_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
426 unsafe { _mm_cmpeq_epi8(_mm_max_epu8(a.into(), b.into()), a.into()).simd_into(self) }
427 }
428 #[inline(always)]
429 fn simd_gt_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
430 unsafe {
431 let sign_bit = _mm_set1_epi8(0x80u8 as _);
432 let a_signed = _mm_xor_si128(a.into(), sign_bit);
433 let b_signed = _mm_xor_si128(b.into(), sign_bit);
434 _mm_cmpgt_epi8(a_signed, b_signed).simd_into(self)
435 }
436 }
437 #[inline(always)]
438 fn zip_low_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
439 unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) }
440 }
441 #[inline(always)]
442 fn zip_high_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
443 unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) }
444 }
445 #[inline(always)]
446 fn unzip_low_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
447 unsafe {
448 let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14);
449 let t1 = _mm_shuffle_epi8(a.into(), mask);
450 let t2 = _mm_shuffle_epi8(b.into(), mask);
451 _mm_unpacklo_epi64(t1, t2).simd_into(self)
452 }
453 }
454 #[inline(always)]
455 fn unzip_high_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
456 unsafe {
457 let mask = _mm_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, 1, 3, 5, 7, 9, 11, 13, 15);
458 let t1 = _mm_shuffle_epi8(a.into(), mask);
459 let t2 = _mm_shuffle_epi8(b.into(), mask);
460 _mm_unpacklo_epi64(t1, t2).simd_into(self)
461 }
462 }
463 #[inline(always)]
464 fn select_u8x16(self, a: mask8x16<Self>, b: u8x16<Self>, c: u8x16<Self>) -> u8x16<Self> {
465 unsafe {
466 _mm_or_si128(
467 _mm_and_si128(a.into(), b.into()),
468 _mm_andnot_si128(a.into(), c.into()),
469 )
470 .simd_into(self)
471 }
472 }
473 #[inline(always)]
474 fn min_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
475 unsafe { _mm_min_epu8(a.into(), b.into()).simd_into(self) }
476 }
477 #[inline(always)]
478 fn max_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
479 unsafe { _mm_max_epu8(a.into(), b.into()).simd_into(self) }
480 }
481 #[inline(always)]
482 fn combine_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x32<Self> {
483 let mut result = [0; 32usize];
484 result[0..16usize].copy_from_slice(&a.val);
485 result[16usize..32usize].copy_from_slice(&b.val);
486 result.simd_into(self)
487 }
488 #[inline(always)]
489 fn widen_u8x16(self, a: u8x16<Self>) -> u16x16<Self> {
490 unsafe {
491 let raw = a.into();
492 let high = _mm_cvtepu8_epi16(raw).simd_into(self);
493 let low = _mm_cvtepu8_epi16(_mm_srli_si128::<8>(raw)).simd_into(self);
494 self.combine_u16x8(high, low)
495 }
496 }
497 #[inline(always)]
498 fn reinterpret_u32_u8x16(self, a: u8x16<Self>) -> u32x4<Self> {
499 u32x4 {
500 val: bytemuck::cast(a.val),
501 simd: a.simd,
502 }
503 }
504 #[inline(always)]
505 fn splat_mask8x16(self, val: i8) -> mask8x16<Self> {
506 unsafe { _mm_set1_epi8(val).simd_into(self) }
507 }
508 #[inline(always)]
509 fn not_mask8x16(self, a: mask8x16<Self>) -> mask8x16<Self> {
510 a ^ !0
511 }
512 #[inline(always)]
513 fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
514 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
515 }
516 #[inline(always)]
517 fn or_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
518 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
519 }
520 #[inline(always)]
521 fn xor_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
522 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
523 }
524 #[inline(always)]
525 fn select_mask8x16(
526 self,
527 a: mask8x16<Self>,
528 b: mask8x16<Self>,
529 c: mask8x16<Self>,
530 ) -> mask8x16<Self> {
531 unsafe {
532 _mm_or_si128(
533 _mm_and_si128(a.into(), b.into()),
534 _mm_andnot_si128(a.into(), c.into()),
535 )
536 .simd_into(self)
537 }
538 }
539 #[inline(always)]
540 fn simd_eq_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
541 unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) }
542 }
543 #[inline(always)]
544 fn combine_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x32<Self> {
545 let mut result = [0; 32usize];
546 result[0..16usize].copy_from_slice(&a.val);
547 result[16usize..32usize].copy_from_slice(&b.val);
548 result.simd_into(self)
549 }
550 #[inline(always)]
551 fn splat_i16x8(self, val: i16) -> i16x8<Self> {
552 unsafe { _mm_set1_epi16(val).simd_into(self) }
553 }
554 #[inline(always)]
555 fn not_i16x8(self, a: i16x8<Self>) -> i16x8<Self> {
556 a ^ !0
557 }
558 #[inline(always)]
559 fn add_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
560 unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) }
561 }
562 #[inline(always)]
563 fn sub_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
564 unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) }
565 }
566 #[inline(always)]
567 fn mul_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
568 unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) }
569 }
570 #[inline(always)]
571 fn and_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
572 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
573 }
574 #[inline(always)]
575 fn or_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
576 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
577 }
578 #[inline(always)]
579 fn xor_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
580 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
581 }
582 #[inline(always)]
583 fn shr_i16x8(self, a: i16x8<Self>, shift: u32) -> i16x8<Self> {
584 unsafe { _mm_sra_epi16(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) }
585 }
586 #[inline(always)]
587 fn simd_eq_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
588 unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) }
589 }
590 #[inline(always)]
591 fn simd_lt_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
592 unsafe { _mm_cmplt_epi16(a.into(), b.into()).simd_into(self) }
593 }
594 #[inline(always)]
595 fn simd_le_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
596 unsafe { _mm_cmpeq_epi16(_mm_min_epi16(a.into(), b.into()), a.into()).simd_into(self) }
597 }
598 #[inline(always)]
599 fn simd_ge_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
600 unsafe { _mm_cmpeq_epi16(_mm_max_epi16(a.into(), b.into()), a.into()).simd_into(self) }
601 }
602 #[inline(always)]
603 fn simd_gt_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
604 unsafe { _mm_cmpgt_epi16(a.into(), b.into()).simd_into(self) }
605 }
606 #[inline(always)]
607 fn zip_low_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
608 unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) }
609 }
610 #[inline(always)]
611 fn zip_high_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
612 unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) }
613 }
614 #[inline(always)]
615 fn unzip_low_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
616 unsafe {
617 let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13);
618 let t1 = _mm_shuffle_epi8(a.into(), mask);
619 let t2 = _mm_shuffle_epi8(b.into(), mask);
620 _mm_unpacklo_epi64(t1, t2).simd_into(self)
621 }
622 }
623 #[inline(always)]
624 fn unzip_high_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
625 unsafe {
626 let mask = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
627 let t1 = _mm_shuffle_epi8(a.into(), mask);
628 let t2 = _mm_shuffle_epi8(b.into(), mask);
629 _mm_unpacklo_epi64(t1, t2).simd_into(self)
630 }
631 }
632 #[inline(always)]
633 fn select_i16x8(self, a: mask16x8<Self>, b: i16x8<Self>, c: i16x8<Self>) -> i16x8<Self> {
634 unsafe {
635 _mm_or_si128(
636 _mm_and_si128(a.into(), b.into()),
637 _mm_andnot_si128(a.into(), c.into()),
638 )
639 .simd_into(self)
640 }
641 }
642 #[inline(always)]
643 fn min_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
644 unsafe { _mm_min_epi16(a.into(), b.into()).simd_into(self) }
645 }
646 #[inline(always)]
647 fn max_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
648 unsafe { _mm_max_epi16(a.into(), b.into()).simd_into(self) }
649 }
650 #[inline(always)]
651 fn combine_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x16<Self> {
652 let mut result = [0; 16usize];
653 result[0..8usize].copy_from_slice(&a.val);
654 result[8usize..16usize].copy_from_slice(&b.val);
655 result.simd_into(self)
656 }
657 #[inline(always)]
658 fn reinterpret_u8_i16x8(self, a: i16x8<Self>) -> u8x16<Self> {
659 u8x16 {
660 val: bytemuck::cast(a.val),
661 simd: a.simd,
662 }
663 }
664 #[inline(always)]
665 fn reinterpret_u32_i16x8(self, a: i16x8<Self>) -> u32x4<Self> {
666 u32x4 {
667 val: bytemuck::cast(a.val),
668 simd: a.simd,
669 }
670 }
671 #[inline(always)]
672 fn splat_u16x8(self, val: u16) -> u16x8<Self> {
673 unsafe { _mm_set1_epi16(val as _).simd_into(self) }
674 }
675 #[inline(always)]
676 fn not_u16x8(self, a: u16x8<Self>) -> u16x8<Self> {
677 a ^ !0
678 }
679 #[inline(always)]
680 fn add_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
681 unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) }
682 }
683 #[inline(always)]
684 fn sub_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
685 unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) }
686 }
687 #[inline(always)]
688 fn mul_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
689 unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) }
690 }
691 #[inline(always)]
692 fn and_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
693 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
694 }
695 #[inline(always)]
696 fn or_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
697 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
698 }
699 #[inline(always)]
700 fn xor_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
701 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
702 }
703 #[inline(always)]
704 fn shr_u16x8(self, a: u16x8<Self>, shift: u32) -> u16x8<Self> {
705 unsafe { _mm_srl_epi16(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) }
706 }
707 #[inline(always)]
708 fn simd_eq_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
709 unsafe {
710 let sign_bit = _mm_set1_epi16(0x8000u16 as _);
711 let a_signed = _mm_xor_si128(a.into(), sign_bit);
712 let b_signed = _mm_xor_si128(b.into(), sign_bit);
713 _mm_cmpgt_epi16(a_signed, b_signed).simd_into(self)
714 }
715 }
716 #[inline(always)]
717 fn simd_lt_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
718 unsafe {
719 let sign_bit = _mm_set1_epi16(0x8000u16 as _);
720 let a_signed = _mm_xor_si128(a.into(), sign_bit);
721 let b_signed = _mm_xor_si128(b.into(), sign_bit);
722 _mm_cmpgt_epi16(b_signed, a_signed).simd_into(self)
723 }
724 }
725 #[inline(always)]
726 fn simd_le_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
727 unsafe { _mm_cmpeq_epi16(_mm_min_epu16(a.into(), b.into()), a.into()).simd_into(self) }
728 }
729 #[inline(always)]
730 fn simd_ge_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
731 unsafe { _mm_cmpeq_epi16(_mm_max_epu16(a.into(), b.into()), a.into()).simd_into(self) }
732 }
733 #[inline(always)]
734 fn simd_gt_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
735 unsafe {
736 let sign_bit = _mm_set1_epi16(0x8000u16 as _);
737 let a_signed = _mm_xor_si128(a.into(), sign_bit);
738 let b_signed = _mm_xor_si128(b.into(), sign_bit);
739 _mm_cmpgt_epi16(a_signed, b_signed).simd_into(self)
740 }
741 }
742 #[inline(always)]
743 fn zip_low_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
744 unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) }
745 }
746 #[inline(always)]
747 fn zip_high_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
748 unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) }
749 }
750 #[inline(always)]
751 fn unzip_low_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
752 unsafe {
753 let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13);
754 let t1 = _mm_shuffle_epi8(a.into(), mask);
755 let t2 = _mm_shuffle_epi8(b.into(), mask);
756 _mm_unpacklo_epi64(t1, t2).simd_into(self)
757 }
758 }
759 #[inline(always)]
760 fn unzip_high_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
761 unsafe {
762 let mask = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
763 let t1 = _mm_shuffle_epi8(a.into(), mask);
764 let t2 = _mm_shuffle_epi8(b.into(), mask);
765 _mm_unpacklo_epi64(t1, t2).simd_into(self)
766 }
767 }
768 #[inline(always)]
769 fn select_u16x8(self, a: mask16x8<Self>, b: u16x8<Self>, c: u16x8<Self>) -> u16x8<Self> {
770 unsafe {
771 _mm_or_si128(
772 _mm_and_si128(a.into(), b.into()),
773 _mm_andnot_si128(a.into(), c.into()),
774 )
775 .simd_into(self)
776 }
777 }
778 #[inline(always)]
779 fn min_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
780 unsafe { _mm_min_epu16(a.into(), b.into()).simd_into(self) }
781 }
782 #[inline(always)]
783 fn max_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
784 unsafe { _mm_max_epu16(a.into(), b.into()).simd_into(self) }
785 }
786 #[inline(always)]
787 fn combine_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x16<Self> {
788 let mut result = [0; 16usize];
789 result[0..8usize].copy_from_slice(&a.val);
790 result[8usize..16usize].copy_from_slice(&b.val);
791 result.simd_into(self)
792 }
793 #[inline(always)]
794 fn reinterpret_u8_u16x8(self, a: u16x8<Self>) -> u8x16<Self> {
795 u8x16 {
796 val: bytemuck::cast(a.val),
797 simd: a.simd,
798 }
799 }
800 #[inline(always)]
801 fn reinterpret_u32_u16x8(self, a: u16x8<Self>) -> u32x4<Self> {
802 u32x4 {
803 val: bytemuck::cast(a.val),
804 simd: a.simd,
805 }
806 }
807 #[inline(always)]
808 fn splat_mask16x8(self, val: i16) -> mask16x8<Self> {
809 unsafe { _mm_set1_epi16(val).simd_into(self) }
810 }
811 #[inline(always)]
812 fn not_mask16x8(self, a: mask16x8<Self>) -> mask16x8<Self> {
813 a ^ !0
814 }
815 #[inline(always)]
816 fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
817 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
818 }
819 #[inline(always)]
820 fn or_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
821 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
822 }
823 #[inline(always)]
824 fn xor_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
825 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
826 }
827 #[inline(always)]
828 fn select_mask16x8(
829 self,
830 a: mask16x8<Self>,
831 b: mask16x8<Self>,
832 c: mask16x8<Self>,
833 ) -> mask16x8<Self> {
834 unsafe {
835 _mm_or_si128(
836 _mm_and_si128(a.into(), b.into()),
837 _mm_andnot_si128(a.into(), c.into()),
838 )
839 .simd_into(self)
840 }
841 }
842 #[inline(always)]
843 fn simd_eq_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
844 unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) }
845 }
846 #[inline(always)]
847 fn combine_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x16<Self> {
848 let mut result = [0; 16usize];
849 result[0..8usize].copy_from_slice(&a.val);
850 result[8usize..16usize].copy_from_slice(&b.val);
851 result.simd_into(self)
852 }
853 #[inline(always)]
854 fn splat_i32x4(self, val: i32) -> i32x4<Self> {
855 unsafe { _mm_set1_epi32(val).simd_into(self) }
856 }
857 #[inline(always)]
858 fn not_i32x4(self, a: i32x4<Self>) -> i32x4<Self> {
859 a ^ !0
860 }
861 #[inline(always)]
862 fn add_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
863 unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) }
864 }
865 #[inline(always)]
866 fn sub_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
867 unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) }
868 }
869 #[inline(always)]
870 fn mul_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
871 unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) }
872 }
873 #[inline(always)]
874 fn and_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
875 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
876 }
877 #[inline(always)]
878 fn or_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
879 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
880 }
881 #[inline(always)]
882 fn xor_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
883 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
884 }
885 #[inline(always)]
886 fn shr_i32x4(self, a: i32x4<Self>, shift: u32) -> i32x4<Self> {
887 unsafe { _mm_sra_epi32(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) }
888 }
889 #[inline(always)]
890 fn simd_eq_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
891 unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) }
892 }
893 #[inline(always)]
894 fn simd_lt_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
895 unsafe { _mm_cmplt_epi32(a.into(), b.into()).simd_into(self) }
896 }
897 #[inline(always)]
898 fn simd_le_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
899 unsafe { _mm_cmpeq_epi32(_mm_min_epi32(a.into(), b.into()), a.into()).simd_into(self) }
900 }
901 #[inline(always)]
902 fn simd_ge_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
903 unsafe { _mm_cmpeq_epi32(_mm_max_epi32(a.into(), b.into()), a.into()).simd_into(self) }
904 }
905 #[inline(always)]
906 fn simd_gt_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
907 unsafe { _mm_cmpgt_epi32(a.into(), b.into()).simd_into(self) }
908 }
909 #[inline(always)]
910 fn zip_low_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
911 unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) }
912 }
913 #[inline(always)]
914 fn zip_high_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
915 unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) }
916 }
917 #[inline(always)]
918 fn unzip_low_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
919 unsafe {
920 let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
921 let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
922 _mm_unpacklo_epi64(t1, t2).simd_into(self)
923 }
924 }
925 #[inline(always)]
926 fn unzip_high_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
927 unsafe {
928 let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
929 let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
930 _mm_unpackhi_epi64(t1, t2).simd_into(self)
931 }
932 }
933 #[inline(always)]
934 fn select_i32x4(self, a: mask32x4<Self>, b: i32x4<Self>, c: i32x4<Self>) -> i32x4<Self> {
935 unsafe {
936 _mm_or_si128(
937 _mm_and_si128(a.into(), b.into()),
938 _mm_andnot_si128(a.into(), c.into()),
939 )
940 .simd_into(self)
941 }
942 }
943 #[inline(always)]
944 fn min_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
945 unsafe { _mm_min_epi32(a.into(), b.into()).simd_into(self) }
946 }
947 #[inline(always)]
948 fn max_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
949 unsafe { _mm_max_epi32(a.into(), b.into()).simd_into(self) }
950 }
951 #[inline(always)]
952 fn combine_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x8<Self> {
953 let mut result = [0; 8usize];
954 result[0..4usize].copy_from_slice(&a.val);
955 result[4usize..8usize].copy_from_slice(&b.val);
956 result.simd_into(self)
957 }
958 #[inline(always)]
959 fn reinterpret_u8_i32x4(self, a: i32x4<Self>) -> u8x16<Self> {
960 u8x16 {
961 val: bytemuck::cast(a.val),
962 simd: a.simd,
963 }
964 }
965 #[inline(always)]
966 fn reinterpret_u32_i32x4(self, a: i32x4<Self>) -> u32x4<Self> {
967 u32x4 {
968 val: bytemuck::cast(a.val),
969 simd: a.simd,
970 }
971 }
972 #[inline(always)]
973 fn cvt_f32_i32x4(self, a: i32x4<Self>) -> f32x4<Self> {
974 unsafe { _mm_cvtepi32_ps(a.into()).simd_into(self) }
975 }
976 #[inline(always)]
977 fn splat_u32x4(self, val: u32) -> u32x4<Self> {
978 unsafe { _mm_set1_epi32(val as _).simd_into(self) }
979 }
980 #[inline(always)]
981 fn not_u32x4(self, a: u32x4<Self>) -> u32x4<Self> {
982 a ^ !0
983 }
984 #[inline(always)]
985 fn add_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
986 unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) }
987 }
988 #[inline(always)]
989 fn sub_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
990 unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) }
991 }
992 #[inline(always)]
993 fn mul_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
994 unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) }
995 }
996 #[inline(always)]
997 fn and_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
998 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
999 }
1000 #[inline(always)]
1001 fn or_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1002 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1003 }
1004 #[inline(always)]
1005 fn xor_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1006 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1007 }
1008 #[inline(always)]
1009 fn shr_u32x4(self, a: u32x4<Self>, shift: u32) -> u32x4<Self> {
1010 unsafe { _mm_srl_epi32(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) }
1011 }
1012 #[inline(always)]
1013 fn simd_eq_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1014 unsafe {
1015 let sign_bit = _mm_set1_epi32(0x80000000u32 as _);
1016 let a_signed = _mm_xor_si128(a.into(), sign_bit);
1017 let b_signed = _mm_xor_si128(b.into(), sign_bit);
1018 _mm_cmpgt_epi32(a_signed, b_signed).simd_into(self)
1019 }
1020 }
1021 #[inline(always)]
1022 fn simd_lt_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1023 unsafe {
1024 let sign_bit = _mm_set1_epi32(0x80000000u32 as _);
1025 let a_signed = _mm_xor_si128(a.into(), sign_bit);
1026 let b_signed = _mm_xor_si128(b.into(), sign_bit);
1027 _mm_cmpgt_epi32(b_signed, a_signed).simd_into(self)
1028 }
1029 }
1030 #[inline(always)]
1031 fn simd_le_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1032 unsafe { _mm_cmpeq_epi32(_mm_min_epu32(a.into(), b.into()), a.into()).simd_into(self) }
1033 }
1034 #[inline(always)]
1035 fn simd_ge_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1036 unsafe { _mm_cmpeq_epi32(_mm_max_epu32(a.into(), b.into()), a.into()).simd_into(self) }
1037 }
1038 #[inline(always)]
1039 fn simd_gt_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1040 unsafe {
1041 let sign_bit = _mm_set1_epi32(0x80000000u32 as _);
1042 let a_signed = _mm_xor_si128(a.into(), sign_bit);
1043 let b_signed = _mm_xor_si128(b.into(), sign_bit);
1044 _mm_cmpgt_epi32(a_signed, b_signed).simd_into(self)
1045 }
1046 }
1047 #[inline(always)]
1048 fn zip_low_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1049 unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) }
1050 }
1051 #[inline(always)]
1052 fn zip_high_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1053 unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) }
1054 }
1055 #[inline(always)]
1056 fn unzip_low_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1057 unsafe {
1058 let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
1059 let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
1060 _mm_unpacklo_epi64(t1, t2).simd_into(self)
1061 }
1062 }
1063 #[inline(always)]
1064 fn unzip_high_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1065 unsafe {
1066 let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
1067 let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
1068 _mm_unpackhi_epi64(t1, t2).simd_into(self)
1069 }
1070 }
1071 #[inline(always)]
1072 fn select_u32x4(self, a: mask32x4<Self>, b: u32x4<Self>, c: u32x4<Self>) -> u32x4<Self> {
1073 unsafe {
1074 _mm_or_si128(
1075 _mm_and_si128(a.into(), b.into()),
1076 _mm_andnot_si128(a.into(), c.into()),
1077 )
1078 .simd_into(self)
1079 }
1080 }
1081 #[inline(always)]
1082 fn min_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1083 unsafe { _mm_min_epu32(a.into(), b.into()).simd_into(self) }
1084 }
1085 #[inline(always)]
1086 fn max_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1087 unsafe { _mm_max_epu32(a.into(), b.into()).simd_into(self) }
1088 }
1089 #[inline(always)]
1090 fn combine_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x8<Self> {
1091 let mut result = [0; 8usize];
1092 result[0..4usize].copy_from_slice(&a.val);
1093 result[4usize..8usize].copy_from_slice(&b.val);
1094 result.simd_into(self)
1095 }
1096 #[inline(always)]
1097 fn reinterpret_u8_u32x4(self, a: u32x4<Self>) -> u8x16<Self> {
1098 u8x16 {
1099 val: bytemuck::cast(a.val),
1100 simd: a.simd,
1101 }
1102 }
1103 #[inline(always)]
1104 fn cvt_f32_u32x4(self, a: u32x4<Self>) -> f32x4<Self> {
1105 unsafe { _mm_cvtepi32_ps(a.into()).simd_into(self) }
1106 }
1107 #[inline(always)]
1108 fn splat_mask32x4(self, val: i32) -> mask32x4<Self> {
1109 unsafe { _mm_set1_epi32(val).simd_into(self) }
1110 }
1111 #[inline(always)]
1112 fn not_mask32x4(self, a: mask32x4<Self>) -> mask32x4<Self> {
1113 a ^ !0
1114 }
1115 #[inline(always)]
1116 fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
1117 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1118 }
1119 #[inline(always)]
1120 fn or_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
1121 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1122 }
1123 #[inline(always)]
1124 fn xor_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
1125 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1126 }
1127 #[inline(always)]
1128 fn select_mask32x4(
1129 self,
1130 a: mask32x4<Self>,
1131 b: mask32x4<Self>,
1132 c: mask32x4<Self>,
1133 ) -> mask32x4<Self> {
1134 unsafe {
1135 _mm_or_si128(
1136 _mm_and_si128(a.into(), b.into()),
1137 _mm_andnot_si128(a.into(), c.into()),
1138 )
1139 .simd_into(self)
1140 }
1141 }
1142 #[inline(always)]
1143 fn simd_eq_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
1144 unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) }
1145 }
1146 #[inline(always)]
1147 fn combine_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x8<Self> {
1148 let mut result = [0; 8usize];
1149 result[0..4usize].copy_from_slice(&a.val);
1150 result[4usize..8usize].copy_from_slice(&b.val);
1151 result.simd_into(self)
1152 }
1153 #[inline(always)]
1154 fn splat_f64x2(self, val: f64) -> f64x2<Self> {
1155 unsafe { _mm_set1_pd(val).simd_into(self) }
1156 }
1157 #[inline(always)]
1158 fn abs_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
1159 unsafe { _mm_andnot_pd(_mm_set1_pd(-0.0), a.into()).simd_into(self) }
1160 }
1161 #[inline(always)]
1162 fn neg_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
1163 unsafe { _mm_xor_pd(a.into(), _mm_set1_pd(-0.0)).simd_into(self) }
1164 }
1165 #[inline(always)]
1166 fn sqrt_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
1167 unsafe { _mm_sqrt_pd(a.into()).simd_into(self) }
1168 }
1169 #[inline(always)]
1170 fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1171 unsafe { _mm_add_pd(a.into(), b.into()).simd_into(self) }
1172 }
1173 #[inline(always)]
1174 fn sub_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1175 unsafe { _mm_sub_pd(a.into(), b.into()).simd_into(self) }
1176 }
1177 #[inline(always)]
1178 fn mul_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1179 unsafe { _mm_mul_pd(a.into(), b.into()).simd_into(self) }
1180 }
1181 #[inline(always)]
1182 fn div_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1183 unsafe { _mm_div_pd(a.into(), b.into()).simd_into(self) }
1184 }
1185 #[inline(always)]
1186 fn copysign_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1187 unsafe {
1188 let mask = _mm_set1_pd(-0.0);
1189 _mm_or_pd(_mm_and_pd(mask, b.into()), _mm_andnot_pd(mask, a.into())).simd_into(self)
1190 }
1191 }
1192 #[inline(always)]
1193 fn simd_eq_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
1194 unsafe { _mm_castpd_si128(_mm_cmpeq_pd(a.into(), b.into())).simd_into(self) }
1195 }
1196 #[inline(always)]
1197 fn simd_lt_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
1198 unsafe { _mm_castpd_si128(_mm_cmplt_pd(a.into(), b.into())).simd_into(self) }
1199 }
1200 #[inline(always)]
1201 fn simd_le_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
1202 unsafe { _mm_castpd_si128(_mm_cmple_pd(a.into(), b.into())).simd_into(self) }
1203 }
1204 #[inline(always)]
1205 fn simd_ge_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
1206 unsafe { _mm_castpd_si128(_mm_cmpge_pd(a.into(), b.into())).simd_into(self) }
1207 }
1208 #[inline(always)]
1209 fn simd_gt_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
1210 unsafe { _mm_castpd_si128(_mm_cmpgt_pd(a.into(), b.into())).simd_into(self) }
1211 }
1212 #[inline(always)]
1213 fn zip_low_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1214 unsafe { _mm_unpacklo_pd(a.into(), b.into()).simd_into(self) }
1215 }
1216 #[inline(always)]
1217 fn zip_high_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1218 unsafe { _mm_unpackhi_pd(a.into(), b.into()).simd_into(self) }
1219 }
1220 #[inline(always)]
1221 fn unzip_low_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1222 unsafe { _mm_shuffle_pd::<0b00>(a.into(), b.into()).simd_into(self) }
1223 }
1224 #[inline(always)]
1225 fn unzip_high_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1226 unsafe { _mm_shuffle_pd::<0b11>(a.into(), b.into()).simd_into(self) }
1227 }
1228 #[inline(always)]
1229 fn max_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1230 unsafe { _mm_max_pd(a.into(), b.into()).simd_into(self) }
1231 }
1232 #[inline(always)]
1233 fn max_precise_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1234 unsafe { _mm_max_pd(a.into(), b.into()).simd_into(self) }
1235 }
1236 #[inline(always)]
1237 fn min_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1238 unsafe { _mm_min_pd(a.into(), b.into()).simd_into(self) }
1239 }
1240 #[inline(always)]
1241 fn min_precise_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
1242 unsafe { _mm_min_pd(a.into(), b.into()).simd_into(self) }
1243 }
1244 #[inline(always)]
1245 fn madd_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
1246 a + b * c
1247 }
1248 #[inline(always)]
1249 fn msub_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
1250 a - b * c
1251 }
1252 #[inline(always)]
1253 fn floor_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
1254 unsafe { _mm_floor_pd(a.into()).simd_into(self) }
1255 }
1256 #[inline(always)]
1257 fn fract_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
1258 a - a.trunc()
1259 }
1260 #[inline(always)]
1261 fn trunc_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
1262 unsafe { _mm_round_pd(a.into(), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC).simd_into(self) }
1263 }
1264 #[inline(always)]
1265 fn select_f64x2(self, a: mask64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
1266 unsafe {
1267 let mask = _mm_castsi128_pd(a.into());
1268 _mm_or_pd(_mm_and_pd(mask, b.into()), _mm_andnot_pd(mask, c.into())).simd_into(self)
1269 }
1270 }
1271 #[inline(always)]
1272 fn combine_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x4<Self> {
1273 let mut result = [0.0; 4usize];
1274 result[0..2usize].copy_from_slice(&a.val);
1275 result[2usize..4usize].copy_from_slice(&b.val);
1276 result.simd_into(self)
1277 }
1278 #[inline(always)]
1279 fn reinterpret_f32_f64x2(self, a: f64x2<Self>) -> f32x4<Self> {
1280 f32x4 {
1281 val: bytemuck::cast(a.val),
1282 simd: a.simd,
1283 }
1284 }
1285 #[inline(always)]
1286 fn splat_mask64x2(self, val: i64) -> mask64x2<Self> {
1287 unsafe { _mm_set1_epi64x(val).simd_into(self) }
1288 }
1289 #[inline(always)]
1290 fn not_mask64x2(self, a: mask64x2<Self>) -> mask64x2<Self> {
1291 a ^ !0
1292 }
1293 #[inline(always)]
1294 fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
1295 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1296 }
1297 #[inline(always)]
1298 fn or_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
1299 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1300 }
1301 #[inline(always)]
1302 fn xor_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
1303 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1304 }
1305 #[inline(always)]
1306 fn select_mask64x2(
1307 self,
1308 a: mask64x2<Self>,
1309 b: mask64x2<Self>,
1310 c: mask64x2<Self>,
1311 ) -> mask64x2<Self> {
1312 unsafe {
1313 _mm_or_si128(
1314 _mm_and_si128(a.into(), b.into()),
1315 _mm_andnot_si128(a.into(), c.into()),
1316 )
1317 .simd_into(self)
1318 }
1319 }
1320 #[inline(always)]
1321 fn simd_eq_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
1322 unsafe { _mm_cmpeq_epi64(a.into(), b.into()).simd_into(self) }
1323 }
1324 #[inline(always)]
1325 fn combine_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x4<Self> {
1326 let mut result = [0; 4usize];
1327 result[0..2usize].copy_from_slice(&a.val);
1328 result[2usize..4usize].copy_from_slice(&b.val);
1329 result.simd_into(self)
1330 }
1331 #[inline(always)]
1332 fn splat_f32x8(self, a: f32) -> f32x8<Self> {
1333 let half = self.splat_f32x4(a);
1334 self.combine_f32x4(half, half)
1335 }
1336 #[inline(always)]
1337 fn abs_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
1338 let (a0, a1) = self.split_f32x8(a);
1339 self.combine_f32x4(self.abs_f32x4(a0), self.abs_f32x4(a1))
1340 }
1341 #[inline(always)]
1342 fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
1343 let (a0, a1) = self.split_f32x8(a);
1344 self.combine_f32x4(self.neg_f32x4(a0), self.neg_f32x4(a1))
1345 }
1346 #[inline(always)]
1347 fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
1348 let (a0, a1) = self.split_f32x8(a);
1349 self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1))
1350 }
1351 #[inline(always)]
1352 fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1353 let (a0, a1) = self.split_f32x8(a);
1354 let (b0, b1) = self.split_f32x8(b);
1355 self.combine_f32x4(self.add_f32x4(a0, b0), self.add_f32x4(a1, b1))
1356 }
1357 #[inline(always)]
1358 fn sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1359 let (a0, a1) = self.split_f32x8(a);
1360 let (b0, b1) = self.split_f32x8(b);
1361 self.combine_f32x4(self.sub_f32x4(a0, b0), self.sub_f32x4(a1, b1))
1362 }
1363 #[inline(always)]
1364 fn mul_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1365 let (a0, a1) = self.split_f32x8(a);
1366 let (b0, b1) = self.split_f32x8(b);
1367 self.combine_f32x4(self.mul_f32x4(a0, b0), self.mul_f32x4(a1, b1))
1368 }
1369 #[inline(always)]
1370 fn div_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1371 let (a0, a1) = self.split_f32x8(a);
1372 let (b0, b1) = self.split_f32x8(b);
1373 self.combine_f32x4(self.div_f32x4(a0, b0), self.div_f32x4(a1, b1))
1374 }
1375 #[inline(always)]
1376 fn copysign_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1377 let (a0, a1) = self.split_f32x8(a);
1378 let (b0, b1) = self.split_f32x8(b);
1379 self.combine_f32x4(self.copysign_f32x4(a0, b0), self.copysign_f32x4(a1, b1))
1380 }
1381 #[inline(always)]
1382 fn simd_eq_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
1383 let (a0, a1) = self.split_f32x8(a);
1384 let (b0, b1) = self.split_f32x8(b);
1385 self.combine_mask32x4(self.simd_eq_f32x4(a0, b0), self.simd_eq_f32x4(a1, b1))
1386 }
1387 #[inline(always)]
1388 fn simd_lt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
1389 let (a0, a1) = self.split_f32x8(a);
1390 let (b0, b1) = self.split_f32x8(b);
1391 self.combine_mask32x4(self.simd_lt_f32x4(a0, b0), self.simd_lt_f32x4(a1, b1))
1392 }
1393 #[inline(always)]
1394 fn simd_le_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
1395 let (a0, a1) = self.split_f32x8(a);
1396 let (b0, b1) = self.split_f32x8(b);
1397 self.combine_mask32x4(self.simd_le_f32x4(a0, b0), self.simd_le_f32x4(a1, b1))
1398 }
1399 #[inline(always)]
1400 fn simd_ge_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
1401 let (a0, a1) = self.split_f32x8(a);
1402 let (b0, b1) = self.split_f32x8(b);
1403 self.combine_mask32x4(self.simd_ge_f32x4(a0, b0), self.simd_ge_f32x4(a1, b1))
1404 }
1405 #[inline(always)]
1406 fn simd_gt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
1407 let (a0, a1) = self.split_f32x8(a);
1408 let (b0, b1) = self.split_f32x8(b);
1409 self.combine_mask32x4(self.simd_gt_f32x4(a0, b0), self.simd_gt_f32x4(a1, b1))
1410 }
1411 #[inline(always)]
1412 fn zip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1413 let (a0, _) = self.split_f32x8(a);
1414 let (b0, _) = self.split_f32x8(b);
1415 self.combine_f32x4(self.zip_low_f32x4(a0, b0), self.zip_high_f32x4(a0, b0))
1416 }
1417 #[inline(always)]
1418 fn zip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1419 let (_, a1) = self.split_f32x8(a);
1420 let (_, b1) = self.split_f32x8(b);
1421 self.combine_f32x4(self.zip_low_f32x4(a1, b1), self.zip_high_f32x4(a1, b1))
1422 }
1423 #[inline(always)]
1424 fn unzip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1425 let (a0, a1) = self.split_f32x8(a);
1426 let (b0, b1) = self.split_f32x8(b);
1427 self.combine_f32x4(self.unzip_low_f32x4(a0, a1), self.unzip_low_f32x4(b0, b1))
1428 }
1429 #[inline(always)]
1430 fn unzip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1431 let (a0, a1) = self.split_f32x8(a);
1432 let (b0, b1) = self.split_f32x8(b);
1433 self.combine_f32x4(self.unzip_high_f32x4(a0, a1), self.unzip_high_f32x4(b0, b1))
1434 }
1435 #[inline(always)]
1436 fn max_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1437 let (a0, a1) = self.split_f32x8(a);
1438 let (b0, b1) = self.split_f32x8(b);
1439 self.combine_f32x4(self.max_f32x4(a0, b0), self.max_f32x4(a1, b1))
1440 }
1441 #[inline(always)]
1442 fn max_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1443 let (a0, a1) = self.split_f32x8(a);
1444 let (b0, b1) = self.split_f32x8(b);
1445 self.combine_f32x4(
1446 self.max_precise_f32x4(a0, b0),
1447 self.max_precise_f32x4(a1, b1),
1448 )
1449 }
1450 #[inline(always)]
1451 fn min_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1452 let (a0, a1) = self.split_f32x8(a);
1453 let (b0, b1) = self.split_f32x8(b);
1454 self.combine_f32x4(self.min_f32x4(a0, b0), self.min_f32x4(a1, b1))
1455 }
1456 #[inline(always)]
1457 fn min_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
1458 let (a0, a1) = self.split_f32x8(a);
1459 let (b0, b1) = self.split_f32x8(b);
1460 self.combine_f32x4(
1461 self.min_precise_f32x4(a0, b0),
1462 self.min_precise_f32x4(a1, b1),
1463 )
1464 }
1465 #[inline(always)]
1466 fn madd_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
1467 let (a0, a1) = self.split_f32x8(a);
1468 let (b0, b1) = self.split_f32x8(b);
1469 let (c0, c1) = self.split_f32x8(c);
1470 self.combine_f32x4(self.madd_f32x4(a0, b0, c0), self.madd_f32x4(a1, b1, c1))
1471 }
1472 #[inline(always)]
1473 fn msub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
1474 let (a0, a1) = self.split_f32x8(a);
1475 let (b0, b1) = self.split_f32x8(b);
1476 let (c0, c1) = self.split_f32x8(c);
1477 self.combine_f32x4(self.msub_f32x4(a0, b0, c0), self.msub_f32x4(a1, b1, c1))
1478 }
1479 #[inline(always)]
1480 fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
1481 let (a0, a1) = self.split_f32x8(a);
1482 self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1))
1483 }
1484 #[inline(always)]
1485 fn fract_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
1486 let (a0, a1) = self.split_f32x8(a);
1487 self.combine_f32x4(self.fract_f32x4(a0), self.fract_f32x4(a1))
1488 }
1489 #[inline(always)]
1490 fn trunc_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
1491 let (a0, a1) = self.split_f32x8(a);
1492 self.combine_f32x4(self.trunc_f32x4(a0), self.trunc_f32x4(a1))
1493 }
1494 #[inline(always)]
1495 fn select_f32x8(self, a: mask32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
1496 let (a0, a1) = self.split_mask32x8(a);
1497 let (b0, b1) = self.split_f32x8(b);
1498 let (c0, c1) = self.split_f32x8(c);
1499 self.combine_f32x4(self.select_f32x4(a0, b0, c0), self.select_f32x4(a1, b1, c1))
1500 }
1501 #[inline(always)]
1502 fn combine_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x16<Self> {
1503 let mut result = [0.0; 16usize];
1504 result[0..8usize].copy_from_slice(&a.val);
1505 result[8usize..16usize].copy_from_slice(&b.val);
1506 result.simd_into(self)
1507 }
1508 #[inline(always)]
1509 fn split_f32x8(self, a: f32x8<Self>) -> (f32x4<Self>, f32x4<Self>) {
1510 let mut b0 = [0.0; 4usize];
1511 let mut b1 = [0.0; 4usize];
1512 b0.copy_from_slice(&a.val[0..4usize]);
1513 b1.copy_from_slice(&a.val[4usize..8usize]);
1514 (b0.simd_into(self), b1.simd_into(self))
1515 }
1516 #[inline(always)]
1517 fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self> {
1518 let (a0, a1) = self.split_f32x8(a);
1519 self.combine_f64x2(
1520 self.reinterpret_f64_f32x4(a0),
1521 self.reinterpret_f64_f32x4(a1),
1522 )
1523 }
1524 #[inline(always)]
1525 fn reinterpret_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
1526 let (a0, a1) = self.split_f32x8(a);
1527 self.combine_i32x4(
1528 self.reinterpret_i32_f32x4(a0),
1529 self.reinterpret_i32_f32x4(a1),
1530 )
1531 }
1532 #[inline(always)]
1533 fn reinterpret_u8_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
1534 let (a0, a1) = self.split_f32x8(a);
1535 self.combine_u8x16(self.reinterpret_u8_f32x4(a0), self.reinterpret_u8_f32x4(a1))
1536 }
1537 #[inline(always)]
1538 fn reinterpret_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
1539 let (a0, a1) = self.split_f32x8(a);
1540 self.combine_u32x4(
1541 self.reinterpret_u32_f32x4(a0),
1542 self.reinterpret_u32_f32x4(a1),
1543 )
1544 }
1545 #[inline(always)]
1546 fn cvt_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
1547 let (a0, a1) = self.split_f32x8(a);
1548 self.combine_u32x4(self.cvt_u32_f32x4(a0), self.cvt_u32_f32x4(a1))
1549 }
1550 #[inline(always)]
1551 fn cvt_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
1552 let (a0, a1) = self.split_f32x8(a);
1553 self.combine_i32x4(self.cvt_i32_f32x4(a0), self.cvt_i32_f32x4(a1))
1554 }
1555 #[inline(always)]
1556 fn splat_i8x32(self, a: i8) -> i8x32<Self> {
1557 let half = self.splat_i8x16(a);
1558 self.combine_i8x16(half, half)
1559 }
1560 #[inline(always)]
1561 fn not_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
1562 let (a0, a1) = self.split_i8x32(a);
1563 self.combine_i8x16(self.not_i8x16(a0), self.not_i8x16(a1))
1564 }
1565 #[inline(always)]
1566 fn add_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1567 let (a0, a1) = self.split_i8x32(a);
1568 let (b0, b1) = self.split_i8x32(b);
1569 self.combine_i8x16(self.add_i8x16(a0, b0), self.add_i8x16(a1, b1))
1570 }
1571 #[inline(always)]
1572 fn sub_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1573 let (a0, a1) = self.split_i8x32(a);
1574 let (b0, b1) = self.split_i8x32(b);
1575 self.combine_i8x16(self.sub_i8x16(a0, b0), self.sub_i8x16(a1, b1))
1576 }
1577 #[inline(always)]
1578 fn mul_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1579 let (a0, a1) = self.split_i8x32(a);
1580 let (b0, b1) = self.split_i8x32(b);
1581 self.combine_i8x16(self.mul_i8x16(a0, b0), self.mul_i8x16(a1, b1))
1582 }
1583 #[inline(always)]
1584 fn and_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1585 let (a0, a1) = self.split_i8x32(a);
1586 let (b0, b1) = self.split_i8x32(b);
1587 self.combine_i8x16(self.and_i8x16(a0, b0), self.and_i8x16(a1, b1))
1588 }
1589 #[inline(always)]
1590 fn or_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1591 let (a0, a1) = self.split_i8x32(a);
1592 let (b0, b1) = self.split_i8x32(b);
1593 self.combine_i8x16(self.or_i8x16(a0, b0), self.or_i8x16(a1, b1))
1594 }
1595 #[inline(always)]
1596 fn xor_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1597 let (a0, a1) = self.split_i8x32(a);
1598 let (b0, b1) = self.split_i8x32(b);
1599 self.combine_i8x16(self.xor_i8x16(a0, b0), self.xor_i8x16(a1, b1))
1600 }
1601 #[inline(always)]
1602 fn shr_i8x32(self, a: i8x32<Self>, b: u32) -> i8x32<Self> {
1603 let (a0, a1) = self.split_i8x32(a);
1604 self.combine_i8x16(self.shr_i8x16(a0, b), self.shr_i8x16(a1, b))
1605 }
1606 #[inline(always)]
1607 fn simd_eq_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
1608 let (a0, a1) = self.split_i8x32(a);
1609 let (b0, b1) = self.split_i8x32(b);
1610 self.combine_mask8x16(self.simd_eq_i8x16(a0, b0), self.simd_eq_i8x16(a1, b1))
1611 }
1612 #[inline(always)]
1613 fn simd_lt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
1614 let (a0, a1) = self.split_i8x32(a);
1615 let (b0, b1) = self.split_i8x32(b);
1616 self.combine_mask8x16(self.simd_lt_i8x16(a0, b0), self.simd_lt_i8x16(a1, b1))
1617 }
1618 #[inline(always)]
1619 fn simd_le_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
1620 let (a0, a1) = self.split_i8x32(a);
1621 let (b0, b1) = self.split_i8x32(b);
1622 self.combine_mask8x16(self.simd_le_i8x16(a0, b0), self.simd_le_i8x16(a1, b1))
1623 }
1624 #[inline(always)]
1625 fn simd_ge_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
1626 let (a0, a1) = self.split_i8x32(a);
1627 let (b0, b1) = self.split_i8x32(b);
1628 self.combine_mask8x16(self.simd_ge_i8x16(a0, b0), self.simd_ge_i8x16(a1, b1))
1629 }
1630 #[inline(always)]
1631 fn simd_gt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
1632 let (a0, a1) = self.split_i8x32(a);
1633 let (b0, b1) = self.split_i8x32(b);
1634 self.combine_mask8x16(self.simd_gt_i8x16(a0, b0), self.simd_gt_i8x16(a1, b1))
1635 }
1636 #[inline(always)]
1637 fn zip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1638 let (a0, _) = self.split_i8x32(a);
1639 let (b0, _) = self.split_i8x32(b);
1640 self.combine_i8x16(self.zip_low_i8x16(a0, b0), self.zip_high_i8x16(a0, b0))
1641 }
1642 #[inline(always)]
1643 fn zip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1644 let (_, a1) = self.split_i8x32(a);
1645 let (_, b1) = self.split_i8x32(b);
1646 self.combine_i8x16(self.zip_low_i8x16(a1, b1), self.zip_high_i8x16(a1, b1))
1647 }
1648 #[inline(always)]
1649 fn unzip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1650 let (a0, a1) = self.split_i8x32(a);
1651 let (b0, b1) = self.split_i8x32(b);
1652 self.combine_i8x16(self.unzip_low_i8x16(a0, a1), self.unzip_low_i8x16(b0, b1))
1653 }
1654 #[inline(always)]
1655 fn unzip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1656 let (a0, a1) = self.split_i8x32(a);
1657 let (b0, b1) = self.split_i8x32(b);
1658 self.combine_i8x16(self.unzip_high_i8x16(a0, a1), self.unzip_high_i8x16(b0, b1))
1659 }
1660 #[inline(always)]
1661 fn select_i8x32(self, a: mask8x32<Self>, b: i8x32<Self>, c: i8x32<Self>) -> i8x32<Self> {
1662 let (a0, a1) = self.split_mask8x32(a);
1663 let (b0, b1) = self.split_i8x32(b);
1664 let (c0, c1) = self.split_i8x32(c);
1665 self.combine_i8x16(self.select_i8x16(a0, b0, c0), self.select_i8x16(a1, b1, c1))
1666 }
1667 #[inline(always)]
1668 fn min_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1669 let (a0, a1) = self.split_i8x32(a);
1670 let (b0, b1) = self.split_i8x32(b);
1671 self.combine_i8x16(self.min_i8x16(a0, b0), self.min_i8x16(a1, b1))
1672 }
1673 #[inline(always)]
1674 fn max_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1675 let (a0, a1) = self.split_i8x32(a);
1676 let (b0, b1) = self.split_i8x32(b);
1677 self.combine_i8x16(self.max_i8x16(a0, b0), self.max_i8x16(a1, b1))
1678 }
1679 #[inline(always)]
1680 fn combine_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x64<Self> {
1681 let mut result = [0; 64usize];
1682 result[0..32usize].copy_from_slice(&a.val);
1683 result[32usize..64usize].copy_from_slice(&b.val);
1684 result.simd_into(self)
1685 }
1686 #[inline(always)]
1687 fn split_i8x32(self, a: i8x32<Self>) -> (i8x16<Self>, i8x16<Self>) {
1688 let mut b0 = [0; 16usize];
1689 let mut b1 = [0; 16usize];
1690 b0.copy_from_slice(&a.val[0..16usize]);
1691 b1.copy_from_slice(&a.val[16usize..32usize]);
1692 (b0.simd_into(self), b1.simd_into(self))
1693 }
1694 #[inline(always)]
1695 fn reinterpret_u8_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
1696 let (a0, a1) = self.split_i8x32(a);
1697 self.combine_u8x16(self.reinterpret_u8_i8x16(a0), self.reinterpret_u8_i8x16(a1))
1698 }
1699 #[inline(always)]
1700 fn reinterpret_u32_i8x32(self, a: i8x32<Self>) -> u32x8<Self> {
1701 let (a0, a1) = self.split_i8x32(a);
1702 self.combine_u32x4(
1703 self.reinterpret_u32_i8x16(a0),
1704 self.reinterpret_u32_i8x16(a1),
1705 )
1706 }
1707 #[inline(always)]
1708 fn splat_u8x32(self, a: u8) -> u8x32<Self> {
1709 let half = self.splat_u8x16(a);
1710 self.combine_u8x16(half, half)
1711 }
1712 #[inline(always)]
1713 fn not_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
1714 let (a0, a1) = self.split_u8x32(a);
1715 self.combine_u8x16(self.not_u8x16(a0), self.not_u8x16(a1))
1716 }
1717 #[inline(always)]
1718 fn add_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1719 let (a0, a1) = self.split_u8x32(a);
1720 let (b0, b1) = self.split_u8x32(b);
1721 self.combine_u8x16(self.add_u8x16(a0, b0), self.add_u8x16(a1, b1))
1722 }
1723 #[inline(always)]
1724 fn sub_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1725 let (a0, a1) = self.split_u8x32(a);
1726 let (b0, b1) = self.split_u8x32(b);
1727 self.combine_u8x16(self.sub_u8x16(a0, b0), self.sub_u8x16(a1, b1))
1728 }
1729 #[inline(always)]
1730 fn mul_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1731 let (a0, a1) = self.split_u8x32(a);
1732 let (b0, b1) = self.split_u8x32(b);
1733 self.combine_u8x16(self.mul_u8x16(a0, b0), self.mul_u8x16(a1, b1))
1734 }
1735 #[inline(always)]
1736 fn and_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1737 let (a0, a1) = self.split_u8x32(a);
1738 let (b0, b1) = self.split_u8x32(b);
1739 self.combine_u8x16(self.and_u8x16(a0, b0), self.and_u8x16(a1, b1))
1740 }
1741 #[inline(always)]
1742 fn or_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1743 let (a0, a1) = self.split_u8x32(a);
1744 let (b0, b1) = self.split_u8x32(b);
1745 self.combine_u8x16(self.or_u8x16(a0, b0), self.or_u8x16(a1, b1))
1746 }
1747 #[inline(always)]
1748 fn xor_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1749 let (a0, a1) = self.split_u8x32(a);
1750 let (b0, b1) = self.split_u8x32(b);
1751 self.combine_u8x16(self.xor_u8x16(a0, b0), self.xor_u8x16(a1, b1))
1752 }
1753 #[inline(always)]
1754 fn shr_u8x32(self, a: u8x32<Self>, b: u32) -> u8x32<Self> {
1755 let (a0, a1) = self.split_u8x32(a);
1756 self.combine_u8x16(self.shr_u8x16(a0, b), self.shr_u8x16(a1, b))
1757 }
1758 #[inline(always)]
1759 fn simd_eq_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
1760 let (a0, a1) = self.split_u8x32(a);
1761 let (b0, b1) = self.split_u8x32(b);
1762 self.combine_mask8x16(self.simd_eq_u8x16(a0, b0), self.simd_eq_u8x16(a1, b1))
1763 }
1764 #[inline(always)]
1765 fn simd_lt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
1766 let (a0, a1) = self.split_u8x32(a);
1767 let (b0, b1) = self.split_u8x32(b);
1768 self.combine_mask8x16(self.simd_lt_u8x16(a0, b0), self.simd_lt_u8x16(a1, b1))
1769 }
1770 #[inline(always)]
1771 fn simd_le_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
1772 let (a0, a1) = self.split_u8x32(a);
1773 let (b0, b1) = self.split_u8x32(b);
1774 self.combine_mask8x16(self.simd_le_u8x16(a0, b0), self.simd_le_u8x16(a1, b1))
1775 }
1776 #[inline(always)]
1777 fn simd_ge_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
1778 let (a0, a1) = self.split_u8x32(a);
1779 let (b0, b1) = self.split_u8x32(b);
1780 self.combine_mask8x16(self.simd_ge_u8x16(a0, b0), self.simd_ge_u8x16(a1, b1))
1781 }
1782 #[inline(always)]
1783 fn simd_gt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
1784 let (a0, a1) = self.split_u8x32(a);
1785 let (b0, b1) = self.split_u8x32(b);
1786 self.combine_mask8x16(self.simd_gt_u8x16(a0, b0), self.simd_gt_u8x16(a1, b1))
1787 }
1788 #[inline(always)]
1789 fn zip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1790 let (a0, _) = self.split_u8x32(a);
1791 let (b0, _) = self.split_u8x32(b);
1792 self.combine_u8x16(self.zip_low_u8x16(a0, b0), self.zip_high_u8x16(a0, b0))
1793 }
1794 #[inline(always)]
1795 fn zip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1796 let (_, a1) = self.split_u8x32(a);
1797 let (_, b1) = self.split_u8x32(b);
1798 self.combine_u8x16(self.zip_low_u8x16(a1, b1), self.zip_high_u8x16(a1, b1))
1799 }
1800 #[inline(always)]
1801 fn unzip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1802 let (a0, a1) = self.split_u8x32(a);
1803 let (b0, b1) = self.split_u8x32(b);
1804 self.combine_u8x16(self.unzip_low_u8x16(a0, a1), self.unzip_low_u8x16(b0, b1))
1805 }
1806 #[inline(always)]
1807 fn unzip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1808 let (a0, a1) = self.split_u8x32(a);
1809 let (b0, b1) = self.split_u8x32(b);
1810 self.combine_u8x16(self.unzip_high_u8x16(a0, a1), self.unzip_high_u8x16(b0, b1))
1811 }
1812 #[inline(always)]
1813 fn select_u8x32(self, a: mask8x32<Self>, b: u8x32<Self>, c: u8x32<Self>) -> u8x32<Self> {
1814 let (a0, a1) = self.split_mask8x32(a);
1815 let (b0, b1) = self.split_u8x32(b);
1816 let (c0, c1) = self.split_u8x32(c);
1817 self.combine_u8x16(self.select_u8x16(a0, b0, c0), self.select_u8x16(a1, b1, c1))
1818 }
1819 #[inline(always)]
1820 fn min_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1821 let (a0, a1) = self.split_u8x32(a);
1822 let (b0, b1) = self.split_u8x32(b);
1823 self.combine_u8x16(self.min_u8x16(a0, b0), self.min_u8x16(a1, b1))
1824 }
1825 #[inline(always)]
1826 fn max_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1827 let (a0, a1) = self.split_u8x32(a);
1828 let (b0, b1) = self.split_u8x32(b);
1829 self.combine_u8x16(self.max_u8x16(a0, b0), self.max_u8x16(a1, b1))
1830 }
1831 #[inline(always)]
1832 fn combine_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x64<Self> {
1833 let mut result = [0; 64usize];
1834 result[0..32usize].copy_from_slice(&a.val);
1835 result[32usize..64usize].copy_from_slice(&b.val);
1836 result.simd_into(self)
1837 }
1838 #[inline(always)]
1839 fn split_u8x32(self, a: u8x32<Self>) -> (u8x16<Self>, u8x16<Self>) {
1840 let mut b0 = [0; 16usize];
1841 let mut b1 = [0; 16usize];
1842 b0.copy_from_slice(&a.val[0..16usize]);
1843 b1.copy_from_slice(&a.val[16usize..32usize]);
1844 (b0.simd_into(self), b1.simd_into(self))
1845 }
1846 #[inline(always)]
1847 fn widen_u8x32(self, a: u8x32<Self>) -> u16x32<Self> {
1848 let (a0, a1) = self.split_u8x32(a);
1849 self.combine_u16x16(self.widen_u8x16(a0), self.widen_u8x16(a1))
1850 }
1851 #[inline(always)]
1852 fn reinterpret_u32_u8x32(self, a: u8x32<Self>) -> u32x8<Self> {
1853 let (a0, a1) = self.split_u8x32(a);
1854 self.combine_u32x4(
1855 self.reinterpret_u32_u8x16(a0),
1856 self.reinterpret_u32_u8x16(a1),
1857 )
1858 }
1859 #[inline(always)]
1860 fn splat_mask8x32(self, a: i8) -> mask8x32<Self> {
1861 let half = self.splat_mask8x16(a);
1862 self.combine_mask8x16(half, half)
1863 }
1864 #[inline(always)]
1865 fn not_mask8x32(self, a: mask8x32<Self>) -> mask8x32<Self> {
1866 let (a0, a1) = self.split_mask8x32(a);
1867 self.combine_mask8x16(self.not_mask8x16(a0), self.not_mask8x16(a1))
1868 }
1869 #[inline(always)]
1870 fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
1871 let (a0, a1) = self.split_mask8x32(a);
1872 let (b0, b1) = self.split_mask8x32(b);
1873 self.combine_mask8x16(self.and_mask8x16(a0, b0), self.and_mask8x16(a1, b1))
1874 }
1875 #[inline(always)]
1876 fn or_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
1877 let (a0, a1) = self.split_mask8x32(a);
1878 let (b0, b1) = self.split_mask8x32(b);
1879 self.combine_mask8x16(self.or_mask8x16(a0, b0), self.or_mask8x16(a1, b1))
1880 }
1881 #[inline(always)]
1882 fn xor_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
1883 let (a0, a1) = self.split_mask8x32(a);
1884 let (b0, b1) = self.split_mask8x32(b);
1885 self.combine_mask8x16(self.xor_mask8x16(a0, b0), self.xor_mask8x16(a1, b1))
1886 }
1887 #[inline(always)]
1888 fn select_mask8x32(
1889 self,
1890 a: mask8x32<Self>,
1891 b: mask8x32<Self>,
1892 c: mask8x32<Self>,
1893 ) -> mask8x32<Self> {
1894 let (a0, a1) = self.split_mask8x32(a);
1895 let (b0, b1) = self.split_mask8x32(b);
1896 let (c0, c1) = self.split_mask8x32(c);
1897 self.combine_mask8x16(
1898 self.select_mask8x16(a0, b0, c0),
1899 self.select_mask8x16(a1, b1, c1),
1900 )
1901 }
1902 #[inline(always)]
1903 fn simd_eq_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
1904 let (a0, a1) = self.split_mask8x32(a);
1905 let (b0, b1) = self.split_mask8x32(b);
1906 self.combine_mask8x16(self.simd_eq_mask8x16(a0, b0), self.simd_eq_mask8x16(a1, b1))
1907 }
1908 #[inline(always)]
1909 fn combine_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x64<Self> {
1910 let mut result = [0; 64usize];
1911 result[0..32usize].copy_from_slice(&a.val);
1912 result[32usize..64usize].copy_from_slice(&b.val);
1913 result.simd_into(self)
1914 }
1915 #[inline(always)]
1916 fn split_mask8x32(self, a: mask8x32<Self>) -> (mask8x16<Self>, mask8x16<Self>) {
1917 let mut b0 = [0; 16usize];
1918 let mut b1 = [0; 16usize];
1919 b0.copy_from_slice(&a.val[0..16usize]);
1920 b1.copy_from_slice(&a.val[16usize..32usize]);
1921 (b0.simd_into(self), b1.simd_into(self))
1922 }
1923 #[inline(always)]
1924 fn splat_i16x16(self, a: i16) -> i16x16<Self> {
1925 let half = self.splat_i16x8(a);
1926 self.combine_i16x8(half, half)
1927 }
1928 #[inline(always)]
1929 fn not_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
1930 let (a0, a1) = self.split_i16x16(a);
1931 self.combine_i16x8(self.not_i16x8(a0), self.not_i16x8(a1))
1932 }
1933 #[inline(always)]
1934 fn add_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
1935 let (a0, a1) = self.split_i16x16(a);
1936 let (b0, b1) = self.split_i16x16(b);
1937 self.combine_i16x8(self.add_i16x8(a0, b0), self.add_i16x8(a1, b1))
1938 }
1939 #[inline(always)]
1940 fn sub_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
1941 let (a0, a1) = self.split_i16x16(a);
1942 let (b0, b1) = self.split_i16x16(b);
1943 self.combine_i16x8(self.sub_i16x8(a0, b0), self.sub_i16x8(a1, b1))
1944 }
1945 #[inline(always)]
1946 fn mul_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
1947 let (a0, a1) = self.split_i16x16(a);
1948 let (b0, b1) = self.split_i16x16(b);
1949 self.combine_i16x8(self.mul_i16x8(a0, b0), self.mul_i16x8(a1, b1))
1950 }
1951 #[inline(always)]
1952 fn and_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
1953 let (a0, a1) = self.split_i16x16(a);
1954 let (b0, b1) = self.split_i16x16(b);
1955 self.combine_i16x8(self.and_i16x8(a0, b0), self.and_i16x8(a1, b1))
1956 }
1957 #[inline(always)]
1958 fn or_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
1959 let (a0, a1) = self.split_i16x16(a);
1960 let (b0, b1) = self.split_i16x16(b);
1961 self.combine_i16x8(self.or_i16x8(a0, b0), self.or_i16x8(a1, b1))
1962 }
1963 #[inline(always)]
1964 fn xor_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
1965 let (a0, a1) = self.split_i16x16(a);
1966 let (b0, b1) = self.split_i16x16(b);
1967 self.combine_i16x8(self.xor_i16x8(a0, b0), self.xor_i16x8(a1, b1))
1968 }
1969 #[inline(always)]
1970 fn shr_i16x16(self, a: i16x16<Self>, b: u32) -> i16x16<Self> {
1971 let (a0, a1) = self.split_i16x16(a);
1972 self.combine_i16x8(self.shr_i16x8(a0, b), self.shr_i16x8(a1, b))
1973 }
1974 #[inline(always)]
1975 fn simd_eq_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
1976 let (a0, a1) = self.split_i16x16(a);
1977 let (b0, b1) = self.split_i16x16(b);
1978 self.combine_mask16x8(self.simd_eq_i16x8(a0, b0), self.simd_eq_i16x8(a1, b1))
1979 }
1980 #[inline(always)]
1981 fn simd_lt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
1982 let (a0, a1) = self.split_i16x16(a);
1983 let (b0, b1) = self.split_i16x16(b);
1984 self.combine_mask16x8(self.simd_lt_i16x8(a0, b0), self.simd_lt_i16x8(a1, b1))
1985 }
1986 #[inline(always)]
1987 fn simd_le_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
1988 let (a0, a1) = self.split_i16x16(a);
1989 let (b0, b1) = self.split_i16x16(b);
1990 self.combine_mask16x8(self.simd_le_i16x8(a0, b0), self.simd_le_i16x8(a1, b1))
1991 }
1992 #[inline(always)]
1993 fn simd_ge_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
1994 let (a0, a1) = self.split_i16x16(a);
1995 let (b0, b1) = self.split_i16x16(b);
1996 self.combine_mask16x8(self.simd_ge_i16x8(a0, b0), self.simd_ge_i16x8(a1, b1))
1997 }
1998 #[inline(always)]
1999 fn simd_gt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
2000 let (a0, a1) = self.split_i16x16(a);
2001 let (b0, b1) = self.split_i16x16(b);
2002 self.combine_mask16x8(self.simd_gt_i16x8(a0, b0), self.simd_gt_i16x8(a1, b1))
2003 }
2004 #[inline(always)]
2005 fn zip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2006 let (a0, _) = self.split_i16x16(a);
2007 let (b0, _) = self.split_i16x16(b);
2008 self.combine_i16x8(self.zip_low_i16x8(a0, b0), self.zip_high_i16x8(a0, b0))
2009 }
2010 #[inline(always)]
2011 fn zip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2012 let (_, a1) = self.split_i16x16(a);
2013 let (_, b1) = self.split_i16x16(b);
2014 self.combine_i16x8(self.zip_low_i16x8(a1, b1), self.zip_high_i16x8(a1, b1))
2015 }
2016 #[inline(always)]
2017 fn unzip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2018 let (a0, a1) = self.split_i16x16(a);
2019 let (b0, b1) = self.split_i16x16(b);
2020 self.combine_i16x8(self.unzip_low_i16x8(a0, a1), self.unzip_low_i16x8(b0, b1))
2021 }
2022 #[inline(always)]
2023 fn unzip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2024 let (a0, a1) = self.split_i16x16(a);
2025 let (b0, b1) = self.split_i16x16(b);
2026 self.combine_i16x8(self.unzip_high_i16x8(a0, a1), self.unzip_high_i16x8(b0, b1))
2027 }
2028 #[inline(always)]
2029 fn select_i16x16(self, a: mask16x16<Self>, b: i16x16<Self>, c: i16x16<Self>) -> i16x16<Self> {
2030 let (a0, a1) = self.split_mask16x16(a);
2031 let (b0, b1) = self.split_i16x16(b);
2032 let (c0, c1) = self.split_i16x16(c);
2033 self.combine_i16x8(self.select_i16x8(a0, b0, c0), self.select_i16x8(a1, b1, c1))
2034 }
2035 #[inline(always)]
2036 fn min_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2037 let (a0, a1) = self.split_i16x16(a);
2038 let (b0, b1) = self.split_i16x16(b);
2039 self.combine_i16x8(self.min_i16x8(a0, b0), self.min_i16x8(a1, b1))
2040 }
2041 #[inline(always)]
2042 fn max_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
2043 let (a0, a1) = self.split_i16x16(a);
2044 let (b0, b1) = self.split_i16x16(b);
2045 self.combine_i16x8(self.max_i16x8(a0, b0), self.max_i16x8(a1, b1))
2046 }
2047 #[inline(always)]
2048 fn combine_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x32<Self> {
2049 let mut result = [0; 32usize];
2050 result[0..16usize].copy_from_slice(&a.val);
2051 result[16usize..32usize].copy_from_slice(&b.val);
2052 result.simd_into(self)
2053 }
2054 #[inline(always)]
2055 fn split_i16x16(self, a: i16x16<Self>) -> (i16x8<Self>, i16x8<Self>) {
2056 let mut b0 = [0; 8usize];
2057 let mut b1 = [0; 8usize];
2058 b0.copy_from_slice(&a.val[0..8usize]);
2059 b1.copy_from_slice(&a.val[8usize..16usize]);
2060 (b0.simd_into(self), b1.simd_into(self))
2061 }
2062 #[inline(always)]
2063 fn reinterpret_u8_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
2064 let (a0, a1) = self.split_i16x16(a);
2065 self.combine_u8x16(self.reinterpret_u8_i16x8(a0), self.reinterpret_u8_i16x8(a1))
2066 }
2067 #[inline(always)]
2068 fn reinterpret_u32_i16x16(self, a: i16x16<Self>) -> u32x8<Self> {
2069 let (a0, a1) = self.split_i16x16(a);
2070 self.combine_u32x4(
2071 self.reinterpret_u32_i16x8(a0),
2072 self.reinterpret_u32_i16x8(a1),
2073 )
2074 }
2075 #[inline(always)]
2076 fn splat_u16x16(self, a: u16) -> u16x16<Self> {
2077 let half = self.splat_u16x8(a);
2078 self.combine_u16x8(half, half)
2079 }
2080 #[inline(always)]
2081 fn not_u16x16(self, a: u16x16<Self>) -> u16x16<Self> {
2082 let (a0, a1) = self.split_u16x16(a);
2083 self.combine_u16x8(self.not_u16x8(a0), self.not_u16x8(a1))
2084 }
2085 #[inline(always)]
2086 fn add_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2087 let (a0, a1) = self.split_u16x16(a);
2088 let (b0, b1) = self.split_u16x16(b);
2089 self.combine_u16x8(self.add_u16x8(a0, b0), self.add_u16x8(a1, b1))
2090 }
2091 #[inline(always)]
2092 fn sub_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2093 let (a0, a1) = self.split_u16x16(a);
2094 let (b0, b1) = self.split_u16x16(b);
2095 self.combine_u16x8(self.sub_u16x8(a0, b0), self.sub_u16x8(a1, b1))
2096 }
2097 #[inline(always)]
2098 fn mul_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2099 let (a0, a1) = self.split_u16x16(a);
2100 let (b0, b1) = self.split_u16x16(b);
2101 self.combine_u16x8(self.mul_u16x8(a0, b0), self.mul_u16x8(a1, b1))
2102 }
2103 #[inline(always)]
2104 fn and_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2105 let (a0, a1) = self.split_u16x16(a);
2106 let (b0, b1) = self.split_u16x16(b);
2107 self.combine_u16x8(self.and_u16x8(a0, b0), self.and_u16x8(a1, b1))
2108 }
2109 #[inline(always)]
2110 fn or_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2111 let (a0, a1) = self.split_u16x16(a);
2112 let (b0, b1) = self.split_u16x16(b);
2113 self.combine_u16x8(self.or_u16x8(a0, b0), self.or_u16x8(a1, b1))
2114 }
2115 #[inline(always)]
2116 fn xor_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2117 let (a0, a1) = self.split_u16x16(a);
2118 let (b0, b1) = self.split_u16x16(b);
2119 self.combine_u16x8(self.xor_u16x8(a0, b0), self.xor_u16x8(a1, b1))
2120 }
2121 #[inline(always)]
2122 fn shr_u16x16(self, a: u16x16<Self>, b: u32) -> u16x16<Self> {
2123 let (a0, a1) = self.split_u16x16(a);
2124 self.combine_u16x8(self.shr_u16x8(a0, b), self.shr_u16x8(a1, b))
2125 }
2126 #[inline(always)]
2127 fn simd_eq_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
2128 let (a0, a1) = self.split_u16x16(a);
2129 let (b0, b1) = self.split_u16x16(b);
2130 self.combine_mask16x8(self.simd_eq_u16x8(a0, b0), self.simd_eq_u16x8(a1, b1))
2131 }
2132 #[inline(always)]
2133 fn simd_lt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
2134 let (a0, a1) = self.split_u16x16(a);
2135 let (b0, b1) = self.split_u16x16(b);
2136 self.combine_mask16x8(self.simd_lt_u16x8(a0, b0), self.simd_lt_u16x8(a1, b1))
2137 }
2138 #[inline(always)]
2139 fn simd_le_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
2140 let (a0, a1) = self.split_u16x16(a);
2141 let (b0, b1) = self.split_u16x16(b);
2142 self.combine_mask16x8(self.simd_le_u16x8(a0, b0), self.simd_le_u16x8(a1, b1))
2143 }
2144 #[inline(always)]
2145 fn simd_ge_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
2146 let (a0, a1) = self.split_u16x16(a);
2147 let (b0, b1) = self.split_u16x16(b);
2148 self.combine_mask16x8(self.simd_ge_u16x8(a0, b0), self.simd_ge_u16x8(a1, b1))
2149 }
2150 #[inline(always)]
2151 fn simd_gt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
2152 let (a0, a1) = self.split_u16x16(a);
2153 let (b0, b1) = self.split_u16x16(b);
2154 self.combine_mask16x8(self.simd_gt_u16x8(a0, b0), self.simd_gt_u16x8(a1, b1))
2155 }
2156 #[inline(always)]
2157 fn zip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2158 let (a0, _) = self.split_u16x16(a);
2159 let (b0, _) = self.split_u16x16(b);
2160 self.combine_u16x8(self.zip_low_u16x8(a0, b0), self.zip_high_u16x8(a0, b0))
2161 }
2162 #[inline(always)]
2163 fn zip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2164 let (_, a1) = self.split_u16x16(a);
2165 let (_, b1) = self.split_u16x16(b);
2166 self.combine_u16x8(self.zip_low_u16x8(a1, b1), self.zip_high_u16x8(a1, b1))
2167 }
2168 #[inline(always)]
2169 fn unzip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2170 let (a0, a1) = self.split_u16x16(a);
2171 let (b0, b1) = self.split_u16x16(b);
2172 self.combine_u16x8(self.unzip_low_u16x8(a0, a1), self.unzip_low_u16x8(b0, b1))
2173 }
2174 #[inline(always)]
2175 fn unzip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2176 let (a0, a1) = self.split_u16x16(a);
2177 let (b0, b1) = self.split_u16x16(b);
2178 self.combine_u16x8(self.unzip_high_u16x8(a0, a1), self.unzip_high_u16x8(b0, b1))
2179 }
2180 #[inline(always)]
2181 fn select_u16x16(self, a: mask16x16<Self>, b: u16x16<Self>, c: u16x16<Self>) -> u16x16<Self> {
2182 let (a0, a1) = self.split_mask16x16(a);
2183 let (b0, b1) = self.split_u16x16(b);
2184 let (c0, c1) = self.split_u16x16(c);
2185 self.combine_u16x8(self.select_u16x8(a0, b0, c0), self.select_u16x8(a1, b1, c1))
2186 }
2187 #[inline(always)]
2188 fn min_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2189 let (a0, a1) = self.split_u16x16(a);
2190 let (b0, b1) = self.split_u16x16(b);
2191 self.combine_u16x8(self.min_u16x8(a0, b0), self.min_u16x8(a1, b1))
2192 }
2193 #[inline(always)]
2194 fn max_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
2195 let (a0, a1) = self.split_u16x16(a);
2196 let (b0, b1) = self.split_u16x16(b);
2197 self.combine_u16x8(self.max_u16x8(a0, b0), self.max_u16x8(a1, b1))
2198 }
2199 #[inline(always)]
2200 fn combine_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x32<Self> {
2201 let mut result = [0; 32usize];
2202 result[0..16usize].copy_from_slice(&a.val);
2203 result[16usize..32usize].copy_from_slice(&b.val);
2204 result.simd_into(self)
2205 }
2206 #[inline(always)]
2207 fn split_u16x16(self, a: u16x16<Self>) -> (u16x8<Self>, u16x8<Self>) {
2208 let mut b0 = [0; 8usize];
2209 let mut b1 = [0; 8usize];
2210 b0.copy_from_slice(&a.val[0..8usize]);
2211 b1.copy_from_slice(&a.val[8usize..16usize]);
2212 (b0.simd_into(self), b1.simd_into(self))
2213 }
2214 #[inline(always)]
2215 fn narrow_u16x16(self, a: u16x16<Self>) -> u8x16<Self> {
2216 let (a, b) = self.split_u16x16(a);
2217 unsafe {
2218 let mask = _mm_set1_epi16(0xFF);
2219 let lo_masked = _mm_and_si128(a.into(), mask);
2220 let hi_masked = _mm_and_si128(b.into(), mask);
2221 let result = _mm_packus_epi16(lo_masked, hi_masked);
2222 result.simd_into(self)
2223 }
2224 }
2225 #[inline(always)]
2226 fn reinterpret_u8_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
2227 let (a0, a1) = self.split_u16x16(a);
2228 self.combine_u8x16(self.reinterpret_u8_u16x8(a0), self.reinterpret_u8_u16x8(a1))
2229 }
2230 #[inline(always)]
2231 fn reinterpret_u32_u16x16(self, a: u16x16<Self>) -> u32x8<Self> {
2232 let (a0, a1) = self.split_u16x16(a);
2233 self.combine_u32x4(
2234 self.reinterpret_u32_u16x8(a0),
2235 self.reinterpret_u32_u16x8(a1),
2236 )
2237 }
2238 #[inline(always)]
2239 fn splat_mask16x16(self, a: i16) -> mask16x16<Self> {
2240 let half = self.splat_mask16x8(a);
2241 self.combine_mask16x8(half, half)
2242 }
2243 #[inline(always)]
2244 fn not_mask16x16(self, a: mask16x16<Self>) -> mask16x16<Self> {
2245 let (a0, a1) = self.split_mask16x16(a);
2246 self.combine_mask16x8(self.not_mask16x8(a0), self.not_mask16x8(a1))
2247 }
2248 #[inline(always)]
2249 fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
2250 let (a0, a1) = self.split_mask16x16(a);
2251 let (b0, b1) = self.split_mask16x16(b);
2252 self.combine_mask16x8(self.and_mask16x8(a0, b0), self.and_mask16x8(a1, b1))
2253 }
2254 #[inline(always)]
2255 fn or_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
2256 let (a0, a1) = self.split_mask16x16(a);
2257 let (b0, b1) = self.split_mask16x16(b);
2258 self.combine_mask16x8(self.or_mask16x8(a0, b0), self.or_mask16x8(a1, b1))
2259 }
2260 #[inline(always)]
2261 fn xor_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
2262 let (a0, a1) = self.split_mask16x16(a);
2263 let (b0, b1) = self.split_mask16x16(b);
2264 self.combine_mask16x8(self.xor_mask16x8(a0, b0), self.xor_mask16x8(a1, b1))
2265 }
2266 #[inline(always)]
2267 fn select_mask16x16(
2268 self,
2269 a: mask16x16<Self>,
2270 b: mask16x16<Self>,
2271 c: mask16x16<Self>,
2272 ) -> mask16x16<Self> {
2273 let (a0, a1) = self.split_mask16x16(a);
2274 let (b0, b1) = self.split_mask16x16(b);
2275 let (c0, c1) = self.split_mask16x16(c);
2276 self.combine_mask16x8(
2277 self.select_mask16x8(a0, b0, c0),
2278 self.select_mask16x8(a1, b1, c1),
2279 )
2280 }
2281 #[inline(always)]
2282 fn simd_eq_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
2283 let (a0, a1) = self.split_mask16x16(a);
2284 let (b0, b1) = self.split_mask16x16(b);
2285 self.combine_mask16x8(self.simd_eq_mask16x8(a0, b0), self.simd_eq_mask16x8(a1, b1))
2286 }
2287 #[inline(always)]
2288 fn combine_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x32<Self> {
2289 let mut result = [0; 32usize];
2290 result[0..16usize].copy_from_slice(&a.val);
2291 result[16usize..32usize].copy_from_slice(&b.val);
2292 result.simd_into(self)
2293 }
2294 #[inline(always)]
2295 fn split_mask16x16(self, a: mask16x16<Self>) -> (mask16x8<Self>, mask16x8<Self>) {
2296 let mut b0 = [0; 8usize];
2297 let mut b1 = [0; 8usize];
2298 b0.copy_from_slice(&a.val[0..8usize]);
2299 b1.copy_from_slice(&a.val[8usize..16usize]);
2300 (b0.simd_into(self), b1.simd_into(self))
2301 }
2302 #[inline(always)]
2303 fn splat_i32x8(self, a: i32) -> i32x8<Self> {
2304 let half = self.splat_i32x4(a);
2305 self.combine_i32x4(half, half)
2306 }
2307 #[inline(always)]
2308 fn not_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
2309 let (a0, a1) = self.split_i32x8(a);
2310 self.combine_i32x4(self.not_i32x4(a0), self.not_i32x4(a1))
2311 }
2312 #[inline(always)]
2313 fn add_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2314 let (a0, a1) = self.split_i32x8(a);
2315 let (b0, b1) = self.split_i32x8(b);
2316 self.combine_i32x4(self.add_i32x4(a0, b0), self.add_i32x4(a1, b1))
2317 }
2318 #[inline(always)]
2319 fn sub_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2320 let (a0, a1) = self.split_i32x8(a);
2321 let (b0, b1) = self.split_i32x8(b);
2322 self.combine_i32x4(self.sub_i32x4(a0, b0), self.sub_i32x4(a1, b1))
2323 }
2324 #[inline(always)]
2325 fn mul_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2326 let (a0, a1) = self.split_i32x8(a);
2327 let (b0, b1) = self.split_i32x8(b);
2328 self.combine_i32x4(self.mul_i32x4(a0, b0), self.mul_i32x4(a1, b1))
2329 }
2330 #[inline(always)]
2331 fn and_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2332 let (a0, a1) = self.split_i32x8(a);
2333 let (b0, b1) = self.split_i32x8(b);
2334 self.combine_i32x4(self.and_i32x4(a0, b0), self.and_i32x4(a1, b1))
2335 }
2336 #[inline(always)]
2337 fn or_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2338 let (a0, a1) = self.split_i32x8(a);
2339 let (b0, b1) = self.split_i32x8(b);
2340 self.combine_i32x4(self.or_i32x4(a0, b0), self.or_i32x4(a1, b1))
2341 }
2342 #[inline(always)]
2343 fn xor_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2344 let (a0, a1) = self.split_i32x8(a);
2345 let (b0, b1) = self.split_i32x8(b);
2346 self.combine_i32x4(self.xor_i32x4(a0, b0), self.xor_i32x4(a1, b1))
2347 }
2348 #[inline(always)]
2349 fn shr_i32x8(self, a: i32x8<Self>, b: u32) -> i32x8<Self> {
2350 let (a0, a1) = self.split_i32x8(a);
2351 self.combine_i32x4(self.shr_i32x4(a0, b), self.shr_i32x4(a1, b))
2352 }
2353 #[inline(always)]
2354 fn simd_eq_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
2355 let (a0, a1) = self.split_i32x8(a);
2356 let (b0, b1) = self.split_i32x8(b);
2357 self.combine_mask32x4(self.simd_eq_i32x4(a0, b0), self.simd_eq_i32x4(a1, b1))
2358 }
2359 #[inline(always)]
2360 fn simd_lt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
2361 let (a0, a1) = self.split_i32x8(a);
2362 let (b0, b1) = self.split_i32x8(b);
2363 self.combine_mask32x4(self.simd_lt_i32x4(a0, b0), self.simd_lt_i32x4(a1, b1))
2364 }
2365 #[inline(always)]
2366 fn simd_le_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
2367 let (a0, a1) = self.split_i32x8(a);
2368 let (b0, b1) = self.split_i32x8(b);
2369 self.combine_mask32x4(self.simd_le_i32x4(a0, b0), self.simd_le_i32x4(a1, b1))
2370 }
2371 #[inline(always)]
2372 fn simd_ge_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
2373 let (a0, a1) = self.split_i32x8(a);
2374 let (b0, b1) = self.split_i32x8(b);
2375 self.combine_mask32x4(self.simd_ge_i32x4(a0, b0), self.simd_ge_i32x4(a1, b1))
2376 }
2377 #[inline(always)]
2378 fn simd_gt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
2379 let (a0, a1) = self.split_i32x8(a);
2380 let (b0, b1) = self.split_i32x8(b);
2381 self.combine_mask32x4(self.simd_gt_i32x4(a0, b0), self.simd_gt_i32x4(a1, b1))
2382 }
2383 #[inline(always)]
2384 fn zip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2385 let (a0, _) = self.split_i32x8(a);
2386 let (b0, _) = self.split_i32x8(b);
2387 self.combine_i32x4(self.zip_low_i32x4(a0, b0), self.zip_high_i32x4(a0, b0))
2388 }
2389 #[inline(always)]
2390 fn zip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2391 let (_, a1) = self.split_i32x8(a);
2392 let (_, b1) = self.split_i32x8(b);
2393 self.combine_i32x4(self.zip_low_i32x4(a1, b1), self.zip_high_i32x4(a1, b1))
2394 }
2395 #[inline(always)]
2396 fn unzip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2397 let (a0, a1) = self.split_i32x8(a);
2398 let (b0, b1) = self.split_i32x8(b);
2399 self.combine_i32x4(self.unzip_low_i32x4(a0, a1), self.unzip_low_i32x4(b0, b1))
2400 }
2401 #[inline(always)]
2402 fn unzip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2403 let (a0, a1) = self.split_i32x8(a);
2404 let (b0, b1) = self.split_i32x8(b);
2405 self.combine_i32x4(self.unzip_high_i32x4(a0, a1), self.unzip_high_i32x4(b0, b1))
2406 }
2407 #[inline(always)]
2408 fn select_i32x8(self, a: mask32x8<Self>, b: i32x8<Self>, c: i32x8<Self>) -> i32x8<Self> {
2409 let (a0, a1) = self.split_mask32x8(a);
2410 let (b0, b1) = self.split_i32x8(b);
2411 let (c0, c1) = self.split_i32x8(c);
2412 self.combine_i32x4(self.select_i32x4(a0, b0, c0), self.select_i32x4(a1, b1, c1))
2413 }
2414 #[inline(always)]
2415 fn min_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2416 let (a0, a1) = self.split_i32x8(a);
2417 let (b0, b1) = self.split_i32x8(b);
2418 self.combine_i32x4(self.min_i32x4(a0, b0), self.min_i32x4(a1, b1))
2419 }
2420 #[inline(always)]
2421 fn max_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
2422 let (a0, a1) = self.split_i32x8(a);
2423 let (b0, b1) = self.split_i32x8(b);
2424 self.combine_i32x4(self.max_i32x4(a0, b0), self.max_i32x4(a1, b1))
2425 }
2426 #[inline(always)]
2427 fn combine_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x16<Self> {
2428 let mut result = [0; 16usize];
2429 result[0..8usize].copy_from_slice(&a.val);
2430 result[8usize..16usize].copy_from_slice(&b.val);
2431 result.simd_into(self)
2432 }
2433 #[inline(always)]
2434 fn split_i32x8(self, a: i32x8<Self>) -> (i32x4<Self>, i32x4<Self>) {
2435 let mut b0 = [0; 4usize];
2436 let mut b1 = [0; 4usize];
2437 b0.copy_from_slice(&a.val[0..4usize]);
2438 b1.copy_from_slice(&a.val[4usize..8usize]);
2439 (b0.simd_into(self), b1.simd_into(self))
2440 }
2441 #[inline(always)]
2442 fn reinterpret_u8_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
2443 let (a0, a1) = self.split_i32x8(a);
2444 self.combine_u8x16(self.reinterpret_u8_i32x4(a0), self.reinterpret_u8_i32x4(a1))
2445 }
2446 #[inline(always)]
2447 fn reinterpret_u32_i32x8(self, a: i32x8<Self>) -> u32x8<Self> {
2448 let (a0, a1) = self.split_i32x8(a);
2449 self.combine_u32x4(
2450 self.reinterpret_u32_i32x4(a0),
2451 self.reinterpret_u32_i32x4(a1),
2452 )
2453 }
2454 #[inline(always)]
2455 fn cvt_f32_i32x8(self, a: i32x8<Self>) -> f32x8<Self> {
2456 let (a0, a1) = self.split_i32x8(a);
2457 self.combine_f32x4(self.cvt_f32_i32x4(a0), self.cvt_f32_i32x4(a1))
2458 }
2459 #[inline(always)]
2460 fn splat_u32x8(self, a: u32) -> u32x8<Self> {
2461 let half = self.splat_u32x4(a);
2462 self.combine_u32x4(half, half)
2463 }
2464 #[inline(always)]
2465 fn not_u32x8(self, a: u32x8<Self>) -> u32x8<Self> {
2466 let (a0, a1) = self.split_u32x8(a);
2467 self.combine_u32x4(self.not_u32x4(a0), self.not_u32x4(a1))
2468 }
2469 #[inline(always)]
2470 fn add_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2471 let (a0, a1) = self.split_u32x8(a);
2472 let (b0, b1) = self.split_u32x8(b);
2473 self.combine_u32x4(self.add_u32x4(a0, b0), self.add_u32x4(a1, b1))
2474 }
2475 #[inline(always)]
2476 fn sub_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2477 let (a0, a1) = self.split_u32x8(a);
2478 let (b0, b1) = self.split_u32x8(b);
2479 self.combine_u32x4(self.sub_u32x4(a0, b0), self.sub_u32x4(a1, b1))
2480 }
2481 #[inline(always)]
2482 fn mul_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2483 let (a0, a1) = self.split_u32x8(a);
2484 let (b0, b1) = self.split_u32x8(b);
2485 self.combine_u32x4(self.mul_u32x4(a0, b0), self.mul_u32x4(a1, b1))
2486 }
2487 #[inline(always)]
2488 fn and_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2489 let (a0, a1) = self.split_u32x8(a);
2490 let (b0, b1) = self.split_u32x8(b);
2491 self.combine_u32x4(self.and_u32x4(a0, b0), self.and_u32x4(a1, b1))
2492 }
2493 #[inline(always)]
2494 fn or_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2495 let (a0, a1) = self.split_u32x8(a);
2496 let (b0, b1) = self.split_u32x8(b);
2497 self.combine_u32x4(self.or_u32x4(a0, b0), self.or_u32x4(a1, b1))
2498 }
2499 #[inline(always)]
2500 fn xor_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2501 let (a0, a1) = self.split_u32x8(a);
2502 let (b0, b1) = self.split_u32x8(b);
2503 self.combine_u32x4(self.xor_u32x4(a0, b0), self.xor_u32x4(a1, b1))
2504 }
2505 #[inline(always)]
2506 fn shr_u32x8(self, a: u32x8<Self>, b: u32) -> u32x8<Self> {
2507 let (a0, a1) = self.split_u32x8(a);
2508 self.combine_u32x4(self.shr_u32x4(a0, b), self.shr_u32x4(a1, b))
2509 }
2510 #[inline(always)]
2511 fn simd_eq_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
2512 let (a0, a1) = self.split_u32x8(a);
2513 let (b0, b1) = self.split_u32x8(b);
2514 self.combine_mask32x4(self.simd_eq_u32x4(a0, b0), self.simd_eq_u32x4(a1, b1))
2515 }
2516 #[inline(always)]
2517 fn simd_lt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
2518 let (a0, a1) = self.split_u32x8(a);
2519 let (b0, b1) = self.split_u32x8(b);
2520 self.combine_mask32x4(self.simd_lt_u32x4(a0, b0), self.simd_lt_u32x4(a1, b1))
2521 }
2522 #[inline(always)]
2523 fn simd_le_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
2524 let (a0, a1) = self.split_u32x8(a);
2525 let (b0, b1) = self.split_u32x8(b);
2526 self.combine_mask32x4(self.simd_le_u32x4(a0, b0), self.simd_le_u32x4(a1, b1))
2527 }
2528 #[inline(always)]
2529 fn simd_ge_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
2530 let (a0, a1) = self.split_u32x8(a);
2531 let (b0, b1) = self.split_u32x8(b);
2532 self.combine_mask32x4(self.simd_ge_u32x4(a0, b0), self.simd_ge_u32x4(a1, b1))
2533 }
2534 #[inline(always)]
2535 fn simd_gt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
2536 let (a0, a1) = self.split_u32x8(a);
2537 let (b0, b1) = self.split_u32x8(b);
2538 self.combine_mask32x4(self.simd_gt_u32x4(a0, b0), self.simd_gt_u32x4(a1, b1))
2539 }
2540 #[inline(always)]
2541 fn zip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2542 let (a0, _) = self.split_u32x8(a);
2543 let (b0, _) = self.split_u32x8(b);
2544 self.combine_u32x4(self.zip_low_u32x4(a0, b0), self.zip_high_u32x4(a0, b0))
2545 }
2546 #[inline(always)]
2547 fn zip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2548 let (_, a1) = self.split_u32x8(a);
2549 let (_, b1) = self.split_u32x8(b);
2550 self.combine_u32x4(self.zip_low_u32x4(a1, b1), self.zip_high_u32x4(a1, b1))
2551 }
2552 #[inline(always)]
2553 fn unzip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2554 let (a0, a1) = self.split_u32x8(a);
2555 let (b0, b1) = self.split_u32x8(b);
2556 self.combine_u32x4(self.unzip_low_u32x4(a0, a1), self.unzip_low_u32x4(b0, b1))
2557 }
2558 #[inline(always)]
2559 fn unzip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2560 let (a0, a1) = self.split_u32x8(a);
2561 let (b0, b1) = self.split_u32x8(b);
2562 self.combine_u32x4(self.unzip_high_u32x4(a0, a1), self.unzip_high_u32x4(b0, b1))
2563 }
2564 #[inline(always)]
2565 fn select_u32x8(self, a: mask32x8<Self>, b: u32x8<Self>, c: u32x8<Self>) -> u32x8<Self> {
2566 let (a0, a1) = self.split_mask32x8(a);
2567 let (b0, b1) = self.split_u32x8(b);
2568 let (c0, c1) = self.split_u32x8(c);
2569 self.combine_u32x4(self.select_u32x4(a0, b0, c0), self.select_u32x4(a1, b1, c1))
2570 }
2571 #[inline(always)]
2572 fn min_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2573 let (a0, a1) = self.split_u32x8(a);
2574 let (b0, b1) = self.split_u32x8(b);
2575 self.combine_u32x4(self.min_u32x4(a0, b0), self.min_u32x4(a1, b1))
2576 }
2577 #[inline(always)]
2578 fn max_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
2579 let (a0, a1) = self.split_u32x8(a);
2580 let (b0, b1) = self.split_u32x8(b);
2581 self.combine_u32x4(self.max_u32x4(a0, b0), self.max_u32x4(a1, b1))
2582 }
2583 #[inline(always)]
2584 fn combine_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x16<Self> {
2585 let mut result = [0; 16usize];
2586 result[0..8usize].copy_from_slice(&a.val);
2587 result[8usize..16usize].copy_from_slice(&b.val);
2588 result.simd_into(self)
2589 }
2590 #[inline(always)]
2591 fn split_u32x8(self, a: u32x8<Self>) -> (u32x4<Self>, u32x4<Self>) {
2592 let mut b0 = [0; 4usize];
2593 let mut b1 = [0; 4usize];
2594 b0.copy_from_slice(&a.val[0..4usize]);
2595 b1.copy_from_slice(&a.val[4usize..8usize]);
2596 (b0.simd_into(self), b1.simd_into(self))
2597 }
2598 #[inline(always)]
2599 fn reinterpret_u8_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
2600 let (a0, a1) = self.split_u32x8(a);
2601 self.combine_u8x16(self.reinterpret_u8_u32x4(a0), self.reinterpret_u8_u32x4(a1))
2602 }
2603 #[inline(always)]
2604 fn cvt_f32_u32x8(self, a: u32x8<Self>) -> f32x8<Self> {
2605 let (a0, a1) = self.split_u32x8(a);
2606 self.combine_f32x4(self.cvt_f32_u32x4(a0), self.cvt_f32_u32x4(a1))
2607 }
2608 #[inline(always)]
2609 fn splat_mask32x8(self, a: i32) -> mask32x8<Self> {
2610 let half = self.splat_mask32x4(a);
2611 self.combine_mask32x4(half, half)
2612 }
2613 #[inline(always)]
2614 fn not_mask32x8(self, a: mask32x8<Self>) -> mask32x8<Self> {
2615 let (a0, a1) = self.split_mask32x8(a);
2616 self.combine_mask32x4(self.not_mask32x4(a0), self.not_mask32x4(a1))
2617 }
2618 #[inline(always)]
2619 fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
2620 let (a0, a1) = self.split_mask32x8(a);
2621 let (b0, b1) = self.split_mask32x8(b);
2622 self.combine_mask32x4(self.and_mask32x4(a0, b0), self.and_mask32x4(a1, b1))
2623 }
2624 #[inline(always)]
2625 fn or_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
2626 let (a0, a1) = self.split_mask32x8(a);
2627 let (b0, b1) = self.split_mask32x8(b);
2628 self.combine_mask32x4(self.or_mask32x4(a0, b0), self.or_mask32x4(a1, b1))
2629 }
2630 #[inline(always)]
2631 fn xor_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
2632 let (a0, a1) = self.split_mask32x8(a);
2633 let (b0, b1) = self.split_mask32x8(b);
2634 self.combine_mask32x4(self.xor_mask32x4(a0, b0), self.xor_mask32x4(a1, b1))
2635 }
2636 #[inline(always)]
2637 fn select_mask32x8(
2638 self,
2639 a: mask32x8<Self>,
2640 b: mask32x8<Self>,
2641 c: mask32x8<Self>,
2642 ) -> mask32x8<Self> {
2643 let (a0, a1) = self.split_mask32x8(a);
2644 let (b0, b1) = self.split_mask32x8(b);
2645 let (c0, c1) = self.split_mask32x8(c);
2646 self.combine_mask32x4(
2647 self.select_mask32x4(a0, b0, c0),
2648 self.select_mask32x4(a1, b1, c1),
2649 )
2650 }
2651 #[inline(always)]
2652 fn simd_eq_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
2653 let (a0, a1) = self.split_mask32x8(a);
2654 let (b0, b1) = self.split_mask32x8(b);
2655 self.combine_mask32x4(self.simd_eq_mask32x4(a0, b0), self.simd_eq_mask32x4(a1, b1))
2656 }
2657 #[inline(always)]
2658 fn combine_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x16<Self> {
2659 let mut result = [0; 16usize];
2660 result[0..8usize].copy_from_slice(&a.val);
2661 result[8usize..16usize].copy_from_slice(&b.val);
2662 result.simd_into(self)
2663 }
2664 #[inline(always)]
2665 fn split_mask32x8(self, a: mask32x8<Self>) -> (mask32x4<Self>, mask32x4<Self>) {
2666 let mut b0 = [0; 4usize];
2667 let mut b1 = [0; 4usize];
2668 b0.copy_from_slice(&a.val[0..4usize]);
2669 b1.copy_from_slice(&a.val[4usize..8usize]);
2670 (b0.simd_into(self), b1.simd_into(self))
2671 }
2672 #[inline(always)]
2673 fn splat_f64x4(self, a: f64) -> f64x4<Self> {
2674 let half = self.splat_f64x2(a);
2675 self.combine_f64x2(half, half)
2676 }
2677 #[inline(always)]
2678 fn abs_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
2679 let (a0, a1) = self.split_f64x4(a);
2680 self.combine_f64x2(self.abs_f64x2(a0), self.abs_f64x2(a1))
2681 }
2682 #[inline(always)]
2683 fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
2684 let (a0, a1) = self.split_f64x4(a);
2685 self.combine_f64x2(self.neg_f64x2(a0), self.neg_f64x2(a1))
2686 }
2687 #[inline(always)]
2688 fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
2689 let (a0, a1) = self.split_f64x4(a);
2690 self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1))
2691 }
2692 #[inline(always)]
2693 fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2694 let (a0, a1) = self.split_f64x4(a);
2695 let (b0, b1) = self.split_f64x4(b);
2696 self.combine_f64x2(self.add_f64x2(a0, b0), self.add_f64x2(a1, b1))
2697 }
2698 #[inline(always)]
2699 fn sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2700 let (a0, a1) = self.split_f64x4(a);
2701 let (b0, b1) = self.split_f64x4(b);
2702 self.combine_f64x2(self.sub_f64x2(a0, b0), self.sub_f64x2(a1, b1))
2703 }
2704 #[inline(always)]
2705 fn mul_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2706 let (a0, a1) = self.split_f64x4(a);
2707 let (b0, b1) = self.split_f64x4(b);
2708 self.combine_f64x2(self.mul_f64x2(a0, b0), self.mul_f64x2(a1, b1))
2709 }
2710 #[inline(always)]
2711 fn div_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2712 let (a0, a1) = self.split_f64x4(a);
2713 let (b0, b1) = self.split_f64x4(b);
2714 self.combine_f64x2(self.div_f64x2(a0, b0), self.div_f64x2(a1, b1))
2715 }
2716 #[inline(always)]
2717 fn copysign_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2718 let (a0, a1) = self.split_f64x4(a);
2719 let (b0, b1) = self.split_f64x4(b);
2720 self.combine_f64x2(self.copysign_f64x2(a0, b0), self.copysign_f64x2(a1, b1))
2721 }
2722 #[inline(always)]
2723 fn simd_eq_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
2724 let (a0, a1) = self.split_f64x4(a);
2725 let (b0, b1) = self.split_f64x4(b);
2726 self.combine_mask64x2(self.simd_eq_f64x2(a0, b0), self.simd_eq_f64x2(a1, b1))
2727 }
2728 #[inline(always)]
2729 fn simd_lt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
2730 let (a0, a1) = self.split_f64x4(a);
2731 let (b0, b1) = self.split_f64x4(b);
2732 self.combine_mask64x2(self.simd_lt_f64x2(a0, b0), self.simd_lt_f64x2(a1, b1))
2733 }
2734 #[inline(always)]
2735 fn simd_le_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
2736 let (a0, a1) = self.split_f64x4(a);
2737 let (b0, b1) = self.split_f64x4(b);
2738 self.combine_mask64x2(self.simd_le_f64x2(a0, b0), self.simd_le_f64x2(a1, b1))
2739 }
2740 #[inline(always)]
2741 fn simd_ge_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
2742 let (a0, a1) = self.split_f64x4(a);
2743 let (b0, b1) = self.split_f64x4(b);
2744 self.combine_mask64x2(self.simd_ge_f64x2(a0, b0), self.simd_ge_f64x2(a1, b1))
2745 }
2746 #[inline(always)]
2747 fn simd_gt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
2748 let (a0, a1) = self.split_f64x4(a);
2749 let (b0, b1) = self.split_f64x4(b);
2750 self.combine_mask64x2(self.simd_gt_f64x2(a0, b0), self.simd_gt_f64x2(a1, b1))
2751 }
2752 #[inline(always)]
2753 fn zip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2754 let (a0, _) = self.split_f64x4(a);
2755 let (b0, _) = self.split_f64x4(b);
2756 self.combine_f64x2(self.zip_low_f64x2(a0, b0), self.zip_high_f64x2(a0, b0))
2757 }
2758 #[inline(always)]
2759 fn zip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2760 let (_, a1) = self.split_f64x4(a);
2761 let (_, b1) = self.split_f64x4(b);
2762 self.combine_f64x2(self.zip_low_f64x2(a1, b1), self.zip_high_f64x2(a1, b1))
2763 }
2764 #[inline(always)]
2765 fn unzip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2766 let (a0, a1) = self.split_f64x4(a);
2767 let (b0, b1) = self.split_f64x4(b);
2768 self.combine_f64x2(self.unzip_low_f64x2(a0, a1), self.unzip_low_f64x2(b0, b1))
2769 }
2770 #[inline(always)]
2771 fn unzip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2772 let (a0, a1) = self.split_f64x4(a);
2773 let (b0, b1) = self.split_f64x4(b);
2774 self.combine_f64x2(self.unzip_high_f64x2(a0, a1), self.unzip_high_f64x2(b0, b1))
2775 }
2776 #[inline(always)]
2777 fn max_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2778 let (a0, a1) = self.split_f64x4(a);
2779 let (b0, b1) = self.split_f64x4(b);
2780 self.combine_f64x2(self.max_f64x2(a0, b0), self.max_f64x2(a1, b1))
2781 }
2782 #[inline(always)]
2783 fn max_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2784 let (a0, a1) = self.split_f64x4(a);
2785 let (b0, b1) = self.split_f64x4(b);
2786 self.combine_f64x2(
2787 self.max_precise_f64x2(a0, b0),
2788 self.max_precise_f64x2(a1, b1),
2789 )
2790 }
2791 #[inline(always)]
2792 fn min_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2793 let (a0, a1) = self.split_f64x4(a);
2794 let (b0, b1) = self.split_f64x4(b);
2795 self.combine_f64x2(self.min_f64x2(a0, b0), self.min_f64x2(a1, b1))
2796 }
2797 #[inline(always)]
2798 fn min_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
2799 let (a0, a1) = self.split_f64x4(a);
2800 let (b0, b1) = self.split_f64x4(b);
2801 self.combine_f64x2(
2802 self.min_precise_f64x2(a0, b0),
2803 self.min_precise_f64x2(a1, b1),
2804 )
2805 }
2806 #[inline(always)]
2807 fn madd_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
2808 let (a0, a1) = self.split_f64x4(a);
2809 let (b0, b1) = self.split_f64x4(b);
2810 let (c0, c1) = self.split_f64x4(c);
2811 self.combine_f64x2(self.madd_f64x2(a0, b0, c0), self.madd_f64x2(a1, b1, c1))
2812 }
2813 #[inline(always)]
2814 fn msub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
2815 let (a0, a1) = self.split_f64x4(a);
2816 let (b0, b1) = self.split_f64x4(b);
2817 let (c0, c1) = self.split_f64x4(c);
2818 self.combine_f64x2(self.msub_f64x2(a0, b0, c0), self.msub_f64x2(a1, b1, c1))
2819 }
2820 #[inline(always)]
2821 fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
2822 let (a0, a1) = self.split_f64x4(a);
2823 self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1))
2824 }
2825 #[inline(always)]
2826 fn fract_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
2827 let (a0, a1) = self.split_f64x4(a);
2828 self.combine_f64x2(self.fract_f64x2(a0), self.fract_f64x2(a1))
2829 }
2830 #[inline(always)]
2831 fn trunc_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
2832 let (a0, a1) = self.split_f64x4(a);
2833 self.combine_f64x2(self.trunc_f64x2(a0), self.trunc_f64x2(a1))
2834 }
2835 #[inline(always)]
2836 fn select_f64x4(self, a: mask64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
2837 let (a0, a1) = self.split_mask64x4(a);
2838 let (b0, b1) = self.split_f64x4(b);
2839 let (c0, c1) = self.split_f64x4(c);
2840 self.combine_f64x2(self.select_f64x2(a0, b0, c0), self.select_f64x2(a1, b1, c1))
2841 }
2842 #[inline(always)]
2843 fn combine_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x8<Self> {
2844 let mut result = [0.0; 8usize];
2845 result[0..4usize].copy_from_slice(&a.val);
2846 result[4usize..8usize].copy_from_slice(&b.val);
2847 result.simd_into(self)
2848 }
2849 #[inline(always)]
2850 fn split_f64x4(self, a: f64x4<Self>) -> (f64x2<Self>, f64x2<Self>) {
2851 let mut b0 = [0.0; 2usize];
2852 let mut b1 = [0.0; 2usize];
2853 b0.copy_from_slice(&a.val[0..2usize]);
2854 b1.copy_from_slice(&a.val[2usize..4usize]);
2855 (b0.simd_into(self), b1.simd_into(self))
2856 }
2857 #[inline(always)]
2858 fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
2859 let (a0, a1) = self.split_f64x4(a);
2860 self.combine_f32x4(
2861 self.reinterpret_f32_f64x2(a0),
2862 self.reinterpret_f32_f64x2(a1),
2863 )
2864 }
2865 #[inline(always)]
2866 fn splat_mask64x4(self, a: i64) -> mask64x4<Self> {
2867 let half = self.splat_mask64x2(a);
2868 self.combine_mask64x2(half, half)
2869 }
2870 #[inline(always)]
2871 fn not_mask64x4(self, a: mask64x4<Self>) -> mask64x4<Self> {
2872 let (a0, a1) = self.split_mask64x4(a);
2873 self.combine_mask64x2(self.not_mask64x2(a0), self.not_mask64x2(a1))
2874 }
2875 #[inline(always)]
2876 fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
2877 let (a0, a1) = self.split_mask64x4(a);
2878 let (b0, b1) = self.split_mask64x4(b);
2879 self.combine_mask64x2(self.and_mask64x2(a0, b0), self.and_mask64x2(a1, b1))
2880 }
2881 #[inline(always)]
2882 fn or_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
2883 let (a0, a1) = self.split_mask64x4(a);
2884 let (b0, b1) = self.split_mask64x4(b);
2885 self.combine_mask64x2(self.or_mask64x2(a0, b0), self.or_mask64x2(a1, b1))
2886 }
2887 #[inline(always)]
2888 fn xor_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
2889 let (a0, a1) = self.split_mask64x4(a);
2890 let (b0, b1) = self.split_mask64x4(b);
2891 self.combine_mask64x2(self.xor_mask64x2(a0, b0), self.xor_mask64x2(a1, b1))
2892 }
2893 #[inline(always)]
2894 fn select_mask64x4(
2895 self,
2896 a: mask64x4<Self>,
2897 b: mask64x4<Self>,
2898 c: mask64x4<Self>,
2899 ) -> mask64x4<Self> {
2900 let (a0, a1) = self.split_mask64x4(a);
2901 let (b0, b1) = self.split_mask64x4(b);
2902 let (c0, c1) = self.split_mask64x4(c);
2903 self.combine_mask64x2(
2904 self.select_mask64x2(a0, b0, c0),
2905 self.select_mask64x2(a1, b1, c1),
2906 )
2907 }
2908 #[inline(always)]
2909 fn simd_eq_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
2910 let (a0, a1) = self.split_mask64x4(a);
2911 let (b0, b1) = self.split_mask64x4(b);
2912 self.combine_mask64x2(self.simd_eq_mask64x2(a0, b0), self.simd_eq_mask64x2(a1, b1))
2913 }
2914 #[inline(always)]
2915 fn combine_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x8<Self> {
2916 let mut result = [0; 8usize];
2917 result[0..4usize].copy_from_slice(&a.val);
2918 result[4usize..8usize].copy_from_slice(&b.val);
2919 result.simd_into(self)
2920 }
2921 #[inline(always)]
2922 fn split_mask64x4(self, a: mask64x4<Self>) -> (mask64x2<Self>, mask64x2<Self>) {
2923 let mut b0 = [0; 2usize];
2924 let mut b1 = [0; 2usize];
2925 b0.copy_from_slice(&a.val[0..2usize]);
2926 b1.copy_from_slice(&a.val[2usize..4usize]);
2927 (b0.simd_into(self), b1.simd_into(self))
2928 }
2929 #[inline(always)]
2930 fn splat_f32x16(self, a: f32) -> f32x16<Self> {
2931 let half = self.splat_f32x8(a);
2932 self.combine_f32x8(half, half)
2933 }
2934 #[inline(always)]
2935 fn abs_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
2936 let (a0, a1) = self.split_f32x16(a);
2937 self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1))
2938 }
2939 #[inline(always)]
2940 fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
2941 let (a0, a1) = self.split_f32x16(a);
2942 self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1))
2943 }
2944 #[inline(always)]
2945 fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
2946 let (a0, a1) = self.split_f32x16(a);
2947 self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
2948 }
2949 #[inline(always)]
2950 fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
2951 let (a0, a1) = self.split_f32x16(a);
2952 let (b0, b1) = self.split_f32x16(b);
2953 self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1))
2954 }
2955 #[inline(always)]
2956 fn sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
2957 let (a0, a1) = self.split_f32x16(a);
2958 let (b0, b1) = self.split_f32x16(b);
2959 self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1))
2960 }
2961 #[inline(always)]
2962 fn mul_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
2963 let (a0, a1) = self.split_f32x16(a);
2964 let (b0, b1) = self.split_f32x16(b);
2965 self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1))
2966 }
2967 #[inline(always)]
2968 fn div_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
2969 let (a0, a1) = self.split_f32x16(a);
2970 let (b0, b1) = self.split_f32x16(b);
2971 self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1))
2972 }
2973 #[inline(always)]
2974 fn copysign_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
2975 let (a0, a1) = self.split_f32x16(a);
2976 let (b0, b1) = self.split_f32x16(b);
2977 self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1))
2978 }
2979 #[inline(always)]
2980 fn simd_eq_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
2981 let (a0, a1) = self.split_f32x16(a);
2982 let (b0, b1) = self.split_f32x16(b);
2983 self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1))
2984 }
2985 #[inline(always)]
2986 fn simd_lt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
2987 let (a0, a1) = self.split_f32x16(a);
2988 let (b0, b1) = self.split_f32x16(b);
2989 self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1))
2990 }
2991 #[inline(always)]
2992 fn simd_le_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
2993 let (a0, a1) = self.split_f32x16(a);
2994 let (b0, b1) = self.split_f32x16(b);
2995 self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1))
2996 }
2997 #[inline(always)]
2998 fn simd_ge_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
2999 let (a0, a1) = self.split_f32x16(a);
3000 let (b0, b1) = self.split_f32x16(b);
3001 self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1))
3002 }
3003 #[inline(always)]
3004 fn simd_gt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
3005 let (a0, a1) = self.split_f32x16(a);
3006 let (b0, b1) = self.split_f32x16(b);
3007 self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1))
3008 }
3009 #[inline(always)]
3010 fn zip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3011 let (a0, _) = self.split_f32x16(a);
3012 let (b0, _) = self.split_f32x16(b);
3013 self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0))
3014 }
3015 #[inline(always)]
3016 fn zip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3017 let (_, a1) = self.split_f32x16(a);
3018 let (_, b1) = self.split_f32x16(b);
3019 self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1))
3020 }
3021 #[inline(always)]
3022 fn unzip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3023 let (a0, a1) = self.split_f32x16(a);
3024 let (b0, b1) = self.split_f32x16(b);
3025 self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1))
3026 }
3027 #[inline(always)]
3028 fn unzip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3029 let (a0, a1) = self.split_f32x16(a);
3030 let (b0, b1) = self.split_f32x16(b);
3031 self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1))
3032 }
3033 #[inline(always)]
3034 fn max_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3035 let (a0, a1) = self.split_f32x16(a);
3036 let (b0, b1) = self.split_f32x16(b);
3037 self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1))
3038 }
3039 #[inline(always)]
3040 fn max_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3041 let (a0, a1) = self.split_f32x16(a);
3042 let (b0, b1) = self.split_f32x16(b);
3043 self.combine_f32x8(
3044 self.max_precise_f32x8(a0, b0),
3045 self.max_precise_f32x8(a1, b1),
3046 )
3047 }
3048 #[inline(always)]
3049 fn min_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3050 let (a0, a1) = self.split_f32x16(a);
3051 let (b0, b1) = self.split_f32x16(b);
3052 self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1))
3053 }
3054 #[inline(always)]
3055 fn min_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
3056 let (a0, a1) = self.split_f32x16(a);
3057 let (b0, b1) = self.split_f32x16(b);
3058 self.combine_f32x8(
3059 self.min_precise_f32x8(a0, b0),
3060 self.min_precise_f32x8(a1, b1),
3061 )
3062 }
3063 #[inline(always)]
3064 fn madd_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
3065 let (a0, a1) = self.split_f32x16(a);
3066 let (b0, b1) = self.split_f32x16(b);
3067 let (c0, c1) = self.split_f32x16(c);
3068 self.combine_f32x8(self.madd_f32x8(a0, b0, c0), self.madd_f32x8(a1, b1, c1))
3069 }
3070 #[inline(always)]
3071 fn msub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
3072 let (a0, a1) = self.split_f32x16(a);
3073 let (b0, b1) = self.split_f32x16(b);
3074 let (c0, c1) = self.split_f32x16(c);
3075 self.combine_f32x8(self.msub_f32x8(a0, b0, c0), self.msub_f32x8(a1, b1, c1))
3076 }
3077 #[inline(always)]
3078 fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
3079 let (a0, a1) = self.split_f32x16(a);
3080 self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1))
3081 }
3082 #[inline(always)]
3083 fn fract_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
3084 let (a0, a1) = self.split_f32x16(a);
3085 self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1))
3086 }
3087 #[inline(always)]
3088 fn trunc_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
3089 let (a0, a1) = self.split_f32x16(a);
3090 self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1))
3091 }
3092 #[inline(always)]
3093 fn select_f32x16(self, a: mask32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
3094 let (a0, a1) = self.split_mask32x16(a);
3095 let (b0, b1) = self.split_f32x16(b);
3096 let (c0, c1) = self.split_f32x16(c);
3097 self.combine_f32x8(self.select_f32x8(a0, b0, c0), self.select_f32x8(a1, b1, c1))
3098 }
3099 #[inline(always)]
3100 fn split_f32x16(self, a: f32x16<Self>) -> (f32x8<Self>, f32x8<Self>) {
3101 let mut b0 = [0.0; 8usize];
3102 let mut b1 = [0.0; 8usize];
3103 b0.copy_from_slice(&a.val[0..8usize]);
3104 b1.copy_from_slice(&a.val[8usize..16usize]);
3105 (b0.simd_into(self), b1.simd_into(self))
3106 }
3107 #[inline(always)]
3108 fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
3109 let (a0, a1) = self.split_f32x16(a);
3110 self.combine_f64x4(
3111 self.reinterpret_f64_f32x8(a0),
3112 self.reinterpret_f64_f32x8(a1),
3113 )
3114 }
3115 #[inline(always)]
3116 fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
3117 let (a0, a1) = self.split_f32x16(a);
3118 self.combine_i32x8(
3119 self.reinterpret_i32_f32x8(a0),
3120 self.reinterpret_i32_f32x8(a1),
3121 )
3122 }
3123 #[inline(always)]
3124 fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
3125 crate::Fallback::new()
3126 .load_interleaved_128_f32x16(src)
3127 .val
3128 .simd_into(self)
3129 }
3130 #[inline(always)]
3131 fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
3132 let fb = crate::Fallback::new();
3133 fb.store_interleaved_128_f32x16(a.val.simd_into(fb), dest);
3134 }
3135 #[inline(always)]
3136 fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
3137 let (a0, a1) = self.split_f32x16(a);
3138 self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1))
3139 }
3140 #[inline(always)]
3141 fn reinterpret_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
3142 let (a0, a1) = self.split_f32x16(a);
3143 self.combine_u32x8(
3144 self.reinterpret_u32_f32x8(a0),
3145 self.reinterpret_u32_f32x8(a1),
3146 )
3147 }
3148 #[inline(always)]
3149 fn cvt_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
3150 let (a0, a1) = self.split_f32x16(a);
3151 self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1))
3152 }
3153 #[inline(always)]
3154 fn cvt_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
3155 let (a0, a1) = self.split_f32x16(a);
3156 self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1))
3157 }
3158 #[inline(always)]
3159 fn splat_i8x64(self, a: i8) -> i8x64<Self> {
3160 let half = self.splat_i8x32(a);
3161 self.combine_i8x32(half, half)
3162 }
3163 #[inline(always)]
3164 fn not_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
3165 let (a0, a1) = self.split_i8x64(a);
3166 self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1))
3167 }
3168 #[inline(always)]
3169 fn add_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3170 let (a0, a1) = self.split_i8x64(a);
3171 let (b0, b1) = self.split_i8x64(b);
3172 self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1))
3173 }
3174 #[inline(always)]
3175 fn sub_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3176 let (a0, a1) = self.split_i8x64(a);
3177 let (b0, b1) = self.split_i8x64(b);
3178 self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1))
3179 }
3180 #[inline(always)]
3181 fn mul_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3182 let (a0, a1) = self.split_i8x64(a);
3183 let (b0, b1) = self.split_i8x64(b);
3184 self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1))
3185 }
3186 #[inline(always)]
3187 fn and_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3188 let (a0, a1) = self.split_i8x64(a);
3189 let (b0, b1) = self.split_i8x64(b);
3190 self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1))
3191 }
3192 #[inline(always)]
3193 fn or_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3194 let (a0, a1) = self.split_i8x64(a);
3195 let (b0, b1) = self.split_i8x64(b);
3196 self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1))
3197 }
3198 #[inline(always)]
3199 fn xor_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3200 let (a0, a1) = self.split_i8x64(a);
3201 let (b0, b1) = self.split_i8x64(b);
3202 self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1))
3203 }
3204 #[inline(always)]
3205 fn shr_i8x64(self, a: i8x64<Self>, b: u32) -> i8x64<Self> {
3206 let (a0, a1) = self.split_i8x64(a);
3207 self.combine_i8x32(self.shr_i8x32(a0, b), self.shr_i8x32(a1, b))
3208 }
3209 #[inline(always)]
3210 fn simd_eq_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
3211 let (a0, a1) = self.split_i8x64(a);
3212 let (b0, b1) = self.split_i8x64(b);
3213 self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1))
3214 }
3215 #[inline(always)]
3216 fn simd_lt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
3217 let (a0, a1) = self.split_i8x64(a);
3218 let (b0, b1) = self.split_i8x64(b);
3219 self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1))
3220 }
3221 #[inline(always)]
3222 fn simd_le_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
3223 let (a0, a1) = self.split_i8x64(a);
3224 let (b0, b1) = self.split_i8x64(b);
3225 self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1))
3226 }
3227 #[inline(always)]
3228 fn simd_ge_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
3229 let (a0, a1) = self.split_i8x64(a);
3230 let (b0, b1) = self.split_i8x64(b);
3231 self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1))
3232 }
3233 #[inline(always)]
3234 fn simd_gt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
3235 let (a0, a1) = self.split_i8x64(a);
3236 let (b0, b1) = self.split_i8x64(b);
3237 self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1))
3238 }
3239 #[inline(always)]
3240 fn zip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3241 let (a0, _) = self.split_i8x64(a);
3242 let (b0, _) = self.split_i8x64(b);
3243 self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0))
3244 }
3245 #[inline(always)]
3246 fn zip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3247 let (_, a1) = self.split_i8x64(a);
3248 let (_, b1) = self.split_i8x64(b);
3249 self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1))
3250 }
3251 #[inline(always)]
3252 fn unzip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3253 let (a0, a1) = self.split_i8x64(a);
3254 let (b0, b1) = self.split_i8x64(b);
3255 self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1))
3256 }
3257 #[inline(always)]
3258 fn unzip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3259 let (a0, a1) = self.split_i8x64(a);
3260 let (b0, b1) = self.split_i8x64(b);
3261 self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1))
3262 }
3263 #[inline(always)]
3264 fn select_i8x64(self, a: mask8x64<Self>, b: i8x64<Self>, c: i8x64<Self>) -> i8x64<Self> {
3265 let (a0, a1) = self.split_mask8x64(a);
3266 let (b0, b1) = self.split_i8x64(b);
3267 let (c0, c1) = self.split_i8x64(c);
3268 self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1))
3269 }
3270 #[inline(always)]
3271 fn min_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3272 let (a0, a1) = self.split_i8x64(a);
3273 let (b0, b1) = self.split_i8x64(b);
3274 self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1))
3275 }
3276 #[inline(always)]
3277 fn max_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
3278 let (a0, a1) = self.split_i8x64(a);
3279 let (b0, b1) = self.split_i8x64(b);
3280 self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1))
3281 }
3282 #[inline(always)]
3283 fn split_i8x64(self, a: i8x64<Self>) -> (i8x32<Self>, i8x32<Self>) {
3284 let mut b0 = [0; 32usize];
3285 let mut b1 = [0; 32usize];
3286 b0.copy_from_slice(&a.val[0..32usize]);
3287 b1.copy_from_slice(&a.val[32usize..64usize]);
3288 (b0.simd_into(self), b1.simd_into(self))
3289 }
3290 #[inline(always)]
3291 fn reinterpret_u8_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
3292 let (a0, a1) = self.split_i8x64(a);
3293 self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1))
3294 }
3295 #[inline(always)]
3296 fn reinterpret_u32_i8x64(self, a: i8x64<Self>) -> u32x16<Self> {
3297 let (a0, a1) = self.split_i8x64(a);
3298 self.combine_u32x8(
3299 self.reinterpret_u32_i8x32(a0),
3300 self.reinterpret_u32_i8x32(a1),
3301 )
3302 }
3303 #[inline(always)]
3304 fn splat_u8x64(self, a: u8) -> u8x64<Self> {
3305 let half = self.splat_u8x32(a);
3306 self.combine_u8x32(half, half)
3307 }
3308 #[inline(always)]
3309 fn not_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
3310 let (a0, a1) = self.split_u8x64(a);
3311 self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1))
3312 }
3313 #[inline(always)]
3314 fn add_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3315 let (a0, a1) = self.split_u8x64(a);
3316 let (b0, b1) = self.split_u8x64(b);
3317 self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1))
3318 }
3319 #[inline(always)]
3320 fn sub_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3321 let (a0, a1) = self.split_u8x64(a);
3322 let (b0, b1) = self.split_u8x64(b);
3323 self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1))
3324 }
3325 #[inline(always)]
3326 fn mul_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3327 let (a0, a1) = self.split_u8x64(a);
3328 let (b0, b1) = self.split_u8x64(b);
3329 self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1))
3330 }
3331 #[inline(always)]
3332 fn and_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3333 let (a0, a1) = self.split_u8x64(a);
3334 let (b0, b1) = self.split_u8x64(b);
3335 self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1))
3336 }
3337 #[inline(always)]
3338 fn or_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3339 let (a0, a1) = self.split_u8x64(a);
3340 let (b0, b1) = self.split_u8x64(b);
3341 self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1))
3342 }
3343 #[inline(always)]
3344 fn xor_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3345 let (a0, a1) = self.split_u8x64(a);
3346 let (b0, b1) = self.split_u8x64(b);
3347 self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1))
3348 }
3349 #[inline(always)]
3350 fn shr_u8x64(self, a: u8x64<Self>, b: u32) -> u8x64<Self> {
3351 let (a0, a1) = self.split_u8x64(a);
3352 self.combine_u8x32(self.shr_u8x32(a0, b), self.shr_u8x32(a1, b))
3353 }
3354 #[inline(always)]
3355 fn simd_eq_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
3356 let (a0, a1) = self.split_u8x64(a);
3357 let (b0, b1) = self.split_u8x64(b);
3358 self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1))
3359 }
3360 #[inline(always)]
3361 fn simd_lt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
3362 let (a0, a1) = self.split_u8x64(a);
3363 let (b0, b1) = self.split_u8x64(b);
3364 self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1))
3365 }
3366 #[inline(always)]
3367 fn simd_le_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
3368 let (a0, a1) = self.split_u8x64(a);
3369 let (b0, b1) = self.split_u8x64(b);
3370 self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1))
3371 }
3372 #[inline(always)]
3373 fn simd_ge_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
3374 let (a0, a1) = self.split_u8x64(a);
3375 let (b0, b1) = self.split_u8x64(b);
3376 self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1))
3377 }
3378 #[inline(always)]
3379 fn simd_gt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
3380 let (a0, a1) = self.split_u8x64(a);
3381 let (b0, b1) = self.split_u8x64(b);
3382 self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1))
3383 }
3384 #[inline(always)]
3385 fn zip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3386 let (a0, _) = self.split_u8x64(a);
3387 let (b0, _) = self.split_u8x64(b);
3388 self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0))
3389 }
3390 #[inline(always)]
3391 fn zip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3392 let (_, a1) = self.split_u8x64(a);
3393 let (_, b1) = self.split_u8x64(b);
3394 self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1))
3395 }
3396 #[inline(always)]
3397 fn unzip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3398 let (a0, a1) = self.split_u8x64(a);
3399 let (b0, b1) = self.split_u8x64(b);
3400 self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1))
3401 }
3402 #[inline(always)]
3403 fn unzip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3404 let (a0, a1) = self.split_u8x64(a);
3405 let (b0, b1) = self.split_u8x64(b);
3406 self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1))
3407 }
3408 #[inline(always)]
3409 fn select_u8x64(self, a: mask8x64<Self>, b: u8x64<Self>, c: u8x64<Self>) -> u8x64<Self> {
3410 let (a0, a1) = self.split_mask8x64(a);
3411 let (b0, b1) = self.split_u8x64(b);
3412 let (c0, c1) = self.split_u8x64(c);
3413 self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1))
3414 }
3415 #[inline(always)]
3416 fn min_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3417 let (a0, a1) = self.split_u8x64(a);
3418 let (b0, b1) = self.split_u8x64(b);
3419 self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1))
3420 }
3421 #[inline(always)]
3422 fn max_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
3423 let (a0, a1) = self.split_u8x64(a);
3424 let (b0, b1) = self.split_u8x64(b);
3425 self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1))
3426 }
3427 #[inline(always)]
3428 fn split_u8x64(self, a: u8x64<Self>) -> (u8x32<Self>, u8x32<Self>) {
3429 let mut b0 = [0; 32usize];
3430 let mut b1 = [0; 32usize];
3431 b0.copy_from_slice(&a.val[0..32usize]);
3432 b1.copy_from_slice(&a.val[32usize..64usize]);
3433 (b0.simd_into(self), b1.simd_into(self))
3434 }
3435 #[inline(always)]
3436 fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self> {
3437 crate::Fallback::new()
3438 .load_interleaved_128_u8x64(src)
3439 .val
3440 .simd_into(self)
3441 }
3442 #[inline(always)]
3443 fn store_interleaved_128_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
3444 let fb = crate::Fallback::new();
3445 fb.store_interleaved_128_u8x64(a.val.simd_into(fb), dest);
3446 }
3447 #[inline(always)]
3448 fn reinterpret_u32_u8x64(self, a: u8x64<Self>) -> u32x16<Self> {
3449 let (a0, a1) = self.split_u8x64(a);
3450 self.combine_u32x8(
3451 self.reinterpret_u32_u8x32(a0),
3452 self.reinterpret_u32_u8x32(a1),
3453 )
3454 }
3455 #[inline(always)]
3456 fn splat_mask8x64(self, a: i8) -> mask8x64<Self> {
3457 let half = self.splat_mask8x32(a);
3458 self.combine_mask8x32(half, half)
3459 }
3460 #[inline(always)]
3461 fn not_mask8x64(self, a: mask8x64<Self>) -> mask8x64<Self> {
3462 let (a0, a1) = self.split_mask8x64(a);
3463 self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1))
3464 }
3465 #[inline(always)]
3466 fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
3467 let (a0, a1) = self.split_mask8x64(a);
3468 let (b0, b1) = self.split_mask8x64(b);
3469 self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1))
3470 }
3471 #[inline(always)]
3472 fn or_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
3473 let (a0, a1) = self.split_mask8x64(a);
3474 let (b0, b1) = self.split_mask8x64(b);
3475 self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1))
3476 }
3477 #[inline(always)]
3478 fn xor_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
3479 let (a0, a1) = self.split_mask8x64(a);
3480 let (b0, b1) = self.split_mask8x64(b);
3481 self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1))
3482 }
3483 #[inline(always)]
3484 fn select_mask8x64(
3485 self,
3486 a: mask8x64<Self>,
3487 b: mask8x64<Self>,
3488 c: mask8x64<Self>,
3489 ) -> mask8x64<Self> {
3490 let (a0, a1) = self.split_mask8x64(a);
3491 let (b0, b1) = self.split_mask8x64(b);
3492 let (c0, c1) = self.split_mask8x64(c);
3493 self.combine_mask8x32(
3494 self.select_mask8x32(a0, b0, c0),
3495 self.select_mask8x32(a1, b1, c1),
3496 )
3497 }
3498 #[inline(always)]
3499 fn simd_eq_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
3500 let (a0, a1) = self.split_mask8x64(a);
3501 let (b0, b1) = self.split_mask8x64(b);
3502 self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1))
3503 }
3504 #[inline(always)]
3505 fn split_mask8x64(self, a: mask8x64<Self>) -> (mask8x32<Self>, mask8x32<Self>) {
3506 let mut b0 = [0; 32usize];
3507 let mut b1 = [0; 32usize];
3508 b0.copy_from_slice(&a.val[0..32usize]);
3509 b1.copy_from_slice(&a.val[32usize..64usize]);
3510 (b0.simd_into(self), b1.simd_into(self))
3511 }
3512 #[inline(always)]
3513 fn splat_i16x32(self, a: i16) -> i16x32<Self> {
3514 let half = self.splat_i16x16(a);
3515 self.combine_i16x16(half, half)
3516 }
3517 #[inline(always)]
3518 fn not_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
3519 let (a0, a1) = self.split_i16x32(a);
3520 self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1))
3521 }
3522 #[inline(always)]
3523 fn add_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3524 let (a0, a1) = self.split_i16x32(a);
3525 let (b0, b1) = self.split_i16x32(b);
3526 self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1))
3527 }
3528 #[inline(always)]
3529 fn sub_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3530 let (a0, a1) = self.split_i16x32(a);
3531 let (b0, b1) = self.split_i16x32(b);
3532 self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1))
3533 }
3534 #[inline(always)]
3535 fn mul_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3536 let (a0, a1) = self.split_i16x32(a);
3537 let (b0, b1) = self.split_i16x32(b);
3538 self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1))
3539 }
3540 #[inline(always)]
3541 fn and_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3542 let (a0, a1) = self.split_i16x32(a);
3543 let (b0, b1) = self.split_i16x32(b);
3544 self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1))
3545 }
3546 #[inline(always)]
3547 fn or_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3548 let (a0, a1) = self.split_i16x32(a);
3549 let (b0, b1) = self.split_i16x32(b);
3550 self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1))
3551 }
3552 #[inline(always)]
3553 fn xor_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3554 let (a0, a1) = self.split_i16x32(a);
3555 let (b0, b1) = self.split_i16x32(b);
3556 self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1))
3557 }
3558 #[inline(always)]
3559 fn shr_i16x32(self, a: i16x32<Self>, b: u32) -> i16x32<Self> {
3560 let (a0, a1) = self.split_i16x32(a);
3561 self.combine_i16x16(self.shr_i16x16(a0, b), self.shr_i16x16(a1, b))
3562 }
3563 #[inline(always)]
3564 fn simd_eq_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
3565 let (a0, a1) = self.split_i16x32(a);
3566 let (b0, b1) = self.split_i16x32(b);
3567 self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1))
3568 }
3569 #[inline(always)]
3570 fn simd_lt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
3571 let (a0, a1) = self.split_i16x32(a);
3572 let (b0, b1) = self.split_i16x32(b);
3573 self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1))
3574 }
3575 #[inline(always)]
3576 fn simd_le_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
3577 let (a0, a1) = self.split_i16x32(a);
3578 let (b0, b1) = self.split_i16x32(b);
3579 self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1))
3580 }
3581 #[inline(always)]
3582 fn simd_ge_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
3583 let (a0, a1) = self.split_i16x32(a);
3584 let (b0, b1) = self.split_i16x32(b);
3585 self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1))
3586 }
3587 #[inline(always)]
3588 fn simd_gt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
3589 let (a0, a1) = self.split_i16x32(a);
3590 let (b0, b1) = self.split_i16x32(b);
3591 self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1))
3592 }
3593 #[inline(always)]
3594 fn zip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3595 let (a0, _) = self.split_i16x32(a);
3596 let (b0, _) = self.split_i16x32(b);
3597 self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0))
3598 }
3599 #[inline(always)]
3600 fn zip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3601 let (_, a1) = self.split_i16x32(a);
3602 let (_, b1) = self.split_i16x32(b);
3603 self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1))
3604 }
3605 #[inline(always)]
3606 fn unzip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3607 let (a0, a1) = self.split_i16x32(a);
3608 let (b0, b1) = self.split_i16x32(b);
3609 self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1))
3610 }
3611 #[inline(always)]
3612 fn unzip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3613 let (a0, a1) = self.split_i16x32(a);
3614 let (b0, b1) = self.split_i16x32(b);
3615 self.combine_i16x16(
3616 self.unzip_high_i16x16(a0, a1),
3617 self.unzip_high_i16x16(b0, b1),
3618 )
3619 }
3620 #[inline(always)]
3621 fn select_i16x32(self, a: mask16x32<Self>, b: i16x32<Self>, c: i16x32<Self>) -> i16x32<Self> {
3622 let (a0, a1) = self.split_mask16x32(a);
3623 let (b0, b1) = self.split_i16x32(b);
3624 let (c0, c1) = self.split_i16x32(c);
3625 self.combine_i16x16(
3626 self.select_i16x16(a0, b0, c0),
3627 self.select_i16x16(a1, b1, c1),
3628 )
3629 }
3630 #[inline(always)]
3631 fn min_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3632 let (a0, a1) = self.split_i16x32(a);
3633 let (b0, b1) = self.split_i16x32(b);
3634 self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1))
3635 }
3636 #[inline(always)]
3637 fn max_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
3638 let (a0, a1) = self.split_i16x32(a);
3639 let (b0, b1) = self.split_i16x32(b);
3640 self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1))
3641 }
3642 #[inline(always)]
3643 fn split_i16x32(self, a: i16x32<Self>) -> (i16x16<Self>, i16x16<Self>) {
3644 let mut b0 = [0; 16usize];
3645 let mut b1 = [0; 16usize];
3646 b0.copy_from_slice(&a.val[0..16usize]);
3647 b1.copy_from_slice(&a.val[16usize..32usize]);
3648 (b0.simd_into(self), b1.simd_into(self))
3649 }
3650 #[inline(always)]
3651 fn reinterpret_u8_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
3652 let (a0, a1) = self.split_i16x32(a);
3653 self.combine_u8x32(
3654 self.reinterpret_u8_i16x16(a0),
3655 self.reinterpret_u8_i16x16(a1),
3656 )
3657 }
3658 #[inline(always)]
3659 fn reinterpret_u32_i16x32(self, a: i16x32<Self>) -> u32x16<Self> {
3660 let (a0, a1) = self.split_i16x32(a);
3661 self.combine_u32x8(
3662 self.reinterpret_u32_i16x16(a0),
3663 self.reinterpret_u32_i16x16(a1),
3664 )
3665 }
3666 #[inline(always)]
3667 fn splat_u16x32(self, a: u16) -> u16x32<Self> {
3668 let half = self.splat_u16x16(a);
3669 self.combine_u16x16(half, half)
3670 }
3671 #[inline(always)]
3672 fn not_u16x32(self, a: u16x32<Self>) -> u16x32<Self> {
3673 let (a0, a1) = self.split_u16x32(a);
3674 self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1))
3675 }
3676 #[inline(always)]
3677 fn add_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3678 let (a0, a1) = self.split_u16x32(a);
3679 let (b0, b1) = self.split_u16x32(b);
3680 self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1))
3681 }
3682 #[inline(always)]
3683 fn sub_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3684 let (a0, a1) = self.split_u16x32(a);
3685 let (b0, b1) = self.split_u16x32(b);
3686 self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1))
3687 }
3688 #[inline(always)]
3689 fn mul_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3690 let (a0, a1) = self.split_u16x32(a);
3691 let (b0, b1) = self.split_u16x32(b);
3692 self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1))
3693 }
3694 #[inline(always)]
3695 fn and_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3696 let (a0, a1) = self.split_u16x32(a);
3697 let (b0, b1) = self.split_u16x32(b);
3698 self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1))
3699 }
3700 #[inline(always)]
3701 fn or_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3702 let (a0, a1) = self.split_u16x32(a);
3703 let (b0, b1) = self.split_u16x32(b);
3704 self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1))
3705 }
3706 #[inline(always)]
3707 fn xor_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3708 let (a0, a1) = self.split_u16x32(a);
3709 let (b0, b1) = self.split_u16x32(b);
3710 self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1))
3711 }
3712 #[inline(always)]
3713 fn shr_u16x32(self, a: u16x32<Self>, b: u32) -> u16x32<Self> {
3714 let (a0, a1) = self.split_u16x32(a);
3715 self.combine_u16x16(self.shr_u16x16(a0, b), self.shr_u16x16(a1, b))
3716 }
3717 #[inline(always)]
3718 fn simd_eq_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
3719 let (a0, a1) = self.split_u16x32(a);
3720 let (b0, b1) = self.split_u16x32(b);
3721 self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1))
3722 }
3723 #[inline(always)]
3724 fn simd_lt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
3725 let (a0, a1) = self.split_u16x32(a);
3726 let (b0, b1) = self.split_u16x32(b);
3727 self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1))
3728 }
3729 #[inline(always)]
3730 fn simd_le_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
3731 let (a0, a1) = self.split_u16x32(a);
3732 let (b0, b1) = self.split_u16x32(b);
3733 self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1))
3734 }
3735 #[inline(always)]
3736 fn simd_ge_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
3737 let (a0, a1) = self.split_u16x32(a);
3738 let (b0, b1) = self.split_u16x32(b);
3739 self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1))
3740 }
3741 #[inline(always)]
3742 fn simd_gt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
3743 let (a0, a1) = self.split_u16x32(a);
3744 let (b0, b1) = self.split_u16x32(b);
3745 self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1))
3746 }
3747 #[inline(always)]
3748 fn zip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3749 let (a0, _) = self.split_u16x32(a);
3750 let (b0, _) = self.split_u16x32(b);
3751 self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0))
3752 }
3753 #[inline(always)]
3754 fn zip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3755 let (_, a1) = self.split_u16x32(a);
3756 let (_, b1) = self.split_u16x32(b);
3757 self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1))
3758 }
3759 #[inline(always)]
3760 fn unzip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3761 let (a0, a1) = self.split_u16x32(a);
3762 let (b0, b1) = self.split_u16x32(b);
3763 self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1))
3764 }
3765 #[inline(always)]
3766 fn unzip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3767 let (a0, a1) = self.split_u16x32(a);
3768 let (b0, b1) = self.split_u16x32(b);
3769 self.combine_u16x16(
3770 self.unzip_high_u16x16(a0, a1),
3771 self.unzip_high_u16x16(b0, b1),
3772 )
3773 }
3774 #[inline(always)]
3775 fn select_u16x32(self, a: mask16x32<Self>, b: u16x32<Self>, c: u16x32<Self>) -> u16x32<Self> {
3776 let (a0, a1) = self.split_mask16x32(a);
3777 let (b0, b1) = self.split_u16x32(b);
3778 let (c0, c1) = self.split_u16x32(c);
3779 self.combine_u16x16(
3780 self.select_u16x16(a0, b0, c0),
3781 self.select_u16x16(a1, b1, c1),
3782 )
3783 }
3784 #[inline(always)]
3785 fn min_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3786 let (a0, a1) = self.split_u16x32(a);
3787 let (b0, b1) = self.split_u16x32(b);
3788 self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1))
3789 }
3790 #[inline(always)]
3791 fn max_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
3792 let (a0, a1) = self.split_u16x32(a);
3793 let (b0, b1) = self.split_u16x32(b);
3794 self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1))
3795 }
3796 #[inline(always)]
3797 fn split_u16x32(self, a: u16x32<Self>) -> (u16x16<Self>, u16x16<Self>) {
3798 let mut b0 = [0; 16usize];
3799 let mut b1 = [0; 16usize];
3800 b0.copy_from_slice(&a.val[0..16usize]);
3801 b1.copy_from_slice(&a.val[16usize..32usize]);
3802 (b0.simd_into(self), b1.simd_into(self))
3803 }
3804 #[inline(always)]
3805 fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self> {
3806 crate::Fallback::new()
3807 .load_interleaved_128_u16x32(src)
3808 .val
3809 .simd_into(self)
3810 }
3811 #[inline(always)]
3812 fn store_interleaved_128_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
3813 let fb = crate::Fallback::new();
3814 fb.store_interleaved_128_u16x32(a.val.simd_into(fb), dest);
3815 }
3816 #[inline(always)]
3817 fn narrow_u16x32(self, a: u16x32<Self>) -> u8x32<Self> {
3818 let (a0, a1) = self.split_u16x32(a);
3819 self.combine_u8x16(self.narrow_u16x16(a0), self.narrow_u16x16(a1))
3820 }
3821 #[inline(always)]
3822 fn reinterpret_u8_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
3823 let (a0, a1) = self.split_u16x32(a);
3824 self.combine_u8x32(
3825 self.reinterpret_u8_u16x16(a0),
3826 self.reinterpret_u8_u16x16(a1),
3827 )
3828 }
3829 #[inline(always)]
3830 fn reinterpret_u32_u16x32(self, a: u16x32<Self>) -> u32x16<Self> {
3831 let (a0, a1) = self.split_u16x32(a);
3832 self.combine_u32x8(
3833 self.reinterpret_u32_u16x16(a0),
3834 self.reinterpret_u32_u16x16(a1),
3835 )
3836 }
3837 #[inline(always)]
3838 fn splat_mask16x32(self, a: i16) -> mask16x32<Self> {
3839 let half = self.splat_mask16x16(a);
3840 self.combine_mask16x16(half, half)
3841 }
3842 #[inline(always)]
3843 fn not_mask16x32(self, a: mask16x32<Self>) -> mask16x32<Self> {
3844 let (a0, a1) = self.split_mask16x32(a);
3845 self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1))
3846 }
3847 #[inline(always)]
3848 fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
3849 let (a0, a1) = self.split_mask16x32(a);
3850 let (b0, b1) = self.split_mask16x32(b);
3851 self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1))
3852 }
3853 #[inline(always)]
3854 fn or_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
3855 let (a0, a1) = self.split_mask16x32(a);
3856 let (b0, b1) = self.split_mask16x32(b);
3857 self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1))
3858 }
3859 #[inline(always)]
3860 fn xor_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
3861 let (a0, a1) = self.split_mask16x32(a);
3862 let (b0, b1) = self.split_mask16x32(b);
3863 self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1))
3864 }
3865 #[inline(always)]
3866 fn select_mask16x32(
3867 self,
3868 a: mask16x32<Self>,
3869 b: mask16x32<Self>,
3870 c: mask16x32<Self>,
3871 ) -> mask16x32<Self> {
3872 let (a0, a1) = self.split_mask16x32(a);
3873 let (b0, b1) = self.split_mask16x32(b);
3874 let (c0, c1) = self.split_mask16x32(c);
3875 self.combine_mask16x16(
3876 self.select_mask16x16(a0, b0, c0),
3877 self.select_mask16x16(a1, b1, c1),
3878 )
3879 }
3880 #[inline(always)]
3881 fn simd_eq_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
3882 let (a0, a1) = self.split_mask16x32(a);
3883 let (b0, b1) = self.split_mask16x32(b);
3884 self.combine_mask16x16(
3885 self.simd_eq_mask16x16(a0, b0),
3886 self.simd_eq_mask16x16(a1, b1),
3887 )
3888 }
3889 #[inline(always)]
3890 fn split_mask16x32(self, a: mask16x32<Self>) -> (mask16x16<Self>, mask16x16<Self>) {
3891 let mut b0 = [0; 16usize];
3892 let mut b1 = [0; 16usize];
3893 b0.copy_from_slice(&a.val[0..16usize]);
3894 b1.copy_from_slice(&a.val[16usize..32usize]);
3895 (b0.simd_into(self), b1.simd_into(self))
3896 }
3897 #[inline(always)]
3898 fn splat_i32x16(self, a: i32) -> i32x16<Self> {
3899 let half = self.splat_i32x8(a);
3900 self.combine_i32x8(half, half)
3901 }
3902 #[inline(always)]
3903 fn not_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
3904 let (a0, a1) = self.split_i32x16(a);
3905 self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1))
3906 }
3907 #[inline(always)]
3908 fn add_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
3909 let (a0, a1) = self.split_i32x16(a);
3910 let (b0, b1) = self.split_i32x16(b);
3911 self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1))
3912 }
3913 #[inline(always)]
3914 fn sub_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
3915 let (a0, a1) = self.split_i32x16(a);
3916 let (b0, b1) = self.split_i32x16(b);
3917 self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1))
3918 }
3919 #[inline(always)]
3920 fn mul_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
3921 let (a0, a1) = self.split_i32x16(a);
3922 let (b0, b1) = self.split_i32x16(b);
3923 self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1))
3924 }
3925 #[inline(always)]
3926 fn and_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
3927 let (a0, a1) = self.split_i32x16(a);
3928 let (b0, b1) = self.split_i32x16(b);
3929 self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1))
3930 }
3931 #[inline(always)]
3932 fn or_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
3933 let (a0, a1) = self.split_i32x16(a);
3934 let (b0, b1) = self.split_i32x16(b);
3935 self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1))
3936 }
3937 #[inline(always)]
3938 fn xor_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
3939 let (a0, a1) = self.split_i32x16(a);
3940 let (b0, b1) = self.split_i32x16(b);
3941 self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1))
3942 }
3943 #[inline(always)]
3944 fn shr_i32x16(self, a: i32x16<Self>, b: u32) -> i32x16<Self> {
3945 let (a0, a1) = self.split_i32x16(a);
3946 self.combine_i32x8(self.shr_i32x8(a0, b), self.shr_i32x8(a1, b))
3947 }
3948 #[inline(always)]
3949 fn simd_eq_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
3950 let (a0, a1) = self.split_i32x16(a);
3951 let (b0, b1) = self.split_i32x16(b);
3952 self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1))
3953 }
3954 #[inline(always)]
3955 fn simd_lt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
3956 let (a0, a1) = self.split_i32x16(a);
3957 let (b0, b1) = self.split_i32x16(b);
3958 self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1))
3959 }
3960 #[inline(always)]
3961 fn simd_le_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
3962 let (a0, a1) = self.split_i32x16(a);
3963 let (b0, b1) = self.split_i32x16(b);
3964 self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1))
3965 }
3966 #[inline(always)]
3967 fn simd_ge_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
3968 let (a0, a1) = self.split_i32x16(a);
3969 let (b0, b1) = self.split_i32x16(b);
3970 self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1))
3971 }
3972 #[inline(always)]
3973 fn simd_gt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
3974 let (a0, a1) = self.split_i32x16(a);
3975 let (b0, b1) = self.split_i32x16(b);
3976 self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1))
3977 }
3978 #[inline(always)]
3979 fn zip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
3980 let (a0, _) = self.split_i32x16(a);
3981 let (b0, _) = self.split_i32x16(b);
3982 self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0))
3983 }
3984 #[inline(always)]
3985 fn zip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
3986 let (_, a1) = self.split_i32x16(a);
3987 let (_, b1) = self.split_i32x16(b);
3988 self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1))
3989 }
3990 #[inline(always)]
3991 fn unzip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
3992 let (a0, a1) = self.split_i32x16(a);
3993 let (b0, b1) = self.split_i32x16(b);
3994 self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1))
3995 }
3996 #[inline(always)]
3997 fn unzip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
3998 let (a0, a1) = self.split_i32x16(a);
3999 let (b0, b1) = self.split_i32x16(b);
4000 self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1))
4001 }
4002 #[inline(always)]
4003 fn select_i32x16(self, a: mask32x16<Self>, b: i32x16<Self>, c: i32x16<Self>) -> i32x16<Self> {
4004 let (a0, a1) = self.split_mask32x16(a);
4005 let (b0, b1) = self.split_i32x16(b);
4006 let (c0, c1) = self.split_i32x16(c);
4007 self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1))
4008 }
4009 #[inline(always)]
4010 fn min_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
4011 let (a0, a1) = self.split_i32x16(a);
4012 let (b0, b1) = self.split_i32x16(b);
4013 self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1))
4014 }
4015 #[inline(always)]
4016 fn max_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
4017 let (a0, a1) = self.split_i32x16(a);
4018 let (b0, b1) = self.split_i32x16(b);
4019 self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1))
4020 }
4021 #[inline(always)]
4022 fn split_i32x16(self, a: i32x16<Self>) -> (i32x8<Self>, i32x8<Self>) {
4023 let mut b0 = [0; 8usize];
4024 let mut b1 = [0; 8usize];
4025 b0.copy_from_slice(&a.val[0..8usize]);
4026 b1.copy_from_slice(&a.val[8usize..16usize]);
4027 (b0.simd_into(self), b1.simd_into(self))
4028 }
4029 #[inline(always)]
4030 fn reinterpret_u8_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
4031 let (a0, a1) = self.split_i32x16(a);
4032 self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1))
4033 }
4034 #[inline(always)]
4035 fn reinterpret_u32_i32x16(self, a: i32x16<Self>) -> u32x16<Self> {
4036 let (a0, a1) = self.split_i32x16(a);
4037 self.combine_u32x8(
4038 self.reinterpret_u32_i32x8(a0),
4039 self.reinterpret_u32_i32x8(a1),
4040 )
4041 }
4042 #[inline(always)]
4043 fn cvt_f32_i32x16(self, a: i32x16<Self>) -> f32x16<Self> {
4044 let (a0, a1) = self.split_i32x16(a);
4045 self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1))
4046 }
4047 #[inline(always)]
4048 fn splat_u32x16(self, a: u32) -> u32x16<Self> {
4049 let half = self.splat_u32x8(a);
4050 self.combine_u32x8(half, half)
4051 }
4052 #[inline(always)]
4053 fn not_u32x16(self, a: u32x16<Self>) -> u32x16<Self> {
4054 let (a0, a1) = self.split_u32x16(a);
4055 self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1))
4056 }
4057 #[inline(always)]
4058 fn add_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4059 let (a0, a1) = self.split_u32x16(a);
4060 let (b0, b1) = self.split_u32x16(b);
4061 self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1))
4062 }
4063 #[inline(always)]
4064 fn sub_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4065 let (a0, a1) = self.split_u32x16(a);
4066 let (b0, b1) = self.split_u32x16(b);
4067 self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1))
4068 }
4069 #[inline(always)]
4070 fn mul_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4071 let (a0, a1) = self.split_u32x16(a);
4072 let (b0, b1) = self.split_u32x16(b);
4073 self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1))
4074 }
4075 #[inline(always)]
4076 fn and_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4077 let (a0, a1) = self.split_u32x16(a);
4078 let (b0, b1) = self.split_u32x16(b);
4079 self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1))
4080 }
4081 #[inline(always)]
4082 fn or_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4083 let (a0, a1) = self.split_u32x16(a);
4084 let (b0, b1) = self.split_u32x16(b);
4085 self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1))
4086 }
4087 #[inline(always)]
4088 fn xor_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4089 let (a0, a1) = self.split_u32x16(a);
4090 let (b0, b1) = self.split_u32x16(b);
4091 self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1))
4092 }
4093 #[inline(always)]
4094 fn shr_u32x16(self, a: u32x16<Self>, b: u32) -> u32x16<Self> {
4095 let (a0, a1) = self.split_u32x16(a);
4096 self.combine_u32x8(self.shr_u32x8(a0, b), self.shr_u32x8(a1, b))
4097 }
4098 #[inline(always)]
4099 fn simd_eq_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
4100 let (a0, a1) = self.split_u32x16(a);
4101 let (b0, b1) = self.split_u32x16(b);
4102 self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1))
4103 }
4104 #[inline(always)]
4105 fn simd_lt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
4106 let (a0, a1) = self.split_u32x16(a);
4107 let (b0, b1) = self.split_u32x16(b);
4108 self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1))
4109 }
4110 #[inline(always)]
4111 fn simd_le_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
4112 let (a0, a1) = self.split_u32x16(a);
4113 let (b0, b1) = self.split_u32x16(b);
4114 self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1))
4115 }
4116 #[inline(always)]
4117 fn simd_ge_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
4118 let (a0, a1) = self.split_u32x16(a);
4119 let (b0, b1) = self.split_u32x16(b);
4120 self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1))
4121 }
4122 #[inline(always)]
4123 fn simd_gt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
4124 let (a0, a1) = self.split_u32x16(a);
4125 let (b0, b1) = self.split_u32x16(b);
4126 self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1))
4127 }
4128 #[inline(always)]
4129 fn zip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4130 let (a0, _) = self.split_u32x16(a);
4131 let (b0, _) = self.split_u32x16(b);
4132 self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0))
4133 }
4134 #[inline(always)]
4135 fn zip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4136 let (_, a1) = self.split_u32x16(a);
4137 let (_, b1) = self.split_u32x16(b);
4138 self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1))
4139 }
4140 #[inline(always)]
4141 fn unzip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4142 let (a0, a1) = self.split_u32x16(a);
4143 let (b0, b1) = self.split_u32x16(b);
4144 self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1))
4145 }
4146 #[inline(always)]
4147 fn unzip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4148 let (a0, a1) = self.split_u32x16(a);
4149 let (b0, b1) = self.split_u32x16(b);
4150 self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1))
4151 }
4152 #[inline(always)]
4153 fn select_u32x16(self, a: mask32x16<Self>, b: u32x16<Self>, c: u32x16<Self>) -> u32x16<Self> {
4154 let (a0, a1) = self.split_mask32x16(a);
4155 let (b0, b1) = self.split_u32x16(b);
4156 let (c0, c1) = self.split_u32x16(c);
4157 self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1))
4158 }
4159 #[inline(always)]
4160 fn min_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4161 let (a0, a1) = self.split_u32x16(a);
4162 let (b0, b1) = self.split_u32x16(b);
4163 self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1))
4164 }
4165 #[inline(always)]
4166 fn max_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
4167 let (a0, a1) = self.split_u32x16(a);
4168 let (b0, b1) = self.split_u32x16(b);
4169 self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1))
4170 }
4171 #[inline(always)]
4172 fn split_u32x16(self, a: u32x16<Self>) -> (u32x8<Self>, u32x8<Self>) {
4173 let mut b0 = [0; 8usize];
4174 let mut b1 = [0; 8usize];
4175 b0.copy_from_slice(&a.val[0..8usize]);
4176 b1.copy_from_slice(&a.val[8usize..16usize]);
4177 (b0.simd_into(self), b1.simd_into(self))
4178 }
4179 #[inline(always)]
4180 fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self> {
4181 unsafe {
4182 let v0 = _mm_loadu_si128(src.as_ptr().add(0) as *const __m128i);
4183 let v1 = _mm_loadu_si128(src.as_ptr().add(4) as *const __m128i);
4184 let v2 = _mm_loadu_si128(src.as_ptr().add(8) as *const __m128i);
4185 let v3 = _mm_loadu_si128(src.as_ptr().add(12) as *const __m128i);
4186 let tmp0 = _mm_unpacklo_epi32(v0, v1);
4187 let tmp1 = _mm_unpackhi_epi32(v0, v1);
4188 let tmp2 = _mm_unpacklo_epi32(v2, v3);
4189 let tmp3 = _mm_unpackhi_epi32(v2, v3);
4190 let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
4191 let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
4192 let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
4193 let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
4194 self.combine_u32x8(
4195 self.combine_u32x4(out0.simd_into(self), out1.simd_into(self)),
4196 self.combine_u32x4(out2.simd_into(self), out3.simd_into(self)),
4197 )
4198 }
4199 }
4200 #[inline(always)]
4201 fn store_interleaved_128_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
4202 let fb = crate::Fallback::new();
4203 fb.store_interleaved_128_u32x16(a.val.simd_into(fb), dest);
4204 }
4205 #[inline(always)]
4206 fn reinterpret_u8_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
4207 let (a0, a1) = self.split_u32x16(a);
4208 self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1))
4209 }
4210 #[inline(always)]
4211 fn cvt_f32_u32x16(self, a: u32x16<Self>) -> f32x16<Self> {
4212 let (a0, a1) = self.split_u32x16(a);
4213 self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1))
4214 }
4215 #[inline(always)]
4216 fn splat_mask32x16(self, a: i32) -> mask32x16<Self> {
4217 let half = self.splat_mask32x8(a);
4218 self.combine_mask32x8(half, half)
4219 }
4220 #[inline(always)]
4221 fn not_mask32x16(self, a: mask32x16<Self>) -> mask32x16<Self> {
4222 let (a0, a1) = self.split_mask32x16(a);
4223 self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1))
4224 }
4225 #[inline(always)]
4226 fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
4227 let (a0, a1) = self.split_mask32x16(a);
4228 let (b0, b1) = self.split_mask32x16(b);
4229 self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1))
4230 }
4231 #[inline(always)]
4232 fn or_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
4233 let (a0, a1) = self.split_mask32x16(a);
4234 let (b0, b1) = self.split_mask32x16(b);
4235 self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1))
4236 }
4237 #[inline(always)]
4238 fn xor_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
4239 let (a0, a1) = self.split_mask32x16(a);
4240 let (b0, b1) = self.split_mask32x16(b);
4241 self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1))
4242 }
4243 #[inline(always)]
4244 fn select_mask32x16(
4245 self,
4246 a: mask32x16<Self>,
4247 b: mask32x16<Self>,
4248 c: mask32x16<Self>,
4249 ) -> mask32x16<Self> {
4250 let (a0, a1) = self.split_mask32x16(a);
4251 let (b0, b1) = self.split_mask32x16(b);
4252 let (c0, c1) = self.split_mask32x16(c);
4253 self.combine_mask32x8(
4254 self.select_mask32x8(a0, b0, c0),
4255 self.select_mask32x8(a1, b1, c1),
4256 )
4257 }
4258 #[inline(always)]
4259 fn simd_eq_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
4260 let (a0, a1) = self.split_mask32x16(a);
4261 let (b0, b1) = self.split_mask32x16(b);
4262 self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1))
4263 }
4264 #[inline(always)]
4265 fn split_mask32x16(self, a: mask32x16<Self>) -> (mask32x8<Self>, mask32x8<Self>) {
4266 let mut b0 = [0; 8usize];
4267 let mut b1 = [0; 8usize];
4268 b0.copy_from_slice(&a.val[0..8usize]);
4269 b1.copy_from_slice(&a.val[8usize..16usize]);
4270 (b0.simd_into(self), b1.simd_into(self))
4271 }
4272 #[inline(always)]
4273 fn splat_f64x8(self, a: f64) -> f64x8<Self> {
4274 let half = self.splat_f64x4(a);
4275 self.combine_f64x4(half, half)
4276 }
4277 #[inline(always)]
4278 fn abs_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
4279 let (a0, a1) = self.split_f64x8(a);
4280 self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1))
4281 }
4282 #[inline(always)]
4283 fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
4284 let (a0, a1) = self.split_f64x8(a);
4285 self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1))
4286 }
4287 #[inline(always)]
4288 fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
4289 let (a0, a1) = self.split_f64x8(a);
4290 self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
4291 }
4292 #[inline(always)]
4293 fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4294 let (a0, a1) = self.split_f64x8(a);
4295 let (b0, b1) = self.split_f64x8(b);
4296 self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1))
4297 }
4298 #[inline(always)]
4299 fn sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4300 let (a0, a1) = self.split_f64x8(a);
4301 let (b0, b1) = self.split_f64x8(b);
4302 self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1))
4303 }
4304 #[inline(always)]
4305 fn mul_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4306 let (a0, a1) = self.split_f64x8(a);
4307 let (b0, b1) = self.split_f64x8(b);
4308 self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1))
4309 }
4310 #[inline(always)]
4311 fn div_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4312 let (a0, a1) = self.split_f64x8(a);
4313 let (b0, b1) = self.split_f64x8(b);
4314 self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1))
4315 }
4316 #[inline(always)]
4317 fn copysign_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4318 let (a0, a1) = self.split_f64x8(a);
4319 let (b0, b1) = self.split_f64x8(b);
4320 self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1))
4321 }
4322 #[inline(always)]
4323 fn simd_eq_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
4324 let (a0, a1) = self.split_f64x8(a);
4325 let (b0, b1) = self.split_f64x8(b);
4326 self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1))
4327 }
4328 #[inline(always)]
4329 fn simd_lt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
4330 let (a0, a1) = self.split_f64x8(a);
4331 let (b0, b1) = self.split_f64x8(b);
4332 self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1))
4333 }
4334 #[inline(always)]
4335 fn simd_le_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
4336 let (a0, a1) = self.split_f64x8(a);
4337 let (b0, b1) = self.split_f64x8(b);
4338 self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1))
4339 }
4340 #[inline(always)]
4341 fn simd_ge_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
4342 let (a0, a1) = self.split_f64x8(a);
4343 let (b0, b1) = self.split_f64x8(b);
4344 self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1))
4345 }
4346 #[inline(always)]
4347 fn simd_gt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
4348 let (a0, a1) = self.split_f64x8(a);
4349 let (b0, b1) = self.split_f64x8(b);
4350 self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1))
4351 }
4352 #[inline(always)]
4353 fn zip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4354 let (a0, _) = self.split_f64x8(a);
4355 let (b0, _) = self.split_f64x8(b);
4356 self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0))
4357 }
4358 #[inline(always)]
4359 fn zip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4360 let (_, a1) = self.split_f64x8(a);
4361 let (_, b1) = self.split_f64x8(b);
4362 self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1))
4363 }
4364 #[inline(always)]
4365 fn unzip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4366 let (a0, a1) = self.split_f64x8(a);
4367 let (b0, b1) = self.split_f64x8(b);
4368 self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1))
4369 }
4370 #[inline(always)]
4371 fn unzip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4372 let (a0, a1) = self.split_f64x8(a);
4373 let (b0, b1) = self.split_f64x8(b);
4374 self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1))
4375 }
4376 #[inline(always)]
4377 fn max_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4378 let (a0, a1) = self.split_f64x8(a);
4379 let (b0, b1) = self.split_f64x8(b);
4380 self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1))
4381 }
4382 #[inline(always)]
4383 fn max_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4384 let (a0, a1) = self.split_f64x8(a);
4385 let (b0, b1) = self.split_f64x8(b);
4386 self.combine_f64x4(
4387 self.max_precise_f64x4(a0, b0),
4388 self.max_precise_f64x4(a1, b1),
4389 )
4390 }
4391 #[inline(always)]
4392 fn min_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4393 let (a0, a1) = self.split_f64x8(a);
4394 let (b0, b1) = self.split_f64x8(b);
4395 self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1))
4396 }
4397 #[inline(always)]
4398 fn min_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
4399 let (a0, a1) = self.split_f64x8(a);
4400 let (b0, b1) = self.split_f64x8(b);
4401 self.combine_f64x4(
4402 self.min_precise_f64x4(a0, b0),
4403 self.min_precise_f64x4(a1, b1),
4404 )
4405 }
4406 #[inline(always)]
4407 fn madd_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
4408 let (a0, a1) = self.split_f64x8(a);
4409 let (b0, b1) = self.split_f64x8(b);
4410 let (c0, c1) = self.split_f64x8(c);
4411 self.combine_f64x4(self.madd_f64x4(a0, b0, c0), self.madd_f64x4(a1, b1, c1))
4412 }
4413 #[inline(always)]
4414 fn msub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
4415 let (a0, a1) = self.split_f64x8(a);
4416 let (b0, b1) = self.split_f64x8(b);
4417 let (c0, c1) = self.split_f64x8(c);
4418 self.combine_f64x4(self.msub_f64x4(a0, b0, c0), self.msub_f64x4(a1, b1, c1))
4419 }
4420 #[inline(always)]
4421 fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
4422 let (a0, a1) = self.split_f64x8(a);
4423 self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1))
4424 }
4425 #[inline(always)]
4426 fn fract_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
4427 let (a0, a1) = self.split_f64x8(a);
4428 self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1))
4429 }
4430 #[inline(always)]
4431 fn trunc_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
4432 let (a0, a1) = self.split_f64x8(a);
4433 self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1))
4434 }
4435 #[inline(always)]
4436 fn select_f64x8(self, a: mask64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
4437 let (a0, a1) = self.split_mask64x8(a);
4438 let (b0, b1) = self.split_f64x8(b);
4439 let (c0, c1) = self.split_f64x8(c);
4440 self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1))
4441 }
4442 #[inline(always)]
4443 fn split_f64x8(self, a: f64x8<Self>) -> (f64x4<Self>, f64x4<Self>) {
4444 let mut b0 = [0.0; 4usize];
4445 let mut b1 = [0.0; 4usize];
4446 b0.copy_from_slice(&a.val[0..4usize]);
4447 b1.copy_from_slice(&a.val[4usize..8usize]);
4448 (b0.simd_into(self), b1.simd_into(self))
4449 }
4450 #[inline(always)]
4451 fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self> {
4452 let (a0, a1) = self.split_f64x8(a);
4453 self.combine_f32x8(
4454 self.reinterpret_f32_f64x4(a0),
4455 self.reinterpret_f32_f64x4(a1),
4456 )
4457 }
4458 #[inline(always)]
4459 fn splat_mask64x8(self, a: i64) -> mask64x8<Self> {
4460 let half = self.splat_mask64x4(a);
4461 self.combine_mask64x4(half, half)
4462 }
4463 #[inline(always)]
4464 fn not_mask64x8(self, a: mask64x8<Self>) -> mask64x8<Self> {
4465 let (a0, a1) = self.split_mask64x8(a);
4466 self.combine_mask64x4(self.not_mask64x4(a0), self.not_mask64x4(a1))
4467 }
4468 #[inline(always)]
4469 fn and_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
4470 let (a0, a1) = self.split_mask64x8(a);
4471 let (b0, b1) = self.split_mask64x8(b);
4472 self.combine_mask64x4(self.and_mask64x4(a0, b0), self.and_mask64x4(a1, b1))
4473 }
4474 #[inline(always)]
4475 fn or_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
4476 let (a0, a1) = self.split_mask64x8(a);
4477 let (b0, b1) = self.split_mask64x8(b);
4478 self.combine_mask64x4(self.or_mask64x4(a0, b0), self.or_mask64x4(a1, b1))
4479 }
4480 #[inline(always)]
4481 fn xor_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
4482 let (a0, a1) = self.split_mask64x8(a);
4483 let (b0, b1) = self.split_mask64x8(b);
4484 self.combine_mask64x4(self.xor_mask64x4(a0, b0), self.xor_mask64x4(a1, b1))
4485 }
4486 #[inline(always)]
4487 fn select_mask64x8(
4488 self,
4489 a: mask64x8<Self>,
4490 b: mask64x8<Self>,
4491 c: mask64x8<Self>,
4492 ) -> mask64x8<Self> {
4493 let (a0, a1) = self.split_mask64x8(a);
4494 let (b0, b1) = self.split_mask64x8(b);
4495 let (c0, c1) = self.split_mask64x8(c);
4496 self.combine_mask64x4(
4497 self.select_mask64x4(a0, b0, c0),
4498 self.select_mask64x4(a1, b1, c1),
4499 )
4500 }
4501 #[inline(always)]
4502 fn simd_eq_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
4503 let (a0, a1) = self.split_mask64x8(a);
4504 let (b0, b1) = self.split_mask64x8(b);
4505 self.combine_mask64x4(self.simd_eq_mask64x4(a0, b0), self.simd_eq_mask64x4(a1, b1))
4506 }
4507 #[inline(always)]
4508 fn split_mask64x8(self, a: mask64x8<Self>) -> (mask64x4<Self>, mask64x4<Self>) {
4509 let mut b0 = [0; 4usize];
4510 let mut b1 = [0; 4usize];
4511 b0.copy_from_slice(&a.val[0..4usize]);
4512 b1.copy_from_slice(&a.val[4usize..8usize]);
4513 (b0.simd_into(self), b1.simd_into(self))
4514 }
4515}
4516impl<S: Simd> SimdFrom<__m128, S> for f32x4<S> {
4517 #[inline(always)]
4518 fn simd_from(arch: __m128, simd: S) -> Self {
4519 Self {
4520 val: unsafe { core::mem::transmute(arch) },
4521 simd,
4522 }
4523 }
4524}
4525impl<S: Simd> From<f32x4<S>> for __m128 {
4526 #[inline(always)]
4527 fn from(value: f32x4<S>) -> Self {
4528 unsafe { core::mem::transmute(value.val) }
4529 }
4530}
4531impl<S: Simd> SimdFrom<__m128i, S> for i8x16<S> {
4532 #[inline(always)]
4533 fn simd_from(arch: __m128i, simd: S) -> Self {
4534 Self {
4535 val: unsafe { core::mem::transmute(arch) },
4536 simd,
4537 }
4538 }
4539}
4540impl<S: Simd> From<i8x16<S>> for __m128i {
4541 #[inline(always)]
4542 fn from(value: i8x16<S>) -> Self {
4543 unsafe { core::mem::transmute(value.val) }
4544 }
4545}
4546impl<S: Simd> SimdFrom<__m128i, S> for u8x16<S> {
4547 #[inline(always)]
4548 fn simd_from(arch: __m128i, simd: S) -> Self {
4549 Self {
4550 val: unsafe { core::mem::transmute(arch) },
4551 simd,
4552 }
4553 }
4554}
4555impl<S: Simd> From<u8x16<S>> for __m128i {
4556 #[inline(always)]
4557 fn from(value: u8x16<S>) -> Self {
4558 unsafe { core::mem::transmute(value.val) }
4559 }
4560}
4561impl<S: Simd> SimdFrom<__m128i, S> for mask8x16<S> {
4562 #[inline(always)]
4563 fn simd_from(arch: __m128i, simd: S) -> Self {
4564 Self {
4565 val: unsafe { core::mem::transmute(arch) },
4566 simd,
4567 }
4568 }
4569}
4570impl<S: Simd> From<mask8x16<S>> for __m128i {
4571 #[inline(always)]
4572 fn from(value: mask8x16<S>) -> Self {
4573 unsafe { core::mem::transmute(value.val) }
4574 }
4575}
4576impl<S: Simd> SimdFrom<__m128i, S> for i16x8<S> {
4577 #[inline(always)]
4578 fn simd_from(arch: __m128i, simd: S) -> Self {
4579 Self {
4580 val: unsafe { core::mem::transmute(arch) },
4581 simd,
4582 }
4583 }
4584}
4585impl<S: Simd> From<i16x8<S>> for __m128i {
4586 #[inline(always)]
4587 fn from(value: i16x8<S>) -> Self {
4588 unsafe { core::mem::transmute(value.val) }
4589 }
4590}
4591impl<S: Simd> SimdFrom<__m128i, S> for u16x8<S> {
4592 #[inline(always)]
4593 fn simd_from(arch: __m128i, simd: S) -> Self {
4594 Self {
4595 val: unsafe { core::mem::transmute(arch) },
4596 simd,
4597 }
4598 }
4599}
4600impl<S: Simd> From<u16x8<S>> for __m128i {
4601 #[inline(always)]
4602 fn from(value: u16x8<S>) -> Self {
4603 unsafe { core::mem::transmute(value.val) }
4604 }
4605}
4606impl<S: Simd> SimdFrom<__m128i, S> for mask16x8<S> {
4607 #[inline(always)]
4608 fn simd_from(arch: __m128i, simd: S) -> Self {
4609 Self {
4610 val: unsafe { core::mem::transmute(arch) },
4611 simd,
4612 }
4613 }
4614}
4615impl<S: Simd> From<mask16x8<S>> for __m128i {
4616 #[inline(always)]
4617 fn from(value: mask16x8<S>) -> Self {
4618 unsafe { core::mem::transmute(value.val) }
4619 }
4620}
4621impl<S: Simd> SimdFrom<__m128i, S> for i32x4<S> {
4622 #[inline(always)]
4623 fn simd_from(arch: __m128i, simd: S) -> Self {
4624 Self {
4625 val: unsafe { core::mem::transmute(arch) },
4626 simd,
4627 }
4628 }
4629}
4630impl<S: Simd> From<i32x4<S>> for __m128i {
4631 #[inline(always)]
4632 fn from(value: i32x4<S>) -> Self {
4633 unsafe { core::mem::transmute(value.val) }
4634 }
4635}
4636impl<S: Simd> SimdFrom<__m128i, S> for u32x4<S> {
4637 #[inline(always)]
4638 fn simd_from(arch: __m128i, simd: S) -> Self {
4639 Self {
4640 val: unsafe { core::mem::transmute(arch) },
4641 simd,
4642 }
4643 }
4644}
4645impl<S: Simd> From<u32x4<S>> for __m128i {
4646 #[inline(always)]
4647 fn from(value: u32x4<S>) -> Self {
4648 unsafe { core::mem::transmute(value.val) }
4649 }
4650}
4651impl<S: Simd> SimdFrom<__m128i, S> for mask32x4<S> {
4652 #[inline(always)]
4653 fn simd_from(arch: __m128i, simd: S) -> Self {
4654 Self {
4655 val: unsafe { core::mem::transmute(arch) },
4656 simd,
4657 }
4658 }
4659}
4660impl<S: Simd> From<mask32x4<S>> for __m128i {
4661 #[inline(always)]
4662 fn from(value: mask32x4<S>) -> Self {
4663 unsafe { core::mem::transmute(value.val) }
4664 }
4665}
4666impl<S: Simd> SimdFrom<__m128d, S> for f64x2<S> {
4667 #[inline(always)]
4668 fn simd_from(arch: __m128d, simd: S) -> Self {
4669 Self {
4670 val: unsafe { core::mem::transmute(arch) },
4671 simd,
4672 }
4673 }
4674}
4675impl<S: Simd> From<f64x2<S>> for __m128d {
4676 #[inline(always)]
4677 fn from(value: f64x2<S>) -> Self {
4678 unsafe { core::mem::transmute(value.val) }
4679 }
4680}
4681impl<S: Simd> SimdFrom<__m128i, S> for mask64x2<S> {
4682 #[inline(always)]
4683 fn simd_from(arch: __m128i, simd: S) -> Self {
4684 Self {
4685 val: unsafe { core::mem::transmute(arch) },
4686 simd,
4687 }
4688 }
4689}
4690impl<S: Simd> From<mask64x2<S>> for __m128i {
4691 #[inline(always)]
4692 fn from(value: mask64x2<S>) -> Self {
4693 unsafe { core::mem::transmute(value.val) }
4694 }
4695}