1use crate::{Level, arch_types::ArchTypes, prelude::*, seal::Seal};
7use crate::{
8 f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4,
9 i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4,
10 mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32,
11 u32x4, u32x8, u32x16,
12};
13#[cfg(target_arch = "x86")]
14use core::arch::x86::*;
15#[cfg(target_arch = "x86_64")]
16use core::arch::x86_64::*;
17#[doc = "The SIMD token for the x86-64-v2 level."]
18#[derive(Clone, Copy, Debug)]
19pub struct Sse4_2 {
20 pub sse4_2: crate::core_arch::x86::Sse4_2,
21}
22impl Sse4_2 {
23 #[doc = r" Create a SIMD token."]
24 #[doc = r""]
25 #[doc = r" # Safety"]
26 #[doc = r""]
27 #[doc = r" The `sse4.2`, `cmpxchg16b`, and `popcnt` CPU features must"]
28 #[doc = r" be available."]
29 #[inline]
30 pub const unsafe fn new_unchecked() -> Self {
31 Sse4_2 {
32 sse4_2: unsafe { crate::core_arch::x86::Sse4_2::new_unchecked() },
33 }
34 }
35}
36impl Seal for Sse4_2 {}
37impl ArchTypes for Sse4_2 {
38 type f32x4 = crate::support::Aligned128<__m128>;
39 type i8x16 = crate::support::Aligned128<__m128i>;
40 type u8x16 = crate::support::Aligned128<__m128i>;
41 type mask8x16 = crate::support::Aligned128<__m128i>;
42 type i16x8 = crate::support::Aligned128<__m128i>;
43 type u16x8 = crate::support::Aligned128<__m128i>;
44 type mask16x8 = crate::support::Aligned128<__m128i>;
45 type i32x4 = crate::support::Aligned128<__m128i>;
46 type u32x4 = crate::support::Aligned128<__m128i>;
47 type mask32x4 = crate::support::Aligned128<__m128i>;
48 type f64x2 = crate::support::Aligned128<__m128d>;
49 type mask64x2 = crate::support::Aligned128<__m128i>;
50 type f32x8 = crate::support::Aligned256<[__m128; 2usize]>;
51 type i8x32 = crate::support::Aligned256<[__m128i; 2usize]>;
52 type u8x32 = crate::support::Aligned256<[__m128i; 2usize]>;
53 type mask8x32 = crate::support::Aligned256<[__m128i; 2usize]>;
54 type i16x16 = crate::support::Aligned256<[__m128i; 2usize]>;
55 type u16x16 = crate::support::Aligned256<[__m128i; 2usize]>;
56 type mask16x16 = crate::support::Aligned256<[__m128i; 2usize]>;
57 type i32x8 = crate::support::Aligned256<[__m128i; 2usize]>;
58 type u32x8 = crate::support::Aligned256<[__m128i; 2usize]>;
59 type mask32x8 = crate::support::Aligned256<[__m128i; 2usize]>;
60 type f64x4 = crate::support::Aligned256<[__m128d; 2usize]>;
61 type mask64x4 = crate::support::Aligned256<[__m128i; 2usize]>;
62 type f32x16 = crate::support::Aligned512<[__m128; 4usize]>;
63 type i8x64 = crate::support::Aligned512<[__m128i; 4usize]>;
64 type u8x64 = crate::support::Aligned512<[__m128i; 4usize]>;
65 type mask8x64 = crate::support::Aligned512<[__m128i; 4usize]>;
66 type i16x32 = crate::support::Aligned512<[__m128i; 4usize]>;
67 type u16x32 = crate::support::Aligned512<[__m128i; 4usize]>;
68 type mask16x32 = crate::support::Aligned512<[__m128i; 4usize]>;
69 type i32x16 = crate::support::Aligned512<[__m128i; 4usize]>;
70 type u32x16 = crate::support::Aligned512<[__m128i; 4usize]>;
71 type mask32x16 = crate::support::Aligned512<[__m128i; 4usize]>;
72 type f64x8 = crate::support::Aligned512<[__m128d; 4usize]>;
73 type mask64x8 = crate::support::Aligned512<[__m128i; 4usize]>;
74}
75impl Simd for Sse4_2 {
76 type f32s = f32x4<Self>;
77 type f64s = f64x2<Self>;
78 type u8s = u8x16<Self>;
79 type i8s = i8x16<Self>;
80 type u16s = u16x8<Self>;
81 type i16s = i16x8<Self>;
82 type u32s = u32x4<Self>;
83 type i32s = i32x4<Self>;
84 type mask8s = mask8x16<Self>;
85 type mask16s = mask16x8<Self>;
86 type mask32s = mask32x4<Self>;
87 type mask64s = mask64x2<Self>;
88 #[inline(always)]
89 fn level(self) -> Level {
90 #[cfg(not(all(
91 target_feature = "avx2",
92 target_feature = "bmi1",
93 target_feature = "bmi2",
94 target_feature = "cmpxchg16b",
95 target_feature = "f16c",
96 target_feature = "fma",
97 target_feature = "lzcnt",
98 target_feature = "movbe",
99 target_feature = "popcnt",
100 target_feature = "xsave"
101 )))]
102 return Level::Sse4_2(self);
103 #[cfg(all(
104 target_feature = "avx2",
105 target_feature = "bmi1",
106 target_feature = "bmi2",
107 target_feature = "cmpxchg16b",
108 target_feature = "f16c",
109 target_feature = "fma",
110 target_feature = "lzcnt",
111 target_feature = "movbe",
112 target_feature = "popcnt",
113 target_feature = "xsave"
114 ))]
115 {
116 Level::baseline()
117 }
118 }
119 #[inline]
120 fn vectorize<F: FnOnce() -> R, R>(self, f: F) -> R {
121 #[target_feature(enable = "sse4.2,cmpxchg16b,popcnt")]
122 unsafe fn vectorize_sse4_2<F: FnOnce() -> R, R>(f: F) -> R {
123 f()
124 }
125 unsafe { vectorize_sse4_2(f) }
126 }
127 #[inline(always)]
128 fn splat_f32x4(self, val: f32) -> f32x4<Self> {
129 unsafe { _mm_set1_ps(val).simd_into(self) }
130 }
131 #[inline(always)]
132 fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4<Self> {
133 f32x4 {
134 val: unsafe { core::mem::transmute_copy(&val) },
135 simd: self,
136 }
137 }
138 #[inline(always)]
139 fn load_array_ref_f32x4(self, val: &[f32; 4usize]) -> f32x4<Self> {
140 f32x4 {
141 val: unsafe { core::mem::transmute_copy(val) },
142 simd: self,
143 }
144 }
145 #[inline(always)]
146 fn as_array_f32x4(self, a: f32x4<Self>) -> [f32; 4usize] {
147 unsafe { core::mem::transmute::<__m128, [f32; 4usize]>(a.val.0) }
148 }
149 #[inline(always)]
150 fn as_array_ref_f32x4(self, a: &f32x4<Self>) -> &[f32; 4usize] {
151 unsafe { core::mem::transmute::<&__m128, &[f32; 4usize]>(&a.val.0) }
152 }
153 #[inline(always)]
154 fn as_array_mut_f32x4(self, a: &mut f32x4<Self>) -> &mut [f32; 4usize] {
155 unsafe { core::mem::transmute::<&mut __m128, &mut [f32; 4usize]>(&mut a.val.0) }
156 }
157 #[inline(always)]
158 fn store_array_f32x4(self, a: f32x4<Self>, dest: &mut [f32; 4usize]) -> () {
159 unsafe {
160 core::ptr::copy_nonoverlapping(
161 (&raw const a.val.0) as *const f32,
162 dest.as_mut_ptr(),
163 4usize,
164 );
165 }
166 }
167 #[inline(always)]
168 fn cvt_from_bytes_f32x4(self, a: u8x16<Self>) -> f32x4<Self> {
169 unsafe {
170 f32x4 {
171 val: core::mem::transmute(a.val),
172 simd: self,
173 }
174 }
175 }
176 #[inline(always)]
177 fn cvt_to_bytes_f32x4(self, a: f32x4<Self>) -> u8x16<Self> {
178 unsafe {
179 u8x16 {
180 val: core::mem::transmute(a.val),
181 simd: self,
182 }
183 }
184 }
185 #[inline(always)]
186 fn slide_f32x4<const SHIFT: usize>(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
187 unsafe {
188 if SHIFT >= 4usize {
189 return b;
190 }
191 let result = dyn_alignr_128(
192 self.cvt_to_bytes_f32x4(b).val.0,
193 self.cvt_to_bytes_f32x4(a).val.0,
194 SHIFT * 4usize,
195 );
196 self.cvt_from_bytes_f32x4(u8x16 {
197 val: crate::support::Aligned128(result),
198 simd: self,
199 })
200 }
201 }
202 #[inline(always)]
203 fn slide_within_blocks_f32x4<const SHIFT: usize>(
204 self,
205 a: f32x4<Self>,
206 b: f32x4<Self>,
207 ) -> f32x4<Self> {
208 self.slide_f32x4::<SHIFT>(a, b)
209 }
210 #[inline(always)]
211 fn abs_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
212 unsafe { _mm_andnot_ps(_mm_set1_ps(-0.0), a.into()).simd_into(self) }
213 }
214 #[inline(always)]
215 fn neg_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
216 unsafe { _mm_xor_ps(a.into(), _mm_set1_ps(-0.0)).simd_into(self) }
217 }
218 #[inline(always)]
219 fn sqrt_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
220 unsafe { _mm_sqrt_ps(a.into()).simd_into(self) }
221 }
222 #[inline(always)]
223 fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
224 unsafe { _mm_add_ps(a.into(), b.into()).simd_into(self) }
225 }
226 #[inline(always)]
227 fn sub_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
228 unsafe { _mm_sub_ps(a.into(), b.into()).simd_into(self) }
229 }
230 #[inline(always)]
231 fn mul_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
232 unsafe { _mm_mul_ps(a.into(), b.into()).simd_into(self) }
233 }
234 #[inline(always)]
235 fn div_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
236 unsafe { _mm_div_ps(a.into(), b.into()).simd_into(self) }
237 }
238 #[inline(always)]
239 fn copysign_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
240 unsafe {
241 let mask = _mm_set1_ps(-0.0);
242 _mm_or_ps(_mm_and_ps(mask, b.into()), _mm_andnot_ps(mask, a.into())).simd_into(self)
243 }
244 }
245 #[inline(always)]
246 fn simd_eq_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
247 unsafe { _mm_castps_si128(_mm_cmpeq_ps(a.into(), b.into())).simd_into(self) }
248 }
249 #[inline(always)]
250 fn simd_lt_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
251 unsafe { _mm_castps_si128(_mm_cmplt_ps(a.into(), b.into())).simd_into(self) }
252 }
253 #[inline(always)]
254 fn simd_le_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
255 unsafe { _mm_castps_si128(_mm_cmple_ps(a.into(), b.into())).simd_into(self) }
256 }
257 #[inline(always)]
258 fn simd_ge_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
259 unsafe { _mm_castps_si128(_mm_cmpge_ps(a.into(), b.into())).simd_into(self) }
260 }
261 #[inline(always)]
262 fn simd_gt_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
263 unsafe { _mm_castps_si128(_mm_cmpgt_ps(a.into(), b.into())).simd_into(self) }
264 }
265 #[inline(always)]
266 fn zip_low_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
267 unsafe { _mm_unpacklo_ps(a.into(), b.into()).simd_into(self) }
268 }
269 #[inline(always)]
270 fn zip_high_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
271 unsafe { _mm_unpackhi_ps(a.into(), b.into()).simd_into(self) }
272 }
273 #[inline(always)]
274 fn unzip_low_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
275 unsafe { _mm_shuffle_ps::<0b10_00_10_00>(a.into(), b.into()).simd_into(self) }
276 }
277 #[inline(always)]
278 fn unzip_high_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
279 unsafe { _mm_shuffle_ps::<0b11_01_11_01>(a.into(), b.into()).simd_into(self) }
280 }
281 #[inline(always)]
282 fn interleave_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> (f32x4<Self>, f32x4<Self>) {
283 (self.zip_low_f32x4(a, b), self.zip_high_f32x4(a, b))
284 }
285 #[inline(always)]
286 fn deinterleave_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> (f32x4<Self>, f32x4<Self>) {
287 (self.unzip_low_f32x4(a, b), self.unzip_high_f32x4(a, b))
288 }
289 #[inline(always)]
290 fn max_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
291 unsafe { _mm_max_ps(a.into(), b.into()).simd_into(self) }
292 }
293 #[inline(always)]
294 fn min_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
295 unsafe { _mm_min_ps(a.into(), b.into()).simd_into(self) }
296 }
297 #[inline(always)]
298 fn max_precise_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
299 unsafe {
300 let intermediate = _mm_max_ps(a.into(), b.into());
301 let b_is_nan = _mm_cmpunord_ps(b.into(), b.into());
302 _mm_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(self)
303 }
304 }
305 #[inline(always)]
306 fn min_precise_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
307 unsafe {
308 let intermediate = _mm_min_ps(a.into(), b.into());
309 let b_is_nan = _mm_cmpunord_ps(b.into(), b.into());
310 _mm_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(self)
311 }
312 }
313 #[inline(always)]
314 fn mul_add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
315 a * b + c
316 }
317 #[inline(always)]
318 fn mul_sub_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
319 a * b - c
320 }
321 #[inline(always)]
322 fn floor_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
323 unsafe {
324 _mm_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
325 }
326 }
327 #[inline(always)]
328 fn ceil_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
329 unsafe {
330 _mm_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
331 }
332 }
333 #[inline(always)]
334 fn round_ties_even_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
335 unsafe {
336 _mm_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
337 .simd_into(self)
338 }
339 }
340 #[inline(always)]
341 fn fract_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
342 a - self.trunc_f32x4(a)
343 }
344 #[inline(always)]
345 fn trunc_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
346 unsafe {
347 _mm_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
348 }
349 }
350 #[inline(always)]
351 fn select_f32x4(self, a: mask32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
352 unsafe { _mm_blendv_ps(c.into(), b.into(), _mm_castsi128_ps(a.into())).simd_into(self) }
353 }
354 #[inline(always)]
355 fn combine_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x8<Self> {
356 f32x8 {
357 val: crate::support::Aligned256([a.val.0, b.val.0]),
358 simd: self,
359 }
360 }
361 #[inline(always)]
362 fn reinterpret_f64_f32x4(self, a: f32x4<Self>) -> f64x2<Self> {
363 unsafe { _mm_castps_pd(a.into()).simd_into(self) }
364 }
365 #[inline(always)]
366 fn reinterpret_i32_f32x4(self, a: f32x4<Self>) -> i32x4<Self> {
367 unsafe { _mm_castps_si128(a.into()).simd_into(self) }
368 }
369 #[inline(always)]
370 fn reinterpret_u8_f32x4(self, a: f32x4<Self>) -> u8x16<Self> {
371 unsafe { _mm_castps_si128(a.into()).simd_into(self) }
372 }
373 #[inline(always)]
374 fn reinterpret_u32_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
375 unsafe { _mm_castps_si128(a.into()).simd_into(self) }
376 }
377 #[inline(always)]
378 fn cvt_u32_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
379 unsafe {
380 let mut converted = _mm_cvttps_epi32(a.into());
381 let in_range = _mm_cmplt_ps(a.into(), _mm_set1_ps(2147483648.0));
382 let all_in_range = _mm_movemask_ps(in_range) == 0b1111;
383 if !all_in_range {
384 let excess = _mm_sub_ps(a.into(), _mm_set1_ps(2147483648.0));
385 let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess));
386 converted = _mm_add_epi32(converted, excess_converted);
387 }
388 converted.simd_into(self)
389 }
390 }
391 #[inline(always)]
392 fn cvt_u32_precise_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
393 unsafe {
394 let a = _mm_max_ps(a.into(), _mm_setzero_ps());
395 let mut converted = _mm_cvttps_epi32(a);
396 let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0));
397 let all_in_range = _mm_movemask_ps(in_range) == 0b1111;
398 if !all_in_range {
399 let exceeds_unsigned_range =
400 _mm_castps_si128(_mm_cmplt_ps(_mm_set1_ps(4294967040.0), a));
401 let excess = _mm_sub_ps(a, _mm_set1_ps(2147483648.0));
402 let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess));
403 converted = _mm_add_epi32(converted, excess_converted);
404 converted = _mm_blendv_epi8(
405 converted,
406 _mm_set1_epi32(u32::MAX.cast_signed()),
407 exceeds_unsigned_range,
408 );
409 }
410 converted.simd_into(self)
411 }
412 }
413 #[inline(always)]
414 fn cvt_i32_f32x4(self, a: f32x4<Self>) -> i32x4<Self> {
415 unsafe { _mm_cvttps_epi32(a.into()).simd_into(self) }
416 }
417 #[inline(always)]
418 fn cvt_i32_precise_f32x4(self, a: f32x4<Self>) -> i32x4<Self> {
419 unsafe {
420 let a = a.into();
421 let mut converted = _mm_cvttps_epi32(a);
422 let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0));
423 let all_in_range = _mm_movemask_ps(in_range) == 0b1111;
424 if !all_in_range {
425 converted = _mm_blendv_epi8(
426 _mm_set1_epi32(i32::MAX),
427 converted,
428 _mm_castps_si128(in_range),
429 );
430 let is_not_nan = _mm_castps_si128(_mm_cmpord_ps(a, a));
431 converted = _mm_and_si128(converted, is_not_nan);
432 }
433 converted.simd_into(self)
434 }
435 }
436 #[inline(always)]
437 fn splat_i8x16(self, val: i8) -> i8x16<Self> {
438 unsafe { _mm_set1_epi8(val).simd_into(self) }
439 }
440 #[inline(always)]
441 fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16<Self> {
442 i8x16 {
443 val: unsafe { core::mem::transmute_copy(&val) },
444 simd: self,
445 }
446 }
447 #[inline(always)]
448 fn load_array_ref_i8x16(self, val: &[i8; 16usize]) -> i8x16<Self> {
449 i8x16 {
450 val: unsafe { core::mem::transmute_copy(val) },
451 simd: self,
452 }
453 }
454 #[inline(always)]
455 fn as_array_i8x16(self, a: i8x16<Self>) -> [i8; 16usize] {
456 unsafe { core::mem::transmute::<__m128i, [i8; 16usize]>(a.val.0) }
457 }
458 #[inline(always)]
459 fn as_array_ref_i8x16(self, a: &i8x16<Self>) -> &[i8; 16usize] {
460 unsafe { core::mem::transmute::<&__m128i, &[i8; 16usize]>(&a.val.0) }
461 }
462 #[inline(always)]
463 fn as_array_mut_i8x16(self, a: &mut i8x16<Self>) -> &mut [i8; 16usize] {
464 unsafe { core::mem::transmute::<&mut __m128i, &mut [i8; 16usize]>(&mut a.val.0) }
465 }
466 #[inline(always)]
467 fn store_array_i8x16(self, a: i8x16<Self>, dest: &mut [i8; 16usize]) -> () {
468 unsafe {
469 core::ptr::copy_nonoverlapping(
470 (&raw const a.val.0) as *const i8,
471 dest.as_mut_ptr(),
472 16usize,
473 );
474 }
475 }
476 #[inline(always)]
477 fn cvt_from_bytes_i8x16(self, a: u8x16<Self>) -> i8x16<Self> {
478 unsafe {
479 i8x16 {
480 val: core::mem::transmute(a.val),
481 simd: self,
482 }
483 }
484 }
485 #[inline(always)]
486 fn cvt_to_bytes_i8x16(self, a: i8x16<Self>) -> u8x16<Self> {
487 unsafe {
488 u8x16 {
489 val: core::mem::transmute(a.val),
490 simd: self,
491 }
492 }
493 }
494 #[inline(always)]
495 fn slide_i8x16<const SHIFT: usize>(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
496 unsafe {
497 if SHIFT >= 16usize {
498 return b;
499 }
500 let result = dyn_alignr_128(
501 self.cvt_to_bytes_i8x16(b).val.0,
502 self.cvt_to_bytes_i8x16(a).val.0,
503 SHIFT,
504 );
505 self.cvt_from_bytes_i8x16(u8x16 {
506 val: crate::support::Aligned128(result),
507 simd: self,
508 })
509 }
510 }
511 #[inline(always)]
512 fn slide_within_blocks_i8x16<const SHIFT: usize>(
513 self,
514 a: i8x16<Self>,
515 b: i8x16<Self>,
516 ) -> i8x16<Self> {
517 self.slide_i8x16::<SHIFT>(a, b)
518 }
519 #[inline(always)]
520 fn add_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
521 unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) }
522 }
523 #[inline(always)]
524 fn sub_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
525 unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) }
526 }
527 #[inline(always)]
528 fn mul_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
529 unsafe {
530 let dst_even = _mm_mullo_epi16(a.into(), b.into());
531 let dst_odd =
532 _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into()));
533 _mm_or_si128(
534 _mm_slli_epi16(dst_odd, 8),
535 _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)),
536 )
537 .simd_into(self)
538 }
539 }
540 #[inline(always)]
541 fn and_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
542 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
543 }
544 #[inline(always)]
545 fn or_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
546 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
547 }
548 #[inline(always)]
549 fn xor_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
550 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
551 }
552 #[inline(always)]
553 fn not_i8x16(self, a: i8x16<Self>) -> i8x16<Self> {
554 a ^ !0
555 }
556 #[inline(always)]
557 fn shl_i8x16(self, a: i8x16<Self>, shift: u32) -> i8x16<Self> {
558 unsafe {
559 let val = a.into();
560 let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
561 let lo_16 = _mm_unpacklo_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val));
562 let hi_16 = _mm_unpackhi_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val));
563 let lo_shifted = _mm_sll_epi16(lo_16, shift_count);
564 let hi_shifted = _mm_sll_epi16(hi_16, shift_count);
565 _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
566 }
567 }
568 #[inline(always)]
569 fn shlv_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
570 core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
571 }
572 #[inline(always)]
573 fn shr_i8x16(self, a: i8x16<Self>, shift: u32) -> i8x16<Self> {
574 unsafe {
575 let val = a.into();
576 let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
577 let lo_16 = _mm_unpacklo_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val));
578 let hi_16 = _mm_unpackhi_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val));
579 let lo_shifted = _mm_sra_epi16(lo_16, shift_count);
580 let hi_shifted = _mm_sra_epi16(hi_16, shift_count);
581 _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
582 }
583 }
584 #[inline(always)]
585 fn shrv_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
586 core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
587 }
588 #[inline(always)]
589 fn simd_eq_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
590 unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) }
591 }
592 #[inline(always)]
593 fn simd_lt_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
594 unsafe { _mm_cmpgt_epi8(b.into(), a.into()).simd_into(self) }
595 }
596 #[inline(always)]
597 fn simd_le_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
598 unsafe { _mm_cmpeq_epi8(_mm_min_epi8(a.into(), b.into()), a.into()).simd_into(self) }
599 }
600 #[inline(always)]
601 fn simd_ge_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
602 unsafe { _mm_cmpeq_epi8(_mm_max_epi8(a.into(), b.into()), a.into()).simd_into(self) }
603 }
604 #[inline(always)]
605 fn simd_gt_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
606 unsafe { _mm_cmpgt_epi8(a.into(), b.into()).simd_into(self) }
607 }
608 #[inline(always)]
609 fn zip_low_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
610 unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) }
611 }
612 #[inline(always)]
613 fn zip_high_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
614 unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) }
615 }
616 #[inline(always)]
617 fn unzip_low_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
618 unsafe {
619 let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
620 let t1 = _mm_shuffle_epi8(a.into(), mask);
621 let t2 = _mm_shuffle_epi8(b.into(), mask);
622 _mm_unpacklo_epi64(t1, t2).simd_into(self)
623 }
624 }
625 #[inline(always)]
626 fn unzip_high_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
627 unsafe {
628 let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
629 let t1 = _mm_shuffle_epi8(a.into(), mask);
630 let t2 = _mm_shuffle_epi8(b.into(), mask);
631 _mm_unpackhi_epi64(t1, t2).simd_into(self)
632 }
633 }
634 #[inline(always)]
635 fn interleave_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> (i8x16<Self>, i8x16<Self>) {
636 (self.zip_low_i8x16(a, b), self.zip_high_i8x16(a, b))
637 }
638 #[inline(always)]
639 fn deinterleave_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> (i8x16<Self>, i8x16<Self>) {
640 (self.unzip_low_i8x16(a, b), self.unzip_high_i8x16(a, b))
641 }
642 #[inline(always)]
643 fn select_i8x16(self, a: mask8x16<Self>, b: i8x16<Self>, c: i8x16<Self>) -> i8x16<Self> {
644 unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
645 }
646 #[inline(always)]
647 fn min_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
648 unsafe { _mm_min_epi8(a.into(), b.into()).simd_into(self) }
649 }
650 #[inline(always)]
651 fn max_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
652 unsafe { _mm_max_epi8(a.into(), b.into()).simd_into(self) }
653 }
654 #[inline(always)]
655 fn combine_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x32<Self> {
656 i8x32 {
657 val: crate::support::Aligned256([a.val.0, b.val.0]),
658 simd: self,
659 }
660 }
661 #[inline(always)]
662 fn neg_i8x16(self, a: i8x16<Self>) -> i8x16<Self> {
663 unsafe { _mm_sub_epi8(_mm_setzero_si128(), a.into()).simd_into(self) }
664 }
665 #[inline(always)]
666 fn reinterpret_u8_i8x16(self, a: i8x16<Self>) -> u8x16<Self> {
667 __m128i::from(a).simd_into(self)
668 }
669 #[inline(always)]
670 fn reinterpret_u32_i8x16(self, a: i8x16<Self>) -> u32x4<Self> {
671 __m128i::from(a).simd_into(self)
672 }
673 #[inline(always)]
674 fn splat_u8x16(self, val: u8) -> u8x16<Self> {
675 unsafe { _mm_set1_epi8(val.cast_signed()).simd_into(self) }
676 }
677 #[inline(always)]
678 fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16<Self> {
679 u8x16 {
680 val: unsafe { core::mem::transmute_copy(&val) },
681 simd: self,
682 }
683 }
684 #[inline(always)]
685 fn load_array_ref_u8x16(self, val: &[u8; 16usize]) -> u8x16<Self> {
686 u8x16 {
687 val: unsafe { core::mem::transmute_copy(val) },
688 simd: self,
689 }
690 }
691 #[inline(always)]
692 fn as_array_u8x16(self, a: u8x16<Self>) -> [u8; 16usize] {
693 unsafe { core::mem::transmute::<__m128i, [u8; 16usize]>(a.val.0) }
694 }
695 #[inline(always)]
696 fn as_array_ref_u8x16(self, a: &u8x16<Self>) -> &[u8; 16usize] {
697 unsafe { core::mem::transmute::<&__m128i, &[u8; 16usize]>(&a.val.0) }
698 }
699 #[inline(always)]
700 fn as_array_mut_u8x16(self, a: &mut u8x16<Self>) -> &mut [u8; 16usize] {
701 unsafe { core::mem::transmute::<&mut __m128i, &mut [u8; 16usize]>(&mut a.val.0) }
702 }
703 #[inline(always)]
704 fn store_array_u8x16(self, a: u8x16<Self>, dest: &mut [u8; 16usize]) -> () {
705 unsafe {
706 core::ptr::copy_nonoverlapping(
707 (&raw const a.val.0) as *const u8,
708 dest.as_mut_ptr(),
709 16usize,
710 );
711 }
712 }
713 #[inline(always)]
714 fn cvt_from_bytes_u8x16(self, a: u8x16<Self>) -> u8x16<Self> {
715 unsafe {
716 u8x16 {
717 val: core::mem::transmute(a.val),
718 simd: self,
719 }
720 }
721 }
722 #[inline(always)]
723 fn cvt_to_bytes_u8x16(self, a: u8x16<Self>) -> u8x16<Self> {
724 unsafe {
725 u8x16 {
726 val: core::mem::transmute(a.val),
727 simd: self,
728 }
729 }
730 }
731 #[inline(always)]
732 fn slide_u8x16<const SHIFT: usize>(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
733 unsafe {
734 if SHIFT >= 16usize {
735 return b;
736 }
737 let result = dyn_alignr_128(
738 self.cvt_to_bytes_u8x16(b).val.0,
739 self.cvt_to_bytes_u8x16(a).val.0,
740 SHIFT,
741 );
742 self.cvt_from_bytes_u8x16(u8x16 {
743 val: crate::support::Aligned128(result),
744 simd: self,
745 })
746 }
747 }
748 #[inline(always)]
749 fn slide_within_blocks_u8x16<const SHIFT: usize>(
750 self,
751 a: u8x16<Self>,
752 b: u8x16<Self>,
753 ) -> u8x16<Self> {
754 self.slide_u8x16::<SHIFT>(a, b)
755 }
756 #[inline(always)]
757 fn add_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
758 unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) }
759 }
760 #[inline(always)]
761 fn sub_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
762 unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) }
763 }
764 #[inline(always)]
765 fn mul_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
766 unsafe {
767 let dst_even = _mm_mullo_epi16(a.into(), b.into());
768 let dst_odd =
769 _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into()));
770 _mm_or_si128(
771 _mm_slli_epi16(dst_odd, 8),
772 _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)),
773 )
774 .simd_into(self)
775 }
776 }
777 #[inline(always)]
778 fn and_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
779 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
780 }
781 #[inline(always)]
782 fn or_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
783 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
784 }
785 #[inline(always)]
786 fn xor_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
787 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
788 }
789 #[inline(always)]
790 fn not_u8x16(self, a: u8x16<Self>) -> u8x16<Self> {
791 a ^ !0
792 }
793 #[inline(always)]
794 fn shl_u8x16(self, a: u8x16<Self>, shift: u32) -> u8x16<Self> {
795 unsafe {
796 let val = a.into();
797 let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
798 let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128());
799 let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128());
800 let lo_shifted = _mm_sll_epi16(lo_16, shift_count);
801 let hi_shifted = _mm_sll_epi16(hi_16, shift_count);
802 _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
803 }
804 }
805 #[inline(always)]
806 fn shlv_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
807 core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
808 }
809 #[inline(always)]
810 fn shr_u8x16(self, a: u8x16<Self>, shift: u32) -> u8x16<Self> {
811 unsafe {
812 let val = a.into();
813 let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
814 let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128());
815 let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128());
816 let lo_shifted = _mm_srl_epi16(lo_16, shift_count);
817 let hi_shifted = _mm_srl_epi16(hi_16, shift_count);
818 _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
819 }
820 }
821 #[inline(always)]
822 fn shrv_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
823 core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
824 }
825 #[inline(always)]
826 fn simd_eq_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
827 unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) }
828 }
829 #[inline(always)]
830 fn simd_lt_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
831 unsafe {
832 let sign_bit = _mm_set1_epi8(0x80u8.cast_signed());
833 let a_signed = _mm_xor_si128(a.into(), sign_bit);
834 let b_signed = _mm_xor_si128(b.into(), sign_bit);
835 _mm_cmpgt_epi8(b_signed, a_signed).simd_into(self)
836 }
837 }
838 #[inline(always)]
839 fn simd_le_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
840 unsafe { _mm_cmpeq_epi8(_mm_min_epu8(a.into(), b.into()), a.into()).simd_into(self) }
841 }
842 #[inline(always)]
843 fn simd_ge_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
844 unsafe { _mm_cmpeq_epi8(_mm_max_epu8(a.into(), b.into()), a.into()).simd_into(self) }
845 }
846 #[inline(always)]
847 fn simd_gt_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
848 unsafe {
849 let sign_bit = _mm_set1_epi8(0x80u8.cast_signed());
850 let a_signed = _mm_xor_si128(a.into(), sign_bit);
851 let b_signed = _mm_xor_si128(b.into(), sign_bit);
852 _mm_cmpgt_epi8(a_signed, b_signed).simd_into(self)
853 }
854 }
855 #[inline(always)]
856 fn zip_low_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
857 unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) }
858 }
859 #[inline(always)]
860 fn zip_high_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
861 unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) }
862 }
863 #[inline(always)]
864 fn unzip_low_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
865 unsafe {
866 let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
867 let t1 = _mm_shuffle_epi8(a.into(), mask);
868 let t2 = _mm_shuffle_epi8(b.into(), mask);
869 _mm_unpacklo_epi64(t1, t2).simd_into(self)
870 }
871 }
872 #[inline(always)]
873 fn unzip_high_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
874 unsafe {
875 let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
876 let t1 = _mm_shuffle_epi8(a.into(), mask);
877 let t2 = _mm_shuffle_epi8(b.into(), mask);
878 _mm_unpackhi_epi64(t1, t2).simd_into(self)
879 }
880 }
881 #[inline(always)]
882 fn interleave_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> (u8x16<Self>, u8x16<Self>) {
883 (self.zip_low_u8x16(a, b), self.zip_high_u8x16(a, b))
884 }
885 #[inline(always)]
886 fn deinterleave_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> (u8x16<Self>, u8x16<Self>) {
887 (self.unzip_low_u8x16(a, b), self.unzip_high_u8x16(a, b))
888 }
889 #[inline(always)]
890 fn select_u8x16(self, a: mask8x16<Self>, b: u8x16<Self>, c: u8x16<Self>) -> u8x16<Self> {
891 unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
892 }
893 #[inline(always)]
894 fn min_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
895 unsafe { _mm_min_epu8(a.into(), b.into()).simd_into(self) }
896 }
897 #[inline(always)]
898 fn max_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
899 unsafe { _mm_max_epu8(a.into(), b.into()).simd_into(self) }
900 }
901 #[inline(always)]
902 fn combine_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x32<Self> {
903 u8x32 {
904 val: crate::support::Aligned256([a.val.0, b.val.0]),
905 simd: self,
906 }
907 }
908 #[inline(always)]
909 fn widen_u8x16(self, a: u8x16<Self>) -> u16x16<Self> {
910 unsafe {
911 let raw = a.into();
912 let high = _mm_cvtepu8_epi16(raw).simd_into(self);
913 let low = _mm_cvtepu8_epi16(_mm_srli_si128::<8>(raw)).simd_into(self);
914 self.combine_u16x8(high, low)
915 }
916 }
917 #[inline(always)]
918 fn reinterpret_u32_u8x16(self, a: u8x16<Self>) -> u32x4<Self> {
919 __m128i::from(a).simd_into(self)
920 }
921 #[inline(always)]
922 fn splat_mask8x16(self, val: i8) -> mask8x16<Self> {
923 unsafe { _mm_set1_epi8(val).simd_into(self) }
924 }
925 #[inline(always)]
926 fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16<Self> {
927 mask8x16 {
928 val: unsafe { core::mem::transmute_copy(&val) },
929 simd: self,
930 }
931 }
932 #[inline(always)]
933 fn load_array_ref_mask8x16(self, val: &[i8; 16usize]) -> mask8x16<Self> {
934 mask8x16 {
935 val: unsafe { core::mem::transmute_copy(val) },
936 simd: self,
937 }
938 }
939 #[inline(always)]
940 fn as_array_mask8x16(self, a: mask8x16<Self>) -> [i8; 16usize] {
941 unsafe { core::mem::transmute::<__m128i, [i8; 16usize]>(a.val.0) }
942 }
943 #[inline(always)]
944 fn as_array_ref_mask8x16(self, a: &mask8x16<Self>) -> &[i8; 16usize] {
945 unsafe { core::mem::transmute::<&__m128i, &[i8; 16usize]>(&a.val.0) }
946 }
947 #[inline(always)]
948 fn as_array_mut_mask8x16(self, a: &mut mask8x16<Self>) -> &mut [i8; 16usize] {
949 unsafe { core::mem::transmute::<&mut __m128i, &mut [i8; 16usize]>(&mut a.val.0) }
950 }
951 #[inline(always)]
952 fn store_array_mask8x16(self, a: mask8x16<Self>, dest: &mut [i8; 16usize]) -> () {
953 unsafe {
954 core::ptr::copy_nonoverlapping(
955 (&raw const a.val.0) as *const i8,
956 dest.as_mut_ptr(),
957 16usize,
958 );
959 }
960 }
961 #[inline(always)]
962 fn cvt_from_bytes_mask8x16(self, a: u8x16<Self>) -> mask8x16<Self> {
963 unsafe {
964 mask8x16 {
965 val: core::mem::transmute(a.val),
966 simd: self,
967 }
968 }
969 }
970 #[inline(always)]
971 fn cvt_to_bytes_mask8x16(self, a: mask8x16<Self>) -> u8x16<Self> {
972 unsafe {
973 u8x16 {
974 val: core::mem::transmute(a.val),
975 simd: self,
976 }
977 }
978 }
979 #[inline(always)]
980 fn slide_mask8x16<const SHIFT: usize>(
981 self,
982 a: mask8x16<Self>,
983 b: mask8x16<Self>,
984 ) -> mask8x16<Self> {
985 unsafe {
986 if SHIFT >= 16usize {
987 return b;
988 }
989 let result = dyn_alignr_128(
990 self.cvt_to_bytes_mask8x16(b).val.0,
991 self.cvt_to_bytes_mask8x16(a).val.0,
992 SHIFT,
993 );
994 self.cvt_from_bytes_mask8x16(u8x16 {
995 val: crate::support::Aligned128(result),
996 simd: self,
997 })
998 }
999 }
1000 #[inline(always)]
1001 fn slide_within_blocks_mask8x16<const SHIFT: usize>(
1002 self,
1003 a: mask8x16<Self>,
1004 b: mask8x16<Self>,
1005 ) -> mask8x16<Self> {
1006 self.slide_mask8x16::<SHIFT>(a, b)
1007 }
1008 #[inline(always)]
1009 fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
1010 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1011 }
1012 #[inline(always)]
1013 fn or_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
1014 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1015 }
1016 #[inline(always)]
1017 fn xor_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
1018 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1019 }
1020 #[inline(always)]
1021 fn not_mask8x16(self, a: mask8x16<Self>) -> mask8x16<Self> {
1022 a ^ !0
1023 }
1024 #[inline(always)]
1025 fn select_mask8x16(
1026 self,
1027 a: mask8x16<Self>,
1028 b: mask8x16<Self>,
1029 c: mask8x16<Self>,
1030 ) -> mask8x16<Self> {
1031 unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
1032 }
1033 #[inline(always)]
1034 fn simd_eq_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
1035 unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) }
1036 }
1037 #[inline(always)]
1038 fn any_true_mask8x16(self, a: mask8x16<Self>) -> bool {
1039 unsafe { _mm_movemask_epi8(a.into()) as u32 != 0 }
1040 }
1041 #[inline(always)]
1042 fn all_true_mask8x16(self, a: mask8x16<Self>) -> bool {
1043 unsafe { _mm_movemask_epi8(a.into()) as u32 == 0xffff }
1044 }
1045 #[inline(always)]
1046 fn any_false_mask8x16(self, a: mask8x16<Self>) -> bool {
1047 unsafe { _mm_movemask_epi8(a.into()) as u32 != 0xffff }
1048 }
1049 #[inline(always)]
1050 fn all_false_mask8x16(self, a: mask8x16<Self>) -> bool {
1051 unsafe { _mm_movemask_epi8(a.into()) as u32 == 0 }
1052 }
1053 #[inline(always)]
1054 fn combine_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x32<Self> {
1055 mask8x32 {
1056 val: crate::support::Aligned256([a.val.0, b.val.0]),
1057 simd: self,
1058 }
1059 }
1060 #[inline(always)]
1061 fn splat_i16x8(self, val: i16) -> i16x8<Self> {
1062 unsafe { _mm_set1_epi16(val).simd_into(self) }
1063 }
1064 #[inline(always)]
1065 fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8<Self> {
1066 i16x8 {
1067 val: unsafe { core::mem::transmute_copy(&val) },
1068 simd: self,
1069 }
1070 }
1071 #[inline(always)]
1072 fn load_array_ref_i16x8(self, val: &[i16; 8usize]) -> i16x8<Self> {
1073 i16x8 {
1074 val: unsafe { core::mem::transmute_copy(val) },
1075 simd: self,
1076 }
1077 }
1078 #[inline(always)]
1079 fn as_array_i16x8(self, a: i16x8<Self>) -> [i16; 8usize] {
1080 unsafe { core::mem::transmute::<__m128i, [i16; 8usize]>(a.val.0) }
1081 }
1082 #[inline(always)]
1083 fn as_array_ref_i16x8(self, a: &i16x8<Self>) -> &[i16; 8usize] {
1084 unsafe { core::mem::transmute::<&__m128i, &[i16; 8usize]>(&a.val.0) }
1085 }
1086 #[inline(always)]
1087 fn as_array_mut_i16x8(self, a: &mut i16x8<Self>) -> &mut [i16; 8usize] {
1088 unsafe { core::mem::transmute::<&mut __m128i, &mut [i16; 8usize]>(&mut a.val.0) }
1089 }
1090 #[inline(always)]
1091 fn store_array_i16x8(self, a: i16x8<Self>, dest: &mut [i16; 8usize]) -> () {
1092 unsafe {
1093 core::ptr::copy_nonoverlapping(
1094 (&raw const a.val.0) as *const i16,
1095 dest.as_mut_ptr(),
1096 8usize,
1097 );
1098 }
1099 }
1100 #[inline(always)]
1101 fn cvt_from_bytes_i16x8(self, a: u8x16<Self>) -> i16x8<Self> {
1102 unsafe {
1103 i16x8 {
1104 val: core::mem::transmute(a.val),
1105 simd: self,
1106 }
1107 }
1108 }
1109 #[inline(always)]
1110 fn cvt_to_bytes_i16x8(self, a: i16x8<Self>) -> u8x16<Self> {
1111 unsafe {
1112 u8x16 {
1113 val: core::mem::transmute(a.val),
1114 simd: self,
1115 }
1116 }
1117 }
1118 #[inline(always)]
1119 fn slide_i16x8<const SHIFT: usize>(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1120 unsafe {
1121 if SHIFT >= 8usize {
1122 return b;
1123 }
1124 let result = dyn_alignr_128(
1125 self.cvt_to_bytes_i16x8(b).val.0,
1126 self.cvt_to_bytes_i16x8(a).val.0,
1127 SHIFT * 2usize,
1128 );
1129 self.cvt_from_bytes_i16x8(u8x16 {
1130 val: crate::support::Aligned128(result),
1131 simd: self,
1132 })
1133 }
1134 }
1135 #[inline(always)]
1136 fn slide_within_blocks_i16x8<const SHIFT: usize>(
1137 self,
1138 a: i16x8<Self>,
1139 b: i16x8<Self>,
1140 ) -> i16x8<Self> {
1141 self.slide_i16x8::<SHIFT>(a, b)
1142 }
1143 #[inline(always)]
1144 fn add_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1145 unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) }
1146 }
1147 #[inline(always)]
1148 fn sub_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1149 unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) }
1150 }
1151 #[inline(always)]
1152 fn mul_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1153 unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) }
1154 }
1155 #[inline(always)]
1156 fn and_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1157 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1158 }
1159 #[inline(always)]
1160 fn or_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1161 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1162 }
1163 #[inline(always)]
1164 fn xor_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1165 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1166 }
1167 #[inline(always)]
1168 fn not_i16x8(self, a: i16x8<Self>) -> i16x8<Self> {
1169 a ^ !0
1170 }
1171 #[inline(always)]
1172 fn shl_i16x8(self, a: i16x8<Self>, shift: u32) -> i16x8<Self> {
1173 unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1174 }
1175 #[inline(always)]
1176 fn shlv_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1177 core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
1178 }
1179 #[inline(always)]
1180 fn shr_i16x8(self, a: i16x8<Self>, shift: u32) -> i16x8<Self> {
1181 unsafe { _mm_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1182 }
1183 #[inline(always)]
1184 fn shrv_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1185 core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
1186 }
1187 #[inline(always)]
1188 fn simd_eq_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
1189 unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) }
1190 }
1191 #[inline(always)]
1192 fn simd_lt_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
1193 unsafe { _mm_cmpgt_epi16(b.into(), a.into()).simd_into(self) }
1194 }
1195 #[inline(always)]
1196 fn simd_le_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
1197 unsafe { _mm_cmpeq_epi16(_mm_min_epi16(a.into(), b.into()), a.into()).simd_into(self) }
1198 }
1199 #[inline(always)]
1200 fn simd_ge_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
1201 unsafe { _mm_cmpeq_epi16(_mm_max_epi16(a.into(), b.into()), a.into()).simd_into(self) }
1202 }
1203 #[inline(always)]
1204 fn simd_gt_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
1205 unsafe { _mm_cmpgt_epi16(a.into(), b.into()).simd_into(self) }
1206 }
1207 #[inline(always)]
1208 fn zip_low_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1209 unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) }
1210 }
1211 #[inline(always)]
1212 fn zip_high_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1213 unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) }
1214 }
1215 #[inline(always)]
1216 fn unzip_low_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1217 unsafe {
1218 let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
1219 let t1 = _mm_shuffle_epi8(a.into(), mask);
1220 let t2 = _mm_shuffle_epi8(b.into(), mask);
1221 _mm_unpacklo_epi64(t1, t2).simd_into(self)
1222 }
1223 }
1224 #[inline(always)]
1225 fn unzip_high_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1226 unsafe {
1227 let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
1228 let t1 = _mm_shuffle_epi8(a.into(), mask);
1229 let t2 = _mm_shuffle_epi8(b.into(), mask);
1230 _mm_unpackhi_epi64(t1, t2).simd_into(self)
1231 }
1232 }
1233 #[inline(always)]
1234 fn interleave_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> (i16x8<Self>, i16x8<Self>) {
1235 (self.zip_low_i16x8(a, b), self.zip_high_i16x8(a, b))
1236 }
1237 #[inline(always)]
1238 fn deinterleave_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> (i16x8<Self>, i16x8<Self>) {
1239 (self.unzip_low_i16x8(a, b), self.unzip_high_i16x8(a, b))
1240 }
1241 #[inline(always)]
1242 fn select_i16x8(self, a: mask16x8<Self>, b: i16x8<Self>, c: i16x8<Self>) -> i16x8<Self> {
1243 unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
1244 }
1245 #[inline(always)]
1246 fn min_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1247 unsafe { _mm_min_epi16(a.into(), b.into()).simd_into(self) }
1248 }
1249 #[inline(always)]
1250 fn max_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1251 unsafe { _mm_max_epi16(a.into(), b.into()).simd_into(self) }
1252 }
1253 #[inline(always)]
1254 fn combine_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x16<Self> {
1255 i16x16 {
1256 val: crate::support::Aligned256([a.val.0, b.val.0]),
1257 simd: self,
1258 }
1259 }
1260 #[inline(always)]
1261 fn neg_i16x8(self, a: i16x8<Self>) -> i16x8<Self> {
1262 unsafe { _mm_sub_epi16(_mm_setzero_si128(), a.into()).simd_into(self) }
1263 }
1264 #[inline(always)]
1265 fn reinterpret_u8_i16x8(self, a: i16x8<Self>) -> u8x16<Self> {
1266 __m128i::from(a).simd_into(self)
1267 }
1268 #[inline(always)]
1269 fn reinterpret_u32_i16x8(self, a: i16x8<Self>) -> u32x4<Self> {
1270 __m128i::from(a).simd_into(self)
1271 }
1272 #[inline(always)]
1273 fn splat_u16x8(self, val: u16) -> u16x8<Self> {
1274 unsafe { _mm_set1_epi16(val.cast_signed()).simd_into(self) }
1275 }
1276 #[inline(always)]
1277 fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8<Self> {
1278 u16x8 {
1279 val: unsafe { core::mem::transmute_copy(&val) },
1280 simd: self,
1281 }
1282 }
1283 #[inline(always)]
1284 fn load_array_ref_u16x8(self, val: &[u16; 8usize]) -> u16x8<Self> {
1285 u16x8 {
1286 val: unsafe { core::mem::transmute_copy(val) },
1287 simd: self,
1288 }
1289 }
1290 #[inline(always)]
1291 fn as_array_u16x8(self, a: u16x8<Self>) -> [u16; 8usize] {
1292 unsafe { core::mem::transmute::<__m128i, [u16; 8usize]>(a.val.0) }
1293 }
1294 #[inline(always)]
1295 fn as_array_ref_u16x8(self, a: &u16x8<Self>) -> &[u16; 8usize] {
1296 unsafe { core::mem::transmute::<&__m128i, &[u16; 8usize]>(&a.val.0) }
1297 }
1298 #[inline(always)]
1299 fn as_array_mut_u16x8(self, a: &mut u16x8<Self>) -> &mut [u16; 8usize] {
1300 unsafe { core::mem::transmute::<&mut __m128i, &mut [u16; 8usize]>(&mut a.val.0) }
1301 }
1302 #[inline(always)]
1303 fn store_array_u16x8(self, a: u16x8<Self>, dest: &mut [u16; 8usize]) -> () {
1304 unsafe {
1305 core::ptr::copy_nonoverlapping(
1306 (&raw const a.val.0) as *const u16,
1307 dest.as_mut_ptr(),
1308 8usize,
1309 );
1310 }
1311 }
1312 #[inline(always)]
1313 fn cvt_from_bytes_u16x8(self, a: u8x16<Self>) -> u16x8<Self> {
1314 unsafe {
1315 u16x8 {
1316 val: core::mem::transmute(a.val),
1317 simd: self,
1318 }
1319 }
1320 }
1321 #[inline(always)]
1322 fn cvt_to_bytes_u16x8(self, a: u16x8<Self>) -> u8x16<Self> {
1323 unsafe {
1324 u8x16 {
1325 val: core::mem::transmute(a.val),
1326 simd: self,
1327 }
1328 }
1329 }
1330 #[inline(always)]
1331 fn slide_u16x8<const SHIFT: usize>(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1332 unsafe {
1333 if SHIFT >= 8usize {
1334 return b;
1335 }
1336 let result = dyn_alignr_128(
1337 self.cvt_to_bytes_u16x8(b).val.0,
1338 self.cvt_to_bytes_u16x8(a).val.0,
1339 SHIFT * 2usize,
1340 );
1341 self.cvt_from_bytes_u16x8(u8x16 {
1342 val: crate::support::Aligned128(result),
1343 simd: self,
1344 })
1345 }
1346 }
1347 #[inline(always)]
1348 fn slide_within_blocks_u16x8<const SHIFT: usize>(
1349 self,
1350 a: u16x8<Self>,
1351 b: u16x8<Self>,
1352 ) -> u16x8<Self> {
1353 self.slide_u16x8::<SHIFT>(a, b)
1354 }
1355 #[inline(always)]
1356 fn add_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1357 unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) }
1358 }
1359 #[inline(always)]
1360 fn sub_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1361 unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) }
1362 }
1363 #[inline(always)]
1364 fn mul_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1365 unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) }
1366 }
1367 #[inline(always)]
1368 fn and_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1369 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1370 }
1371 #[inline(always)]
1372 fn or_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1373 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1374 }
1375 #[inline(always)]
1376 fn xor_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1377 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1378 }
1379 #[inline(always)]
1380 fn not_u16x8(self, a: u16x8<Self>) -> u16x8<Self> {
1381 a ^ !0
1382 }
1383 #[inline(always)]
1384 fn shl_u16x8(self, a: u16x8<Self>, shift: u32) -> u16x8<Self> {
1385 unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1386 }
1387 #[inline(always)]
1388 fn shlv_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1389 core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
1390 }
1391 #[inline(always)]
1392 fn shr_u16x8(self, a: u16x8<Self>, shift: u32) -> u16x8<Self> {
1393 unsafe { _mm_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1394 }
1395 #[inline(always)]
1396 fn shrv_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1397 core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
1398 }
1399 #[inline(always)]
1400 fn simd_eq_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
1401 unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) }
1402 }
1403 #[inline(always)]
1404 fn simd_lt_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
1405 unsafe {
1406 let sign_bit = _mm_set1_epi16(0x8000u16.cast_signed());
1407 let a_signed = _mm_xor_si128(a.into(), sign_bit);
1408 let b_signed = _mm_xor_si128(b.into(), sign_bit);
1409 _mm_cmpgt_epi16(b_signed, a_signed).simd_into(self)
1410 }
1411 }
1412 #[inline(always)]
1413 fn simd_le_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
1414 unsafe { _mm_cmpeq_epi16(_mm_min_epu16(a.into(), b.into()), a.into()).simd_into(self) }
1415 }
1416 #[inline(always)]
1417 fn simd_ge_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
1418 unsafe { _mm_cmpeq_epi16(_mm_max_epu16(a.into(), b.into()), a.into()).simd_into(self) }
1419 }
1420 #[inline(always)]
1421 fn simd_gt_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
1422 unsafe {
1423 let sign_bit = _mm_set1_epi16(0x8000u16.cast_signed());
1424 let a_signed = _mm_xor_si128(a.into(), sign_bit);
1425 let b_signed = _mm_xor_si128(b.into(), sign_bit);
1426 _mm_cmpgt_epi16(a_signed, b_signed).simd_into(self)
1427 }
1428 }
1429 #[inline(always)]
1430 fn zip_low_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1431 unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) }
1432 }
1433 #[inline(always)]
1434 fn zip_high_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1435 unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) }
1436 }
1437 #[inline(always)]
1438 fn unzip_low_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1439 unsafe {
1440 let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
1441 let t1 = _mm_shuffle_epi8(a.into(), mask);
1442 let t2 = _mm_shuffle_epi8(b.into(), mask);
1443 _mm_unpacklo_epi64(t1, t2).simd_into(self)
1444 }
1445 }
1446 #[inline(always)]
1447 fn unzip_high_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1448 unsafe {
1449 let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
1450 let t1 = _mm_shuffle_epi8(a.into(), mask);
1451 let t2 = _mm_shuffle_epi8(b.into(), mask);
1452 _mm_unpackhi_epi64(t1, t2).simd_into(self)
1453 }
1454 }
1455 #[inline(always)]
1456 fn interleave_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> (u16x8<Self>, u16x8<Self>) {
1457 (self.zip_low_u16x8(a, b), self.zip_high_u16x8(a, b))
1458 }
1459 #[inline(always)]
1460 fn deinterleave_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> (u16x8<Self>, u16x8<Self>) {
1461 (self.unzip_low_u16x8(a, b), self.unzip_high_u16x8(a, b))
1462 }
1463 #[inline(always)]
1464 fn select_u16x8(self, a: mask16x8<Self>, b: u16x8<Self>, c: u16x8<Self>) -> u16x8<Self> {
1465 unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
1466 }
1467 #[inline(always)]
1468 fn min_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1469 unsafe { _mm_min_epu16(a.into(), b.into()).simd_into(self) }
1470 }
1471 #[inline(always)]
1472 fn max_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1473 unsafe { _mm_max_epu16(a.into(), b.into()).simd_into(self) }
1474 }
1475 #[inline(always)]
1476 fn combine_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x16<Self> {
1477 u16x16 {
1478 val: crate::support::Aligned256([a.val.0, b.val.0]),
1479 simd: self,
1480 }
1481 }
1482 #[inline(always)]
1483 fn reinterpret_u8_u16x8(self, a: u16x8<Self>) -> u8x16<Self> {
1484 __m128i::from(a).simd_into(self)
1485 }
1486 #[inline(always)]
1487 fn reinterpret_u32_u16x8(self, a: u16x8<Self>) -> u32x4<Self> {
1488 __m128i::from(a).simd_into(self)
1489 }
1490 #[inline(always)]
1491 fn splat_mask16x8(self, val: i16) -> mask16x8<Self> {
1492 unsafe { _mm_set1_epi16(val).simd_into(self) }
1493 }
1494 #[inline(always)]
1495 fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8<Self> {
1496 mask16x8 {
1497 val: unsafe { core::mem::transmute_copy(&val) },
1498 simd: self,
1499 }
1500 }
1501 #[inline(always)]
1502 fn load_array_ref_mask16x8(self, val: &[i16; 8usize]) -> mask16x8<Self> {
1503 mask16x8 {
1504 val: unsafe { core::mem::transmute_copy(val) },
1505 simd: self,
1506 }
1507 }
1508 #[inline(always)]
1509 fn as_array_mask16x8(self, a: mask16x8<Self>) -> [i16; 8usize] {
1510 unsafe { core::mem::transmute::<__m128i, [i16; 8usize]>(a.val.0) }
1511 }
1512 #[inline(always)]
1513 fn as_array_ref_mask16x8(self, a: &mask16x8<Self>) -> &[i16; 8usize] {
1514 unsafe { core::mem::transmute::<&__m128i, &[i16; 8usize]>(&a.val.0) }
1515 }
1516 #[inline(always)]
1517 fn as_array_mut_mask16x8(self, a: &mut mask16x8<Self>) -> &mut [i16; 8usize] {
1518 unsafe { core::mem::transmute::<&mut __m128i, &mut [i16; 8usize]>(&mut a.val.0) }
1519 }
1520 #[inline(always)]
1521 fn store_array_mask16x8(self, a: mask16x8<Self>, dest: &mut [i16; 8usize]) -> () {
1522 unsafe {
1523 core::ptr::copy_nonoverlapping(
1524 (&raw const a.val.0) as *const i16,
1525 dest.as_mut_ptr(),
1526 8usize,
1527 );
1528 }
1529 }
1530 #[inline(always)]
1531 fn cvt_from_bytes_mask16x8(self, a: u8x16<Self>) -> mask16x8<Self> {
1532 unsafe {
1533 mask16x8 {
1534 val: core::mem::transmute(a.val),
1535 simd: self,
1536 }
1537 }
1538 }
1539 #[inline(always)]
1540 fn cvt_to_bytes_mask16x8(self, a: mask16x8<Self>) -> u8x16<Self> {
1541 unsafe {
1542 u8x16 {
1543 val: core::mem::transmute(a.val),
1544 simd: self,
1545 }
1546 }
1547 }
1548 #[inline(always)]
1549 fn slide_mask16x8<const SHIFT: usize>(
1550 self,
1551 a: mask16x8<Self>,
1552 b: mask16x8<Self>,
1553 ) -> mask16x8<Self> {
1554 unsafe {
1555 if SHIFT >= 8usize {
1556 return b;
1557 }
1558 let result = dyn_alignr_128(
1559 self.cvt_to_bytes_mask16x8(b).val.0,
1560 self.cvt_to_bytes_mask16x8(a).val.0,
1561 SHIFT * 2usize,
1562 );
1563 self.cvt_from_bytes_mask16x8(u8x16 {
1564 val: crate::support::Aligned128(result),
1565 simd: self,
1566 })
1567 }
1568 }
1569 #[inline(always)]
1570 fn slide_within_blocks_mask16x8<const SHIFT: usize>(
1571 self,
1572 a: mask16x8<Self>,
1573 b: mask16x8<Self>,
1574 ) -> mask16x8<Self> {
1575 self.slide_mask16x8::<SHIFT>(a, b)
1576 }
1577 #[inline(always)]
1578 fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
1579 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1580 }
1581 #[inline(always)]
1582 fn or_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
1583 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1584 }
1585 #[inline(always)]
1586 fn xor_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
1587 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1588 }
1589 #[inline(always)]
1590 fn not_mask16x8(self, a: mask16x8<Self>) -> mask16x8<Self> {
1591 a ^ !0
1592 }
1593 #[inline(always)]
1594 fn select_mask16x8(
1595 self,
1596 a: mask16x8<Self>,
1597 b: mask16x8<Self>,
1598 c: mask16x8<Self>,
1599 ) -> mask16x8<Self> {
1600 unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
1601 }
1602 #[inline(always)]
1603 fn simd_eq_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
1604 unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) }
1605 }
1606 #[inline(always)]
1607 fn any_true_mask16x8(self, a: mask16x8<Self>) -> bool {
1608 unsafe { _mm_movemask_epi8(a.into()) as u32 != 0 }
1609 }
1610 #[inline(always)]
1611 fn all_true_mask16x8(self, a: mask16x8<Self>) -> bool {
1612 unsafe { _mm_movemask_epi8(a.into()) as u32 == 0xffff }
1613 }
1614 #[inline(always)]
1615 fn any_false_mask16x8(self, a: mask16x8<Self>) -> bool {
1616 unsafe { _mm_movemask_epi8(a.into()) as u32 != 0xffff }
1617 }
1618 #[inline(always)]
1619 fn all_false_mask16x8(self, a: mask16x8<Self>) -> bool {
1620 unsafe { _mm_movemask_epi8(a.into()) as u32 == 0 }
1621 }
1622 #[inline(always)]
1623 fn combine_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x16<Self> {
1624 mask16x16 {
1625 val: crate::support::Aligned256([a.val.0, b.val.0]),
1626 simd: self,
1627 }
1628 }
1629 #[inline(always)]
1630 fn splat_i32x4(self, val: i32) -> i32x4<Self> {
1631 unsafe { _mm_set1_epi32(val).simd_into(self) }
1632 }
1633 #[inline(always)]
1634 fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4<Self> {
1635 i32x4 {
1636 val: unsafe { core::mem::transmute_copy(&val) },
1637 simd: self,
1638 }
1639 }
1640 #[inline(always)]
1641 fn load_array_ref_i32x4(self, val: &[i32; 4usize]) -> i32x4<Self> {
1642 i32x4 {
1643 val: unsafe { core::mem::transmute_copy(val) },
1644 simd: self,
1645 }
1646 }
1647 #[inline(always)]
1648 fn as_array_i32x4(self, a: i32x4<Self>) -> [i32; 4usize] {
1649 unsafe { core::mem::transmute::<__m128i, [i32; 4usize]>(a.val.0) }
1650 }
1651 #[inline(always)]
1652 fn as_array_ref_i32x4(self, a: &i32x4<Self>) -> &[i32; 4usize] {
1653 unsafe { core::mem::transmute::<&__m128i, &[i32; 4usize]>(&a.val.0) }
1654 }
1655 #[inline(always)]
1656 fn as_array_mut_i32x4(self, a: &mut i32x4<Self>) -> &mut [i32; 4usize] {
1657 unsafe { core::mem::transmute::<&mut __m128i, &mut [i32; 4usize]>(&mut a.val.0) }
1658 }
1659 #[inline(always)]
1660 fn store_array_i32x4(self, a: i32x4<Self>, dest: &mut [i32; 4usize]) -> () {
1661 unsafe {
1662 core::ptr::copy_nonoverlapping(
1663 (&raw const a.val.0) as *const i32,
1664 dest.as_mut_ptr(),
1665 4usize,
1666 );
1667 }
1668 }
1669 #[inline(always)]
1670 fn cvt_from_bytes_i32x4(self, a: u8x16<Self>) -> i32x4<Self> {
1671 unsafe {
1672 i32x4 {
1673 val: core::mem::transmute(a.val),
1674 simd: self,
1675 }
1676 }
1677 }
1678 #[inline(always)]
1679 fn cvt_to_bytes_i32x4(self, a: i32x4<Self>) -> u8x16<Self> {
1680 unsafe {
1681 u8x16 {
1682 val: core::mem::transmute(a.val),
1683 simd: self,
1684 }
1685 }
1686 }
1687 #[inline(always)]
1688 fn slide_i32x4<const SHIFT: usize>(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1689 unsafe {
1690 if SHIFT >= 4usize {
1691 return b;
1692 }
1693 let result = dyn_alignr_128(
1694 self.cvt_to_bytes_i32x4(b).val.0,
1695 self.cvt_to_bytes_i32x4(a).val.0,
1696 SHIFT * 4usize,
1697 );
1698 self.cvt_from_bytes_i32x4(u8x16 {
1699 val: crate::support::Aligned128(result),
1700 simd: self,
1701 })
1702 }
1703 }
1704 #[inline(always)]
1705 fn slide_within_blocks_i32x4<const SHIFT: usize>(
1706 self,
1707 a: i32x4<Self>,
1708 b: i32x4<Self>,
1709 ) -> i32x4<Self> {
1710 self.slide_i32x4::<SHIFT>(a, b)
1711 }
1712 #[inline(always)]
1713 fn add_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1714 unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) }
1715 }
1716 #[inline(always)]
1717 fn sub_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1718 unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) }
1719 }
1720 #[inline(always)]
1721 fn mul_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1722 unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) }
1723 }
1724 #[inline(always)]
1725 fn and_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1726 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1727 }
1728 #[inline(always)]
1729 fn or_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1730 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1731 }
1732 #[inline(always)]
1733 fn xor_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1734 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1735 }
1736 #[inline(always)]
1737 fn not_i32x4(self, a: i32x4<Self>) -> i32x4<Self> {
1738 a ^ !0
1739 }
1740 #[inline(always)]
1741 fn shl_i32x4(self, a: i32x4<Self>, shift: u32) -> i32x4<Self> {
1742 unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1743 }
1744 #[inline(always)]
1745 fn shlv_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1746 core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
1747 }
1748 #[inline(always)]
1749 fn shr_i32x4(self, a: i32x4<Self>, shift: u32) -> i32x4<Self> {
1750 unsafe { _mm_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1751 }
1752 #[inline(always)]
1753 fn shrv_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1754 core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
1755 }
1756 #[inline(always)]
1757 fn simd_eq_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
1758 unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) }
1759 }
1760 #[inline(always)]
1761 fn simd_lt_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
1762 unsafe { _mm_cmpgt_epi32(b.into(), a.into()).simd_into(self) }
1763 }
1764 #[inline(always)]
1765 fn simd_le_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
1766 unsafe { _mm_cmpeq_epi32(_mm_min_epi32(a.into(), b.into()), a.into()).simd_into(self) }
1767 }
1768 #[inline(always)]
1769 fn simd_ge_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
1770 unsafe { _mm_cmpeq_epi32(_mm_max_epi32(a.into(), b.into()), a.into()).simd_into(self) }
1771 }
1772 #[inline(always)]
1773 fn simd_gt_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
1774 unsafe { _mm_cmpgt_epi32(a.into(), b.into()).simd_into(self) }
1775 }
1776 #[inline(always)]
1777 fn zip_low_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1778 unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) }
1779 }
1780 #[inline(always)]
1781 fn zip_high_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1782 unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) }
1783 }
1784 #[inline(always)]
1785 fn unzip_low_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1786 unsafe {
1787 let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
1788 let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
1789 _mm_unpacklo_epi64(t1, t2).simd_into(self)
1790 }
1791 }
1792 #[inline(always)]
1793 fn unzip_high_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1794 unsafe {
1795 let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
1796 let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
1797 _mm_unpackhi_epi64(t1, t2).simd_into(self)
1798 }
1799 }
1800 #[inline(always)]
1801 fn interleave_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> (i32x4<Self>, i32x4<Self>) {
1802 (self.zip_low_i32x4(a, b), self.zip_high_i32x4(a, b))
1803 }
1804 #[inline(always)]
1805 fn deinterleave_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> (i32x4<Self>, i32x4<Self>) {
1806 (self.unzip_low_i32x4(a, b), self.unzip_high_i32x4(a, b))
1807 }
1808 #[inline(always)]
1809 fn select_i32x4(self, a: mask32x4<Self>, b: i32x4<Self>, c: i32x4<Self>) -> i32x4<Self> {
1810 unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
1811 }
1812 #[inline(always)]
1813 fn min_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1814 unsafe { _mm_min_epi32(a.into(), b.into()).simd_into(self) }
1815 }
1816 #[inline(always)]
1817 fn max_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1818 unsafe { _mm_max_epi32(a.into(), b.into()).simd_into(self) }
1819 }
1820 #[inline(always)]
1821 fn combine_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x8<Self> {
1822 i32x8 {
1823 val: crate::support::Aligned256([a.val.0, b.val.0]),
1824 simd: self,
1825 }
1826 }
1827 #[inline(always)]
1828 fn neg_i32x4(self, a: i32x4<Self>) -> i32x4<Self> {
1829 unsafe { _mm_sub_epi32(_mm_setzero_si128(), a.into()).simd_into(self) }
1830 }
1831 #[inline(always)]
1832 fn reinterpret_u8_i32x4(self, a: i32x4<Self>) -> u8x16<Self> {
1833 __m128i::from(a).simd_into(self)
1834 }
1835 #[inline(always)]
1836 fn reinterpret_u32_i32x4(self, a: i32x4<Self>) -> u32x4<Self> {
1837 __m128i::from(a).simd_into(self)
1838 }
1839 #[inline(always)]
1840 fn cvt_f32_i32x4(self, a: i32x4<Self>) -> f32x4<Self> {
1841 unsafe { _mm_cvtepi32_ps(a.into()).simd_into(self) }
1842 }
1843 #[inline(always)]
1844 fn splat_u32x4(self, val: u32) -> u32x4<Self> {
1845 unsafe { _mm_set1_epi32(val.cast_signed()).simd_into(self) }
1846 }
1847 #[inline(always)]
1848 fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4<Self> {
1849 u32x4 {
1850 val: unsafe { core::mem::transmute_copy(&val) },
1851 simd: self,
1852 }
1853 }
1854 #[inline(always)]
1855 fn load_array_ref_u32x4(self, val: &[u32; 4usize]) -> u32x4<Self> {
1856 u32x4 {
1857 val: unsafe { core::mem::transmute_copy(val) },
1858 simd: self,
1859 }
1860 }
1861 #[inline(always)]
1862 fn as_array_u32x4(self, a: u32x4<Self>) -> [u32; 4usize] {
1863 unsafe { core::mem::transmute::<__m128i, [u32; 4usize]>(a.val.0) }
1864 }
1865 #[inline(always)]
1866 fn as_array_ref_u32x4(self, a: &u32x4<Self>) -> &[u32; 4usize] {
1867 unsafe { core::mem::transmute::<&__m128i, &[u32; 4usize]>(&a.val.0) }
1868 }
1869 #[inline(always)]
1870 fn as_array_mut_u32x4(self, a: &mut u32x4<Self>) -> &mut [u32; 4usize] {
1871 unsafe { core::mem::transmute::<&mut __m128i, &mut [u32; 4usize]>(&mut a.val.0) }
1872 }
1873 #[inline(always)]
1874 fn store_array_u32x4(self, a: u32x4<Self>, dest: &mut [u32; 4usize]) -> () {
1875 unsafe {
1876 core::ptr::copy_nonoverlapping(
1877 (&raw const a.val.0) as *const u32,
1878 dest.as_mut_ptr(),
1879 4usize,
1880 );
1881 }
1882 }
1883 #[inline(always)]
1884 fn cvt_from_bytes_u32x4(self, a: u8x16<Self>) -> u32x4<Self> {
1885 unsafe {
1886 u32x4 {
1887 val: core::mem::transmute(a.val),
1888 simd: self,
1889 }
1890 }
1891 }
1892 #[inline(always)]
1893 fn cvt_to_bytes_u32x4(self, a: u32x4<Self>) -> u8x16<Self> {
1894 unsafe {
1895 u8x16 {
1896 val: core::mem::transmute(a.val),
1897 simd: self,
1898 }
1899 }
1900 }
1901 #[inline(always)]
1902 fn slide_u32x4<const SHIFT: usize>(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1903 unsafe {
1904 if SHIFT >= 4usize {
1905 return b;
1906 }
1907 let result = dyn_alignr_128(
1908 self.cvt_to_bytes_u32x4(b).val.0,
1909 self.cvt_to_bytes_u32x4(a).val.0,
1910 SHIFT * 4usize,
1911 );
1912 self.cvt_from_bytes_u32x4(u8x16 {
1913 val: crate::support::Aligned128(result),
1914 simd: self,
1915 })
1916 }
1917 }
1918 #[inline(always)]
1919 fn slide_within_blocks_u32x4<const SHIFT: usize>(
1920 self,
1921 a: u32x4<Self>,
1922 b: u32x4<Self>,
1923 ) -> u32x4<Self> {
1924 self.slide_u32x4::<SHIFT>(a, b)
1925 }
1926 #[inline(always)]
1927 fn add_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1928 unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) }
1929 }
1930 #[inline(always)]
1931 fn sub_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1932 unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) }
1933 }
1934 #[inline(always)]
1935 fn mul_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1936 unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) }
1937 }
1938 #[inline(always)]
1939 fn and_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1940 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1941 }
1942 #[inline(always)]
1943 fn or_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1944 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1945 }
1946 #[inline(always)]
1947 fn xor_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1948 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1949 }
1950 #[inline(always)]
1951 fn not_u32x4(self, a: u32x4<Self>) -> u32x4<Self> {
1952 a ^ !0
1953 }
1954 #[inline(always)]
1955 fn shl_u32x4(self, a: u32x4<Self>, shift: u32) -> u32x4<Self> {
1956 unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1957 }
1958 #[inline(always)]
1959 fn shlv_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1960 core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
1961 }
1962 #[inline(always)]
1963 fn shr_u32x4(self, a: u32x4<Self>, shift: u32) -> u32x4<Self> {
1964 unsafe { _mm_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1965 }
1966 #[inline(always)]
1967 fn shrv_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1968 core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
1969 }
1970 #[inline(always)]
1971 fn simd_eq_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1972 unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) }
1973 }
1974 #[inline(always)]
1975 fn simd_lt_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1976 unsafe {
1977 let sign_bit = _mm_set1_epi32(0x80000000u32.cast_signed());
1978 let a_signed = _mm_xor_si128(a.into(), sign_bit);
1979 let b_signed = _mm_xor_si128(b.into(), sign_bit);
1980 _mm_cmpgt_epi32(b_signed, a_signed).simd_into(self)
1981 }
1982 }
1983 #[inline(always)]
1984 fn simd_le_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1985 unsafe { _mm_cmpeq_epi32(_mm_min_epu32(a.into(), b.into()), a.into()).simd_into(self) }
1986 }
1987 #[inline(always)]
1988 fn simd_ge_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1989 unsafe { _mm_cmpeq_epi32(_mm_max_epu32(a.into(), b.into()), a.into()).simd_into(self) }
1990 }
1991 #[inline(always)]
1992 fn simd_gt_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1993 unsafe {
1994 let sign_bit = _mm_set1_epi32(0x80000000u32.cast_signed());
1995 let a_signed = _mm_xor_si128(a.into(), sign_bit);
1996 let b_signed = _mm_xor_si128(b.into(), sign_bit);
1997 _mm_cmpgt_epi32(a_signed, b_signed).simd_into(self)
1998 }
1999 }
2000 #[inline(always)]
2001 fn zip_low_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
2002 unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) }
2003 }
2004 #[inline(always)]
2005 fn zip_high_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
2006 unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) }
2007 }
2008 #[inline(always)]
2009 fn unzip_low_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
2010 unsafe {
2011 let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
2012 let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
2013 _mm_unpacklo_epi64(t1, t2).simd_into(self)
2014 }
2015 }
2016 #[inline(always)]
2017 fn unzip_high_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
2018 unsafe {
2019 let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
2020 let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
2021 _mm_unpackhi_epi64(t1, t2).simd_into(self)
2022 }
2023 }
2024 #[inline(always)]
2025 fn interleave_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> (u32x4<Self>, u32x4<Self>) {
2026 (self.zip_low_u32x4(a, b), self.zip_high_u32x4(a, b))
2027 }
2028 #[inline(always)]
2029 fn deinterleave_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> (u32x4<Self>, u32x4<Self>) {
2030 (self.unzip_low_u32x4(a, b), self.unzip_high_u32x4(a, b))
2031 }
2032 #[inline(always)]
2033 fn select_u32x4(self, a: mask32x4<Self>, b: u32x4<Self>, c: u32x4<Self>) -> u32x4<Self> {
2034 unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
2035 }
2036 #[inline(always)]
2037 fn min_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
2038 unsafe { _mm_min_epu32(a.into(), b.into()).simd_into(self) }
2039 }
2040 #[inline(always)]
2041 fn max_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
2042 unsafe { _mm_max_epu32(a.into(), b.into()).simd_into(self) }
2043 }
2044 #[inline(always)]
2045 fn combine_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x8<Self> {
2046 u32x8 {
2047 val: crate::support::Aligned256([a.val.0, b.val.0]),
2048 simd: self,
2049 }
2050 }
2051 #[inline(always)]
2052 fn reinterpret_u8_u32x4(self, a: u32x4<Self>) -> u8x16<Self> {
2053 __m128i::from(a).simd_into(self)
2054 }
2055 #[inline(always)]
2056 fn cvt_f32_u32x4(self, a: u32x4<Self>) -> f32x4<Self> {
2057 unsafe {
2058 let a = a.into();
2059 let lo = _mm_blend_epi16::<0xAA>(a, _mm_set1_epi32(0x4B000000));
2060 let hi = _mm_blend_epi16::<0xAA>(_mm_srli_epi32::<16>(a), _mm_set1_epi32(0x53000000));
2061 let fhi = _mm_sub_ps(
2062 _mm_castsi128_ps(hi),
2063 _mm_set1_ps(f32::from_bits(0x53000080)),
2064 );
2065 let result = _mm_add_ps(_mm_castsi128_ps(lo), fhi);
2066 result.simd_into(self)
2067 }
2068 }
2069 #[inline(always)]
2070 fn splat_mask32x4(self, val: i32) -> mask32x4<Self> {
2071 unsafe { _mm_set1_epi32(val).simd_into(self) }
2072 }
2073 #[inline(always)]
2074 fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4<Self> {
2075 mask32x4 {
2076 val: unsafe { core::mem::transmute_copy(&val) },
2077 simd: self,
2078 }
2079 }
2080 #[inline(always)]
2081 fn load_array_ref_mask32x4(self, val: &[i32; 4usize]) -> mask32x4<Self> {
2082 mask32x4 {
2083 val: unsafe { core::mem::transmute_copy(val) },
2084 simd: self,
2085 }
2086 }
2087 #[inline(always)]
2088 fn as_array_mask32x4(self, a: mask32x4<Self>) -> [i32; 4usize] {
2089 unsafe { core::mem::transmute::<__m128i, [i32; 4usize]>(a.val.0) }
2090 }
2091 #[inline(always)]
2092 fn as_array_ref_mask32x4(self, a: &mask32x4<Self>) -> &[i32; 4usize] {
2093 unsafe { core::mem::transmute::<&__m128i, &[i32; 4usize]>(&a.val.0) }
2094 }
2095 #[inline(always)]
2096 fn as_array_mut_mask32x4(self, a: &mut mask32x4<Self>) -> &mut [i32; 4usize] {
2097 unsafe { core::mem::transmute::<&mut __m128i, &mut [i32; 4usize]>(&mut a.val.0) }
2098 }
2099 #[inline(always)]
2100 fn store_array_mask32x4(self, a: mask32x4<Self>, dest: &mut [i32; 4usize]) -> () {
2101 unsafe {
2102 core::ptr::copy_nonoverlapping(
2103 (&raw const a.val.0) as *const i32,
2104 dest.as_mut_ptr(),
2105 4usize,
2106 );
2107 }
2108 }
2109 #[inline(always)]
2110 fn cvt_from_bytes_mask32x4(self, a: u8x16<Self>) -> mask32x4<Self> {
2111 unsafe {
2112 mask32x4 {
2113 val: core::mem::transmute(a.val),
2114 simd: self,
2115 }
2116 }
2117 }
2118 #[inline(always)]
2119 fn cvt_to_bytes_mask32x4(self, a: mask32x4<Self>) -> u8x16<Self> {
2120 unsafe {
2121 u8x16 {
2122 val: core::mem::transmute(a.val),
2123 simd: self,
2124 }
2125 }
2126 }
2127 #[inline(always)]
2128 fn slide_mask32x4<const SHIFT: usize>(
2129 self,
2130 a: mask32x4<Self>,
2131 b: mask32x4<Self>,
2132 ) -> mask32x4<Self> {
2133 unsafe {
2134 if SHIFT >= 4usize {
2135 return b;
2136 }
2137 let result = dyn_alignr_128(
2138 self.cvt_to_bytes_mask32x4(b).val.0,
2139 self.cvt_to_bytes_mask32x4(a).val.0,
2140 SHIFT * 4usize,
2141 );
2142 self.cvt_from_bytes_mask32x4(u8x16 {
2143 val: crate::support::Aligned128(result),
2144 simd: self,
2145 })
2146 }
2147 }
2148 #[inline(always)]
2149 fn slide_within_blocks_mask32x4<const SHIFT: usize>(
2150 self,
2151 a: mask32x4<Self>,
2152 b: mask32x4<Self>,
2153 ) -> mask32x4<Self> {
2154 self.slide_mask32x4::<SHIFT>(a, b)
2155 }
2156 #[inline(always)]
2157 fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
2158 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
2159 }
2160 #[inline(always)]
2161 fn or_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
2162 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
2163 }
2164 #[inline(always)]
2165 fn xor_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
2166 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
2167 }
2168 #[inline(always)]
2169 fn not_mask32x4(self, a: mask32x4<Self>) -> mask32x4<Self> {
2170 a ^ !0
2171 }
2172 #[inline(always)]
2173 fn select_mask32x4(
2174 self,
2175 a: mask32x4<Self>,
2176 b: mask32x4<Self>,
2177 c: mask32x4<Self>,
2178 ) -> mask32x4<Self> {
2179 unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
2180 }
2181 #[inline(always)]
2182 fn simd_eq_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
2183 unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) }
2184 }
2185 #[inline(always)]
2186 fn any_true_mask32x4(self, a: mask32x4<Self>) -> bool {
2187 unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 != 0 }
2188 }
2189 #[inline(always)]
2190 fn all_true_mask32x4(self, a: mask32x4<Self>) -> bool {
2191 unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 == 0b1111 }
2192 }
2193 #[inline(always)]
2194 fn any_false_mask32x4(self, a: mask32x4<Self>) -> bool {
2195 unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 != 0b1111 }
2196 }
2197 #[inline(always)]
2198 fn all_false_mask32x4(self, a: mask32x4<Self>) -> bool {
2199 unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 == 0 }
2200 }
2201 #[inline(always)]
2202 fn combine_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x8<Self> {
2203 mask32x8 {
2204 val: crate::support::Aligned256([a.val.0, b.val.0]),
2205 simd: self,
2206 }
2207 }
2208 #[inline(always)]
2209 fn splat_f64x2(self, val: f64) -> f64x2<Self> {
2210 unsafe { _mm_set1_pd(val).simd_into(self) }
2211 }
2212 #[inline(always)]
2213 fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2<Self> {
2214 f64x2 {
2215 val: unsafe { core::mem::transmute_copy(&val) },
2216 simd: self,
2217 }
2218 }
2219 #[inline(always)]
2220 fn load_array_ref_f64x2(self, val: &[f64; 2usize]) -> f64x2<Self> {
2221 f64x2 {
2222 val: unsafe { core::mem::transmute_copy(val) },
2223 simd: self,
2224 }
2225 }
2226 #[inline(always)]
2227 fn as_array_f64x2(self, a: f64x2<Self>) -> [f64; 2usize] {
2228 unsafe { core::mem::transmute::<__m128d, [f64; 2usize]>(a.val.0) }
2229 }
2230 #[inline(always)]
2231 fn as_array_ref_f64x2(self, a: &f64x2<Self>) -> &[f64; 2usize] {
2232 unsafe { core::mem::transmute::<&__m128d, &[f64; 2usize]>(&a.val.0) }
2233 }
2234 #[inline(always)]
2235 fn as_array_mut_f64x2(self, a: &mut f64x2<Self>) -> &mut [f64; 2usize] {
2236 unsafe { core::mem::transmute::<&mut __m128d, &mut [f64; 2usize]>(&mut a.val.0) }
2237 }
2238 #[inline(always)]
2239 fn store_array_f64x2(self, a: f64x2<Self>, dest: &mut [f64; 2usize]) -> () {
2240 unsafe {
2241 core::ptr::copy_nonoverlapping(
2242 (&raw const a.val.0) as *const f64,
2243 dest.as_mut_ptr(),
2244 2usize,
2245 );
2246 }
2247 }
2248 #[inline(always)]
2249 fn cvt_from_bytes_f64x2(self, a: u8x16<Self>) -> f64x2<Self> {
2250 unsafe {
2251 f64x2 {
2252 val: core::mem::transmute(a.val),
2253 simd: self,
2254 }
2255 }
2256 }
2257 #[inline(always)]
2258 fn cvt_to_bytes_f64x2(self, a: f64x2<Self>) -> u8x16<Self> {
2259 unsafe {
2260 u8x16 {
2261 val: core::mem::transmute(a.val),
2262 simd: self,
2263 }
2264 }
2265 }
2266 #[inline(always)]
2267 fn slide_f64x2<const SHIFT: usize>(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2268 unsafe {
2269 if SHIFT >= 2usize {
2270 return b;
2271 }
2272 let result = dyn_alignr_128(
2273 self.cvt_to_bytes_f64x2(b).val.0,
2274 self.cvt_to_bytes_f64x2(a).val.0,
2275 SHIFT * 8usize,
2276 );
2277 self.cvt_from_bytes_f64x2(u8x16 {
2278 val: crate::support::Aligned128(result),
2279 simd: self,
2280 })
2281 }
2282 }
2283 #[inline(always)]
2284 fn slide_within_blocks_f64x2<const SHIFT: usize>(
2285 self,
2286 a: f64x2<Self>,
2287 b: f64x2<Self>,
2288 ) -> f64x2<Self> {
2289 self.slide_f64x2::<SHIFT>(a, b)
2290 }
2291 #[inline(always)]
2292 fn abs_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2293 unsafe { _mm_andnot_pd(_mm_set1_pd(-0.0), a.into()).simd_into(self) }
2294 }
2295 #[inline(always)]
2296 fn neg_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2297 unsafe { _mm_xor_pd(a.into(), _mm_set1_pd(-0.0)).simd_into(self) }
2298 }
2299 #[inline(always)]
2300 fn sqrt_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2301 unsafe { _mm_sqrt_pd(a.into()).simd_into(self) }
2302 }
2303 #[inline(always)]
2304 fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2305 unsafe { _mm_add_pd(a.into(), b.into()).simd_into(self) }
2306 }
2307 #[inline(always)]
2308 fn sub_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2309 unsafe { _mm_sub_pd(a.into(), b.into()).simd_into(self) }
2310 }
2311 #[inline(always)]
2312 fn mul_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2313 unsafe { _mm_mul_pd(a.into(), b.into()).simd_into(self) }
2314 }
2315 #[inline(always)]
2316 fn div_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2317 unsafe { _mm_div_pd(a.into(), b.into()).simd_into(self) }
2318 }
2319 #[inline(always)]
2320 fn copysign_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2321 unsafe {
2322 let mask = _mm_set1_pd(-0.0);
2323 _mm_or_pd(_mm_and_pd(mask, b.into()), _mm_andnot_pd(mask, a.into())).simd_into(self)
2324 }
2325 }
2326 #[inline(always)]
2327 fn simd_eq_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
2328 unsafe { _mm_castpd_si128(_mm_cmpeq_pd(a.into(), b.into())).simd_into(self) }
2329 }
2330 #[inline(always)]
2331 fn simd_lt_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
2332 unsafe { _mm_castpd_si128(_mm_cmplt_pd(a.into(), b.into())).simd_into(self) }
2333 }
2334 #[inline(always)]
2335 fn simd_le_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
2336 unsafe { _mm_castpd_si128(_mm_cmple_pd(a.into(), b.into())).simd_into(self) }
2337 }
2338 #[inline(always)]
2339 fn simd_ge_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
2340 unsafe { _mm_castpd_si128(_mm_cmpge_pd(a.into(), b.into())).simd_into(self) }
2341 }
2342 #[inline(always)]
2343 fn simd_gt_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
2344 unsafe { _mm_castpd_si128(_mm_cmpgt_pd(a.into(), b.into())).simd_into(self) }
2345 }
2346 #[inline(always)]
2347 fn zip_low_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2348 unsafe { _mm_unpacklo_pd(a.into(), b.into()).simd_into(self) }
2349 }
2350 #[inline(always)]
2351 fn zip_high_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2352 unsafe { _mm_unpackhi_pd(a.into(), b.into()).simd_into(self) }
2353 }
2354 #[inline(always)]
2355 fn unzip_low_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2356 unsafe { _mm_shuffle_pd::<0b00>(a.into(), b.into()).simd_into(self) }
2357 }
2358 #[inline(always)]
2359 fn unzip_high_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2360 unsafe { _mm_shuffle_pd::<0b11>(a.into(), b.into()).simd_into(self) }
2361 }
2362 #[inline(always)]
2363 fn interleave_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> (f64x2<Self>, f64x2<Self>) {
2364 (self.zip_low_f64x2(a, b), self.zip_high_f64x2(a, b))
2365 }
2366 #[inline(always)]
2367 fn deinterleave_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> (f64x2<Self>, f64x2<Self>) {
2368 (self.unzip_low_f64x2(a, b), self.unzip_high_f64x2(a, b))
2369 }
2370 #[inline(always)]
2371 fn max_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2372 unsafe { _mm_max_pd(a.into(), b.into()).simd_into(self) }
2373 }
2374 #[inline(always)]
2375 fn min_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2376 unsafe { _mm_min_pd(a.into(), b.into()).simd_into(self) }
2377 }
2378 #[inline(always)]
2379 fn max_precise_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2380 unsafe {
2381 let intermediate = _mm_max_pd(a.into(), b.into());
2382 let b_is_nan = _mm_cmpunord_pd(b.into(), b.into());
2383 _mm_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(self)
2384 }
2385 }
2386 #[inline(always)]
2387 fn min_precise_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2388 unsafe {
2389 let intermediate = _mm_min_pd(a.into(), b.into());
2390 let b_is_nan = _mm_cmpunord_pd(b.into(), b.into());
2391 _mm_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(self)
2392 }
2393 }
2394 #[inline(always)]
2395 fn mul_add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
2396 a * b + c
2397 }
2398 #[inline(always)]
2399 fn mul_sub_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
2400 a * b - c
2401 }
2402 #[inline(always)]
2403 fn floor_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2404 unsafe {
2405 _mm_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
2406 }
2407 }
2408 #[inline(always)]
2409 fn ceil_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2410 unsafe {
2411 _mm_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
2412 }
2413 }
2414 #[inline(always)]
2415 fn round_ties_even_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2416 unsafe {
2417 _mm_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
2418 .simd_into(self)
2419 }
2420 }
2421 #[inline(always)]
2422 fn fract_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2423 a - self.trunc_f64x2(a)
2424 }
2425 #[inline(always)]
2426 fn trunc_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2427 unsafe {
2428 _mm_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
2429 }
2430 }
2431 #[inline(always)]
2432 fn select_f64x2(self, a: mask64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
2433 unsafe { _mm_blendv_pd(c.into(), b.into(), _mm_castsi128_pd(a.into())).simd_into(self) }
2434 }
2435 #[inline(always)]
2436 fn combine_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x4<Self> {
2437 f64x4 {
2438 val: crate::support::Aligned256([a.val.0, b.val.0]),
2439 simd: self,
2440 }
2441 }
2442 #[inline(always)]
2443 fn reinterpret_f32_f64x2(self, a: f64x2<Self>) -> f32x4<Self> {
2444 unsafe { _mm_castpd_ps(a.into()).simd_into(self) }
2445 }
2446 #[inline(always)]
2447 fn splat_mask64x2(self, val: i64) -> mask64x2<Self> {
2448 unsafe { _mm_set1_epi64x(val).simd_into(self) }
2449 }
2450 #[inline(always)]
2451 fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2<Self> {
2452 mask64x2 {
2453 val: unsafe { core::mem::transmute_copy(&val) },
2454 simd: self,
2455 }
2456 }
2457 #[inline(always)]
2458 fn load_array_ref_mask64x2(self, val: &[i64; 2usize]) -> mask64x2<Self> {
2459 mask64x2 {
2460 val: unsafe { core::mem::transmute_copy(val) },
2461 simd: self,
2462 }
2463 }
2464 #[inline(always)]
2465 fn as_array_mask64x2(self, a: mask64x2<Self>) -> [i64; 2usize] {
2466 unsafe { core::mem::transmute::<__m128i, [i64; 2usize]>(a.val.0) }
2467 }
2468 #[inline(always)]
2469 fn as_array_ref_mask64x2(self, a: &mask64x2<Self>) -> &[i64; 2usize] {
2470 unsafe { core::mem::transmute::<&__m128i, &[i64; 2usize]>(&a.val.0) }
2471 }
2472 #[inline(always)]
2473 fn as_array_mut_mask64x2(self, a: &mut mask64x2<Self>) -> &mut [i64; 2usize] {
2474 unsafe { core::mem::transmute::<&mut __m128i, &mut [i64; 2usize]>(&mut a.val.0) }
2475 }
2476 #[inline(always)]
2477 fn store_array_mask64x2(self, a: mask64x2<Self>, dest: &mut [i64; 2usize]) -> () {
2478 unsafe {
2479 core::ptr::copy_nonoverlapping(
2480 (&raw const a.val.0) as *const i64,
2481 dest.as_mut_ptr(),
2482 2usize,
2483 );
2484 }
2485 }
2486 #[inline(always)]
2487 fn cvt_from_bytes_mask64x2(self, a: u8x16<Self>) -> mask64x2<Self> {
2488 unsafe {
2489 mask64x2 {
2490 val: core::mem::transmute(a.val),
2491 simd: self,
2492 }
2493 }
2494 }
2495 #[inline(always)]
2496 fn cvt_to_bytes_mask64x2(self, a: mask64x2<Self>) -> u8x16<Self> {
2497 unsafe {
2498 u8x16 {
2499 val: core::mem::transmute(a.val),
2500 simd: self,
2501 }
2502 }
2503 }
2504 #[inline(always)]
2505 fn slide_mask64x2<const SHIFT: usize>(
2506 self,
2507 a: mask64x2<Self>,
2508 b: mask64x2<Self>,
2509 ) -> mask64x2<Self> {
2510 unsafe {
2511 if SHIFT >= 2usize {
2512 return b;
2513 }
2514 let result = dyn_alignr_128(
2515 self.cvt_to_bytes_mask64x2(b).val.0,
2516 self.cvt_to_bytes_mask64x2(a).val.0,
2517 SHIFT * 8usize,
2518 );
2519 self.cvt_from_bytes_mask64x2(u8x16 {
2520 val: crate::support::Aligned128(result),
2521 simd: self,
2522 })
2523 }
2524 }
2525 #[inline(always)]
2526 fn slide_within_blocks_mask64x2<const SHIFT: usize>(
2527 self,
2528 a: mask64x2<Self>,
2529 b: mask64x2<Self>,
2530 ) -> mask64x2<Self> {
2531 self.slide_mask64x2::<SHIFT>(a, b)
2532 }
2533 #[inline(always)]
2534 fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
2535 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
2536 }
2537 #[inline(always)]
2538 fn or_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
2539 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
2540 }
2541 #[inline(always)]
2542 fn xor_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
2543 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
2544 }
2545 #[inline(always)]
2546 fn not_mask64x2(self, a: mask64x2<Self>) -> mask64x2<Self> {
2547 a ^ !0
2548 }
2549 #[inline(always)]
2550 fn select_mask64x2(
2551 self,
2552 a: mask64x2<Self>,
2553 b: mask64x2<Self>,
2554 c: mask64x2<Self>,
2555 ) -> mask64x2<Self> {
2556 unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
2557 }
2558 #[inline(always)]
2559 fn simd_eq_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
2560 unsafe { _mm_cmpeq_epi64(a.into(), b.into()).simd_into(self) }
2561 }
2562 #[inline(always)]
2563 fn any_true_mask64x2(self, a: mask64x2<Self>) -> bool {
2564 unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0 }
2565 }
2566 #[inline(always)]
2567 fn all_true_mask64x2(self, a: mask64x2<Self>) -> bool {
2568 unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0b11 }
2569 }
2570 #[inline(always)]
2571 fn any_false_mask64x2(self, a: mask64x2<Self>) -> bool {
2572 unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0b11 }
2573 }
2574 #[inline(always)]
2575 fn all_false_mask64x2(self, a: mask64x2<Self>) -> bool {
2576 unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0 }
2577 }
2578 #[inline(always)]
2579 fn combine_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x4<Self> {
2580 mask64x4 {
2581 val: crate::support::Aligned256([a.val.0, b.val.0]),
2582 simd: self,
2583 }
2584 }
2585 #[inline(always)]
2586 fn splat_f32x8(self, val: f32) -> f32x8<Self> {
2587 let half = self.splat_f32x4(val);
2588 self.combine_f32x4(half, half)
2589 }
2590 #[inline(always)]
2591 fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8<Self> {
2592 f32x8 {
2593 val: unsafe { core::mem::transmute_copy(&val) },
2594 simd: self,
2595 }
2596 }
2597 #[inline(always)]
2598 fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8<Self> {
2599 f32x8 {
2600 val: unsafe { core::mem::transmute_copy(val) },
2601 simd: self,
2602 }
2603 }
2604 #[inline(always)]
2605 fn as_array_f32x8(self, a: f32x8<Self>) -> [f32; 8usize] {
2606 unsafe { core::mem::transmute::<[__m128; 2usize], [f32; 8usize]>(a.val.0) }
2607 }
2608 #[inline(always)]
2609 fn as_array_ref_f32x8(self, a: &f32x8<Self>) -> &[f32; 8usize] {
2610 unsafe { core::mem::transmute::<&[__m128; 2usize], &[f32; 8usize]>(&a.val.0) }
2611 }
2612 #[inline(always)]
2613 fn as_array_mut_f32x8(self, a: &mut f32x8<Self>) -> &mut [f32; 8usize] {
2614 unsafe { core::mem::transmute::<&mut [__m128; 2usize], &mut [f32; 8usize]>(&mut a.val.0) }
2615 }
2616 #[inline(always)]
2617 fn store_array_f32x8(self, a: f32x8<Self>, dest: &mut [f32; 8usize]) -> () {
2618 unsafe {
2619 core::ptr::copy_nonoverlapping(
2620 (&raw const a.val.0) as *const f32,
2621 dest.as_mut_ptr(),
2622 8usize,
2623 );
2624 }
2625 }
2626 #[inline(always)]
2627 fn cvt_from_bytes_f32x8(self, a: u8x32<Self>) -> f32x8<Self> {
2628 unsafe {
2629 f32x8 {
2630 val: core::mem::transmute(a.val),
2631 simd: self,
2632 }
2633 }
2634 }
2635 #[inline(always)]
2636 fn cvt_to_bytes_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
2637 unsafe {
2638 u8x32 {
2639 val: core::mem::transmute(a.val),
2640 simd: self,
2641 }
2642 }
2643 }
2644 #[inline(always)]
2645 fn slide_f32x8<const SHIFT: usize>(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2646 unsafe {
2647 if SHIFT >= 8usize {
2648 return b;
2649 }
2650 let result = cross_block_alignr_128x2(
2651 self.cvt_to_bytes_f32x8(b).val.0,
2652 self.cvt_to_bytes_f32x8(a).val.0,
2653 SHIFT * 4usize,
2654 );
2655 self.cvt_from_bytes_f32x8(u8x32 {
2656 val: crate::support::Aligned256(result),
2657 simd: self,
2658 })
2659 }
2660 }
2661 #[inline(always)]
2662 fn slide_within_blocks_f32x8<const SHIFT: usize>(
2663 self,
2664 a: f32x8<Self>,
2665 b: f32x8<Self>,
2666 ) -> f32x8<Self> {
2667 let (a0, a1) = self.split_f32x8(a);
2668 let (b0, b1) = self.split_f32x8(b);
2669 self.combine_f32x4(
2670 self.slide_within_blocks_f32x4::<SHIFT>(a0, b0),
2671 self.slide_within_blocks_f32x4::<SHIFT>(a1, b1),
2672 )
2673 }
2674 #[inline(always)]
2675 fn abs_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2676 let (a0, a1) = self.split_f32x8(a);
2677 self.combine_f32x4(self.abs_f32x4(a0), self.abs_f32x4(a1))
2678 }
2679 #[inline(always)]
2680 fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2681 let (a0, a1) = self.split_f32x8(a);
2682 self.combine_f32x4(self.neg_f32x4(a0), self.neg_f32x4(a1))
2683 }
2684 #[inline(always)]
2685 fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2686 let (a0, a1) = self.split_f32x8(a);
2687 self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1))
2688 }
2689 #[inline(always)]
2690 fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2691 let (a0, a1) = self.split_f32x8(a);
2692 let (b0, b1) = self.split_f32x8(b);
2693 self.combine_f32x4(self.add_f32x4(a0, b0), self.add_f32x4(a1, b1))
2694 }
2695 #[inline(always)]
2696 fn sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2697 let (a0, a1) = self.split_f32x8(a);
2698 let (b0, b1) = self.split_f32x8(b);
2699 self.combine_f32x4(self.sub_f32x4(a0, b0), self.sub_f32x4(a1, b1))
2700 }
2701 #[inline(always)]
2702 fn mul_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2703 let (a0, a1) = self.split_f32x8(a);
2704 let (b0, b1) = self.split_f32x8(b);
2705 self.combine_f32x4(self.mul_f32x4(a0, b0), self.mul_f32x4(a1, b1))
2706 }
2707 #[inline(always)]
2708 fn div_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2709 let (a0, a1) = self.split_f32x8(a);
2710 let (b0, b1) = self.split_f32x8(b);
2711 self.combine_f32x4(self.div_f32x4(a0, b0), self.div_f32x4(a1, b1))
2712 }
2713 #[inline(always)]
2714 fn copysign_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2715 let (a0, a1) = self.split_f32x8(a);
2716 let (b0, b1) = self.split_f32x8(b);
2717 self.combine_f32x4(self.copysign_f32x4(a0, b0), self.copysign_f32x4(a1, b1))
2718 }
2719 #[inline(always)]
2720 fn simd_eq_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
2721 let (a0, a1) = self.split_f32x8(a);
2722 let (b0, b1) = self.split_f32x8(b);
2723 self.combine_mask32x4(self.simd_eq_f32x4(a0, b0), self.simd_eq_f32x4(a1, b1))
2724 }
2725 #[inline(always)]
2726 fn simd_lt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
2727 let (a0, a1) = self.split_f32x8(a);
2728 let (b0, b1) = self.split_f32x8(b);
2729 self.combine_mask32x4(self.simd_lt_f32x4(a0, b0), self.simd_lt_f32x4(a1, b1))
2730 }
2731 #[inline(always)]
2732 fn simd_le_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
2733 let (a0, a1) = self.split_f32x8(a);
2734 let (b0, b1) = self.split_f32x8(b);
2735 self.combine_mask32x4(self.simd_le_f32x4(a0, b0), self.simd_le_f32x4(a1, b1))
2736 }
2737 #[inline(always)]
2738 fn simd_ge_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
2739 let (a0, a1) = self.split_f32x8(a);
2740 let (b0, b1) = self.split_f32x8(b);
2741 self.combine_mask32x4(self.simd_ge_f32x4(a0, b0), self.simd_ge_f32x4(a1, b1))
2742 }
2743 #[inline(always)]
2744 fn simd_gt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
2745 let (a0, a1) = self.split_f32x8(a);
2746 let (b0, b1) = self.split_f32x8(b);
2747 self.combine_mask32x4(self.simd_gt_f32x4(a0, b0), self.simd_gt_f32x4(a1, b1))
2748 }
2749 #[inline(always)]
2750 fn zip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2751 let (a0, _) = self.split_f32x8(a);
2752 let (b0, _) = self.split_f32x8(b);
2753 self.combine_f32x4(self.zip_low_f32x4(a0, b0), self.zip_high_f32x4(a0, b0))
2754 }
2755 #[inline(always)]
2756 fn zip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2757 let (_, a1) = self.split_f32x8(a);
2758 let (_, b1) = self.split_f32x8(b);
2759 self.combine_f32x4(self.zip_low_f32x4(a1, b1), self.zip_high_f32x4(a1, b1))
2760 }
2761 #[inline(always)]
2762 fn unzip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2763 let (a0, a1) = self.split_f32x8(a);
2764 let (b0, b1) = self.split_f32x8(b);
2765 self.combine_f32x4(self.unzip_low_f32x4(a0, a1), self.unzip_low_f32x4(b0, b1))
2766 }
2767 #[inline(always)]
2768 fn unzip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2769 let (a0, a1) = self.split_f32x8(a);
2770 let (b0, b1) = self.split_f32x8(b);
2771 self.combine_f32x4(self.unzip_high_f32x4(a0, a1), self.unzip_high_f32x4(b0, b1))
2772 }
2773 #[inline(always)]
2774 fn interleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
2775 let (a0, a1) = self.split_f32x8(a);
2776 let (b0, b1) = self.split_f32x8(b);
2777 let lo_lo = self.zip_low_f32x4(a0, b0);
2778 let lo_hi = self.zip_high_f32x4(a0, b0);
2779 let hi_lo = self.zip_low_f32x4(a1, b1);
2780 let hi_hi = self.zip_high_f32x4(a1, b1);
2781 (
2782 self.combine_f32x4(lo_lo, lo_hi),
2783 self.combine_f32x4(hi_lo, hi_hi),
2784 )
2785 }
2786 #[inline(always)]
2787 fn deinterleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
2788 let (a0, a1) = self.split_f32x8(a);
2789 let (b0, b1) = self.split_f32x8(b);
2790 let lo_even = self.unzip_low_f32x4(a0, a1);
2791 let lo_odd = self.unzip_high_f32x4(a0, a1);
2792 let hi_even = self.unzip_low_f32x4(b0, b1);
2793 let hi_odd = self.unzip_high_f32x4(b0, b1);
2794 (
2795 self.combine_f32x4(lo_even, hi_even),
2796 self.combine_f32x4(lo_odd, hi_odd),
2797 )
2798 }
2799 #[inline(always)]
2800 fn max_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2801 let (a0, a1) = self.split_f32x8(a);
2802 let (b0, b1) = self.split_f32x8(b);
2803 self.combine_f32x4(self.max_f32x4(a0, b0), self.max_f32x4(a1, b1))
2804 }
2805 #[inline(always)]
2806 fn min_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2807 let (a0, a1) = self.split_f32x8(a);
2808 let (b0, b1) = self.split_f32x8(b);
2809 self.combine_f32x4(self.min_f32x4(a0, b0), self.min_f32x4(a1, b1))
2810 }
2811 #[inline(always)]
2812 fn max_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2813 let (a0, a1) = self.split_f32x8(a);
2814 let (b0, b1) = self.split_f32x8(b);
2815 self.combine_f32x4(
2816 self.max_precise_f32x4(a0, b0),
2817 self.max_precise_f32x4(a1, b1),
2818 )
2819 }
2820 #[inline(always)]
2821 fn min_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2822 let (a0, a1) = self.split_f32x8(a);
2823 let (b0, b1) = self.split_f32x8(b);
2824 self.combine_f32x4(
2825 self.min_precise_f32x4(a0, b0),
2826 self.min_precise_f32x4(a1, b1),
2827 )
2828 }
2829 #[inline(always)]
2830 fn mul_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
2831 let (a0, a1) = self.split_f32x8(a);
2832 let (b0, b1) = self.split_f32x8(b);
2833 let (c0, c1) = self.split_f32x8(c);
2834 self.combine_f32x4(
2835 self.mul_add_f32x4(a0, b0, c0),
2836 self.mul_add_f32x4(a1, b1, c1),
2837 )
2838 }
2839 #[inline(always)]
2840 fn mul_sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
2841 let (a0, a1) = self.split_f32x8(a);
2842 let (b0, b1) = self.split_f32x8(b);
2843 let (c0, c1) = self.split_f32x8(c);
2844 self.combine_f32x4(
2845 self.mul_sub_f32x4(a0, b0, c0),
2846 self.mul_sub_f32x4(a1, b1, c1),
2847 )
2848 }
2849 #[inline(always)]
2850 fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2851 let (a0, a1) = self.split_f32x8(a);
2852 self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1))
2853 }
2854 #[inline(always)]
2855 fn ceil_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2856 let (a0, a1) = self.split_f32x8(a);
2857 self.combine_f32x4(self.ceil_f32x4(a0), self.ceil_f32x4(a1))
2858 }
2859 #[inline(always)]
2860 fn round_ties_even_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2861 let (a0, a1) = self.split_f32x8(a);
2862 self.combine_f32x4(
2863 self.round_ties_even_f32x4(a0),
2864 self.round_ties_even_f32x4(a1),
2865 )
2866 }
2867 #[inline(always)]
2868 fn fract_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2869 let (a0, a1) = self.split_f32x8(a);
2870 self.combine_f32x4(self.fract_f32x4(a0), self.fract_f32x4(a1))
2871 }
2872 #[inline(always)]
2873 fn trunc_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2874 let (a0, a1) = self.split_f32x8(a);
2875 self.combine_f32x4(self.trunc_f32x4(a0), self.trunc_f32x4(a1))
2876 }
2877 #[inline(always)]
2878 fn select_f32x8(self, a: mask32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
2879 let (a0, a1) = self.split_mask32x8(a);
2880 let (b0, b1) = self.split_f32x8(b);
2881 let (c0, c1) = self.split_f32x8(c);
2882 self.combine_f32x4(self.select_f32x4(a0, b0, c0), self.select_f32x4(a1, b1, c1))
2883 }
2884 #[inline(always)]
2885 fn combine_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x16<Self> {
2886 f32x16 {
2887 val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
2888 simd: self,
2889 }
2890 }
2891 #[inline(always)]
2892 fn split_f32x8(self, a: f32x8<Self>) -> (f32x4<Self>, f32x4<Self>) {
2893 (
2894 f32x4 {
2895 val: crate::support::Aligned128(a.val.0[0]),
2896 simd: self,
2897 },
2898 f32x4 {
2899 val: crate::support::Aligned128(a.val.0[1]),
2900 simd: self,
2901 },
2902 )
2903 }
2904 #[inline(always)]
2905 fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self> {
2906 let (a0, a1) = self.split_f32x8(a);
2907 self.combine_f64x2(
2908 self.reinterpret_f64_f32x4(a0),
2909 self.reinterpret_f64_f32x4(a1),
2910 )
2911 }
2912 #[inline(always)]
2913 fn reinterpret_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
2914 let (a0, a1) = self.split_f32x8(a);
2915 self.combine_i32x4(
2916 self.reinterpret_i32_f32x4(a0),
2917 self.reinterpret_i32_f32x4(a1),
2918 )
2919 }
2920 #[inline(always)]
2921 fn reinterpret_u8_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
2922 let (a0, a1) = self.split_f32x8(a);
2923 self.combine_u8x16(self.reinterpret_u8_f32x4(a0), self.reinterpret_u8_f32x4(a1))
2924 }
2925 #[inline(always)]
2926 fn reinterpret_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
2927 let (a0, a1) = self.split_f32x8(a);
2928 self.combine_u32x4(
2929 self.reinterpret_u32_f32x4(a0),
2930 self.reinterpret_u32_f32x4(a1),
2931 )
2932 }
2933 #[inline(always)]
2934 fn cvt_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
2935 let (a0, a1) = self.split_f32x8(a);
2936 self.combine_u32x4(self.cvt_u32_f32x4(a0), self.cvt_u32_f32x4(a1))
2937 }
2938 #[inline(always)]
2939 fn cvt_u32_precise_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
2940 let (a0, a1) = self.split_f32x8(a);
2941 self.combine_u32x4(
2942 self.cvt_u32_precise_f32x4(a0),
2943 self.cvt_u32_precise_f32x4(a1),
2944 )
2945 }
2946 #[inline(always)]
2947 fn cvt_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
2948 let (a0, a1) = self.split_f32x8(a);
2949 self.combine_i32x4(self.cvt_i32_f32x4(a0), self.cvt_i32_f32x4(a1))
2950 }
2951 #[inline(always)]
2952 fn cvt_i32_precise_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
2953 let (a0, a1) = self.split_f32x8(a);
2954 self.combine_i32x4(
2955 self.cvt_i32_precise_f32x4(a0),
2956 self.cvt_i32_precise_f32x4(a1),
2957 )
2958 }
2959 #[inline(always)]
2960 fn splat_i8x32(self, val: i8) -> i8x32<Self> {
2961 let half = self.splat_i8x16(val);
2962 self.combine_i8x16(half, half)
2963 }
2964 #[inline(always)]
2965 fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32<Self> {
2966 i8x32 {
2967 val: unsafe { core::mem::transmute_copy(&val) },
2968 simd: self,
2969 }
2970 }
2971 #[inline(always)]
2972 fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32<Self> {
2973 i8x32 {
2974 val: unsafe { core::mem::transmute_copy(val) },
2975 simd: self,
2976 }
2977 }
2978 #[inline(always)]
2979 fn as_array_i8x32(self, a: i8x32<Self>) -> [i8; 32usize] {
2980 unsafe { core::mem::transmute::<[__m128i; 2usize], [i8; 32usize]>(a.val.0) }
2981 }
2982 #[inline(always)]
2983 fn as_array_ref_i8x32(self, a: &i8x32<Self>) -> &[i8; 32usize] {
2984 unsafe { core::mem::transmute::<&[__m128i; 2usize], &[i8; 32usize]>(&a.val.0) }
2985 }
2986 #[inline(always)]
2987 fn as_array_mut_i8x32(self, a: &mut i8x32<Self>) -> &mut [i8; 32usize] {
2988 unsafe { core::mem::transmute::<&mut [__m128i; 2usize], &mut [i8; 32usize]>(&mut a.val.0) }
2989 }
2990 #[inline(always)]
2991 fn store_array_i8x32(self, a: i8x32<Self>, dest: &mut [i8; 32usize]) -> () {
2992 unsafe {
2993 core::ptr::copy_nonoverlapping(
2994 (&raw const a.val.0) as *const i8,
2995 dest.as_mut_ptr(),
2996 32usize,
2997 );
2998 }
2999 }
3000 #[inline(always)]
3001 fn cvt_from_bytes_i8x32(self, a: u8x32<Self>) -> i8x32<Self> {
3002 unsafe {
3003 i8x32 {
3004 val: core::mem::transmute(a.val),
3005 simd: self,
3006 }
3007 }
3008 }
3009 #[inline(always)]
3010 fn cvt_to_bytes_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
3011 unsafe {
3012 u8x32 {
3013 val: core::mem::transmute(a.val),
3014 simd: self,
3015 }
3016 }
3017 }
3018 #[inline(always)]
3019 fn slide_i8x32<const SHIFT: usize>(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3020 unsafe {
3021 if SHIFT >= 32usize {
3022 return b;
3023 }
3024 let result = cross_block_alignr_128x2(
3025 self.cvt_to_bytes_i8x32(b).val.0,
3026 self.cvt_to_bytes_i8x32(a).val.0,
3027 SHIFT,
3028 );
3029 self.cvt_from_bytes_i8x32(u8x32 {
3030 val: crate::support::Aligned256(result),
3031 simd: self,
3032 })
3033 }
3034 }
3035 #[inline(always)]
3036 fn slide_within_blocks_i8x32<const SHIFT: usize>(
3037 self,
3038 a: i8x32<Self>,
3039 b: i8x32<Self>,
3040 ) -> i8x32<Self> {
3041 let (a0, a1) = self.split_i8x32(a);
3042 let (b0, b1) = self.split_i8x32(b);
3043 self.combine_i8x16(
3044 self.slide_within_blocks_i8x16::<SHIFT>(a0, b0),
3045 self.slide_within_blocks_i8x16::<SHIFT>(a1, b1),
3046 )
3047 }
3048 #[inline(always)]
3049 fn add_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3050 let (a0, a1) = self.split_i8x32(a);
3051 let (b0, b1) = self.split_i8x32(b);
3052 self.combine_i8x16(self.add_i8x16(a0, b0), self.add_i8x16(a1, b1))
3053 }
3054 #[inline(always)]
3055 fn sub_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3056 let (a0, a1) = self.split_i8x32(a);
3057 let (b0, b1) = self.split_i8x32(b);
3058 self.combine_i8x16(self.sub_i8x16(a0, b0), self.sub_i8x16(a1, b1))
3059 }
3060 #[inline(always)]
3061 fn mul_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3062 let (a0, a1) = self.split_i8x32(a);
3063 let (b0, b1) = self.split_i8x32(b);
3064 self.combine_i8x16(self.mul_i8x16(a0, b0), self.mul_i8x16(a1, b1))
3065 }
3066 #[inline(always)]
3067 fn and_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3068 let (a0, a1) = self.split_i8x32(a);
3069 let (b0, b1) = self.split_i8x32(b);
3070 self.combine_i8x16(self.and_i8x16(a0, b0), self.and_i8x16(a1, b1))
3071 }
3072 #[inline(always)]
3073 fn or_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3074 let (a0, a1) = self.split_i8x32(a);
3075 let (b0, b1) = self.split_i8x32(b);
3076 self.combine_i8x16(self.or_i8x16(a0, b0), self.or_i8x16(a1, b1))
3077 }
3078 #[inline(always)]
3079 fn xor_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3080 let (a0, a1) = self.split_i8x32(a);
3081 let (b0, b1) = self.split_i8x32(b);
3082 self.combine_i8x16(self.xor_i8x16(a0, b0), self.xor_i8x16(a1, b1))
3083 }
3084 #[inline(always)]
3085 fn not_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
3086 let (a0, a1) = self.split_i8x32(a);
3087 self.combine_i8x16(self.not_i8x16(a0), self.not_i8x16(a1))
3088 }
3089 #[inline(always)]
3090 fn shl_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
3091 let (a0, a1) = self.split_i8x32(a);
3092 self.combine_i8x16(self.shl_i8x16(a0, shift), self.shl_i8x16(a1, shift))
3093 }
3094 #[inline(always)]
3095 fn shlv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3096 let (a0, a1) = self.split_i8x32(a);
3097 let (b0, b1) = self.split_i8x32(b);
3098 self.combine_i8x16(self.shlv_i8x16(a0, b0), self.shlv_i8x16(a1, b1))
3099 }
3100 #[inline(always)]
3101 fn shr_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
3102 let (a0, a1) = self.split_i8x32(a);
3103 self.combine_i8x16(self.shr_i8x16(a0, shift), self.shr_i8x16(a1, shift))
3104 }
3105 #[inline(always)]
3106 fn shrv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3107 let (a0, a1) = self.split_i8x32(a);
3108 let (b0, b1) = self.split_i8x32(b);
3109 self.combine_i8x16(self.shrv_i8x16(a0, b0), self.shrv_i8x16(a1, b1))
3110 }
3111 #[inline(always)]
3112 fn simd_eq_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
3113 let (a0, a1) = self.split_i8x32(a);
3114 let (b0, b1) = self.split_i8x32(b);
3115 self.combine_mask8x16(self.simd_eq_i8x16(a0, b0), self.simd_eq_i8x16(a1, b1))
3116 }
3117 #[inline(always)]
3118 fn simd_lt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
3119 let (a0, a1) = self.split_i8x32(a);
3120 let (b0, b1) = self.split_i8x32(b);
3121 self.combine_mask8x16(self.simd_lt_i8x16(a0, b0), self.simd_lt_i8x16(a1, b1))
3122 }
3123 #[inline(always)]
3124 fn simd_le_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
3125 let (a0, a1) = self.split_i8x32(a);
3126 let (b0, b1) = self.split_i8x32(b);
3127 self.combine_mask8x16(self.simd_le_i8x16(a0, b0), self.simd_le_i8x16(a1, b1))
3128 }
3129 #[inline(always)]
3130 fn simd_ge_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
3131 let (a0, a1) = self.split_i8x32(a);
3132 let (b0, b1) = self.split_i8x32(b);
3133 self.combine_mask8x16(self.simd_ge_i8x16(a0, b0), self.simd_ge_i8x16(a1, b1))
3134 }
3135 #[inline(always)]
3136 fn simd_gt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
3137 let (a0, a1) = self.split_i8x32(a);
3138 let (b0, b1) = self.split_i8x32(b);
3139 self.combine_mask8x16(self.simd_gt_i8x16(a0, b0), self.simd_gt_i8x16(a1, b1))
3140 }
3141 #[inline(always)]
3142 fn zip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3143 let (a0, _) = self.split_i8x32(a);
3144 let (b0, _) = self.split_i8x32(b);
3145 self.combine_i8x16(self.zip_low_i8x16(a0, b0), self.zip_high_i8x16(a0, b0))
3146 }
3147 #[inline(always)]
3148 fn zip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3149 let (_, a1) = self.split_i8x32(a);
3150 let (_, b1) = self.split_i8x32(b);
3151 self.combine_i8x16(self.zip_low_i8x16(a1, b1), self.zip_high_i8x16(a1, b1))
3152 }
3153 #[inline(always)]
3154 fn unzip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3155 let (a0, a1) = self.split_i8x32(a);
3156 let (b0, b1) = self.split_i8x32(b);
3157 self.combine_i8x16(self.unzip_low_i8x16(a0, a1), self.unzip_low_i8x16(b0, b1))
3158 }
3159 #[inline(always)]
3160 fn unzip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3161 let (a0, a1) = self.split_i8x32(a);
3162 let (b0, b1) = self.split_i8x32(b);
3163 self.combine_i8x16(self.unzip_high_i8x16(a0, a1), self.unzip_high_i8x16(b0, b1))
3164 }
3165 #[inline(always)]
3166 fn interleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
3167 let (a0, a1) = self.split_i8x32(a);
3168 let (b0, b1) = self.split_i8x32(b);
3169 let lo_lo = self.zip_low_i8x16(a0, b0);
3170 let lo_hi = self.zip_high_i8x16(a0, b0);
3171 let hi_lo = self.zip_low_i8x16(a1, b1);
3172 let hi_hi = self.zip_high_i8x16(a1, b1);
3173 (
3174 self.combine_i8x16(lo_lo, lo_hi),
3175 self.combine_i8x16(hi_lo, hi_hi),
3176 )
3177 }
3178 #[inline(always)]
3179 fn deinterleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
3180 let (a0, a1) = self.split_i8x32(a);
3181 let (b0, b1) = self.split_i8x32(b);
3182 let lo_even = self.unzip_low_i8x16(a0, a1);
3183 let lo_odd = self.unzip_high_i8x16(a0, a1);
3184 let hi_even = self.unzip_low_i8x16(b0, b1);
3185 let hi_odd = self.unzip_high_i8x16(b0, b1);
3186 (
3187 self.combine_i8x16(lo_even, hi_even),
3188 self.combine_i8x16(lo_odd, hi_odd),
3189 )
3190 }
3191 #[inline(always)]
3192 fn select_i8x32(self, a: mask8x32<Self>, b: i8x32<Self>, c: i8x32<Self>) -> i8x32<Self> {
3193 let (a0, a1) = self.split_mask8x32(a);
3194 let (b0, b1) = self.split_i8x32(b);
3195 let (c0, c1) = self.split_i8x32(c);
3196 self.combine_i8x16(self.select_i8x16(a0, b0, c0), self.select_i8x16(a1, b1, c1))
3197 }
3198 #[inline(always)]
3199 fn min_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3200 let (a0, a1) = self.split_i8x32(a);
3201 let (b0, b1) = self.split_i8x32(b);
3202 self.combine_i8x16(self.min_i8x16(a0, b0), self.min_i8x16(a1, b1))
3203 }
3204 #[inline(always)]
3205 fn max_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3206 let (a0, a1) = self.split_i8x32(a);
3207 let (b0, b1) = self.split_i8x32(b);
3208 self.combine_i8x16(self.max_i8x16(a0, b0), self.max_i8x16(a1, b1))
3209 }
3210 #[inline(always)]
3211 fn combine_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x64<Self> {
3212 i8x64 {
3213 val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
3214 simd: self,
3215 }
3216 }
3217 #[inline(always)]
3218 fn split_i8x32(self, a: i8x32<Self>) -> (i8x16<Self>, i8x16<Self>) {
3219 (
3220 i8x16 {
3221 val: crate::support::Aligned128(a.val.0[0]),
3222 simd: self,
3223 },
3224 i8x16 {
3225 val: crate::support::Aligned128(a.val.0[1]),
3226 simd: self,
3227 },
3228 )
3229 }
3230 #[inline(always)]
3231 fn neg_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
3232 let (a0, a1) = self.split_i8x32(a);
3233 self.combine_i8x16(self.neg_i8x16(a0), self.neg_i8x16(a1))
3234 }
3235 #[inline(always)]
3236 fn reinterpret_u8_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
3237 let (a0, a1) = self.split_i8x32(a);
3238 self.combine_u8x16(self.reinterpret_u8_i8x16(a0), self.reinterpret_u8_i8x16(a1))
3239 }
3240 #[inline(always)]
3241 fn reinterpret_u32_i8x32(self, a: i8x32<Self>) -> u32x8<Self> {
3242 let (a0, a1) = self.split_i8x32(a);
3243 self.combine_u32x4(
3244 self.reinterpret_u32_i8x16(a0),
3245 self.reinterpret_u32_i8x16(a1),
3246 )
3247 }
3248 #[inline(always)]
3249 fn splat_u8x32(self, val: u8) -> u8x32<Self> {
3250 let half = self.splat_u8x16(val);
3251 self.combine_u8x16(half, half)
3252 }
3253 #[inline(always)]
3254 fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32<Self> {
3255 u8x32 {
3256 val: unsafe { core::mem::transmute_copy(&val) },
3257 simd: self,
3258 }
3259 }
3260 #[inline(always)]
3261 fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32<Self> {
3262 u8x32 {
3263 val: unsafe { core::mem::transmute_copy(val) },
3264 simd: self,
3265 }
3266 }
3267 #[inline(always)]
3268 fn as_array_u8x32(self, a: u8x32<Self>) -> [u8; 32usize] {
3269 unsafe { core::mem::transmute::<[__m128i; 2usize], [u8; 32usize]>(a.val.0) }
3270 }
3271 #[inline(always)]
3272 fn as_array_ref_u8x32(self, a: &u8x32<Self>) -> &[u8; 32usize] {
3273 unsafe { core::mem::transmute::<&[__m128i; 2usize], &[u8; 32usize]>(&a.val.0) }
3274 }
3275 #[inline(always)]
3276 fn as_array_mut_u8x32(self, a: &mut u8x32<Self>) -> &mut [u8; 32usize] {
3277 unsafe { core::mem::transmute::<&mut [__m128i; 2usize], &mut [u8; 32usize]>(&mut a.val.0) }
3278 }
3279 #[inline(always)]
3280 fn store_array_u8x32(self, a: u8x32<Self>, dest: &mut [u8; 32usize]) -> () {
3281 unsafe {
3282 core::ptr::copy_nonoverlapping(
3283 (&raw const a.val.0) as *const u8,
3284 dest.as_mut_ptr(),
3285 32usize,
3286 );
3287 }
3288 }
3289 #[inline(always)]
3290 fn cvt_from_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
3291 unsafe {
3292 u8x32 {
3293 val: core::mem::transmute(a.val),
3294 simd: self,
3295 }
3296 }
3297 }
3298 #[inline(always)]
3299 fn cvt_to_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
3300 unsafe {
3301 u8x32 {
3302 val: core::mem::transmute(a.val),
3303 simd: self,
3304 }
3305 }
3306 }
3307 #[inline(always)]
3308 fn slide_u8x32<const SHIFT: usize>(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3309 unsafe {
3310 if SHIFT >= 32usize {
3311 return b;
3312 }
3313 let result = cross_block_alignr_128x2(
3314 self.cvt_to_bytes_u8x32(b).val.0,
3315 self.cvt_to_bytes_u8x32(a).val.0,
3316 SHIFT,
3317 );
3318 self.cvt_from_bytes_u8x32(u8x32 {
3319 val: crate::support::Aligned256(result),
3320 simd: self,
3321 })
3322 }
3323 }
3324 #[inline(always)]
3325 fn slide_within_blocks_u8x32<const SHIFT: usize>(
3326 self,
3327 a: u8x32<Self>,
3328 b: u8x32<Self>,
3329 ) -> u8x32<Self> {
3330 let (a0, a1) = self.split_u8x32(a);
3331 let (b0, b1) = self.split_u8x32(b);
3332 self.combine_u8x16(
3333 self.slide_within_blocks_u8x16::<SHIFT>(a0, b0),
3334 self.slide_within_blocks_u8x16::<SHIFT>(a1, b1),
3335 )
3336 }
3337 #[inline(always)]
3338 fn add_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3339 let (a0, a1) = self.split_u8x32(a);
3340 let (b0, b1) = self.split_u8x32(b);
3341 self.combine_u8x16(self.add_u8x16(a0, b0), self.add_u8x16(a1, b1))
3342 }
3343 #[inline(always)]
3344 fn sub_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3345 let (a0, a1) = self.split_u8x32(a);
3346 let (b0, b1) = self.split_u8x32(b);
3347 self.combine_u8x16(self.sub_u8x16(a0, b0), self.sub_u8x16(a1, b1))
3348 }
3349 #[inline(always)]
3350 fn mul_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3351 let (a0, a1) = self.split_u8x32(a);
3352 let (b0, b1) = self.split_u8x32(b);
3353 self.combine_u8x16(self.mul_u8x16(a0, b0), self.mul_u8x16(a1, b1))
3354 }
3355 #[inline(always)]
3356 fn and_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3357 let (a0, a1) = self.split_u8x32(a);
3358 let (b0, b1) = self.split_u8x32(b);
3359 self.combine_u8x16(self.and_u8x16(a0, b0), self.and_u8x16(a1, b1))
3360 }
3361 #[inline(always)]
3362 fn or_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3363 let (a0, a1) = self.split_u8x32(a);
3364 let (b0, b1) = self.split_u8x32(b);
3365 self.combine_u8x16(self.or_u8x16(a0, b0), self.or_u8x16(a1, b1))
3366 }
3367 #[inline(always)]
3368 fn xor_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3369 let (a0, a1) = self.split_u8x32(a);
3370 let (b0, b1) = self.split_u8x32(b);
3371 self.combine_u8x16(self.xor_u8x16(a0, b0), self.xor_u8x16(a1, b1))
3372 }
3373 #[inline(always)]
3374 fn not_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
3375 let (a0, a1) = self.split_u8x32(a);
3376 self.combine_u8x16(self.not_u8x16(a0), self.not_u8x16(a1))
3377 }
3378 #[inline(always)]
3379 fn shl_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
3380 let (a0, a1) = self.split_u8x32(a);
3381 self.combine_u8x16(self.shl_u8x16(a0, shift), self.shl_u8x16(a1, shift))
3382 }
3383 #[inline(always)]
3384 fn shlv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3385 let (a0, a1) = self.split_u8x32(a);
3386 let (b0, b1) = self.split_u8x32(b);
3387 self.combine_u8x16(self.shlv_u8x16(a0, b0), self.shlv_u8x16(a1, b1))
3388 }
3389 #[inline(always)]
3390 fn shr_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
3391 let (a0, a1) = self.split_u8x32(a);
3392 self.combine_u8x16(self.shr_u8x16(a0, shift), self.shr_u8x16(a1, shift))
3393 }
3394 #[inline(always)]
3395 fn shrv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3396 let (a0, a1) = self.split_u8x32(a);
3397 let (b0, b1) = self.split_u8x32(b);
3398 self.combine_u8x16(self.shrv_u8x16(a0, b0), self.shrv_u8x16(a1, b1))
3399 }
3400 #[inline(always)]
3401 fn simd_eq_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
3402 let (a0, a1) = self.split_u8x32(a);
3403 let (b0, b1) = self.split_u8x32(b);
3404 self.combine_mask8x16(self.simd_eq_u8x16(a0, b0), self.simd_eq_u8x16(a1, b1))
3405 }
3406 #[inline(always)]
3407 fn simd_lt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
3408 let (a0, a1) = self.split_u8x32(a);
3409 let (b0, b1) = self.split_u8x32(b);
3410 self.combine_mask8x16(self.simd_lt_u8x16(a0, b0), self.simd_lt_u8x16(a1, b1))
3411 }
3412 #[inline(always)]
3413 fn simd_le_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
3414 let (a0, a1) = self.split_u8x32(a);
3415 let (b0, b1) = self.split_u8x32(b);
3416 self.combine_mask8x16(self.simd_le_u8x16(a0, b0), self.simd_le_u8x16(a1, b1))
3417 }
3418 #[inline(always)]
3419 fn simd_ge_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
3420 let (a0, a1) = self.split_u8x32(a);
3421 let (b0, b1) = self.split_u8x32(b);
3422 self.combine_mask8x16(self.simd_ge_u8x16(a0, b0), self.simd_ge_u8x16(a1, b1))
3423 }
3424 #[inline(always)]
3425 fn simd_gt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
3426 let (a0, a1) = self.split_u8x32(a);
3427 let (b0, b1) = self.split_u8x32(b);
3428 self.combine_mask8x16(self.simd_gt_u8x16(a0, b0), self.simd_gt_u8x16(a1, b1))
3429 }
3430 #[inline(always)]
3431 fn zip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3432 let (a0, _) = self.split_u8x32(a);
3433 let (b0, _) = self.split_u8x32(b);
3434 self.combine_u8x16(self.zip_low_u8x16(a0, b0), self.zip_high_u8x16(a0, b0))
3435 }
3436 #[inline(always)]
3437 fn zip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3438 let (_, a1) = self.split_u8x32(a);
3439 let (_, b1) = self.split_u8x32(b);
3440 self.combine_u8x16(self.zip_low_u8x16(a1, b1), self.zip_high_u8x16(a1, b1))
3441 }
3442 #[inline(always)]
3443 fn unzip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3444 let (a0, a1) = self.split_u8x32(a);
3445 let (b0, b1) = self.split_u8x32(b);
3446 self.combine_u8x16(self.unzip_low_u8x16(a0, a1), self.unzip_low_u8x16(b0, b1))
3447 }
3448 #[inline(always)]
3449 fn unzip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3450 let (a0, a1) = self.split_u8x32(a);
3451 let (b0, b1) = self.split_u8x32(b);
3452 self.combine_u8x16(self.unzip_high_u8x16(a0, a1), self.unzip_high_u8x16(b0, b1))
3453 }
3454 #[inline(always)]
3455 fn interleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
3456 let (a0, a1) = self.split_u8x32(a);
3457 let (b0, b1) = self.split_u8x32(b);
3458 let lo_lo = self.zip_low_u8x16(a0, b0);
3459 let lo_hi = self.zip_high_u8x16(a0, b0);
3460 let hi_lo = self.zip_low_u8x16(a1, b1);
3461 let hi_hi = self.zip_high_u8x16(a1, b1);
3462 (
3463 self.combine_u8x16(lo_lo, lo_hi),
3464 self.combine_u8x16(hi_lo, hi_hi),
3465 )
3466 }
3467 #[inline(always)]
3468 fn deinterleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
3469 let (a0, a1) = self.split_u8x32(a);
3470 let (b0, b1) = self.split_u8x32(b);
3471 let lo_even = self.unzip_low_u8x16(a0, a1);
3472 let lo_odd = self.unzip_high_u8x16(a0, a1);
3473 let hi_even = self.unzip_low_u8x16(b0, b1);
3474 let hi_odd = self.unzip_high_u8x16(b0, b1);
3475 (
3476 self.combine_u8x16(lo_even, hi_even),
3477 self.combine_u8x16(lo_odd, hi_odd),
3478 )
3479 }
3480 #[inline(always)]
3481 fn select_u8x32(self, a: mask8x32<Self>, b: u8x32<Self>, c: u8x32<Self>) -> u8x32<Self> {
3482 let (a0, a1) = self.split_mask8x32(a);
3483 let (b0, b1) = self.split_u8x32(b);
3484 let (c0, c1) = self.split_u8x32(c);
3485 self.combine_u8x16(self.select_u8x16(a0, b0, c0), self.select_u8x16(a1, b1, c1))
3486 }
3487 #[inline(always)]
3488 fn min_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3489 let (a0, a1) = self.split_u8x32(a);
3490 let (b0, b1) = self.split_u8x32(b);
3491 self.combine_u8x16(self.min_u8x16(a0, b0), self.min_u8x16(a1, b1))
3492 }
3493 #[inline(always)]
3494 fn max_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3495 let (a0, a1) = self.split_u8x32(a);
3496 let (b0, b1) = self.split_u8x32(b);
3497 self.combine_u8x16(self.max_u8x16(a0, b0), self.max_u8x16(a1, b1))
3498 }
3499 #[inline(always)]
3500 fn combine_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x64<Self> {
3501 u8x64 {
3502 val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
3503 simd: self,
3504 }
3505 }
3506 #[inline(always)]
3507 fn split_u8x32(self, a: u8x32<Self>) -> (u8x16<Self>, u8x16<Self>) {
3508 (
3509 u8x16 {
3510 val: crate::support::Aligned128(a.val.0[0]),
3511 simd: self,
3512 },
3513 u8x16 {
3514 val: crate::support::Aligned128(a.val.0[1]),
3515 simd: self,
3516 },
3517 )
3518 }
3519 #[inline(always)]
3520 fn widen_u8x32(self, a: u8x32<Self>) -> u16x32<Self> {
3521 let (a0, a1) = self.split_u8x32(a);
3522 self.combine_u16x16(self.widen_u8x16(a0), self.widen_u8x16(a1))
3523 }
3524 #[inline(always)]
3525 fn reinterpret_u32_u8x32(self, a: u8x32<Self>) -> u32x8<Self> {
3526 let (a0, a1) = self.split_u8x32(a);
3527 self.combine_u32x4(
3528 self.reinterpret_u32_u8x16(a0),
3529 self.reinterpret_u32_u8x16(a1),
3530 )
3531 }
3532 #[inline(always)]
3533 fn splat_mask8x32(self, val: i8) -> mask8x32<Self> {
3534 let half = self.splat_mask8x16(val);
3535 self.combine_mask8x16(half, half)
3536 }
3537 #[inline(always)]
3538 fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32<Self> {
3539 mask8x32 {
3540 val: unsafe { core::mem::transmute_copy(&val) },
3541 simd: self,
3542 }
3543 }
3544 #[inline(always)]
3545 fn load_array_ref_mask8x32(self, val: &[i8; 32usize]) -> mask8x32<Self> {
3546 mask8x32 {
3547 val: unsafe { core::mem::transmute_copy(val) },
3548 simd: self,
3549 }
3550 }
3551 #[inline(always)]
3552 fn as_array_mask8x32(self, a: mask8x32<Self>) -> [i8; 32usize] {
3553 unsafe { core::mem::transmute::<[__m128i; 2usize], [i8; 32usize]>(a.val.0) }
3554 }
3555 #[inline(always)]
3556 fn as_array_ref_mask8x32(self, a: &mask8x32<Self>) -> &[i8; 32usize] {
3557 unsafe { core::mem::transmute::<&[__m128i; 2usize], &[i8; 32usize]>(&a.val.0) }
3558 }
3559 #[inline(always)]
3560 fn as_array_mut_mask8x32(self, a: &mut mask8x32<Self>) -> &mut [i8; 32usize] {
3561 unsafe { core::mem::transmute::<&mut [__m128i; 2usize], &mut [i8; 32usize]>(&mut a.val.0) }
3562 }
3563 #[inline(always)]
3564 fn store_array_mask8x32(self, a: mask8x32<Self>, dest: &mut [i8; 32usize]) -> () {
3565 unsafe {
3566 core::ptr::copy_nonoverlapping(
3567 (&raw const a.val.0) as *const i8,
3568 dest.as_mut_ptr(),
3569 32usize,
3570 );
3571 }
3572 }
3573 #[inline(always)]
3574 fn cvt_from_bytes_mask8x32(self, a: u8x32<Self>) -> mask8x32<Self> {
3575 unsafe {
3576 mask8x32 {
3577 val: core::mem::transmute(a.val),
3578 simd: self,
3579 }
3580 }
3581 }
3582 #[inline(always)]
3583 fn cvt_to_bytes_mask8x32(self, a: mask8x32<Self>) -> u8x32<Self> {
3584 unsafe {
3585 u8x32 {
3586 val: core::mem::transmute(a.val),
3587 simd: self,
3588 }
3589 }
3590 }
3591 #[inline(always)]
3592 fn slide_mask8x32<const SHIFT: usize>(
3593 self,
3594 a: mask8x32<Self>,
3595 b: mask8x32<Self>,
3596 ) -> mask8x32<Self> {
3597 unsafe {
3598 if SHIFT >= 32usize {
3599 return b;
3600 }
3601 let result = cross_block_alignr_128x2(
3602 self.cvt_to_bytes_mask8x32(b).val.0,
3603 self.cvt_to_bytes_mask8x32(a).val.0,
3604 SHIFT,
3605 );
3606 self.cvt_from_bytes_mask8x32(u8x32 {
3607 val: crate::support::Aligned256(result),
3608 simd: self,
3609 })
3610 }
3611 }
3612 #[inline(always)]
3613 fn slide_within_blocks_mask8x32<const SHIFT: usize>(
3614 self,
3615 a: mask8x32<Self>,
3616 b: mask8x32<Self>,
3617 ) -> mask8x32<Self> {
3618 let (a0, a1) = self.split_mask8x32(a);
3619 let (b0, b1) = self.split_mask8x32(b);
3620 self.combine_mask8x16(
3621 self.slide_within_blocks_mask8x16::<SHIFT>(a0, b0),
3622 self.slide_within_blocks_mask8x16::<SHIFT>(a1, b1),
3623 )
3624 }
3625 #[inline(always)]
3626 fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
3627 let (a0, a1) = self.split_mask8x32(a);
3628 let (b0, b1) = self.split_mask8x32(b);
3629 self.combine_mask8x16(self.and_mask8x16(a0, b0), self.and_mask8x16(a1, b1))
3630 }
3631 #[inline(always)]
3632 fn or_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
3633 let (a0, a1) = self.split_mask8x32(a);
3634 let (b0, b1) = self.split_mask8x32(b);
3635 self.combine_mask8x16(self.or_mask8x16(a0, b0), self.or_mask8x16(a1, b1))
3636 }
3637 #[inline(always)]
3638 fn xor_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
3639 let (a0, a1) = self.split_mask8x32(a);
3640 let (b0, b1) = self.split_mask8x32(b);
3641 self.combine_mask8x16(self.xor_mask8x16(a0, b0), self.xor_mask8x16(a1, b1))
3642 }
3643 #[inline(always)]
3644 fn not_mask8x32(self, a: mask8x32<Self>) -> mask8x32<Self> {
3645 let (a0, a1) = self.split_mask8x32(a);
3646 self.combine_mask8x16(self.not_mask8x16(a0), self.not_mask8x16(a1))
3647 }
3648 #[inline(always)]
3649 fn select_mask8x32(
3650 self,
3651 a: mask8x32<Self>,
3652 b: mask8x32<Self>,
3653 c: mask8x32<Self>,
3654 ) -> mask8x32<Self> {
3655 let (a0, a1) = self.split_mask8x32(a);
3656 let (b0, b1) = self.split_mask8x32(b);
3657 let (c0, c1) = self.split_mask8x32(c);
3658 self.combine_mask8x16(
3659 self.select_mask8x16(a0, b0, c0),
3660 self.select_mask8x16(a1, b1, c1),
3661 )
3662 }
3663 #[inline(always)]
3664 fn simd_eq_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
3665 let (a0, a1) = self.split_mask8x32(a);
3666 let (b0, b1) = self.split_mask8x32(b);
3667 self.combine_mask8x16(self.simd_eq_mask8x16(a0, b0), self.simd_eq_mask8x16(a1, b1))
3668 }
3669 #[inline(always)]
3670 fn any_true_mask8x32(self, a: mask8x32<Self>) -> bool {
3671 let (a0, a1) = self.split_mask8x32(a);
3672 self.any_true_mask8x16(a0) || self.any_true_mask8x16(a1)
3673 }
3674 #[inline(always)]
3675 fn all_true_mask8x32(self, a: mask8x32<Self>) -> bool {
3676 let (a0, a1) = self.split_mask8x32(a);
3677 self.all_true_mask8x16(a0) && self.all_true_mask8x16(a1)
3678 }
3679 #[inline(always)]
3680 fn any_false_mask8x32(self, a: mask8x32<Self>) -> bool {
3681 let (a0, a1) = self.split_mask8x32(a);
3682 self.any_false_mask8x16(a0) || self.any_false_mask8x16(a1)
3683 }
3684 #[inline(always)]
3685 fn all_false_mask8x32(self, a: mask8x32<Self>) -> bool {
3686 let (a0, a1) = self.split_mask8x32(a);
3687 self.all_false_mask8x16(a0) && self.all_false_mask8x16(a1)
3688 }
3689 #[inline(always)]
3690 fn combine_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x64<Self> {
3691 mask8x64 {
3692 val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
3693 simd: self,
3694 }
3695 }
3696 #[inline(always)]
3697 fn split_mask8x32(self, a: mask8x32<Self>) -> (mask8x16<Self>, mask8x16<Self>) {
3698 (
3699 mask8x16 {
3700 val: crate::support::Aligned128(a.val.0[0]),
3701 simd: self,
3702 },
3703 mask8x16 {
3704 val: crate::support::Aligned128(a.val.0[1]),
3705 simd: self,
3706 },
3707 )
3708 }
3709 #[inline(always)]
3710 fn splat_i16x16(self, val: i16) -> i16x16<Self> {
3711 let half = self.splat_i16x8(val);
3712 self.combine_i16x8(half, half)
3713 }
3714 #[inline(always)]
3715 fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16<Self> {
3716 i16x16 {
3717 val: unsafe { core::mem::transmute_copy(&val) },
3718 simd: self,
3719 }
3720 }
3721 #[inline(always)]
3722 fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16<Self> {
3723 i16x16 {
3724 val: unsafe { core::mem::transmute_copy(val) },
3725 simd: self,
3726 }
3727 }
3728 #[inline(always)]
3729 fn as_array_i16x16(self, a: i16x16<Self>) -> [i16; 16usize] {
3730 unsafe { core::mem::transmute::<[__m128i; 2usize], [i16; 16usize]>(a.val.0) }
3731 }
3732 #[inline(always)]
3733 fn as_array_ref_i16x16(self, a: &i16x16<Self>) -> &[i16; 16usize] {
3734 unsafe { core::mem::transmute::<&[__m128i; 2usize], &[i16; 16usize]>(&a.val.0) }
3735 }
3736 #[inline(always)]
3737 fn as_array_mut_i16x16(self, a: &mut i16x16<Self>) -> &mut [i16; 16usize] {
3738 unsafe { core::mem::transmute::<&mut [__m128i; 2usize], &mut [i16; 16usize]>(&mut a.val.0) }
3739 }
3740 #[inline(always)]
3741 fn store_array_i16x16(self, a: i16x16<Self>, dest: &mut [i16; 16usize]) -> () {
3742 unsafe {
3743 core::ptr::copy_nonoverlapping(
3744 (&raw const a.val.0) as *const i16,
3745 dest.as_mut_ptr(),
3746 16usize,
3747 );
3748 }
3749 }
3750 #[inline(always)]
3751 fn cvt_from_bytes_i16x16(self, a: u8x32<Self>) -> i16x16<Self> {
3752 unsafe {
3753 i16x16 {
3754 val: core::mem::transmute(a.val),
3755 simd: self,
3756 }
3757 }
3758 }
3759 #[inline(always)]
3760 fn cvt_to_bytes_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
3761 unsafe {
3762 u8x32 {
3763 val: core::mem::transmute(a.val),
3764 simd: self,
3765 }
3766 }
3767 }
3768 #[inline(always)]
3769 fn slide_i16x16<const SHIFT: usize>(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3770 unsafe {
3771 if SHIFT >= 16usize {
3772 return b;
3773 }
3774 let result = cross_block_alignr_128x2(
3775 self.cvt_to_bytes_i16x16(b).val.0,
3776 self.cvt_to_bytes_i16x16(a).val.0,
3777 SHIFT * 2usize,
3778 );
3779 self.cvt_from_bytes_i16x16(u8x32 {
3780 val: crate::support::Aligned256(result),
3781 simd: self,
3782 })
3783 }
3784 }
3785 #[inline(always)]
3786 fn slide_within_blocks_i16x16<const SHIFT: usize>(
3787 self,
3788 a: i16x16<Self>,
3789 b: i16x16<Self>,
3790 ) -> i16x16<Self> {
3791 let (a0, a1) = self.split_i16x16(a);
3792 let (b0, b1) = self.split_i16x16(b);
3793 self.combine_i16x8(
3794 self.slide_within_blocks_i16x8::<SHIFT>(a0, b0),
3795 self.slide_within_blocks_i16x8::<SHIFT>(a1, b1),
3796 )
3797 }
3798 #[inline(always)]
3799 fn add_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3800 let (a0, a1) = self.split_i16x16(a);
3801 let (b0, b1) = self.split_i16x16(b);
3802 self.combine_i16x8(self.add_i16x8(a0, b0), self.add_i16x8(a1, b1))
3803 }
3804 #[inline(always)]
3805 fn sub_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3806 let (a0, a1) = self.split_i16x16(a);
3807 let (b0, b1) = self.split_i16x16(b);
3808 self.combine_i16x8(self.sub_i16x8(a0, b0), self.sub_i16x8(a1, b1))
3809 }
3810 #[inline(always)]
3811 fn mul_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3812 let (a0, a1) = self.split_i16x16(a);
3813 let (b0, b1) = self.split_i16x16(b);
3814 self.combine_i16x8(self.mul_i16x8(a0, b0), self.mul_i16x8(a1, b1))
3815 }
3816 #[inline(always)]
3817 fn and_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3818 let (a0, a1) = self.split_i16x16(a);
3819 let (b0, b1) = self.split_i16x16(b);
3820 self.combine_i16x8(self.and_i16x8(a0, b0), self.and_i16x8(a1, b1))
3821 }
3822 #[inline(always)]
3823 fn or_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3824 let (a0, a1) = self.split_i16x16(a);
3825 let (b0, b1) = self.split_i16x16(b);
3826 self.combine_i16x8(self.or_i16x8(a0, b0), self.or_i16x8(a1, b1))
3827 }
3828 #[inline(always)]
3829 fn xor_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3830 let (a0, a1) = self.split_i16x16(a);
3831 let (b0, b1) = self.split_i16x16(b);
3832 self.combine_i16x8(self.xor_i16x8(a0, b0), self.xor_i16x8(a1, b1))
3833 }
3834 #[inline(always)]
3835 fn not_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
3836 let (a0, a1) = self.split_i16x16(a);
3837 self.combine_i16x8(self.not_i16x8(a0), self.not_i16x8(a1))
3838 }
3839 #[inline(always)]
3840 fn shl_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
3841 let (a0, a1) = self.split_i16x16(a);
3842 self.combine_i16x8(self.shl_i16x8(a0, shift), self.shl_i16x8(a1, shift))
3843 }
3844 #[inline(always)]
3845 fn shlv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3846 let (a0, a1) = self.split_i16x16(a);
3847 let (b0, b1) = self.split_i16x16(b);
3848 self.combine_i16x8(self.shlv_i16x8(a0, b0), self.shlv_i16x8(a1, b1))
3849 }
3850 #[inline(always)]
3851 fn shr_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
3852 let (a0, a1) = self.split_i16x16(a);
3853 self.combine_i16x8(self.shr_i16x8(a0, shift), self.shr_i16x8(a1, shift))
3854 }
3855 #[inline(always)]
3856 fn shrv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3857 let (a0, a1) = self.split_i16x16(a);
3858 let (b0, b1) = self.split_i16x16(b);
3859 self.combine_i16x8(self.shrv_i16x8(a0, b0), self.shrv_i16x8(a1, b1))
3860 }
3861 #[inline(always)]
3862 fn simd_eq_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
3863 let (a0, a1) = self.split_i16x16(a);
3864 let (b0, b1) = self.split_i16x16(b);
3865 self.combine_mask16x8(self.simd_eq_i16x8(a0, b0), self.simd_eq_i16x8(a1, b1))
3866 }
3867 #[inline(always)]
3868 fn simd_lt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
3869 let (a0, a1) = self.split_i16x16(a);
3870 let (b0, b1) = self.split_i16x16(b);
3871 self.combine_mask16x8(self.simd_lt_i16x8(a0, b0), self.simd_lt_i16x8(a1, b1))
3872 }
3873 #[inline(always)]
3874 fn simd_le_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
3875 let (a0, a1) = self.split_i16x16(a);
3876 let (b0, b1) = self.split_i16x16(b);
3877 self.combine_mask16x8(self.simd_le_i16x8(a0, b0), self.simd_le_i16x8(a1, b1))
3878 }
3879 #[inline(always)]
3880 fn simd_ge_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
3881 let (a0, a1) = self.split_i16x16(a);
3882 let (b0, b1) = self.split_i16x16(b);
3883 self.combine_mask16x8(self.simd_ge_i16x8(a0, b0), self.simd_ge_i16x8(a1, b1))
3884 }
3885 #[inline(always)]
3886 fn simd_gt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
3887 let (a0, a1) = self.split_i16x16(a);
3888 let (b0, b1) = self.split_i16x16(b);
3889 self.combine_mask16x8(self.simd_gt_i16x8(a0, b0), self.simd_gt_i16x8(a1, b1))
3890 }
3891 #[inline(always)]
3892 fn zip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3893 let (a0, _) = self.split_i16x16(a);
3894 let (b0, _) = self.split_i16x16(b);
3895 self.combine_i16x8(self.zip_low_i16x8(a0, b0), self.zip_high_i16x8(a0, b0))
3896 }
3897 #[inline(always)]
3898 fn zip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3899 let (_, a1) = self.split_i16x16(a);
3900 let (_, b1) = self.split_i16x16(b);
3901 self.combine_i16x8(self.zip_low_i16x8(a1, b1), self.zip_high_i16x8(a1, b1))
3902 }
3903 #[inline(always)]
3904 fn unzip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3905 let (a0, a1) = self.split_i16x16(a);
3906 let (b0, b1) = self.split_i16x16(b);
3907 self.combine_i16x8(self.unzip_low_i16x8(a0, a1), self.unzip_low_i16x8(b0, b1))
3908 }
3909 #[inline(always)]
3910 fn unzip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3911 let (a0, a1) = self.split_i16x16(a);
3912 let (b0, b1) = self.split_i16x16(b);
3913 self.combine_i16x8(self.unzip_high_i16x8(a0, a1), self.unzip_high_i16x8(b0, b1))
3914 }
3915 #[inline(always)]
3916 fn interleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
3917 let (a0, a1) = self.split_i16x16(a);
3918 let (b0, b1) = self.split_i16x16(b);
3919 let lo_lo = self.zip_low_i16x8(a0, b0);
3920 let lo_hi = self.zip_high_i16x8(a0, b0);
3921 let hi_lo = self.zip_low_i16x8(a1, b1);
3922 let hi_hi = self.zip_high_i16x8(a1, b1);
3923 (
3924 self.combine_i16x8(lo_lo, lo_hi),
3925 self.combine_i16x8(hi_lo, hi_hi),
3926 )
3927 }
3928 #[inline(always)]
3929 fn deinterleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
3930 let (a0, a1) = self.split_i16x16(a);
3931 let (b0, b1) = self.split_i16x16(b);
3932 let lo_even = self.unzip_low_i16x8(a0, a1);
3933 let lo_odd = self.unzip_high_i16x8(a0, a1);
3934 let hi_even = self.unzip_low_i16x8(b0, b1);
3935 let hi_odd = self.unzip_high_i16x8(b0, b1);
3936 (
3937 self.combine_i16x8(lo_even, hi_even),
3938 self.combine_i16x8(lo_odd, hi_odd),
3939 )
3940 }
3941 #[inline(always)]
3942 fn select_i16x16(self, a: mask16x16<Self>, b: i16x16<Self>, c: i16x16<Self>) -> i16x16<Self> {
3943 let (a0, a1) = self.split_mask16x16(a);
3944 let (b0, b1) = self.split_i16x16(b);
3945 let (c0, c1) = self.split_i16x16(c);
3946 self.combine_i16x8(self.select_i16x8(a0, b0, c0), self.select_i16x8(a1, b1, c1))
3947 }
3948 #[inline(always)]
3949 fn min_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3950 let (a0, a1) = self.split_i16x16(a);
3951 let (b0, b1) = self.split_i16x16(b);
3952 self.combine_i16x8(self.min_i16x8(a0, b0), self.min_i16x8(a1, b1))
3953 }
3954 #[inline(always)]
3955 fn max_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3956 let (a0, a1) = self.split_i16x16(a);
3957 let (b0, b1) = self.split_i16x16(b);
3958 self.combine_i16x8(self.max_i16x8(a0, b0), self.max_i16x8(a1, b1))
3959 }
3960 #[inline(always)]
3961 fn combine_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x32<Self> {
3962 i16x32 {
3963 val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
3964 simd: self,
3965 }
3966 }
3967 #[inline(always)]
3968 fn split_i16x16(self, a: i16x16<Self>) -> (i16x8<Self>, i16x8<Self>) {
3969 (
3970 i16x8 {
3971 val: crate::support::Aligned128(a.val.0[0]),
3972 simd: self,
3973 },
3974 i16x8 {
3975 val: crate::support::Aligned128(a.val.0[1]),
3976 simd: self,
3977 },
3978 )
3979 }
3980 #[inline(always)]
3981 fn neg_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
3982 let (a0, a1) = self.split_i16x16(a);
3983 self.combine_i16x8(self.neg_i16x8(a0), self.neg_i16x8(a1))
3984 }
3985 #[inline(always)]
3986 fn reinterpret_u8_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
3987 let (a0, a1) = self.split_i16x16(a);
3988 self.combine_u8x16(self.reinterpret_u8_i16x8(a0), self.reinterpret_u8_i16x8(a1))
3989 }
3990 #[inline(always)]
3991 fn reinterpret_u32_i16x16(self, a: i16x16<Self>) -> u32x8<Self> {
3992 let (a0, a1) = self.split_i16x16(a);
3993 self.combine_u32x4(
3994 self.reinterpret_u32_i16x8(a0),
3995 self.reinterpret_u32_i16x8(a1),
3996 )
3997 }
3998 #[inline(always)]
3999 fn splat_u16x16(self, val: u16) -> u16x16<Self> {
4000 let half = self.splat_u16x8(val);
4001 self.combine_u16x8(half, half)
4002 }
4003 #[inline(always)]
4004 fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16<Self> {
4005 u16x16 {
4006 val: unsafe { core::mem::transmute_copy(&val) },
4007 simd: self,
4008 }
4009 }
4010 #[inline(always)]
4011 fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16<Self> {
4012 u16x16 {
4013 val: unsafe { core::mem::transmute_copy(val) },
4014 simd: self,
4015 }
4016 }
4017 #[inline(always)]
4018 fn as_array_u16x16(self, a: u16x16<Self>) -> [u16; 16usize] {
4019 unsafe { core::mem::transmute::<[__m128i; 2usize], [u16; 16usize]>(a.val.0) }
4020 }
4021 #[inline(always)]
4022 fn as_array_ref_u16x16(self, a: &u16x16<Self>) -> &[u16; 16usize] {
4023 unsafe { core::mem::transmute::<&[__m128i; 2usize], &[u16; 16usize]>(&a.val.0) }
4024 }
4025 #[inline(always)]
4026 fn as_array_mut_u16x16(self, a: &mut u16x16<Self>) -> &mut [u16; 16usize] {
4027 unsafe { core::mem::transmute::<&mut [__m128i; 2usize], &mut [u16; 16usize]>(&mut a.val.0) }
4028 }
4029 #[inline(always)]
4030 fn store_array_u16x16(self, a: u16x16<Self>, dest: &mut [u16; 16usize]) -> () {
4031 unsafe {
4032 core::ptr::copy_nonoverlapping(
4033 (&raw const a.val.0) as *const u16,
4034 dest.as_mut_ptr(),
4035 16usize,
4036 );
4037 }
4038 }
4039 #[inline(always)]
4040 fn cvt_from_bytes_u16x16(self, a: u8x32<Self>) -> u16x16<Self> {
4041 unsafe {
4042 u16x16 {
4043 val: core::mem::transmute(a.val),
4044 simd: self,
4045 }
4046 }
4047 }
4048 #[inline(always)]
4049 fn cvt_to_bytes_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
4050 unsafe {
4051 u8x32 {
4052 val: core::mem::transmute(a.val),
4053 simd: self,
4054 }
4055 }
4056 }
4057 #[inline(always)]
4058 fn slide_u16x16<const SHIFT: usize>(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4059 unsafe {
4060 if SHIFT >= 16usize {
4061 return b;
4062 }
4063 let result = cross_block_alignr_128x2(
4064 self.cvt_to_bytes_u16x16(b).val.0,
4065 self.cvt_to_bytes_u16x16(a).val.0,
4066 SHIFT * 2usize,
4067 );
4068 self.cvt_from_bytes_u16x16(u8x32 {
4069 val: crate::support::Aligned256(result),
4070 simd: self,
4071 })
4072 }
4073 }
4074 #[inline(always)]
4075 fn slide_within_blocks_u16x16<const SHIFT: usize>(
4076 self,
4077 a: u16x16<Self>,
4078 b: u16x16<Self>,
4079 ) -> u16x16<Self> {
4080 let (a0, a1) = self.split_u16x16(a);
4081 let (b0, b1) = self.split_u16x16(b);
4082 self.combine_u16x8(
4083 self.slide_within_blocks_u16x8::<SHIFT>(a0, b0),
4084 self.slide_within_blocks_u16x8::<SHIFT>(a1, b1),
4085 )
4086 }
4087 #[inline(always)]
4088 fn add_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4089 let (a0, a1) = self.split_u16x16(a);
4090 let (b0, b1) = self.split_u16x16(b);
4091 self.combine_u16x8(self.add_u16x8(a0, b0), self.add_u16x8(a1, b1))
4092 }
4093 #[inline(always)]
4094 fn sub_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4095 let (a0, a1) = self.split_u16x16(a);
4096 let (b0, b1) = self.split_u16x16(b);
4097 self.combine_u16x8(self.sub_u16x8(a0, b0), self.sub_u16x8(a1, b1))
4098 }
4099 #[inline(always)]
4100 fn mul_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4101 let (a0, a1) = self.split_u16x16(a);
4102 let (b0, b1) = self.split_u16x16(b);
4103 self.combine_u16x8(self.mul_u16x8(a0, b0), self.mul_u16x8(a1, b1))
4104 }
4105 #[inline(always)]
4106 fn and_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4107 let (a0, a1) = self.split_u16x16(a);
4108 let (b0, b1) = self.split_u16x16(b);
4109 self.combine_u16x8(self.and_u16x8(a0, b0), self.and_u16x8(a1, b1))
4110 }
4111 #[inline(always)]
4112 fn or_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4113 let (a0, a1) = self.split_u16x16(a);
4114 let (b0, b1) = self.split_u16x16(b);
4115 self.combine_u16x8(self.or_u16x8(a0, b0), self.or_u16x8(a1, b1))
4116 }
4117 #[inline(always)]
4118 fn xor_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4119 let (a0, a1) = self.split_u16x16(a);
4120 let (b0, b1) = self.split_u16x16(b);
4121 self.combine_u16x8(self.xor_u16x8(a0, b0), self.xor_u16x8(a1, b1))
4122 }
4123 #[inline(always)]
4124 fn not_u16x16(self, a: u16x16<Self>) -> u16x16<Self> {
4125 let (a0, a1) = self.split_u16x16(a);
4126 self.combine_u16x8(self.not_u16x8(a0), self.not_u16x8(a1))
4127 }
4128 #[inline(always)]
4129 fn shl_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
4130 let (a0, a1) = self.split_u16x16(a);
4131 self.combine_u16x8(self.shl_u16x8(a0, shift), self.shl_u16x8(a1, shift))
4132 }
4133 #[inline(always)]
4134 fn shlv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4135 let (a0, a1) = self.split_u16x16(a);
4136 let (b0, b1) = self.split_u16x16(b);
4137 self.combine_u16x8(self.shlv_u16x8(a0, b0), self.shlv_u16x8(a1, b1))
4138 }
4139 #[inline(always)]
4140 fn shr_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
4141 let (a0, a1) = self.split_u16x16(a);
4142 self.combine_u16x8(self.shr_u16x8(a0, shift), self.shr_u16x8(a1, shift))
4143 }
4144 #[inline(always)]
4145 fn shrv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4146 let (a0, a1) = self.split_u16x16(a);
4147 let (b0, b1) = self.split_u16x16(b);
4148 self.combine_u16x8(self.shrv_u16x8(a0, b0), self.shrv_u16x8(a1, b1))
4149 }
4150 #[inline(always)]
4151 fn simd_eq_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
4152 let (a0, a1) = self.split_u16x16(a);
4153 let (b0, b1) = self.split_u16x16(b);
4154 self.combine_mask16x8(self.simd_eq_u16x8(a0, b0), self.simd_eq_u16x8(a1, b1))
4155 }
4156 #[inline(always)]
4157 fn simd_lt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
4158 let (a0, a1) = self.split_u16x16(a);
4159 let (b0, b1) = self.split_u16x16(b);
4160 self.combine_mask16x8(self.simd_lt_u16x8(a0, b0), self.simd_lt_u16x8(a1, b1))
4161 }
4162 #[inline(always)]
4163 fn simd_le_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
4164 let (a0, a1) = self.split_u16x16(a);
4165 let (b0, b1) = self.split_u16x16(b);
4166 self.combine_mask16x8(self.simd_le_u16x8(a0, b0), self.simd_le_u16x8(a1, b1))
4167 }
4168 #[inline(always)]
4169 fn simd_ge_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
4170 let (a0, a1) = self.split_u16x16(a);
4171 let (b0, b1) = self.split_u16x16(b);
4172 self.combine_mask16x8(self.simd_ge_u16x8(a0, b0), self.simd_ge_u16x8(a1, b1))
4173 }
4174 #[inline(always)]
4175 fn simd_gt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
4176 let (a0, a1) = self.split_u16x16(a);
4177 let (b0, b1) = self.split_u16x16(b);
4178 self.combine_mask16x8(self.simd_gt_u16x8(a0, b0), self.simd_gt_u16x8(a1, b1))
4179 }
4180 #[inline(always)]
4181 fn zip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4182 let (a0, _) = self.split_u16x16(a);
4183 let (b0, _) = self.split_u16x16(b);
4184 self.combine_u16x8(self.zip_low_u16x8(a0, b0), self.zip_high_u16x8(a0, b0))
4185 }
4186 #[inline(always)]
4187 fn zip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4188 let (_, a1) = self.split_u16x16(a);
4189 let (_, b1) = self.split_u16x16(b);
4190 self.combine_u16x8(self.zip_low_u16x8(a1, b1), self.zip_high_u16x8(a1, b1))
4191 }
4192 #[inline(always)]
4193 fn unzip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4194 let (a0, a1) = self.split_u16x16(a);
4195 let (b0, b1) = self.split_u16x16(b);
4196 self.combine_u16x8(self.unzip_low_u16x8(a0, a1), self.unzip_low_u16x8(b0, b1))
4197 }
4198 #[inline(always)]
4199 fn unzip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4200 let (a0, a1) = self.split_u16x16(a);
4201 let (b0, b1) = self.split_u16x16(b);
4202 self.combine_u16x8(self.unzip_high_u16x8(a0, a1), self.unzip_high_u16x8(b0, b1))
4203 }
4204 #[inline(always)]
4205 fn interleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
4206 let (a0, a1) = self.split_u16x16(a);
4207 let (b0, b1) = self.split_u16x16(b);
4208 let lo_lo = self.zip_low_u16x8(a0, b0);
4209 let lo_hi = self.zip_high_u16x8(a0, b0);
4210 let hi_lo = self.zip_low_u16x8(a1, b1);
4211 let hi_hi = self.zip_high_u16x8(a1, b1);
4212 (
4213 self.combine_u16x8(lo_lo, lo_hi),
4214 self.combine_u16x8(hi_lo, hi_hi),
4215 )
4216 }
4217 #[inline(always)]
4218 fn deinterleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
4219 let (a0, a1) = self.split_u16x16(a);
4220 let (b0, b1) = self.split_u16x16(b);
4221 let lo_even = self.unzip_low_u16x8(a0, a1);
4222 let lo_odd = self.unzip_high_u16x8(a0, a1);
4223 let hi_even = self.unzip_low_u16x8(b0, b1);
4224 let hi_odd = self.unzip_high_u16x8(b0, b1);
4225 (
4226 self.combine_u16x8(lo_even, hi_even),
4227 self.combine_u16x8(lo_odd, hi_odd),
4228 )
4229 }
4230 #[inline(always)]
4231 fn select_u16x16(self, a: mask16x16<Self>, b: u16x16<Self>, c: u16x16<Self>) -> u16x16<Self> {
4232 let (a0, a1) = self.split_mask16x16(a);
4233 let (b0, b1) = self.split_u16x16(b);
4234 let (c0, c1) = self.split_u16x16(c);
4235 self.combine_u16x8(self.select_u16x8(a0, b0, c0), self.select_u16x8(a1, b1, c1))
4236 }
4237 #[inline(always)]
4238 fn min_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4239 let (a0, a1) = self.split_u16x16(a);
4240 let (b0, b1) = self.split_u16x16(b);
4241 self.combine_u16x8(self.min_u16x8(a0, b0), self.min_u16x8(a1, b1))
4242 }
4243 #[inline(always)]
4244 fn max_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4245 let (a0, a1) = self.split_u16x16(a);
4246 let (b0, b1) = self.split_u16x16(b);
4247 self.combine_u16x8(self.max_u16x8(a0, b0), self.max_u16x8(a1, b1))
4248 }
4249 #[inline(always)]
4250 fn combine_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x32<Self> {
4251 u16x32 {
4252 val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
4253 simd: self,
4254 }
4255 }
4256 #[inline(always)]
4257 fn split_u16x16(self, a: u16x16<Self>) -> (u16x8<Self>, u16x8<Self>) {
4258 (
4259 u16x8 {
4260 val: crate::support::Aligned128(a.val.0[0]),
4261 simd: self,
4262 },
4263 u16x8 {
4264 val: crate::support::Aligned128(a.val.0[1]),
4265 simd: self,
4266 },
4267 )
4268 }
4269 #[inline(always)]
4270 fn narrow_u16x16(self, a: u16x16<Self>) -> u8x16<Self> {
4271 let (a, b) = self.split_u16x16(a);
4272 unsafe {
4273 let mask = _mm_set1_epi16(0xFF);
4274 let lo_masked = _mm_and_si128(a.into(), mask);
4275 let hi_masked = _mm_and_si128(b.into(), mask);
4276 let result = _mm_packus_epi16(lo_masked, hi_masked);
4277 result.simd_into(self)
4278 }
4279 }
4280 #[inline(always)]
4281 fn reinterpret_u8_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
4282 let (a0, a1) = self.split_u16x16(a);
4283 self.combine_u8x16(self.reinterpret_u8_u16x8(a0), self.reinterpret_u8_u16x8(a1))
4284 }
4285 #[inline(always)]
4286 fn reinterpret_u32_u16x16(self, a: u16x16<Self>) -> u32x8<Self> {
4287 let (a0, a1) = self.split_u16x16(a);
4288 self.combine_u32x4(
4289 self.reinterpret_u32_u16x8(a0),
4290 self.reinterpret_u32_u16x8(a1),
4291 )
4292 }
4293 #[inline(always)]
4294 fn splat_mask16x16(self, val: i16) -> mask16x16<Self> {
4295 let half = self.splat_mask16x8(val);
4296 self.combine_mask16x8(half, half)
4297 }
4298 #[inline(always)]
4299 fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16<Self> {
4300 mask16x16 {
4301 val: unsafe { core::mem::transmute_copy(&val) },
4302 simd: self,
4303 }
4304 }
4305 #[inline(always)]
4306 fn load_array_ref_mask16x16(self, val: &[i16; 16usize]) -> mask16x16<Self> {
4307 mask16x16 {
4308 val: unsafe { core::mem::transmute_copy(val) },
4309 simd: self,
4310 }
4311 }
4312 #[inline(always)]
4313 fn as_array_mask16x16(self, a: mask16x16<Self>) -> [i16; 16usize] {
4314 unsafe { core::mem::transmute::<[__m128i; 2usize], [i16; 16usize]>(a.val.0) }
4315 }
4316 #[inline(always)]
4317 fn as_array_ref_mask16x16(self, a: &mask16x16<Self>) -> &[i16; 16usize] {
4318 unsafe { core::mem::transmute::<&[__m128i; 2usize], &[i16; 16usize]>(&a.val.0) }
4319 }
4320 #[inline(always)]
4321 fn as_array_mut_mask16x16(self, a: &mut mask16x16<Self>) -> &mut [i16; 16usize] {
4322 unsafe { core::mem::transmute::<&mut [__m128i; 2usize], &mut [i16; 16usize]>(&mut a.val.0) }
4323 }
4324 #[inline(always)]
4325 fn store_array_mask16x16(self, a: mask16x16<Self>, dest: &mut [i16; 16usize]) -> () {
4326 unsafe {
4327 core::ptr::copy_nonoverlapping(
4328 (&raw const a.val.0) as *const i16,
4329 dest.as_mut_ptr(),
4330 16usize,
4331 );
4332 }
4333 }
4334 #[inline(always)]
4335 fn cvt_from_bytes_mask16x16(self, a: u8x32<Self>) -> mask16x16<Self> {
4336 unsafe {
4337 mask16x16 {
4338 val: core::mem::transmute(a.val),
4339 simd: self,
4340 }
4341 }
4342 }
4343 #[inline(always)]
4344 fn cvt_to_bytes_mask16x16(self, a: mask16x16<Self>) -> u8x32<Self> {
4345 unsafe {
4346 u8x32 {
4347 val: core::mem::transmute(a.val),
4348 simd: self,
4349 }
4350 }
4351 }
4352 #[inline(always)]
4353 fn slide_mask16x16<const SHIFT: usize>(
4354 self,
4355 a: mask16x16<Self>,
4356 b: mask16x16<Self>,
4357 ) -> mask16x16<Self> {
4358 unsafe {
4359 if SHIFT >= 16usize {
4360 return b;
4361 }
4362 let result = cross_block_alignr_128x2(
4363 self.cvt_to_bytes_mask16x16(b).val.0,
4364 self.cvt_to_bytes_mask16x16(a).val.0,
4365 SHIFT * 2usize,
4366 );
4367 self.cvt_from_bytes_mask16x16(u8x32 {
4368 val: crate::support::Aligned256(result),
4369 simd: self,
4370 })
4371 }
4372 }
4373 #[inline(always)]
4374 fn slide_within_blocks_mask16x16<const SHIFT: usize>(
4375 self,
4376 a: mask16x16<Self>,
4377 b: mask16x16<Self>,
4378 ) -> mask16x16<Self> {
4379 let (a0, a1) = self.split_mask16x16(a);
4380 let (b0, b1) = self.split_mask16x16(b);
4381 self.combine_mask16x8(
4382 self.slide_within_blocks_mask16x8::<SHIFT>(a0, b0),
4383 self.slide_within_blocks_mask16x8::<SHIFT>(a1, b1),
4384 )
4385 }
4386 #[inline(always)]
4387 fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
4388 let (a0, a1) = self.split_mask16x16(a);
4389 let (b0, b1) = self.split_mask16x16(b);
4390 self.combine_mask16x8(self.and_mask16x8(a0, b0), self.and_mask16x8(a1, b1))
4391 }
4392 #[inline(always)]
4393 fn or_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
4394 let (a0, a1) = self.split_mask16x16(a);
4395 let (b0, b1) = self.split_mask16x16(b);
4396 self.combine_mask16x8(self.or_mask16x8(a0, b0), self.or_mask16x8(a1, b1))
4397 }
4398 #[inline(always)]
4399 fn xor_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
4400 let (a0, a1) = self.split_mask16x16(a);
4401 let (b0, b1) = self.split_mask16x16(b);
4402 self.combine_mask16x8(self.xor_mask16x8(a0, b0), self.xor_mask16x8(a1, b1))
4403 }
4404 #[inline(always)]
4405 fn not_mask16x16(self, a: mask16x16<Self>) -> mask16x16<Self> {
4406 let (a0, a1) = self.split_mask16x16(a);
4407 self.combine_mask16x8(self.not_mask16x8(a0), self.not_mask16x8(a1))
4408 }
4409 #[inline(always)]
4410 fn select_mask16x16(
4411 self,
4412 a: mask16x16<Self>,
4413 b: mask16x16<Self>,
4414 c: mask16x16<Self>,
4415 ) -> mask16x16<Self> {
4416 let (a0, a1) = self.split_mask16x16(a);
4417 let (b0, b1) = self.split_mask16x16(b);
4418 let (c0, c1) = self.split_mask16x16(c);
4419 self.combine_mask16x8(
4420 self.select_mask16x8(a0, b0, c0),
4421 self.select_mask16x8(a1, b1, c1),
4422 )
4423 }
4424 #[inline(always)]
4425 fn simd_eq_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
4426 let (a0, a1) = self.split_mask16x16(a);
4427 let (b0, b1) = self.split_mask16x16(b);
4428 self.combine_mask16x8(self.simd_eq_mask16x8(a0, b0), self.simd_eq_mask16x8(a1, b1))
4429 }
4430 #[inline(always)]
4431 fn any_true_mask16x16(self, a: mask16x16<Self>) -> bool {
4432 let (a0, a1) = self.split_mask16x16(a);
4433 self.any_true_mask16x8(a0) || self.any_true_mask16x8(a1)
4434 }
4435 #[inline(always)]
4436 fn all_true_mask16x16(self, a: mask16x16<Self>) -> bool {
4437 let (a0, a1) = self.split_mask16x16(a);
4438 self.all_true_mask16x8(a0) && self.all_true_mask16x8(a1)
4439 }
4440 #[inline(always)]
4441 fn any_false_mask16x16(self, a: mask16x16<Self>) -> bool {
4442 let (a0, a1) = self.split_mask16x16(a);
4443 self.any_false_mask16x8(a0) || self.any_false_mask16x8(a1)
4444 }
4445 #[inline(always)]
4446 fn all_false_mask16x16(self, a: mask16x16<Self>) -> bool {
4447 let (a0, a1) = self.split_mask16x16(a);
4448 self.all_false_mask16x8(a0) && self.all_false_mask16x8(a1)
4449 }
4450 #[inline(always)]
4451 fn combine_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x32<Self> {
4452 mask16x32 {
4453 val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
4454 simd: self,
4455 }
4456 }
4457 #[inline(always)]
4458 fn split_mask16x16(self, a: mask16x16<Self>) -> (mask16x8<Self>, mask16x8<Self>) {
4459 (
4460 mask16x8 {
4461 val: crate::support::Aligned128(a.val.0[0]),
4462 simd: self,
4463 },
4464 mask16x8 {
4465 val: crate::support::Aligned128(a.val.0[1]),
4466 simd: self,
4467 },
4468 )
4469 }
4470 #[inline(always)]
4471 fn splat_i32x8(self, val: i32) -> i32x8<Self> {
4472 let half = self.splat_i32x4(val);
4473 self.combine_i32x4(half, half)
4474 }
4475 #[inline(always)]
4476 fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8<Self> {
4477 i32x8 {
4478 val: unsafe { core::mem::transmute_copy(&val) },
4479 simd: self,
4480 }
4481 }
4482 #[inline(always)]
4483 fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8<Self> {
4484 i32x8 {
4485 val: unsafe { core::mem::transmute_copy(val) },
4486 simd: self,
4487 }
4488 }
4489 #[inline(always)]
4490 fn as_array_i32x8(self, a: i32x8<Self>) -> [i32; 8usize] {
4491 unsafe { core::mem::transmute::<[__m128i; 2usize], [i32; 8usize]>(a.val.0) }
4492 }
4493 #[inline(always)]
4494 fn as_array_ref_i32x8(self, a: &i32x8<Self>) -> &[i32; 8usize] {
4495 unsafe { core::mem::transmute::<&[__m128i; 2usize], &[i32; 8usize]>(&a.val.0) }
4496 }
4497 #[inline(always)]
4498 fn as_array_mut_i32x8(self, a: &mut i32x8<Self>) -> &mut [i32; 8usize] {
4499 unsafe { core::mem::transmute::<&mut [__m128i; 2usize], &mut [i32; 8usize]>(&mut a.val.0) }
4500 }
4501 #[inline(always)]
4502 fn store_array_i32x8(self, a: i32x8<Self>, dest: &mut [i32; 8usize]) -> () {
4503 unsafe {
4504 core::ptr::copy_nonoverlapping(
4505 (&raw const a.val.0) as *const i32,
4506 dest.as_mut_ptr(),
4507 8usize,
4508 );
4509 }
4510 }
4511 #[inline(always)]
4512 fn cvt_from_bytes_i32x8(self, a: u8x32<Self>) -> i32x8<Self> {
4513 unsafe {
4514 i32x8 {
4515 val: core::mem::transmute(a.val),
4516 simd: self,
4517 }
4518 }
4519 }
4520 #[inline(always)]
4521 fn cvt_to_bytes_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
4522 unsafe {
4523 u8x32 {
4524 val: core::mem::transmute(a.val),
4525 simd: self,
4526 }
4527 }
4528 }
4529 #[inline(always)]
4530 fn slide_i32x8<const SHIFT: usize>(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4531 unsafe {
4532 if SHIFT >= 8usize {
4533 return b;
4534 }
4535 let result = cross_block_alignr_128x2(
4536 self.cvt_to_bytes_i32x8(b).val.0,
4537 self.cvt_to_bytes_i32x8(a).val.0,
4538 SHIFT * 4usize,
4539 );
4540 self.cvt_from_bytes_i32x8(u8x32 {
4541 val: crate::support::Aligned256(result),
4542 simd: self,
4543 })
4544 }
4545 }
4546 #[inline(always)]
4547 fn slide_within_blocks_i32x8<const SHIFT: usize>(
4548 self,
4549 a: i32x8<Self>,
4550 b: i32x8<Self>,
4551 ) -> i32x8<Self> {
4552 let (a0, a1) = self.split_i32x8(a);
4553 let (b0, b1) = self.split_i32x8(b);
4554 self.combine_i32x4(
4555 self.slide_within_blocks_i32x4::<SHIFT>(a0, b0),
4556 self.slide_within_blocks_i32x4::<SHIFT>(a1, b1),
4557 )
4558 }
4559 #[inline(always)]
4560 fn add_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4561 let (a0, a1) = self.split_i32x8(a);
4562 let (b0, b1) = self.split_i32x8(b);
4563 self.combine_i32x4(self.add_i32x4(a0, b0), self.add_i32x4(a1, b1))
4564 }
4565 #[inline(always)]
4566 fn sub_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4567 let (a0, a1) = self.split_i32x8(a);
4568 let (b0, b1) = self.split_i32x8(b);
4569 self.combine_i32x4(self.sub_i32x4(a0, b0), self.sub_i32x4(a1, b1))
4570 }
4571 #[inline(always)]
4572 fn mul_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4573 let (a0, a1) = self.split_i32x8(a);
4574 let (b0, b1) = self.split_i32x8(b);
4575 self.combine_i32x4(self.mul_i32x4(a0, b0), self.mul_i32x4(a1, b1))
4576 }
4577 #[inline(always)]
4578 fn and_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4579 let (a0, a1) = self.split_i32x8(a);
4580 let (b0, b1) = self.split_i32x8(b);
4581 self.combine_i32x4(self.and_i32x4(a0, b0), self.and_i32x4(a1, b1))
4582 }
4583 #[inline(always)]
4584 fn or_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4585 let (a0, a1) = self.split_i32x8(a);
4586 let (b0, b1) = self.split_i32x8(b);
4587 self.combine_i32x4(self.or_i32x4(a0, b0), self.or_i32x4(a1, b1))
4588 }
4589 #[inline(always)]
4590 fn xor_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4591 let (a0, a1) = self.split_i32x8(a);
4592 let (b0, b1) = self.split_i32x8(b);
4593 self.combine_i32x4(self.xor_i32x4(a0, b0), self.xor_i32x4(a1, b1))
4594 }
4595 #[inline(always)]
4596 fn not_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
4597 let (a0, a1) = self.split_i32x8(a);
4598 self.combine_i32x4(self.not_i32x4(a0), self.not_i32x4(a1))
4599 }
4600 #[inline(always)]
4601 fn shl_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
4602 let (a0, a1) = self.split_i32x8(a);
4603 self.combine_i32x4(self.shl_i32x4(a0, shift), self.shl_i32x4(a1, shift))
4604 }
4605 #[inline(always)]
4606 fn shlv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4607 let (a0, a1) = self.split_i32x8(a);
4608 let (b0, b1) = self.split_i32x8(b);
4609 self.combine_i32x4(self.shlv_i32x4(a0, b0), self.shlv_i32x4(a1, b1))
4610 }
4611 #[inline(always)]
4612 fn shr_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
4613 let (a0, a1) = self.split_i32x8(a);
4614 self.combine_i32x4(self.shr_i32x4(a0, shift), self.shr_i32x4(a1, shift))
4615 }
4616 #[inline(always)]
4617 fn shrv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4618 let (a0, a1) = self.split_i32x8(a);
4619 let (b0, b1) = self.split_i32x8(b);
4620 self.combine_i32x4(self.shrv_i32x4(a0, b0), self.shrv_i32x4(a1, b1))
4621 }
4622 #[inline(always)]
4623 fn simd_eq_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
4624 let (a0, a1) = self.split_i32x8(a);
4625 let (b0, b1) = self.split_i32x8(b);
4626 self.combine_mask32x4(self.simd_eq_i32x4(a0, b0), self.simd_eq_i32x4(a1, b1))
4627 }
4628 #[inline(always)]
4629 fn simd_lt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
4630 let (a0, a1) = self.split_i32x8(a);
4631 let (b0, b1) = self.split_i32x8(b);
4632 self.combine_mask32x4(self.simd_lt_i32x4(a0, b0), self.simd_lt_i32x4(a1, b1))
4633 }
4634 #[inline(always)]
4635 fn simd_le_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
4636 let (a0, a1) = self.split_i32x8(a);
4637 let (b0, b1) = self.split_i32x8(b);
4638 self.combine_mask32x4(self.simd_le_i32x4(a0, b0), self.simd_le_i32x4(a1, b1))
4639 }
4640 #[inline(always)]
4641 fn simd_ge_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
4642 let (a0, a1) = self.split_i32x8(a);
4643 let (b0, b1) = self.split_i32x8(b);
4644 self.combine_mask32x4(self.simd_ge_i32x4(a0, b0), self.simd_ge_i32x4(a1, b1))
4645 }
4646 #[inline(always)]
4647 fn simd_gt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
4648 let (a0, a1) = self.split_i32x8(a);
4649 let (b0, b1) = self.split_i32x8(b);
4650 self.combine_mask32x4(self.simd_gt_i32x4(a0, b0), self.simd_gt_i32x4(a1, b1))
4651 }
4652 #[inline(always)]
4653 fn zip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4654 let (a0, _) = self.split_i32x8(a);
4655 let (b0, _) = self.split_i32x8(b);
4656 self.combine_i32x4(self.zip_low_i32x4(a0, b0), self.zip_high_i32x4(a0, b0))
4657 }
4658 #[inline(always)]
4659 fn zip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4660 let (_, a1) = self.split_i32x8(a);
4661 let (_, b1) = self.split_i32x8(b);
4662 self.combine_i32x4(self.zip_low_i32x4(a1, b1), self.zip_high_i32x4(a1, b1))
4663 }
4664 #[inline(always)]
4665 fn unzip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4666 let (a0, a1) = self.split_i32x8(a);
4667 let (b0, b1) = self.split_i32x8(b);
4668 self.combine_i32x4(self.unzip_low_i32x4(a0, a1), self.unzip_low_i32x4(b0, b1))
4669 }
4670 #[inline(always)]
4671 fn unzip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4672 let (a0, a1) = self.split_i32x8(a);
4673 let (b0, b1) = self.split_i32x8(b);
4674 self.combine_i32x4(self.unzip_high_i32x4(a0, a1), self.unzip_high_i32x4(b0, b1))
4675 }
4676 #[inline(always)]
4677 fn interleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
4678 let (a0, a1) = self.split_i32x8(a);
4679 let (b0, b1) = self.split_i32x8(b);
4680 let lo_lo = self.zip_low_i32x4(a0, b0);
4681 let lo_hi = self.zip_high_i32x4(a0, b0);
4682 let hi_lo = self.zip_low_i32x4(a1, b1);
4683 let hi_hi = self.zip_high_i32x4(a1, b1);
4684 (
4685 self.combine_i32x4(lo_lo, lo_hi),
4686 self.combine_i32x4(hi_lo, hi_hi),
4687 )
4688 }
4689 #[inline(always)]
4690 fn deinterleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
4691 let (a0, a1) = self.split_i32x8(a);
4692 let (b0, b1) = self.split_i32x8(b);
4693 let lo_even = self.unzip_low_i32x4(a0, a1);
4694 let lo_odd = self.unzip_high_i32x4(a0, a1);
4695 let hi_even = self.unzip_low_i32x4(b0, b1);
4696 let hi_odd = self.unzip_high_i32x4(b0, b1);
4697 (
4698 self.combine_i32x4(lo_even, hi_even),
4699 self.combine_i32x4(lo_odd, hi_odd),
4700 )
4701 }
4702 #[inline(always)]
4703 fn select_i32x8(self, a: mask32x8<Self>, b: i32x8<Self>, c: i32x8<Self>) -> i32x8<Self> {
4704 let (a0, a1) = self.split_mask32x8(a);
4705 let (b0, b1) = self.split_i32x8(b);
4706 let (c0, c1) = self.split_i32x8(c);
4707 self.combine_i32x4(self.select_i32x4(a0, b0, c0), self.select_i32x4(a1, b1, c1))
4708 }
4709 #[inline(always)]
4710 fn min_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4711 let (a0, a1) = self.split_i32x8(a);
4712 let (b0, b1) = self.split_i32x8(b);
4713 self.combine_i32x4(self.min_i32x4(a0, b0), self.min_i32x4(a1, b1))
4714 }
4715 #[inline(always)]
4716 fn max_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4717 let (a0, a1) = self.split_i32x8(a);
4718 let (b0, b1) = self.split_i32x8(b);
4719 self.combine_i32x4(self.max_i32x4(a0, b0), self.max_i32x4(a1, b1))
4720 }
4721 #[inline(always)]
4722 fn combine_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x16<Self> {
4723 i32x16 {
4724 val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
4725 simd: self,
4726 }
4727 }
4728 #[inline(always)]
4729 fn split_i32x8(self, a: i32x8<Self>) -> (i32x4<Self>, i32x4<Self>) {
4730 (
4731 i32x4 {
4732 val: crate::support::Aligned128(a.val.0[0]),
4733 simd: self,
4734 },
4735 i32x4 {
4736 val: crate::support::Aligned128(a.val.0[1]),
4737 simd: self,
4738 },
4739 )
4740 }
4741 #[inline(always)]
4742 fn neg_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
4743 let (a0, a1) = self.split_i32x8(a);
4744 self.combine_i32x4(self.neg_i32x4(a0), self.neg_i32x4(a1))
4745 }
4746 #[inline(always)]
4747 fn reinterpret_u8_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
4748 let (a0, a1) = self.split_i32x8(a);
4749 self.combine_u8x16(self.reinterpret_u8_i32x4(a0), self.reinterpret_u8_i32x4(a1))
4750 }
4751 #[inline(always)]
4752 fn reinterpret_u32_i32x8(self, a: i32x8<Self>) -> u32x8<Self> {
4753 let (a0, a1) = self.split_i32x8(a);
4754 self.combine_u32x4(
4755 self.reinterpret_u32_i32x4(a0),
4756 self.reinterpret_u32_i32x4(a1),
4757 )
4758 }
4759 #[inline(always)]
4760 fn cvt_f32_i32x8(self, a: i32x8<Self>) -> f32x8<Self> {
4761 let (a0, a1) = self.split_i32x8(a);
4762 self.combine_f32x4(self.cvt_f32_i32x4(a0), self.cvt_f32_i32x4(a1))
4763 }
4764 #[inline(always)]
4765 fn splat_u32x8(self, val: u32) -> u32x8<Self> {
4766 let half = self.splat_u32x4(val);
4767 self.combine_u32x4(half, half)
4768 }
4769 #[inline(always)]
4770 fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8<Self> {
4771 u32x8 {
4772 val: unsafe { core::mem::transmute_copy(&val) },
4773 simd: self,
4774 }
4775 }
4776 #[inline(always)]
4777 fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8<Self> {
4778 u32x8 {
4779 val: unsafe { core::mem::transmute_copy(val) },
4780 simd: self,
4781 }
4782 }
4783 #[inline(always)]
4784 fn as_array_u32x8(self, a: u32x8<Self>) -> [u32; 8usize] {
4785 unsafe { core::mem::transmute::<[__m128i; 2usize], [u32; 8usize]>(a.val.0) }
4786 }
4787 #[inline(always)]
4788 fn as_array_ref_u32x8(self, a: &u32x8<Self>) -> &[u32; 8usize] {
4789 unsafe { core::mem::transmute::<&[__m128i; 2usize], &[u32; 8usize]>(&a.val.0) }
4790 }
4791 #[inline(always)]
4792 fn as_array_mut_u32x8(self, a: &mut u32x8<Self>) -> &mut [u32; 8usize] {
4793 unsafe { core::mem::transmute::<&mut [__m128i; 2usize], &mut [u32; 8usize]>(&mut a.val.0) }
4794 }
4795 #[inline(always)]
4796 fn store_array_u32x8(self, a: u32x8<Self>, dest: &mut [u32; 8usize]) -> () {
4797 unsafe {
4798 core::ptr::copy_nonoverlapping(
4799 (&raw const a.val.0) as *const u32,
4800 dest.as_mut_ptr(),
4801 8usize,
4802 );
4803 }
4804 }
4805 #[inline(always)]
4806 fn cvt_from_bytes_u32x8(self, a: u8x32<Self>) -> u32x8<Self> {
4807 unsafe {
4808 u32x8 {
4809 val: core::mem::transmute(a.val),
4810 simd: self,
4811 }
4812 }
4813 }
4814 #[inline(always)]
4815 fn cvt_to_bytes_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
4816 unsafe {
4817 u8x32 {
4818 val: core::mem::transmute(a.val),
4819 simd: self,
4820 }
4821 }
4822 }
4823 #[inline(always)]
4824 fn slide_u32x8<const SHIFT: usize>(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4825 unsafe {
4826 if SHIFT >= 8usize {
4827 return b;
4828 }
4829 let result = cross_block_alignr_128x2(
4830 self.cvt_to_bytes_u32x8(b).val.0,
4831 self.cvt_to_bytes_u32x8(a).val.0,
4832 SHIFT * 4usize,
4833 );
4834 self.cvt_from_bytes_u32x8(u8x32 {
4835 val: crate::support::Aligned256(result),
4836 simd: self,
4837 })
4838 }
4839 }
4840 #[inline(always)]
4841 fn slide_within_blocks_u32x8<const SHIFT: usize>(
4842 self,
4843 a: u32x8<Self>,
4844 b: u32x8<Self>,
4845 ) -> u32x8<Self> {
4846 let (a0, a1) = self.split_u32x8(a);
4847 let (b0, b1) = self.split_u32x8(b);
4848 self.combine_u32x4(
4849 self.slide_within_blocks_u32x4::<SHIFT>(a0, b0),
4850 self.slide_within_blocks_u32x4::<SHIFT>(a1, b1),
4851 )
4852 }
4853 #[inline(always)]
4854 fn add_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4855 let (a0, a1) = self.split_u32x8(a);
4856 let (b0, b1) = self.split_u32x8(b);
4857 self.combine_u32x4(self.add_u32x4(a0, b0), self.add_u32x4(a1, b1))
4858 }
4859 #[inline(always)]
4860 fn sub_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4861 let (a0, a1) = self.split_u32x8(a);
4862 let (b0, b1) = self.split_u32x8(b);
4863 self.combine_u32x4(self.sub_u32x4(a0, b0), self.sub_u32x4(a1, b1))
4864 }
4865 #[inline(always)]
4866 fn mul_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4867 let (a0, a1) = self.split_u32x8(a);
4868 let (b0, b1) = self.split_u32x8(b);
4869 self.combine_u32x4(self.mul_u32x4(a0, b0), self.mul_u32x4(a1, b1))
4870 }
4871 #[inline(always)]
4872 fn and_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4873 let (a0, a1) = self.split_u32x8(a);
4874 let (b0, b1) = self.split_u32x8(b);
4875 self.combine_u32x4(self.and_u32x4(a0, b0), self.and_u32x4(a1, b1))
4876 }
4877 #[inline(always)]
4878 fn or_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4879 let (a0, a1) = self.split_u32x8(a);
4880 let (b0, b1) = self.split_u32x8(b);
4881 self.combine_u32x4(self.or_u32x4(a0, b0), self.or_u32x4(a1, b1))
4882 }
4883 #[inline(always)]
4884 fn xor_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4885 let (a0, a1) = self.split_u32x8(a);
4886 let (b0, b1) = self.split_u32x8(b);
4887 self.combine_u32x4(self.xor_u32x4(a0, b0), self.xor_u32x4(a1, b1))
4888 }
4889 #[inline(always)]
4890 fn not_u32x8(self, a: u32x8<Self>) -> u32x8<Self> {
4891 let (a0, a1) = self.split_u32x8(a);
4892 self.combine_u32x4(self.not_u32x4(a0), self.not_u32x4(a1))
4893 }
4894 #[inline(always)]
4895 fn shl_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
4896 let (a0, a1) = self.split_u32x8(a);
4897 self.combine_u32x4(self.shl_u32x4(a0, shift), self.shl_u32x4(a1, shift))
4898 }
4899 #[inline(always)]
4900 fn shlv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4901 let (a0, a1) = self.split_u32x8(a);
4902 let (b0, b1) = self.split_u32x8(b);
4903 self.combine_u32x4(self.shlv_u32x4(a0, b0), self.shlv_u32x4(a1, b1))
4904 }
4905 #[inline(always)]
4906 fn shr_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
4907 let (a0, a1) = self.split_u32x8(a);
4908 self.combine_u32x4(self.shr_u32x4(a0, shift), self.shr_u32x4(a1, shift))
4909 }
4910 #[inline(always)]
4911 fn shrv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4912 let (a0, a1) = self.split_u32x8(a);
4913 let (b0, b1) = self.split_u32x8(b);
4914 self.combine_u32x4(self.shrv_u32x4(a0, b0), self.shrv_u32x4(a1, b1))
4915 }
4916 #[inline(always)]
4917 fn simd_eq_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
4918 let (a0, a1) = self.split_u32x8(a);
4919 let (b0, b1) = self.split_u32x8(b);
4920 self.combine_mask32x4(self.simd_eq_u32x4(a0, b0), self.simd_eq_u32x4(a1, b1))
4921 }
4922 #[inline(always)]
4923 fn simd_lt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
4924 let (a0, a1) = self.split_u32x8(a);
4925 let (b0, b1) = self.split_u32x8(b);
4926 self.combine_mask32x4(self.simd_lt_u32x4(a0, b0), self.simd_lt_u32x4(a1, b1))
4927 }
4928 #[inline(always)]
4929 fn simd_le_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
4930 let (a0, a1) = self.split_u32x8(a);
4931 let (b0, b1) = self.split_u32x8(b);
4932 self.combine_mask32x4(self.simd_le_u32x4(a0, b0), self.simd_le_u32x4(a1, b1))
4933 }
4934 #[inline(always)]
4935 fn simd_ge_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
4936 let (a0, a1) = self.split_u32x8(a);
4937 let (b0, b1) = self.split_u32x8(b);
4938 self.combine_mask32x4(self.simd_ge_u32x4(a0, b0), self.simd_ge_u32x4(a1, b1))
4939 }
4940 #[inline(always)]
4941 fn simd_gt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
4942 let (a0, a1) = self.split_u32x8(a);
4943 let (b0, b1) = self.split_u32x8(b);
4944 self.combine_mask32x4(self.simd_gt_u32x4(a0, b0), self.simd_gt_u32x4(a1, b1))
4945 }
4946 #[inline(always)]
4947 fn zip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4948 let (a0, _) = self.split_u32x8(a);
4949 let (b0, _) = self.split_u32x8(b);
4950 self.combine_u32x4(self.zip_low_u32x4(a0, b0), self.zip_high_u32x4(a0, b0))
4951 }
4952 #[inline(always)]
4953 fn zip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4954 let (_, a1) = self.split_u32x8(a);
4955 let (_, b1) = self.split_u32x8(b);
4956 self.combine_u32x4(self.zip_low_u32x4(a1, b1), self.zip_high_u32x4(a1, b1))
4957 }
4958 #[inline(always)]
4959 fn unzip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4960 let (a0, a1) = self.split_u32x8(a);
4961 let (b0, b1) = self.split_u32x8(b);
4962 self.combine_u32x4(self.unzip_low_u32x4(a0, a1), self.unzip_low_u32x4(b0, b1))
4963 }
4964 #[inline(always)]
4965 fn unzip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4966 let (a0, a1) = self.split_u32x8(a);
4967 let (b0, b1) = self.split_u32x8(b);
4968 self.combine_u32x4(self.unzip_high_u32x4(a0, a1), self.unzip_high_u32x4(b0, b1))
4969 }
4970 #[inline(always)]
4971 fn interleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
4972 let (a0, a1) = self.split_u32x8(a);
4973 let (b0, b1) = self.split_u32x8(b);
4974 let lo_lo = self.zip_low_u32x4(a0, b0);
4975 let lo_hi = self.zip_high_u32x4(a0, b0);
4976 let hi_lo = self.zip_low_u32x4(a1, b1);
4977 let hi_hi = self.zip_high_u32x4(a1, b1);
4978 (
4979 self.combine_u32x4(lo_lo, lo_hi),
4980 self.combine_u32x4(hi_lo, hi_hi),
4981 )
4982 }
4983 #[inline(always)]
4984 fn deinterleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
4985 let (a0, a1) = self.split_u32x8(a);
4986 let (b0, b1) = self.split_u32x8(b);
4987 let lo_even = self.unzip_low_u32x4(a0, a1);
4988 let lo_odd = self.unzip_high_u32x4(a0, a1);
4989 let hi_even = self.unzip_low_u32x4(b0, b1);
4990 let hi_odd = self.unzip_high_u32x4(b0, b1);
4991 (
4992 self.combine_u32x4(lo_even, hi_even),
4993 self.combine_u32x4(lo_odd, hi_odd),
4994 )
4995 }
4996 #[inline(always)]
4997 fn select_u32x8(self, a: mask32x8<Self>, b: u32x8<Self>, c: u32x8<Self>) -> u32x8<Self> {
4998 let (a0, a1) = self.split_mask32x8(a);
4999 let (b0, b1) = self.split_u32x8(b);
5000 let (c0, c1) = self.split_u32x8(c);
5001 self.combine_u32x4(self.select_u32x4(a0, b0, c0), self.select_u32x4(a1, b1, c1))
5002 }
5003 #[inline(always)]
5004 fn min_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
5005 let (a0, a1) = self.split_u32x8(a);
5006 let (b0, b1) = self.split_u32x8(b);
5007 self.combine_u32x4(self.min_u32x4(a0, b0), self.min_u32x4(a1, b1))
5008 }
5009 #[inline(always)]
5010 fn max_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
5011 let (a0, a1) = self.split_u32x8(a);
5012 let (b0, b1) = self.split_u32x8(b);
5013 self.combine_u32x4(self.max_u32x4(a0, b0), self.max_u32x4(a1, b1))
5014 }
5015 #[inline(always)]
5016 fn combine_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x16<Self> {
5017 u32x16 {
5018 val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
5019 simd: self,
5020 }
5021 }
5022 #[inline(always)]
5023 fn split_u32x8(self, a: u32x8<Self>) -> (u32x4<Self>, u32x4<Self>) {
5024 (
5025 u32x4 {
5026 val: crate::support::Aligned128(a.val.0[0]),
5027 simd: self,
5028 },
5029 u32x4 {
5030 val: crate::support::Aligned128(a.val.0[1]),
5031 simd: self,
5032 },
5033 )
5034 }
5035 #[inline(always)]
5036 fn reinterpret_u8_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
5037 let (a0, a1) = self.split_u32x8(a);
5038 self.combine_u8x16(self.reinterpret_u8_u32x4(a0), self.reinterpret_u8_u32x4(a1))
5039 }
5040 #[inline(always)]
5041 fn cvt_f32_u32x8(self, a: u32x8<Self>) -> f32x8<Self> {
5042 let (a0, a1) = self.split_u32x8(a);
5043 self.combine_f32x4(self.cvt_f32_u32x4(a0), self.cvt_f32_u32x4(a1))
5044 }
5045 #[inline(always)]
5046 fn splat_mask32x8(self, val: i32) -> mask32x8<Self> {
5047 let half = self.splat_mask32x4(val);
5048 self.combine_mask32x4(half, half)
5049 }
5050 #[inline(always)]
5051 fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8<Self> {
5052 mask32x8 {
5053 val: unsafe { core::mem::transmute_copy(&val) },
5054 simd: self,
5055 }
5056 }
5057 #[inline(always)]
5058 fn load_array_ref_mask32x8(self, val: &[i32; 8usize]) -> mask32x8<Self> {
5059 mask32x8 {
5060 val: unsafe { core::mem::transmute_copy(val) },
5061 simd: self,
5062 }
5063 }
5064 #[inline(always)]
5065 fn as_array_mask32x8(self, a: mask32x8<Self>) -> [i32; 8usize] {
5066 unsafe { core::mem::transmute::<[__m128i; 2usize], [i32; 8usize]>(a.val.0) }
5067 }
5068 #[inline(always)]
5069 fn as_array_ref_mask32x8(self, a: &mask32x8<Self>) -> &[i32; 8usize] {
5070 unsafe { core::mem::transmute::<&[__m128i; 2usize], &[i32; 8usize]>(&a.val.0) }
5071 }
5072 #[inline(always)]
5073 fn as_array_mut_mask32x8(self, a: &mut mask32x8<Self>) -> &mut [i32; 8usize] {
5074 unsafe { core::mem::transmute::<&mut [__m128i; 2usize], &mut [i32; 8usize]>(&mut a.val.0) }
5075 }
5076 #[inline(always)]
5077 fn store_array_mask32x8(self, a: mask32x8<Self>, dest: &mut [i32; 8usize]) -> () {
5078 unsafe {
5079 core::ptr::copy_nonoverlapping(
5080 (&raw const a.val.0) as *const i32,
5081 dest.as_mut_ptr(),
5082 8usize,
5083 );
5084 }
5085 }
5086 #[inline(always)]
5087 fn cvt_from_bytes_mask32x8(self, a: u8x32<Self>) -> mask32x8<Self> {
5088 unsafe {
5089 mask32x8 {
5090 val: core::mem::transmute(a.val),
5091 simd: self,
5092 }
5093 }
5094 }
5095 #[inline(always)]
5096 fn cvt_to_bytes_mask32x8(self, a: mask32x8<Self>) -> u8x32<Self> {
5097 unsafe {
5098 u8x32 {
5099 val: core::mem::transmute(a.val),
5100 simd: self,
5101 }
5102 }
5103 }
5104 #[inline(always)]
5105 fn slide_mask32x8<const SHIFT: usize>(
5106 self,
5107 a: mask32x8<Self>,
5108 b: mask32x8<Self>,
5109 ) -> mask32x8<Self> {
5110 unsafe {
5111 if SHIFT >= 8usize {
5112 return b;
5113 }
5114 let result = cross_block_alignr_128x2(
5115 self.cvt_to_bytes_mask32x8(b).val.0,
5116 self.cvt_to_bytes_mask32x8(a).val.0,
5117 SHIFT * 4usize,
5118 );
5119 self.cvt_from_bytes_mask32x8(u8x32 {
5120 val: crate::support::Aligned256(result),
5121 simd: self,
5122 })
5123 }
5124 }
5125 #[inline(always)]
5126 fn slide_within_blocks_mask32x8<const SHIFT: usize>(
5127 self,
5128 a: mask32x8<Self>,
5129 b: mask32x8<Self>,
5130 ) -> mask32x8<Self> {
5131 let (a0, a1) = self.split_mask32x8(a);
5132 let (b0, b1) = self.split_mask32x8(b);
5133 self.combine_mask32x4(
5134 self.slide_within_blocks_mask32x4::<SHIFT>(a0, b0),
5135 self.slide_within_blocks_mask32x4::<SHIFT>(a1, b1),
5136 )
5137 }
5138 #[inline(always)]
5139 fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
5140 let (a0, a1) = self.split_mask32x8(a);
5141 let (b0, b1) = self.split_mask32x8(b);
5142 self.combine_mask32x4(self.and_mask32x4(a0, b0), self.and_mask32x4(a1, b1))
5143 }
5144 #[inline(always)]
5145 fn or_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
5146 let (a0, a1) = self.split_mask32x8(a);
5147 let (b0, b1) = self.split_mask32x8(b);
5148 self.combine_mask32x4(self.or_mask32x4(a0, b0), self.or_mask32x4(a1, b1))
5149 }
5150 #[inline(always)]
5151 fn xor_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
5152 let (a0, a1) = self.split_mask32x8(a);
5153 let (b0, b1) = self.split_mask32x8(b);
5154 self.combine_mask32x4(self.xor_mask32x4(a0, b0), self.xor_mask32x4(a1, b1))
5155 }
5156 #[inline(always)]
5157 fn not_mask32x8(self, a: mask32x8<Self>) -> mask32x8<Self> {
5158 let (a0, a1) = self.split_mask32x8(a);
5159 self.combine_mask32x4(self.not_mask32x4(a0), self.not_mask32x4(a1))
5160 }
5161 #[inline(always)]
5162 fn select_mask32x8(
5163 self,
5164 a: mask32x8<Self>,
5165 b: mask32x8<Self>,
5166 c: mask32x8<Self>,
5167 ) -> mask32x8<Self> {
5168 let (a0, a1) = self.split_mask32x8(a);
5169 let (b0, b1) = self.split_mask32x8(b);
5170 let (c0, c1) = self.split_mask32x8(c);
5171 self.combine_mask32x4(
5172 self.select_mask32x4(a0, b0, c0),
5173 self.select_mask32x4(a1, b1, c1),
5174 )
5175 }
5176 #[inline(always)]
5177 fn simd_eq_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
5178 let (a0, a1) = self.split_mask32x8(a);
5179 let (b0, b1) = self.split_mask32x8(b);
5180 self.combine_mask32x4(self.simd_eq_mask32x4(a0, b0), self.simd_eq_mask32x4(a1, b1))
5181 }
5182 #[inline(always)]
5183 fn any_true_mask32x8(self, a: mask32x8<Self>) -> bool {
5184 let (a0, a1) = self.split_mask32x8(a);
5185 self.any_true_mask32x4(a0) || self.any_true_mask32x4(a1)
5186 }
5187 #[inline(always)]
5188 fn all_true_mask32x8(self, a: mask32x8<Self>) -> bool {
5189 let (a0, a1) = self.split_mask32x8(a);
5190 self.all_true_mask32x4(a0) && self.all_true_mask32x4(a1)
5191 }
5192 #[inline(always)]
5193 fn any_false_mask32x8(self, a: mask32x8<Self>) -> bool {
5194 let (a0, a1) = self.split_mask32x8(a);
5195 self.any_false_mask32x4(a0) || self.any_false_mask32x4(a1)
5196 }
5197 #[inline(always)]
5198 fn all_false_mask32x8(self, a: mask32x8<Self>) -> bool {
5199 let (a0, a1) = self.split_mask32x8(a);
5200 self.all_false_mask32x4(a0) && self.all_false_mask32x4(a1)
5201 }
5202 #[inline(always)]
5203 fn combine_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x16<Self> {
5204 mask32x16 {
5205 val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
5206 simd: self,
5207 }
5208 }
5209 #[inline(always)]
5210 fn split_mask32x8(self, a: mask32x8<Self>) -> (mask32x4<Self>, mask32x4<Self>) {
5211 (
5212 mask32x4 {
5213 val: crate::support::Aligned128(a.val.0[0]),
5214 simd: self,
5215 },
5216 mask32x4 {
5217 val: crate::support::Aligned128(a.val.0[1]),
5218 simd: self,
5219 },
5220 )
5221 }
5222 #[inline(always)]
5223 fn splat_f64x4(self, val: f64) -> f64x4<Self> {
5224 let half = self.splat_f64x2(val);
5225 self.combine_f64x2(half, half)
5226 }
5227 #[inline(always)]
5228 fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4<Self> {
5229 f64x4 {
5230 val: unsafe { core::mem::transmute_copy(&val) },
5231 simd: self,
5232 }
5233 }
5234 #[inline(always)]
5235 fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4<Self> {
5236 f64x4 {
5237 val: unsafe { core::mem::transmute_copy(val) },
5238 simd: self,
5239 }
5240 }
5241 #[inline(always)]
5242 fn as_array_f64x4(self, a: f64x4<Self>) -> [f64; 4usize] {
5243 unsafe { core::mem::transmute::<[__m128d; 2usize], [f64; 4usize]>(a.val.0) }
5244 }
5245 #[inline(always)]
5246 fn as_array_ref_f64x4(self, a: &f64x4<Self>) -> &[f64; 4usize] {
5247 unsafe { core::mem::transmute::<&[__m128d; 2usize], &[f64; 4usize]>(&a.val.0) }
5248 }
5249 #[inline(always)]
5250 fn as_array_mut_f64x4(self, a: &mut f64x4<Self>) -> &mut [f64; 4usize] {
5251 unsafe { core::mem::transmute::<&mut [__m128d; 2usize], &mut [f64; 4usize]>(&mut a.val.0) }
5252 }
5253 #[inline(always)]
5254 fn store_array_f64x4(self, a: f64x4<Self>, dest: &mut [f64; 4usize]) -> () {
5255 unsafe {
5256 core::ptr::copy_nonoverlapping(
5257 (&raw const a.val.0) as *const f64,
5258 dest.as_mut_ptr(),
5259 4usize,
5260 );
5261 }
5262 }
5263 #[inline(always)]
5264 fn cvt_from_bytes_f64x4(self, a: u8x32<Self>) -> f64x4<Self> {
5265 unsafe {
5266 f64x4 {
5267 val: core::mem::transmute(a.val),
5268 simd: self,
5269 }
5270 }
5271 }
5272 #[inline(always)]
5273 fn cvt_to_bytes_f64x4(self, a: f64x4<Self>) -> u8x32<Self> {
5274 unsafe {
5275 u8x32 {
5276 val: core::mem::transmute(a.val),
5277 simd: self,
5278 }
5279 }
5280 }
5281 #[inline(always)]
5282 fn slide_f64x4<const SHIFT: usize>(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5283 unsafe {
5284 if SHIFT >= 4usize {
5285 return b;
5286 }
5287 let result = cross_block_alignr_128x2(
5288 self.cvt_to_bytes_f64x4(b).val.0,
5289 self.cvt_to_bytes_f64x4(a).val.0,
5290 SHIFT * 8usize,
5291 );
5292 self.cvt_from_bytes_f64x4(u8x32 {
5293 val: crate::support::Aligned256(result),
5294 simd: self,
5295 })
5296 }
5297 }
5298 #[inline(always)]
5299 fn slide_within_blocks_f64x4<const SHIFT: usize>(
5300 self,
5301 a: f64x4<Self>,
5302 b: f64x4<Self>,
5303 ) -> f64x4<Self> {
5304 let (a0, a1) = self.split_f64x4(a);
5305 let (b0, b1) = self.split_f64x4(b);
5306 self.combine_f64x2(
5307 self.slide_within_blocks_f64x2::<SHIFT>(a0, b0),
5308 self.slide_within_blocks_f64x2::<SHIFT>(a1, b1),
5309 )
5310 }
5311 #[inline(always)]
5312 fn abs_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5313 let (a0, a1) = self.split_f64x4(a);
5314 self.combine_f64x2(self.abs_f64x2(a0), self.abs_f64x2(a1))
5315 }
5316 #[inline(always)]
5317 fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5318 let (a0, a1) = self.split_f64x4(a);
5319 self.combine_f64x2(self.neg_f64x2(a0), self.neg_f64x2(a1))
5320 }
5321 #[inline(always)]
5322 fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5323 let (a0, a1) = self.split_f64x4(a);
5324 self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1))
5325 }
5326 #[inline(always)]
5327 fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5328 let (a0, a1) = self.split_f64x4(a);
5329 let (b0, b1) = self.split_f64x4(b);
5330 self.combine_f64x2(self.add_f64x2(a0, b0), self.add_f64x2(a1, b1))
5331 }
5332 #[inline(always)]
5333 fn sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5334 let (a0, a1) = self.split_f64x4(a);
5335 let (b0, b1) = self.split_f64x4(b);
5336 self.combine_f64x2(self.sub_f64x2(a0, b0), self.sub_f64x2(a1, b1))
5337 }
5338 #[inline(always)]
5339 fn mul_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5340 let (a0, a1) = self.split_f64x4(a);
5341 let (b0, b1) = self.split_f64x4(b);
5342 self.combine_f64x2(self.mul_f64x2(a0, b0), self.mul_f64x2(a1, b1))
5343 }
5344 #[inline(always)]
5345 fn div_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5346 let (a0, a1) = self.split_f64x4(a);
5347 let (b0, b1) = self.split_f64x4(b);
5348 self.combine_f64x2(self.div_f64x2(a0, b0), self.div_f64x2(a1, b1))
5349 }
5350 #[inline(always)]
5351 fn copysign_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5352 let (a0, a1) = self.split_f64x4(a);
5353 let (b0, b1) = self.split_f64x4(b);
5354 self.combine_f64x2(self.copysign_f64x2(a0, b0), self.copysign_f64x2(a1, b1))
5355 }
5356 #[inline(always)]
5357 fn simd_eq_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
5358 let (a0, a1) = self.split_f64x4(a);
5359 let (b0, b1) = self.split_f64x4(b);
5360 self.combine_mask64x2(self.simd_eq_f64x2(a0, b0), self.simd_eq_f64x2(a1, b1))
5361 }
5362 #[inline(always)]
5363 fn simd_lt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
5364 let (a0, a1) = self.split_f64x4(a);
5365 let (b0, b1) = self.split_f64x4(b);
5366 self.combine_mask64x2(self.simd_lt_f64x2(a0, b0), self.simd_lt_f64x2(a1, b1))
5367 }
5368 #[inline(always)]
5369 fn simd_le_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
5370 let (a0, a1) = self.split_f64x4(a);
5371 let (b0, b1) = self.split_f64x4(b);
5372 self.combine_mask64x2(self.simd_le_f64x2(a0, b0), self.simd_le_f64x2(a1, b1))
5373 }
5374 #[inline(always)]
5375 fn simd_ge_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
5376 let (a0, a1) = self.split_f64x4(a);
5377 let (b0, b1) = self.split_f64x4(b);
5378 self.combine_mask64x2(self.simd_ge_f64x2(a0, b0), self.simd_ge_f64x2(a1, b1))
5379 }
5380 #[inline(always)]
5381 fn simd_gt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
5382 let (a0, a1) = self.split_f64x4(a);
5383 let (b0, b1) = self.split_f64x4(b);
5384 self.combine_mask64x2(self.simd_gt_f64x2(a0, b0), self.simd_gt_f64x2(a1, b1))
5385 }
5386 #[inline(always)]
5387 fn zip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5388 let (a0, _) = self.split_f64x4(a);
5389 let (b0, _) = self.split_f64x4(b);
5390 self.combine_f64x2(self.zip_low_f64x2(a0, b0), self.zip_high_f64x2(a0, b0))
5391 }
5392 #[inline(always)]
5393 fn zip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5394 let (_, a1) = self.split_f64x4(a);
5395 let (_, b1) = self.split_f64x4(b);
5396 self.combine_f64x2(self.zip_low_f64x2(a1, b1), self.zip_high_f64x2(a1, b1))
5397 }
5398 #[inline(always)]
5399 fn unzip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5400 let (a0, a1) = self.split_f64x4(a);
5401 let (b0, b1) = self.split_f64x4(b);
5402 self.combine_f64x2(self.unzip_low_f64x2(a0, a1), self.unzip_low_f64x2(b0, b1))
5403 }
5404 #[inline(always)]
5405 fn unzip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5406 let (a0, a1) = self.split_f64x4(a);
5407 let (b0, b1) = self.split_f64x4(b);
5408 self.combine_f64x2(self.unzip_high_f64x2(a0, a1), self.unzip_high_f64x2(b0, b1))
5409 }
5410 #[inline(always)]
5411 fn interleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
5412 let (a0, a1) = self.split_f64x4(a);
5413 let (b0, b1) = self.split_f64x4(b);
5414 let lo_lo = self.zip_low_f64x2(a0, b0);
5415 let lo_hi = self.zip_high_f64x2(a0, b0);
5416 let hi_lo = self.zip_low_f64x2(a1, b1);
5417 let hi_hi = self.zip_high_f64x2(a1, b1);
5418 (
5419 self.combine_f64x2(lo_lo, lo_hi),
5420 self.combine_f64x2(hi_lo, hi_hi),
5421 )
5422 }
5423 #[inline(always)]
5424 fn deinterleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
5425 let (a0, a1) = self.split_f64x4(a);
5426 let (b0, b1) = self.split_f64x4(b);
5427 let lo_even = self.unzip_low_f64x2(a0, a1);
5428 let lo_odd = self.unzip_high_f64x2(a0, a1);
5429 let hi_even = self.unzip_low_f64x2(b0, b1);
5430 let hi_odd = self.unzip_high_f64x2(b0, b1);
5431 (
5432 self.combine_f64x2(lo_even, hi_even),
5433 self.combine_f64x2(lo_odd, hi_odd),
5434 )
5435 }
5436 #[inline(always)]
5437 fn max_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5438 let (a0, a1) = self.split_f64x4(a);
5439 let (b0, b1) = self.split_f64x4(b);
5440 self.combine_f64x2(self.max_f64x2(a0, b0), self.max_f64x2(a1, b1))
5441 }
5442 #[inline(always)]
5443 fn min_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5444 let (a0, a1) = self.split_f64x4(a);
5445 let (b0, b1) = self.split_f64x4(b);
5446 self.combine_f64x2(self.min_f64x2(a0, b0), self.min_f64x2(a1, b1))
5447 }
5448 #[inline(always)]
5449 fn max_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5450 let (a0, a1) = self.split_f64x4(a);
5451 let (b0, b1) = self.split_f64x4(b);
5452 self.combine_f64x2(
5453 self.max_precise_f64x2(a0, b0),
5454 self.max_precise_f64x2(a1, b1),
5455 )
5456 }
5457 #[inline(always)]
5458 fn min_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5459 let (a0, a1) = self.split_f64x4(a);
5460 let (b0, b1) = self.split_f64x4(b);
5461 self.combine_f64x2(
5462 self.min_precise_f64x2(a0, b0),
5463 self.min_precise_f64x2(a1, b1),
5464 )
5465 }
5466 #[inline(always)]
5467 fn mul_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
5468 let (a0, a1) = self.split_f64x4(a);
5469 let (b0, b1) = self.split_f64x4(b);
5470 let (c0, c1) = self.split_f64x4(c);
5471 self.combine_f64x2(
5472 self.mul_add_f64x2(a0, b0, c0),
5473 self.mul_add_f64x2(a1, b1, c1),
5474 )
5475 }
5476 #[inline(always)]
5477 fn mul_sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
5478 let (a0, a1) = self.split_f64x4(a);
5479 let (b0, b1) = self.split_f64x4(b);
5480 let (c0, c1) = self.split_f64x4(c);
5481 self.combine_f64x2(
5482 self.mul_sub_f64x2(a0, b0, c0),
5483 self.mul_sub_f64x2(a1, b1, c1),
5484 )
5485 }
5486 #[inline(always)]
5487 fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5488 let (a0, a1) = self.split_f64x4(a);
5489 self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1))
5490 }
5491 #[inline(always)]
5492 fn ceil_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5493 let (a0, a1) = self.split_f64x4(a);
5494 self.combine_f64x2(self.ceil_f64x2(a0), self.ceil_f64x2(a1))
5495 }
5496 #[inline(always)]
5497 fn round_ties_even_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5498 let (a0, a1) = self.split_f64x4(a);
5499 self.combine_f64x2(
5500 self.round_ties_even_f64x2(a0),
5501 self.round_ties_even_f64x2(a1),
5502 )
5503 }
5504 #[inline(always)]
5505 fn fract_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5506 let (a0, a1) = self.split_f64x4(a);
5507 self.combine_f64x2(self.fract_f64x2(a0), self.fract_f64x2(a1))
5508 }
5509 #[inline(always)]
5510 fn trunc_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5511 let (a0, a1) = self.split_f64x4(a);
5512 self.combine_f64x2(self.trunc_f64x2(a0), self.trunc_f64x2(a1))
5513 }
5514 #[inline(always)]
5515 fn select_f64x4(self, a: mask64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
5516 let (a0, a1) = self.split_mask64x4(a);
5517 let (b0, b1) = self.split_f64x4(b);
5518 let (c0, c1) = self.split_f64x4(c);
5519 self.combine_f64x2(self.select_f64x2(a0, b0, c0), self.select_f64x2(a1, b1, c1))
5520 }
5521 #[inline(always)]
5522 fn combine_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x8<Self> {
5523 f64x8 {
5524 val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
5525 simd: self,
5526 }
5527 }
5528 #[inline(always)]
5529 fn split_f64x4(self, a: f64x4<Self>) -> (f64x2<Self>, f64x2<Self>) {
5530 (
5531 f64x2 {
5532 val: crate::support::Aligned128(a.val.0[0]),
5533 simd: self,
5534 },
5535 f64x2 {
5536 val: crate::support::Aligned128(a.val.0[1]),
5537 simd: self,
5538 },
5539 )
5540 }
5541 #[inline(always)]
5542 fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
5543 let (a0, a1) = self.split_f64x4(a);
5544 self.combine_f32x4(
5545 self.reinterpret_f32_f64x2(a0),
5546 self.reinterpret_f32_f64x2(a1),
5547 )
5548 }
5549 #[inline(always)]
5550 fn splat_mask64x4(self, val: i64) -> mask64x4<Self> {
5551 let half = self.splat_mask64x2(val);
5552 self.combine_mask64x2(half, half)
5553 }
5554 #[inline(always)]
5555 fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4<Self> {
5556 mask64x4 {
5557 val: unsafe { core::mem::transmute_copy(&val) },
5558 simd: self,
5559 }
5560 }
5561 #[inline(always)]
5562 fn load_array_ref_mask64x4(self, val: &[i64; 4usize]) -> mask64x4<Self> {
5563 mask64x4 {
5564 val: unsafe { core::mem::transmute_copy(val) },
5565 simd: self,
5566 }
5567 }
5568 #[inline(always)]
5569 fn as_array_mask64x4(self, a: mask64x4<Self>) -> [i64; 4usize] {
5570 unsafe { core::mem::transmute::<[__m128i; 2usize], [i64; 4usize]>(a.val.0) }
5571 }
5572 #[inline(always)]
5573 fn as_array_ref_mask64x4(self, a: &mask64x4<Self>) -> &[i64; 4usize] {
5574 unsafe { core::mem::transmute::<&[__m128i; 2usize], &[i64; 4usize]>(&a.val.0) }
5575 }
5576 #[inline(always)]
5577 fn as_array_mut_mask64x4(self, a: &mut mask64x4<Self>) -> &mut [i64; 4usize] {
5578 unsafe { core::mem::transmute::<&mut [__m128i; 2usize], &mut [i64; 4usize]>(&mut a.val.0) }
5579 }
5580 #[inline(always)]
5581 fn store_array_mask64x4(self, a: mask64x4<Self>, dest: &mut [i64; 4usize]) -> () {
5582 unsafe {
5583 core::ptr::copy_nonoverlapping(
5584 (&raw const a.val.0) as *const i64,
5585 dest.as_mut_ptr(),
5586 4usize,
5587 );
5588 }
5589 }
5590 #[inline(always)]
5591 fn cvt_from_bytes_mask64x4(self, a: u8x32<Self>) -> mask64x4<Self> {
5592 unsafe {
5593 mask64x4 {
5594 val: core::mem::transmute(a.val),
5595 simd: self,
5596 }
5597 }
5598 }
5599 #[inline(always)]
5600 fn cvt_to_bytes_mask64x4(self, a: mask64x4<Self>) -> u8x32<Self> {
5601 unsafe {
5602 u8x32 {
5603 val: core::mem::transmute(a.val),
5604 simd: self,
5605 }
5606 }
5607 }
5608 #[inline(always)]
5609 fn slide_mask64x4<const SHIFT: usize>(
5610 self,
5611 a: mask64x4<Self>,
5612 b: mask64x4<Self>,
5613 ) -> mask64x4<Self> {
5614 unsafe {
5615 if SHIFT >= 4usize {
5616 return b;
5617 }
5618 let result = cross_block_alignr_128x2(
5619 self.cvt_to_bytes_mask64x4(b).val.0,
5620 self.cvt_to_bytes_mask64x4(a).val.0,
5621 SHIFT * 8usize,
5622 );
5623 self.cvt_from_bytes_mask64x4(u8x32 {
5624 val: crate::support::Aligned256(result),
5625 simd: self,
5626 })
5627 }
5628 }
5629 #[inline(always)]
5630 fn slide_within_blocks_mask64x4<const SHIFT: usize>(
5631 self,
5632 a: mask64x4<Self>,
5633 b: mask64x4<Self>,
5634 ) -> mask64x4<Self> {
5635 let (a0, a1) = self.split_mask64x4(a);
5636 let (b0, b1) = self.split_mask64x4(b);
5637 self.combine_mask64x2(
5638 self.slide_within_blocks_mask64x2::<SHIFT>(a0, b0),
5639 self.slide_within_blocks_mask64x2::<SHIFT>(a1, b1),
5640 )
5641 }
5642 #[inline(always)]
5643 fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
5644 let (a0, a1) = self.split_mask64x4(a);
5645 let (b0, b1) = self.split_mask64x4(b);
5646 self.combine_mask64x2(self.and_mask64x2(a0, b0), self.and_mask64x2(a1, b1))
5647 }
5648 #[inline(always)]
5649 fn or_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
5650 let (a0, a1) = self.split_mask64x4(a);
5651 let (b0, b1) = self.split_mask64x4(b);
5652 self.combine_mask64x2(self.or_mask64x2(a0, b0), self.or_mask64x2(a1, b1))
5653 }
5654 #[inline(always)]
5655 fn xor_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
5656 let (a0, a1) = self.split_mask64x4(a);
5657 let (b0, b1) = self.split_mask64x4(b);
5658 self.combine_mask64x2(self.xor_mask64x2(a0, b0), self.xor_mask64x2(a1, b1))
5659 }
5660 #[inline(always)]
5661 fn not_mask64x4(self, a: mask64x4<Self>) -> mask64x4<Self> {
5662 let (a0, a1) = self.split_mask64x4(a);
5663 self.combine_mask64x2(self.not_mask64x2(a0), self.not_mask64x2(a1))
5664 }
5665 #[inline(always)]
5666 fn select_mask64x4(
5667 self,
5668 a: mask64x4<Self>,
5669 b: mask64x4<Self>,
5670 c: mask64x4<Self>,
5671 ) -> mask64x4<Self> {
5672 let (a0, a1) = self.split_mask64x4(a);
5673 let (b0, b1) = self.split_mask64x4(b);
5674 let (c0, c1) = self.split_mask64x4(c);
5675 self.combine_mask64x2(
5676 self.select_mask64x2(a0, b0, c0),
5677 self.select_mask64x2(a1, b1, c1),
5678 )
5679 }
5680 #[inline(always)]
5681 fn simd_eq_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
5682 let (a0, a1) = self.split_mask64x4(a);
5683 let (b0, b1) = self.split_mask64x4(b);
5684 self.combine_mask64x2(self.simd_eq_mask64x2(a0, b0), self.simd_eq_mask64x2(a1, b1))
5685 }
5686 #[inline(always)]
5687 fn any_true_mask64x4(self, a: mask64x4<Self>) -> bool {
5688 let (a0, a1) = self.split_mask64x4(a);
5689 self.any_true_mask64x2(a0) || self.any_true_mask64x2(a1)
5690 }
5691 #[inline(always)]
5692 fn all_true_mask64x4(self, a: mask64x4<Self>) -> bool {
5693 let (a0, a1) = self.split_mask64x4(a);
5694 self.all_true_mask64x2(a0) && self.all_true_mask64x2(a1)
5695 }
5696 #[inline(always)]
5697 fn any_false_mask64x4(self, a: mask64x4<Self>) -> bool {
5698 let (a0, a1) = self.split_mask64x4(a);
5699 self.any_false_mask64x2(a0) || self.any_false_mask64x2(a1)
5700 }
5701 #[inline(always)]
5702 fn all_false_mask64x4(self, a: mask64x4<Self>) -> bool {
5703 let (a0, a1) = self.split_mask64x4(a);
5704 self.all_false_mask64x2(a0) && self.all_false_mask64x2(a1)
5705 }
5706 #[inline(always)]
5707 fn combine_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x8<Self> {
5708 mask64x8 {
5709 val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
5710 simd: self,
5711 }
5712 }
5713 #[inline(always)]
5714 fn split_mask64x4(self, a: mask64x4<Self>) -> (mask64x2<Self>, mask64x2<Self>) {
5715 (
5716 mask64x2 {
5717 val: crate::support::Aligned128(a.val.0[0]),
5718 simd: self,
5719 },
5720 mask64x2 {
5721 val: crate::support::Aligned128(a.val.0[1]),
5722 simd: self,
5723 },
5724 )
5725 }
5726 #[inline(always)]
5727 fn splat_f32x16(self, val: f32) -> f32x16<Self> {
5728 let half = self.splat_f32x8(val);
5729 self.combine_f32x8(half, half)
5730 }
5731 #[inline(always)]
5732 fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16<Self> {
5733 f32x16 {
5734 val: unsafe { core::mem::transmute_copy(&val) },
5735 simd: self,
5736 }
5737 }
5738 #[inline(always)]
5739 fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16<Self> {
5740 f32x16 {
5741 val: unsafe { core::mem::transmute_copy(val) },
5742 simd: self,
5743 }
5744 }
5745 #[inline(always)]
5746 fn as_array_f32x16(self, a: f32x16<Self>) -> [f32; 16usize] {
5747 unsafe { core::mem::transmute::<[__m128; 4usize], [f32; 16usize]>(a.val.0) }
5748 }
5749 #[inline(always)]
5750 fn as_array_ref_f32x16(self, a: &f32x16<Self>) -> &[f32; 16usize] {
5751 unsafe { core::mem::transmute::<&[__m128; 4usize], &[f32; 16usize]>(&a.val.0) }
5752 }
5753 #[inline(always)]
5754 fn as_array_mut_f32x16(self, a: &mut f32x16<Self>) -> &mut [f32; 16usize] {
5755 unsafe { core::mem::transmute::<&mut [__m128; 4usize], &mut [f32; 16usize]>(&mut a.val.0) }
5756 }
5757 #[inline(always)]
5758 fn store_array_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
5759 unsafe {
5760 core::ptr::copy_nonoverlapping(
5761 (&raw const a.val.0) as *const f32,
5762 dest.as_mut_ptr(),
5763 16usize,
5764 );
5765 }
5766 }
5767 #[inline(always)]
5768 fn cvt_from_bytes_f32x16(self, a: u8x64<Self>) -> f32x16<Self> {
5769 unsafe {
5770 f32x16 {
5771 val: core::mem::transmute(a.val),
5772 simd: self,
5773 }
5774 }
5775 }
5776 #[inline(always)]
5777 fn cvt_to_bytes_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
5778 unsafe {
5779 u8x64 {
5780 val: core::mem::transmute(a.val),
5781 simd: self,
5782 }
5783 }
5784 }
5785 #[inline(always)]
5786 fn slide_f32x16<const SHIFT: usize>(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5787 unsafe {
5788 if SHIFT >= 16usize {
5789 return b;
5790 }
5791 let result = cross_block_alignr_128x4(
5792 self.cvt_to_bytes_f32x16(b).val.0,
5793 self.cvt_to_bytes_f32x16(a).val.0,
5794 SHIFT * 4usize,
5795 );
5796 self.cvt_from_bytes_f32x16(u8x64 {
5797 val: crate::support::Aligned512(result),
5798 simd: self,
5799 })
5800 }
5801 }
5802 #[inline(always)]
5803 fn slide_within_blocks_f32x16<const SHIFT: usize>(
5804 self,
5805 a: f32x16<Self>,
5806 b: f32x16<Self>,
5807 ) -> f32x16<Self> {
5808 let (a0, a1) = self.split_f32x16(a);
5809 let (b0, b1) = self.split_f32x16(b);
5810 self.combine_f32x8(
5811 self.slide_within_blocks_f32x8::<SHIFT>(a0, b0),
5812 self.slide_within_blocks_f32x8::<SHIFT>(a1, b1),
5813 )
5814 }
5815 #[inline(always)]
5816 fn abs_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
5817 let (a0, a1) = self.split_f32x16(a);
5818 self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1))
5819 }
5820 #[inline(always)]
5821 fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
5822 let (a0, a1) = self.split_f32x16(a);
5823 self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1))
5824 }
5825 #[inline(always)]
5826 fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
5827 let (a0, a1) = self.split_f32x16(a);
5828 self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
5829 }
5830 #[inline(always)]
5831 fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5832 let (a0, a1) = self.split_f32x16(a);
5833 let (b0, b1) = self.split_f32x16(b);
5834 self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1))
5835 }
5836 #[inline(always)]
5837 fn sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5838 let (a0, a1) = self.split_f32x16(a);
5839 let (b0, b1) = self.split_f32x16(b);
5840 self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1))
5841 }
5842 #[inline(always)]
5843 fn mul_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5844 let (a0, a1) = self.split_f32x16(a);
5845 let (b0, b1) = self.split_f32x16(b);
5846 self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1))
5847 }
5848 #[inline(always)]
5849 fn div_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5850 let (a0, a1) = self.split_f32x16(a);
5851 let (b0, b1) = self.split_f32x16(b);
5852 self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1))
5853 }
5854 #[inline(always)]
5855 fn copysign_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5856 let (a0, a1) = self.split_f32x16(a);
5857 let (b0, b1) = self.split_f32x16(b);
5858 self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1))
5859 }
5860 #[inline(always)]
5861 fn simd_eq_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
5862 let (a0, a1) = self.split_f32x16(a);
5863 let (b0, b1) = self.split_f32x16(b);
5864 self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1))
5865 }
5866 #[inline(always)]
5867 fn simd_lt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
5868 let (a0, a1) = self.split_f32x16(a);
5869 let (b0, b1) = self.split_f32x16(b);
5870 self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1))
5871 }
5872 #[inline(always)]
5873 fn simd_le_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
5874 let (a0, a1) = self.split_f32x16(a);
5875 let (b0, b1) = self.split_f32x16(b);
5876 self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1))
5877 }
5878 #[inline(always)]
5879 fn simd_ge_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
5880 let (a0, a1) = self.split_f32x16(a);
5881 let (b0, b1) = self.split_f32x16(b);
5882 self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1))
5883 }
5884 #[inline(always)]
5885 fn simd_gt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
5886 let (a0, a1) = self.split_f32x16(a);
5887 let (b0, b1) = self.split_f32x16(b);
5888 self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1))
5889 }
5890 #[inline(always)]
5891 fn zip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5892 let (a0, _) = self.split_f32x16(a);
5893 let (b0, _) = self.split_f32x16(b);
5894 self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0))
5895 }
5896 #[inline(always)]
5897 fn zip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5898 let (_, a1) = self.split_f32x16(a);
5899 let (_, b1) = self.split_f32x16(b);
5900 self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1))
5901 }
5902 #[inline(always)]
5903 fn unzip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5904 let (a0, a1) = self.split_f32x16(a);
5905 let (b0, b1) = self.split_f32x16(b);
5906 self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1))
5907 }
5908 #[inline(always)]
5909 fn unzip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5910 let (a0, a1) = self.split_f32x16(a);
5911 let (b0, b1) = self.split_f32x16(b);
5912 self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1))
5913 }
5914 #[inline(always)]
5915 fn interleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
5916 let (a0, a1) = self.split_f32x16(a);
5917 let (b0, b1) = self.split_f32x16(b);
5918 let lo_lo = self.zip_low_f32x8(a0, b0);
5919 let lo_hi = self.zip_high_f32x8(a0, b0);
5920 let hi_lo = self.zip_low_f32x8(a1, b1);
5921 let hi_hi = self.zip_high_f32x8(a1, b1);
5922 (
5923 self.combine_f32x8(lo_lo, lo_hi),
5924 self.combine_f32x8(hi_lo, hi_hi),
5925 )
5926 }
5927 #[inline(always)]
5928 fn deinterleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
5929 let (a0, a1) = self.split_f32x16(a);
5930 let (b0, b1) = self.split_f32x16(b);
5931 let lo_even = self.unzip_low_f32x8(a0, a1);
5932 let lo_odd = self.unzip_high_f32x8(a0, a1);
5933 let hi_even = self.unzip_low_f32x8(b0, b1);
5934 let hi_odd = self.unzip_high_f32x8(b0, b1);
5935 (
5936 self.combine_f32x8(lo_even, hi_even),
5937 self.combine_f32x8(lo_odd, hi_odd),
5938 )
5939 }
5940 #[inline(always)]
5941 fn max_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5942 let (a0, a1) = self.split_f32x16(a);
5943 let (b0, b1) = self.split_f32x16(b);
5944 self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1))
5945 }
5946 #[inline(always)]
5947 fn min_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5948 let (a0, a1) = self.split_f32x16(a);
5949 let (b0, b1) = self.split_f32x16(b);
5950 self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1))
5951 }
5952 #[inline(always)]
5953 fn max_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5954 let (a0, a1) = self.split_f32x16(a);
5955 let (b0, b1) = self.split_f32x16(b);
5956 self.combine_f32x8(
5957 self.max_precise_f32x8(a0, b0),
5958 self.max_precise_f32x8(a1, b1),
5959 )
5960 }
5961 #[inline(always)]
5962 fn min_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5963 let (a0, a1) = self.split_f32x16(a);
5964 let (b0, b1) = self.split_f32x16(b);
5965 self.combine_f32x8(
5966 self.min_precise_f32x8(a0, b0),
5967 self.min_precise_f32x8(a1, b1),
5968 )
5969 }
5970 #[inline(always)]
5971 fn mul_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
5972 let (a0, a1) = self.split_f32x16(a);
5973 let (b0, b1) = self.split_f32x16(b);
5974 let (c0, c1) = self.split_f32x16(c);
5975 self.combine_f32x8(
5976 self.mul_add_f32x8(a0, b0, c0),
5977 self.mul_add_f32x8(a1, b1, c1),
5978 )
5979 }
5980 #[inline(always)]
5981 fn mul_sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
5982 let (a0, a1) = self.split_f32x16(a);
5983 let (b0, b1) = self.split_f32x16(b);
5984 let (c0, c1) = self.split_f32x16(c);
5985 self.combine_f32x8(
5986 self.mul_sub_f32x8(a0, b0, c0),
5987 self.mul_sub_f32x8(a1, b1, c1),
5988 )
5989 }
5990 #[inline(always)]
5991 fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
5992 let (a0, a1) = self.split_f32x16(a);
5993 self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1))
5994 }
5995 #[inline(always)]
5996 fn ceil_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
5997 let (a0, a1) = self.split_f32x16(a);
5998 self.combine_f32x8(self.ceil_f32x8(a0), self.ceil_f32x8(a1))
5999 }
6000 #[inline(always)]
6001 fn round_ties_even_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
6002 let (a0, a1) = self.split_f32x16(a);
6003 self.combine_f32x8(
6004 self.round_ties_even_f32x8(a0),
6005 self.round_ties_even_f32x8(a1),
6006 )
6007 }
6008 #[inline(always)]
6009 fn fract_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
6010 let (a0, a1) = self.split_f32x16(a);
6011 self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1))
6012 }
6013 #[inline(always)]
6014 fn trunc_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
6015 let (a0, a1) = self.split_f32x16(a);
6016 self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1))
6017 }
6018 #[inline(always)]
6019 fn select_f32x16(self, a: mask32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
6020 let (a0, a1) = self.split_mask32x16(a);
6021 let (b0, b1) = self.split_f32x16(b);
6022 let (c0, c1) = self.split_f32x16(c);
6023 self.combine_f32x8(self.select_f32x8(a0, b0, c0), self.select_f32x8(a1, b1, c1))
6024 }
6025 #[inline(always)]
6026 fn split_f32x16(self, a: f32x16<Self>) -> (f32x8<Self>, f32x8<Self>) {
6027 (
6028 f32x8 {
6029 val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
6030 simd: self,
6031 },
6032 f32x8 {
6033 val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
6034 simd: self,
6035 },
6036 )
6037 }
6038 #[inline(always)]
6039 fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
6040 let (a0, a1) = self.split_f32x16(a);
6041 self.combine_f64x4(
6042 self.reinterpret_f64_f32x8(a0),
6043 self.reinterpret_f64_f32x8(a1),
6044 )
6045 }
6046 #[inline(always)]
6047 fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
6048 let (a0, a1) = self.split_f32x16(a);
6049 self.combine_i32x8(
6050 self.reinterpret_i32_f32x8(a0),
6051 self.reinterpret_i32_f32x8(a1),
6052 )
6053 }
6054 #[inline(always)]
6055 fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
6056 unsafe {
6057 let v0 = _mm_loadu_ps(src.as_ptr() as *const _);
6058 let v1 = _mm_loadu_ps(src.as_ptr().add(4usize) as *const _);
6059 let v2 = _mm_loadu_ps(src.as_ptr().add(2 * 4usize) as *const _);
6060 let v3 = _mm_loadu_ps(src.as_ptr().add(3 * 4usize) as *const _);
6061 let tmp0 = _mm_unpacklo_ps(v0, v1);
6062 let tmp1 = _mm_unpackhi_ps(v0, v1);
6063 let tmp2 = _mm_unpacklo_ps(v2, v3);
6064 let tmp3 = _mm_unpackhi_ps(v2, v3);
6065 let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
6066 let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
6067 let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
6068 let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
6069 self.combine_f32x8(
6070 self.combine_f32x4(out0.simd_into(self), out1.simd_into(self)),
6071 self.combine_f32x4(out2.simd_into(self), out3.simd_into(self)),
6072 )
6073 }
6074 }
6075 #[inline(always)]
6076 fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
6077 let (v01, v23) = self.split_f32x16(a);
6078 let (v0, v1) = self.split_f32x8(v01);
6079 let (v2, v3) = self.split_f32x8(v23);
6080 let v0 = v0.into();
6081 let v1 = v1.into();
6082 let v2 = v2.into();
6083 let v3 = v3.into();
6084 unsafe {
6085 let tmp0 = _mm_unpacklo_ps(v0, v1);
6086 let tmp1 = _mm_unpackhi_ps(v0, v1);
6087 let tmp2 = _mm_unpacklo_ps(v2, v3);
6088 let tmp3 = _mm_unpackhi_ps(v2, v3);
6089 let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
6090 let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
6091 let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
6092 let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
6093 _mm_storeu_ps(dest.as_mut_ptr() as *mut _, out0);
6094 _mm_storeu_ps(dest.as_mut_ptr().add(4usize) as *mut _, out1);
6095 _mm_storeu_ps(dest.as_mut_ptr().add(2 * 4usize) as *mut _, out2);
6096 _mm_storeu_ps(dest.as_mut_ptr().add(3 * 4usize) as *mut _, out3);
6097 }
6098 }
6099 #[inline(always)]
6100 fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
6101 let (a0, a1) = self.split_f32x16(a);
6102 self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1))
6103 }
6104 #[inline(always)]
6105 fn reinterpret_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
6106 let (a0, a1) = self.split_f32x16(a);
6107 self.combine_u32x8(
6108 self.reinterpret_u32_f32x8(a0),
6109 self.reinterpret_u32_f32x8(a1),
6110 )
6111 }
6112 #[inline(always)]
6113 fn cvt_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
6114 let (a0, a1) = self.split_f32x16(a);
6115 self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1))
6116 }
6117 #[inline(always)]
6118 fn cvt_u32_precise_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
6119 let (a0, a1) = self.split_f32x16(a);
6120 self.combine_u32x8(
6121 self.cvt_u32_precise_f32x8(a0),
6122 self.cvt_u32_precise_f32x8(a1),
6123 )
6124 }
6125 #[inline(always)]
6126 fn cvt_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
6127 let (a0, a1) = self.split_f32x16(a);
6128 self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1))
6129 }
6130 #[inline(always)]
6131 fn cvt_i32_precise_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
6132 let (a0, a1) = self.split_f32x16(a);
6133 self.combine_i32x8(
6134 self.cvt_i32_precise_f32x8(a0),
6135 self.cvt_i32_precise_f32x8(a1),
6136 )
6137 }
6138 #[inline(always)]
6139 fn splat_i8x64(self, val: i8) -> i8x64<Self> {
6140 let half = self.splat_i8x32(val);
6141 self.combine_i8x32(half, half)
6142 }
6143 #[inline(always)]
6144 fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64<Self> {
6145 i8x64 {
6146 val: unsafe { core::mem::transmute_copy(&val) },
6147 simd: self,
6148 }
6149 }
6150 #[inline(always)]
6151 fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64<Self> {
6152 i8x64 {
6153 val: unsafe { core::mem::transmute_copy(val) },
6154 simd: self,
6155 }
6156 }
6157 #[inline(always)]
6158 fn as_array_i8x64(self, a: i8x64<Self>) -> [i8; 64usize] {
6159 unsafe { core::mem::transmute::<[__m128i; 4usize], [i8; 64usize]>(a.val.0) }
6160 }
6161 #[inline(always)]
6162 fn as_array_ref_i8x64(self, a: &i8x64<Self>) -> &[i8; 64usize] {
6163 unsafe { core::mem::transmute::<&[__m128i; 4usize], &[i8; 64usize]>(&a.val.0) }
6164 }
6165 #[inline(always)]
6166 fn as_array_mut_i8x64(self, a: &mut i8x64<Self>) -> &mut [i8; 64usize] {
6167 unsafe { core::mem::transmute::<&mut [__m128i; 4usize], &mut [i8; 64usize]>(&mut a.val.0) }
6168 }
6169 #[inline(always)]
6170 fn store_array_i8x64(self, a: i8x64<Self>, dest: &mut [i8; 64usize]) -> () {
6171 unsafe {
6172 core::ptr::copy_nonoverlapping(
6173 (&raw const a.val.0) as *const i8,
6174 dest.as_mut_ptr(),
6175 64usize,
6176 );
6177 }
6178 }
6179 #[inline(always)]
6180 fn cvt_from_bytes_i8x64(self, a: u8x64<Self>) -> i8x64<Self> {
6181 unsafe {
6182 i8x64 {
6183 val: core::mem::transmute(a.val),
6184 simd: self,
6185 }
6186 }
6187 }
6188 #[inline(always)]
6189 fn cvt_to_bytes_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
6190 unsafe {
6191 u8x64 {
6192 val: core::mem::transmute(a.val),
6193 simd: self,
6194 }
6195 }
6196 }
6197 #[inline(always)]
6198 fn slide_i8x64<const SHIFT: usize>(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6199 unsafe {
6200 if SHIFT >= 64usize {
6201 return b;
6202 }
6203 let result = cross_block_alignr_128x4(
6204 self.cvt_to_bytes_i8x64(b).val.0,
6205 self.cvt_to_bytes_i8x64(a).val.0,
6206 SHIFT,
6207 );
6208 self.cvt_from_bytes_i8x64(u8x64 {
6209 val: crate::support::Aligned512(result),
6210 simd: self,
6211 })
6212 }
6213 }
6214 #[inline(always)]
6215 fn slide_within_blocks_i8x64<const SHIFT: usize>(
6216 self,
6217 a: i8x64<Self>,
6218 b: i8x64<Self>,
6219 ) -> i8x64<Self> {
6220 let (a0, a1) = self.split_i8x64(a);
6221 let (b0, b1) = self.split_i8x64(b);
6222 self.combine_i8x32(
6223 self.slide_within_blocks_i8x32::<SHIFT>(a0, b0),
6224 self.slide_within_blocks_i8x32::<SHIFT>(a1, b1),
6225 )
6226 }
6227 #[inline(always)]
6228 fn add_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6229 let (a0, a1) = self.split_i8x64(a);
6230 let (b0, b1) = self.split_i8x64(b);
6231 self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1))
6232 }
6233 #[inline(always)]
6234 fn sub_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6235 let (a0, a1) = self.split_i8x64(a);
6236 let (b0, b1) = self.split_i8x64(b);
6237 self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1))
6238 }
6239 #[inline(always)]
6240 fn mul_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6241 let (a0, a1) = self.split_i8x64(a);
6242 let (b0, b1) = self.split_i8x64(b);
6243 self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1))
6244 }
6245 #[inline(always)]
6246 fn and_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6247 let (a0, a1) = self.split_i8x64(a);
6248 let (b0, b1) = self.split_i8x64(b);
6249 self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1))
6250 }
6251 #[inline(always)]
6252 fn or_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6253 let (a0, a1) = self.split_i8x64(a);
6254 let (b0, b1) = self.split_i8x64(b);
6255 self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1))
6256 }
6257 #[inline(always)]
6258 fn xor_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6259 let (a0, a1) = self.split_i8x64(a);
6260 let (b0, b1) = self.split_i8x64(b);
6261 self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1))
6262 }
6263 #[inline(always)]
6264 fn not_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
6265 let (a0, a1) = self.split_i8x64(a);
6266 self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1))
6267 }
6268 #[inline(always)]
6269 fn shl_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
6270 let (a0, a1) = self.split_i8x64(a);
6271 self.combine_i8x32(self.shl_i8x32(a0, shift), self.shl_i8x32(a1, shift))
6272 }
6273 #[inline(always)]
6274 fn shlv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6275 let (a0, a1) = self.split_i8x64(a);
6276 let (b0, b1) = self.split_i8x64(b);
6277 self.combine_i8x32(self.shlv_i8x32(a0, b0), self.shlv_i8x32(a1, b1))
6278 }
6279 #[inline(always)]
6280 fn shr_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
6281 let (a0, a1) = self.split_i8x64(a);
6282 self.combine_i8x32(self.shr_i8x32(a0, shift), self.shr_i8x32(a1, shift))
6283 }
6284 #[inline(always)]
6285 fn shrv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6286 let (a0, a1) = self.split_i8x64(a);
6287 let (b0, b1) = self.split_i8x64(b);
6288 self.combine_i8x32(self.shrv_i8x32(a0, b0), self.shrv_i8x32(a1, b1))
6289 }
6290 #[inline(always)]
6291 fn simd_eq_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
6292 let (a0, a1) = self.split_i8x64(a);
6293 let (b0, b1) = self.split_i8x64(b);
6294 self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1))
6295 }
6296 #[inline(always)]
6297 fn simd_lt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
6298 let (a0, a1) = self.split_i8x64(a);
6299 let (b0, b1) = self.split_i8x64(b);
6300 self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1))
6301 }
6302 #[inline(always)]
6303 fn simd_le_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
6304 let (a0, a1) = self.split_i8x64(a);
6305 let (b0, b1) = self.split_i8x64(b);
6306 self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1))
6307 }
6308 #[inline(always)]
6309 fn simd_ge_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
6310 let (a0, a1) = self.split_i8x64(a);
6311 let (b0, b1) = self.split_i8x64(b);
6312 self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1))
6313 }
6314 #[inline(always)]
6315 fn simd_gt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
6316 let (a0, a1) = self.split_i8x64(a);
6317 let (b0, b1) = self.split_i8x64(b);
6318 self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1))
6319 }
6320 #[inline(always)]
6321 fn zip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6322 let (a0, _) = self.split_i8x64(a);
6323 let (b0, _) = self.split_i8x64(b);
6324 self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0))
6325 }
6326 #[inline(always)]
6327 fn zip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6328 let (_, a1) = self.split_i8x64(a);
6329 let (_, b1) = self.split_i8x64(b);
6330 self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1))
6331 }
6332 #[inline(always)]
6333 fn unzip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6334 let (a0, a1) = self.split_i8x64(a);
6335 let (b0, b1) = self.split_i8x64(b);
6336 self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1))
6337 }
6338 #[inline(always)]
6339 fn unzip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6340 let (a0, a1) = self.split_i8x64(a);
6341 let (b0, b1) = self.split_i8x64(b);
6342 self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1))
6343 }
6344 #[inline(always)]
6345 fn interleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
6346 let (a0, a1) = self.split_i8x64(a);
6347 let (b0, b1) = self.split_i8x64(b);
6348 let lo_lo = self.zip_low_i8x32(a0, b0);
6349 let lo_hi = self.zip_high_i8x32(a0, b0);
6350 let hi_lo = self.zip_low_i8x32(a1, b1);
6351 let hi_hi = self.zip_high_i8x32(a1, b1);
6352 (
6353 self.combine_i8x32(lo_lo, lo_hi),
6354 self.combine_i8x32(hi_lo, hi_hi),
6355 )
6356 }
6357 #[inline(always)]
6358 fn deinterleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
6359 let (a0, a1) = self.split_i8x64(a);
6360 let (b0, b1) = self.split_i8x64(b);
6361 let lo_even = self.unzip_low_i8x32(a0, a1);
6362 let lo_odd = self.unzip_high_i8x32(a0, a1);
6363 let hi_even = self.unzip_low_i8x32(b0, b1);
6364 let hi_odd = self.unzip_high_i8x32(b0, b1);
6365 (
6366 self.combine_i8x32(lo_even, hi_even),
6367 self.combine_i8x32(lo_odd, hi_odd),
6368 )
6369 }
6370 #[inline(always)]
6371 fn select_i8x64(self, a: mask8x64<Self>, b: i8x64<Self>, c: i8x64<Self>) -> i8x64<Self> {
6372 let (a0, a1) = self.split_mask8x64(a);
6373 let (b0, b1) = self.split_i8x64(b);
6374 let (c0, c1) = self.split_i8x64(c);
6375 self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1))
6376 }
6377 #[inline(always)]
6378 fn min_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6379 let (a0, a1) = self.split_i8x64(a);
6380 let (b0, b1) = self.split_i8x64(b);
6381 self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1))
6382 }
6383 #[inline(always)]
6384 fn max_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6385 let (a0, a1) = self.split_i8x64(a);
6386 let (b0, b1) = self.split_i8x64(b);
6387 self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1))
6388 }
6389 #[inline(always)]
6390 fn split_i8x64(self, a: i8x64<Self>) -> (i8x32<Self>, i8x32<Self>) {
6391 (
6392 i8x32 {
6393 val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
6394 simd: self,
6395 },
6396 i8x32 {
6397 val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
6398 simd: self,
6399 },
6400 )
6401 }
6402 #[inline(always)]
6403 fn neg_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
6404 let (a0, a1) = self.split_i8x64(a);
6405 self.combine_i8x32(self.neg_i8x32(a0), self.neg_i8x32(a1))
6406 }
6407 #[inline(always)]
6408 fn reinterpret_u8_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
6409 let (a0, a1) = self.split_i8x64(a);
6410 self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1))
6411 }
6412 #[inline(always)]
6413 fn reinterpret_u32_i8x64(self, a: i8x64<Self>) -> u32x16<Self> {
6414 let (a0, a1) = self.split_i8x64(a);
6415 self.combine_u32x8(
6416 self.reinterpret_u32_i8x32(a0),
6417 self.reinterpret_u32_i8x32(a1),
6418 )
6419 }
6420 #[inline(always)]
6421 fn splat_u8x64(self, val: u8) -> u8x64<Self> {
6422 let half = self.splat_u8x32(val);
6423 self.combine_u8x32(half, half)
6424 }
6425 #[inline(always)]
6426 fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64<Self> {
6427 u8x64 {
6428 val: unsafe { core::mem::transmute_copy(&val) },
6429 simd: self,
6430 }
6431 }
6432 #[inline(always)]
6433 fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64<Self> {
6434 u8x64 {
6435 val: unsafe { core::mem::transmute_copy(val) },
6436 simd: self,
6437 }
6438 }
6439 #[inline(always)]
6440 fn as_array_u8x64(self, a: u8x64<Self>) -> [u8; 64usize] {
6441 unsafe { core::mem::transmute::<[__m128i; 4usize], [u8; 64usize]>(a.val.0) }
6442 }
6443 #[inline(always)]
6444 fn as_array_ref_u8x64(self, a: &u8x64<Self>) -> &[u8; 64usize] {
6445 unsafe { core::mem::transmute::<&[__m128i; 4usize], &[u8; 64usize]>(&a.val.0) }
6446 }
6447 #[inline(always)]
6448 fn as_array_mut_u8x64(self, a: &mut u8x64<Self>) -> &mut [u8; 64usize] {
6449 unsafe { core::mem::transmute::<&mut [__m128i; 4usize], &mut [u8; 64usize]>(&mut a.val.0) }
6450 }
6451 #[inline(always)]
6452 fn store_array_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
6453 unsafe {
6454 core::ptr::copy_nonoverlapping(
6455 (&raw const a.val.0) as *const u8,
6456 dest.as_mut_ptr(),
6457 64usize,
6458 );
6459 }
6460 }
6461 #[inline(always)]
6462 fn cvt_from_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
6463 unsafe {
6464 u8x64 {
6465 val: core::mem::transmute(a.val),
6466 simd: self,
6467 }
6468 }
6469 }
6470 #[inline(always)]
6471 fn cvt_to_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
6472 unsafe {
6473 u8x64 {
6474 val: core::mem::transmute(a.val),
6475 simd: self,
6476 }
6477 }
6478 }
6479 #[inline(always)]
6480 fn slide_u8x64<const SHIFT: usize>(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6481 unsafe {
6482 if SHIFT >= 64usize {
6483 return b;
6484 }
6485 let result = cross_block_alignr_128x4(
6486 self.cvt_to_bytes_u8x64(b).val.0,
6487 self.cvt_to_bytes_u8x64(a).val.0,
6488 SHIFT,
6489 );
6490 self.cvt_from_bytes_u8x64(u8x64 {
6491 val: crate::support::Aligned512(result),
6492 simd: self,
6493 })
6494 }
6495 }
6496 #[inline(always)]
6497 fn slide_within_blocks_u8x64<const SHIFT: usize>(
6498 self,
6499 a: u8x64<Self>,
6500 b: u8x64<Self>,
6501 ) -> u8x64<Self> {
6502 let (a0, a1) = self.split_u8x64(a);
6503 let (b0, b1) = self.split_u8x64(b);
6504 self.combine_u8x32(
6505 self.slide_within_blocks_u8x32::<SHIFT>(a0, b0),
6506 self.slide_within_blocks_u8x32::<SHIFT>(a1, b1),
6507 )
6508 }
6509 #[inline(always)]
6510 fn add_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6511 let (a0, a1) = self.split_u8x64(a);
6512 let (b0, b1) = self.split_u8x64(b);
6513 self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1))
6514 }
6515 #[inline(always)]
6516 fn sub_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6517 let (a0, a1) = self.split_u8x64(a);
6518 let (b0, b1) = self.split_u8x64(b);
6519 self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1))
6520 }
6521 #[inline(always)]
6522 fn mul_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6523 let (a0, a1) = self.split_u8x64(a);
6524 let (b0, b1) = self.split_u8x64(b);
6525 self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1))
6526 }
6527 #[inline(always)]
6528 fn and_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6529 let (a0, a1) = self.split_u8x64(a);
6530 let (b0, b1) = self.split_u8x64(b);
6531 self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1))
6532 }
6533 #[inline(always)]
6534 fn or_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6535 let (a0, a1) = self.split_u8x64(a);
6536 let (b0, b1) = self.split_u8x64(b);
6537 self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1))
6538 }
6539 #[inline(always)]
6540 fn xor_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6541 let (a0, a1) = self.split_u8x64(a);
6542 let (b0, b1) = self.split_u8x64(b);
6543 self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1))
6544 }
6545 #[inline(always)]
6546 fn not_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
6547 let (a0, a1) = self.split_u8x64(a);
6548 self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1))
6549 }
6550 #[inline(always)]
6551 fn shl_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
6552 let (a0, a1) = self.split_u8x64(a);
6553 self.combine_u8x32(self.shl_u8x32(a0, shift), self.shl_u8x32(a1, shift))
6554 }
6555 #[inline(always)]
6556 fn shlv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6557 let (a0, a1) = self.split_u8x64(a);
6558 let (b0, b1) = self.split_u8x64(b);
6559 self.combine_u8x32(self.shlv_u8x32(a0, b0), self.shlv_u8x32(a1, b1))
6560 }
6561 #[inline(always)]
6562 fn shr_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
6563 let (a0, a1) = self.split_u8x64(a);
6564 self.combine_u8x32(self.shr_u8x32(a0, shift), self.shr_u8x32(a1, shift))
6565 }
6566 #[inline(always)]
6567 fn shrv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6568 let (a0, a1) = self.split_u8x64(a);
6569 let (b0, b1) = self.split_u8x64(b);
6570 self.combine_u8x32(self.shrv_u8x32(a0, b0), self.shrv_u8x32(a1, b1))
6571 }
6572 #[inline(always)]
6573 fn simd_eq_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
6574 let (a0, a1) = self.split_u8x64(a);
6575 let (b0, b1) = self.split_u8x64(b);
6576 self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1))
6577 }
6578 #[inline(always)]
6579 fn simd_lt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
6580 let (a0, a1) = self.split_u8x64(a);
6581 let (b0, b1) = self.split_u8x64(b);
6582 self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1))
6583 }
6584 #[inline(always)]
6585 fn simd_le_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
6586 let (a0, a1) = self.split_u8x64(a);
6587 let (b0, b1) = self.split_u8x64(b);
6588 self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1))
6589 }
6590 #[inline(always)]
6591 fn simd_ge_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
6592 let (a0, a1) = self.split_u8x64(a);
6593 let (b0, b1) = self.split_u8x64(b);
6594 self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1))
6595 }
6596 #[inline(always)]
6597 fn simd_gt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
6598 let (a0, a1) = self.split_u8x64(a);
6599 let (b0, b1) = self.split_u8x64(b);
6600 self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1))
6601 }
6602 #[inline(always)]
6603 fn zip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6604 let (a0, _) = self.split_u8x64(a);
6605 let (b0, _) = self.split_u8x64(b);
6606 self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0))
6607 }
6608 #[inline(always)]
6609 fn zip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6610 let (_, a1) = self.split_u8x64(a);
6611 let (_, b1) = self.split_u8x64(b);
6612 self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1))
6613 }
6614 #[inline(always)]
6615 fn unzip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6616 let (a0, a1) = self.split_u8x64(a);
6617 let (b0, b1) = self.split_u8x64(b);
6618 self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1))
6619 }
6620 #[inline(always)]
6621 fn unzip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6622 let (a0, a1) = self.split_u8x64(a);
6623 let (b0, b1) = self.split_u8x64(b);
6624 self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1))
6625 }
6626 #[inline(always)]
6627 fn interleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
6628 let (a0, a1) = self.split_u8x64(a);
6629 let (b0, b1) = self.split_u8x64(b);
6630 let lo_lo = self.zip_low_u8x32(a0, b0);
6631 let lo_hi = self.zip_high_u8x32(a0, b0);
6632 let hi_lo = self.zip_low_u8x32(a1, b1);
6633 let hi_hi = self.zip_high_u8x32(a1, b1);
6634 (
6635 self.combine_u8x32(lo_lo, lo_hi),
6636 self.combine_u8x32(hi_lo, hi_hi),
6637 )
6638 }
6639 #[inline(always)]
6640 fn deinterleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
6641 let (a0, a1) = self.split_u8x64(a);
6642 let (b0, b1) = self.split_u8x64(b);
6643 let lo_even = self.unzip_low_u8x32(a0, a1);
6644 let lo_odd = self.unzip_high_u8x32(a0, a1);
6645 let hi_even = self.unzip_low_u8x32(b0, b1);
6646 let hi_odd = self.unzip_high_u8x32(b0, b1);
6647 (
6648 self.combine_u8x32(lo_even, hi_even),
6649 self.combine_u8x32(lo_odd, hi_odd),
6650 )
6651 }
6652 #[inline(always)]
6653 fn select_u8x64(self, a: mask8x64<Self>, b: u8x64<Self>, c: u8x64<Self>) -> u8x64<Self> {
6654 let (a0, a1) = self.split_mask8x64(a);
6655 let (b0, b1) = self.split_u8x64(b);
6656 let (c0, c1) = self.split_u8x64(c);
6657 self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1))
6658 }
6659 #[inline(always)]
6660 fn min_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6661 let (a0, a1) = self.split_u8x64(a);
6662 let (b0, b1) = self.split_u8x64(b);
6663 self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1))
6664 }
6665 #[inline(always)]
6666 fn max_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6667 let (a0, a1) = self.split_u8x64(a);
6668 let (b0, b1) = self.split_u8x64(b);
6669 self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1))
6670 }
6671 #[inline(always)]
6672 fn split_u8x64(self, a: u8x64<Self>) -> (u8x32<Self>, u8x32<Self>) {
6673 (
6674 u8x32 {
6675 val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
6676 simd: self,
6677 },
6678 u8x32 {
6679 val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
6680 simd: self,
6681 },
6682 )
6683 }
6684 #[inline(always)]
6685 fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self> {
6686 unsafe {
6687 let v0 = _mm_loadu_si128(src.as_ptr() as *const _);
6688 let v1 = _mm_loadu_si128(src.as_ptr().add(16usize) as *const _);
6689 let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 16usize) as *const _);
6690 let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 16usize) as *const _);
6691 let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
6692 let v0 = _mm_shuffle_epi8(v0, mask);
6693 let v1 = _mm_shuffle_epi8(v1, mask);
6694 let v2 = _mm_shuffle_epi8(v2, mask);
6695 let v3 = _mm_shuffle_epi8(v3, mask);
6696 let tmp0 = _mm_unpacklo_epi32(v0, v1);
6697 let tmp1 = _mm_unpackhi_epi32(v0, v1);
6698 let tmp2 = _mm_unpacklo_epi32(v2, v3);
6699 let tmp3 = _mm_unpackhi_epi32(v2, v3);
6700 let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
6701 let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
6702 let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
6703 let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
6704 self.combine_u8x32(
6705 self.combine_u8x16(out0.simd_into(self), out1.simd_into(self)),
6706 self.combine_u8x16(out2.simd_into(self), out3.simd_into(self)),
6707 )
6708 }
6709 }
6710 #[inline(always)]
6711 fn store_interleaved_128_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
6712 let (v01, v23) = self.split_u8x64(a);
6713 let (v0, v1) = self.split_u8x32(v01);
6714 let (v2, v3) = self.split_u8x32(v23);
6715 let v0 = v0.into();
6716 let v1 = v1.into();
6717 let v2 = v2.into();
6718 let v3 = v3.into();
6719 unsafe {
6720 let tmp0 = _mm_unpacklo_epi32(v0, v1);
6721 let tmp1 = _mm_unpackhi_epi32(v0, v1);
6722 let tmp2 = _mm_unpacklo_epi32(v2, v3);
6723 let tmp3 = _mm_unpackhi_epi32(v2, v3);
6724 let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
6725 let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
6726 let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
6727 let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
6728 let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
6729 let out0 = _mm_shuffle_epi8(out0, mask);
6730 let out1 = _mm_shuffle_epi8(out1, mask);
6731 let out2 = _mm_shuffle_epi8(out2, mask);
6732 let out3 = _mm_shuffle_epi8(out3, mask);
6733 _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0);
6734 _mm_storeu_si128(dest.as_mut_ptr().add(16usize) as *mut _, out1);
6735 _mm_storeu_si128(dest.as_mut_ptr().add(2 * 16usize) as *mut _, out2);
6736 _mm_storeu_si128(dest.as_mut_ptr().add(3 * 16usize) as *mut _, out3);
6737 }
6738 }
6739 #[inline(always)]
6740 fn reinterpret_u32_u8x64(self, a: u8x64<Self>) -> u32x16<Self> {
6741 let (a0, a1) = self.split_u8x64(a);
6742 self.combine_u32x8(
6743 self.reinterpret_u32_u8x32(a0),
6744 self.reinterpret_u32_u8x32(a1),
6745 )
6746 }
6747 #[inline(always)]
6748 fn splat_mask8x64(self, val: i8) -> mask8x64<Self> {
6749 let half = self.splat_mask8x32(val);
6750 self.combine_mask8x32(half, half)
6751 }
6752 #[inline(always)]
6753 fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64<Self> {
6754 mask8x64 {
6755 val: unsafe { core::mem::transmute_copy(&val) },
6756 simd: self,
6757 }
6758 }
6759 #[inline(always)]
6760 fn load_array_ref_mask8x64(self, val: &[i8; 64usize]) -> mask8x64<Self> {
6761 mask8x64 {
6762 val: unsafe { core::mem::transmute_copy(val) },
6763 simd: self,
6764 }
6765 }
6766 #[inline(always)]
6767 fn as_array_mask8x64(self, a: mask8x64<Self>) -> [i8; 64usize] {
6768 unsafe { core::mem::transmute::<[__m128i; 4usize], [i8; 64usize]>(a.val.0) }
6769 }
6770 #[inline(always)]
6771 fn as_array_ref_mask8x64(self, a: &mask8x64<Self>) -> &[i8; 64usize] {
6772 unsafe { core::mem::transmute::<&[__m128i; 4usize], &[i8; 64usize]>(&a.val.0) }
6773 }
6774 #[inline(always)]
6775 fn as_array_mut_mask8x64(self, a: &mut mask8x64<Self>) -> &mut [i8; 64usize] {
6776 unsafe { core::mem::transmute::<&mut [__m128i; 4usize], &mut [i8; 64usize]>(&mut a.val.0) }
6777 }
6778 #[inline(always)]
6779 fn store_array_mask8x64(self, a: mask8x64<Self>, dest: &mut [i8; 64usize]) -> () {
6780 unsafe {
6781 core::ptr::copy_nonoverlapping(
6782 (&raw const a.val.0) as *const i8,
6783 dest.as_mut_ptr(),
6784 64usize,
6785 );
6786 }
6787 }
6788 #[inline(always)]
6789 fn cvt_from_bytes_mask8x64(self, a: u8x64<Self>) -> mask8x64<Self> {
6790 unsafe {
6791 mask8x64 {
6792 val: core::mem::transmute(a.val),
6793 simd: self,
6794 }
6795 }
6796 }
6797 #[inline(always)]
6798 fn cvt_to_bytes_mask8x64(self, a: mask8x64<Self>) -> u8x64<Self> {
6799 unsafe {
6800 u8x64 {
6801 val: core::mem::transmute(a.val),
6802 simd: self,
6803 }
6804 }
6805 }
6806 #[inline(always)]
6807 fn slide_mask8x64<const SHIFT: usize>(
6808 self,
6809 a: mask8x64<Self>,
6810 b: mask8x64<Self>,
6811 ) -> mask8x64<Self> {
6812 unsafe {
6813 if SHIFT >= 64usize {
6814 return b;
6815 }
6816 let result = cross_block_alignr_128x4(
6817 self.cvt_to_bytes_mask8x64(b).val.0,
6818 self.cvt_to_bytes_mask8x64(a).val.0,
6819 SHIFT,
6820 );
6821 self.cvt_from_bytes_mask8x64(u8x64 {
6822 val: crate::support::Aligned512(result),
6823 simd: self,
6824 })
6825 }
6826 }
6827 #[inline(always)]
6828 fn slide_within_blocks_mask8x64<const SHIFT: usize>(
6829 self,
6830 a: mask8x64<Self>,
6831 b: mask8x64<Self>,
6832 ) -> mask8x64<Self> {
6833 let (a0, a1) = self.split_mask8x64(a);
6834 let (b0, b1) = self.split_mask8x64(b);
6835 self.combine_mask8x32(
6836 self.slide_within_blocks_mask8x32::<SHIFT>(a0, b0),
6837 self.slide_within_blocks_mask8x32::<SHIFT>(a1, b1),
6838 )
6839 }
6840 #[inline(always)]
6841 fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
6842 let (a0, a1) = self.split_mask8x64(a);
6843 let (b0, b1) = self.split_mask8x64(b);
6844 self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1))
6845 }
6846 #[inline(always)]
6847 fn or_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
6848 let (a0, a1) = self.split_mask8x64(a);
6849 let (b0, b1) = self.split_mask8x64(b);
6850 self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1))
6851 }
6852 #[inline(always)]
6853 fn xor_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
6854 let (a0, a1) = self.split_mask8x64(a);
6855 let (b0, b1) = self.split_mask8x64(b);
6856 self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1))
6857 }
6858 #[inline(always)]
6859 fn not_mask8x64(self, a: mask8x64<Self>) -> mask8x64<Self> {
6860 let (a0, a1) = self.split_mask8x64(a);
6861 self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1))
6862 }
6863 #[inline(always)]
6864 fn select_mask8x64(
6865 self,
6866 a: mask8x64<Self>,
6867 b: mask8x64<Self>,
6868 c: mask8x64<Self>,
6869 ) -> mask8x64<Self> {
6870 let (a0, a1) = self.split_mask8x64(a);
6871 let (b0, b1) = self.split_mask8x64(b);
6872 let (c0, c1) = self.split_mask8x64(c);
6873 self.combine_mask8x32(
6874 self.select_mask8x32(a0, b0, c0),
6875 self.select_mask8x32(a1, b1, c1),
6876 )
6877 }
6878 #[inline(always)]
6879 fn simd_eq_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
6880 let (a0, a1) = self.split_mask8x64(a);
6881 let (b0, b1) = self.split_mask8x64(b);
6882 self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1))
6883 }
6884 #[inline(always)]
6885 fn any_true_mask8x64(self, a: mask8x64<Self>) -> bool {
6886 let (a0, a1) = self.split_mask8x64(a);
6887 self.any_true_mask8x32(a0) || self.any_true_mask8x32(a1)
6888 }
6889 #[inline(always)]
6890 fn all_true_mask8x64(self, a: mask8x64<Self>) -> bool {
6891 let (a0, a1) = self.split_mask8x64(a);
6892 self.all_true_mask8x32(a0) && self.all_true_mask8x32(a1)
6893 }
6894 #[inline(always)]
6895 fn any_false_mask8x64(self, a: mask8x64<Self>) -> bool {
6896 let (a0, a1) = self.split_mask8x64(a);
6897 self.any_false_mask8x32(a0) || self.any_false_mask8x32(a1)
6898 }
6899 #[inline(always)]
6900 fn all_false_mask8x64(self, a: mask8x64<Self>) -> bool {
6901 let (a0, a1) = self.split_mask8x64(a);
6902 self.all_false_mask8x32(a0) && self.all_false_mask8x32(a1)
6903 }
6904 #[inline(always)]
6905 fn split_mask8x64(self, a: mask8x64<Self>) -> (mask8x32<Self>, mask8x32<Self>) {
6906 (
6907 mask8x32 {
6908 val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
6909 simd: self,
6910 },
6911 mask8x32 {
6912 val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
6913 simd: self,
6914 },
6915 )
6916 }
6917 #[inline(always)]
6918 fn splat_i16x32(self, val: i16) -> i16x32<Self> {
6919 let half = self.splat_i16x16(val);
6920 self.combine_i16x16(half, half)
6921 }
6922 #[inline(always)]
6923 fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32<Self> {
6924 i16x32 {
6925 val: unsafe { core::mem::transmute_copy(&val) },
6926 simd: self,
6927 }
6928 }
6929 #[inline(always)]
6930 fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32<Self> {
6931 i16x32 {
6932 val: unsafe { core::mem::transmute_copy(val) },
6933 simd: self,
6934 }
6935 }
6936 #[inline(always)]
6937 fn as_array_i16x32(self, a: i16x32<Self>) -> [i16; 32usize] {
6938 unsafe { core::mem::transmute::<[__m128i; 4usize], [i16; 32usize]>(a.val.0) }
6939 }
6940 #[inline(always)]
6941 fn as_array_ref_i16x32(self, a: &i16x32<Self>) -> &[i16; 32usize] {
6942 unsafe { core::mem::transmute::<&[__m128i; 4usize], &[i16; 32usize]>(&a.val.0) }
6943 }
6944 #[inline(always)]
6945 fn as_array_mut_i16x32(self, a: &mut i16x32<Self>) -> &mut [i16; 32usize] {
6946 unsafe { core::mem::transmute::<&mut [__m128i; 4usize], &mut [i16; 32usize]>(&mut a.val.0) }
6947 }
6948 #[inline(always)]
6949 fn store_array_i16x32(self, a: i16x32<Self>, dest: &mut [i16; 32usize]) -> () {
6950 unsafe {
6951 core::ptr::copy_nonoverlapping(
6952 (&raw const a.val.0) as *const i16,
6953 dest.as_mut_ptr(),
6954 32usize,
6955 );
6956 }
6957 }
6958 #[inline(always)]
6959 fn cvt_from_bytes_i16x32(self, a: u8x64<Self>) -> i16x32<Self> {
6960 unsafe {
6961 i16x32 {
6962 val: core::mem::transmute(a.val),
6963 simd: self,
6964 }
6965 }
6966 }
6967 #[inline(always)]
6968 fn cvt_to_bytes_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
6969 unsafe {
6970 u8x64 {
6971 val: core::mem::transmute(a.val),
6972 simd: self,
6973 }
6974 }
6975 }
6976 #[inline(always)]
6977 fn slide_i16x32<const SHIFT: usize>(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
6978 unsafe {
6979 if SHIFT >= 32usize {
6980 return b;
6981 }
6982 let result = cross_block_alignr_128x4(
6983 self.cvt_to_bytes_i16x32(b).val.0,
6984 self.cvt_to_bytes_i16x32(a).val.0,
6985 SHIFT * 2usize,
6986 );
6987 self.cvt_from_bytes_i16x32(u8x64 {
6988 val: crate::support::Aligned512(result),
6989 simd: self,
6990 })
6991 }
6992 }
6993 #[inline(always)]
6994 fn slide_within_blocks_i16x32<const SHIFT: usize>(
6995 self,
6996 a: i16x32<Self>,
6997 b: i16x32<Self>,
6998 ) -> i16x32<Self> {
6999 let (a0, a1) = self.split_i16x32(a);
7000 let (b0, b1) = self.split_i16x32(b);
7001 self.combine_i16x16(
7002 self.slide_within_blocks_i16x16::<SHIFT>(a0, b0),
7003 self.slide_within_blocks_i16x16::<SHIFT>(a1, b1),
7004 )
7005 }
7006 #[inline(always)]
7007 fn add_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7008 let (a0, a1) = self.split_i16x32(a);
7009 let (b0, b1) = self.split_i16x32(b);
7010 self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1))
7011 }
7012 #[inline(always)]
7013 fn sub_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7014 let (a0, a1) = self.split_i16x32(a);
7015 let (b0, b1) = self.split_i16x32(b);
7016 self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1))
7017 }
7018 #[inline(always)]
7019 fn mul_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7020 let (a0, a1) = self.split_i16x32(a);
7021 let (b0, b1) = self.split_i16x32(b);
7022 self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1))
7023 }
7024 #[inline(always)]
7025 fn and_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7026 let (a0, a1) = self.split_i16x32(a);
7027 let (b0, b1) = self.split_i16x32(b);
7028 self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1))
7029 }
7030 #[inline(always)]
7031 fn or_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7032 let (a0, a1) = self.split_i16x32(a);
7033 let (b0, b1) = self.split_i16x32(b);
7034 self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1))
7035 }
7036 #[inline(always)]
7037 fn xor_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7038 let (a0, a1) = self.split_i16x32(a);
7039 let (b0, b1) = self.split_i16x32(b);
7040 self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1))
7041 }
7042 #[inline(always)]
7043 fn not_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
7044 let (a0, a1) = self.split_i16x32(a);
7045 self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1))
7046 }
7047 #[inline(always)]
7048 fn shl_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
7049 let (a0, a1) = self.split_i16x32(a);
7050 self.combine_i16x16(self.shl_i16x16(a0, shift), self.shl_i16x16(a1, shift))
7051 }
7052 #[inline(always)]
7053 fn shlv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7054 let (a0, a1) = self.split_i16x32(a);
7055 let (b0, b1) = self.split_i16x32(b);
7056 self.combine_i16x16(self.shlv_i16x16(a0, b0), self.shlv_i16x16(a1, b1))
7057 }
7058 #[inline(always)]
7059 fn shr_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
7060 let (a0, a1) = self.split_i16x32(a);
7061 self.combine_i16x16(self.shr_i16x16(a0, shift), self.shr_i16x16(a1, shift))
7062 }
7063 #[inline(always)]
7064 fn shrv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7065 let (a0, a1) = self.split_i16x32(a);
7066 let (b0, b1) = self.split_i16x32(b);
7067 self.combine_i16x16(self.shrv_i16x16(a0, b0), self.shrv_i16x16(a1, b1))
7068 }
7069 #[inline(always)]
7070 fn simd_eq_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
7071 let (a0, a1) = self.split_i16x32(a);
7072 let (b0, b1) = self.split_i16x32(b);
7073 self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1))
7074 }
7075 #[inline(always)]
7076 fn simd_lt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
7077 let (a0, a1) = self.split_i16x32(a);
7078 let (b0, b1) = self.split_i16x32(b);
7079 self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1))
7080 }
7081 #[inline(always)]
7082 fn simd_le_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
7083 let (a0, a1) = self.split_i16x32(a);
7084 let (b0, b1) = self.split_i16x32(b);
7085 self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1))
7086 }
7087 #[inline(always)]
7088 fn simd_ge_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
7089 let (a0, a1) = self.split_i16x32(a);
7090 let (b0, b1) = self.split_i16x32(b);
7091 self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1))
7092 }
7093 #[inline(always)]
7094 fn simd_gt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
7095 let (a0, a1) = self.split_i16x32(a);
7096 let (b0, b1) = self.split_i16x32(b);
7097 self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1))
7098 }
7099 #[inline(always)]
7100 fn zip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7101 let (a0, _) = self.split_i16x32(a);
7102 let (b0, _) = self.split_i16x32(b);
7103 self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0))
7104 }
7105 #[inline(always)]
7106 fn zip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7107 let (_, a1) = self.split_i16x32(a);
7108 let (_, b1) = self.split_i16x32(b);
7109 self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1))
7110 }
7111 #[inline(always)]
7112 fn unzip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7113 let (a0, a1) = self.split_i16x32(a);
7114 let (b0, b1) = self.split_i16x32(b);
7115 self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1))
7116 }
7117 #[inline(always)]
7118 fn unzip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7119 let (a0, a1) = self.split_i16x32(a);
7120 let (b0, b1) = self.split_i16x32(b);
7121 self.combine_i16x16(
7122 self.unzip_high_i16x16(a0, a1),
7123 self.unzip_high_i16x16(b0, b1),
7124 )
7125 }
7126 #[inline(always)]
7127 fn interleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
7128 let (a0, a1) = self.split_i16x32(a);
7129 let (b0, b1) = self.split_i16x32(b);
7130 let lo_lo = self.zip_low_i16x16(a0, b0);
7131 let lo_hi = self.zip_high_i16x16(a0, b0);
7132 let hi_lo = self.zip_low_i16x16(a1, b1);
7133 let hi_hi = self.zip_high_i16x16(a1, b1);
7134 (
7135 self.combine_i16x16(lo_lo, lo_hi),
7136 self.combine_i16x16(hi_lo, hi_hi),
7137 )
7138 }
7139 #[inline(always)]
7140 fn deinterleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
7141 let (a0, a1) = self.split_i16x32(a);
7142 let (b0, b1) = self.split_i16x32(b);
7143 let lo_even = self.unzip_low_i16x16(a0, a1);
7144 let lo_odd = self.unzip_high_i16x16(a0, a1);
7145 let hi_even = self.unzip_low_i16x16(b0, b1);
7146 let hi_odd = self.unzip_high_i16x16(b0, b1);
7147 (
7148 self.combine_i16x16(lo_even, hi_even),
7149 self.combine_i16x16(lo_odd, hi_odd),
7150 )
7151 }
7152 #[inline(always)]
7153 fn select_i16x32(self, a: mask16x32<Self>, b: i16x32<Self>, c: i16x32<Self>) -> i16x32<Self> {
7154 let (a0, a1) = self.split_mask16x32(a);
7155 let (b0, b1) = self.split_i16x32(b);
7156 let (c0, c1) = self.split_i16x32(c);
7157 self.combine_i16x16(
7158 self.select_i16x16(a0, b0, c0),
7159 self.select_i16x16(a1, b1, c1),
7160 )
7161 }
7162 #[inline(always)]
7163 fn min_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7164 let (a0, a1) = self.split_i16x32(a);
7165 let (b0, b1) = self.split_i16x32(b);
7166 self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1))
7167 }
7168 #[inline(always)]
7169 fn max_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7170 let (a0, a1) = self.split_i16x32(a);
7171 let (b0, b1) = self.split_i16x32(b);
7172 self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1))
7173 }
7174 #[inline(always)]
7175 fn split_i16x32(self, a: i16x32<Self>) -> (i16x16<Self>, i16x16<Self>) {
7176 (
7177 i16x16 {
7178 val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
7179 simd: self,
7180 },
7181 i16x16 {
7182 val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
7183 simd: self,
7184 },
7185 )
7186 }
7187 #[inline(always)]
7188 fn neg_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
7189 let (a0, a1) = self.split_i16x32(a);
7190 self.combine_i16x16(self.neg_i16x16(a0), self.neg_i16x16(a1))
7191 }
7192 #[inline(always)]
7193 fn reinterpret_u8_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
7194 let (a0, a1) = self.split_i16x32(a);
7195 self.combine_u8x32(
7196 self.reinterpret_u8_i16x16(a0),
7197 self.reinterpret_u8_i16x16(a1),
7198 )
7199 }
7200 #[inline(always)]
7201 fn reinterpret_u32_i16x32(self, a: i16x32<Self>) -> u32x16<Self> {
7202 let (a0, a1) = self.split_i16x32(a);
7203 self.combine_u32x8(
7204 self.reinterpret_u32_i16x16(a0),
7205 self.reinterpret_u32_i16x16(a1),
7206 )
7207 }
7208 #[inline(always)]
7209 fn splat_u16x32(self, val: u16) -> u16x32<Self> {
7210 let half = self.splat_u16x16(val);
7211 self.combine_u16x16(half, half)
7212 }
7213 #[inline(always)]
7214 fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32<Self> {
7215 u16x32 {
7216 val: unsafe { core::mem::transmute_copy(&val) },
7217 simd: self,
7218 }
7219 }
7220 #[inline(always)]
7221 fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32<Self> {
7222 u16x32 {
7223 val: unsafe { core::mem::transmute_copy(val) },
7224 simd: self,
7225 }
7226 }
7227 #[inline(always)]
7228 fn as_array_u16x32(self, a: u16x32<Self>) -> [u16; 32usize] {
7229 unsafe { core::mem::transmute::<[__m128i; 4usize], [u16; 32usize]>(a.val.0) }
7230 }
7231 #[inline(always)]
7232 fn as_array_ref_u16x32(self, a: &u16x32<Self>) -> &[u16; 32usize] {
7233 unsafe { core::mem::transmute::<&[__m128i; 4usize], &[u16; 32usize]>(&a.val.0) }
7234 }
7235 #[inline(always)]
7236 fn as_array_mut_u16x32(self, a: &mut u16x32<Self>) -> &mut [u16; 32usize] {
7237 unsafe { core::mem::transmute::<&mut [__m128i; 4usize], &mut [u16; 32usize]>(&mut a.val.0) }
7238 }
7239 #[inline(always)]
7240 fn store_array_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
7241 unsafe {
7242 core::ptr::copy_nonoverlapping(
7243 (&raw const a.val.0) as *const u16,
7244 dest.as_mut_ptr(),
7245 32usize,
7246 );
7247 }
7248 }
7249 #[inline(always)]
7250 fn cvt_from_bytes_u16x32(self, a: u8x64<Self>) -> u16x32<Self> {
7251 unsafe {
7252 u16x32 {
7253 val: core::mem::transmute(a.val),
7254 simd: self,
7255 }
7256 }
7257 }
7258 #[inline(always)]
7259 fn cvt_to_bytes_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
7260 unsafe {
7261 u8x64 {
7262 val: core::mem::transmute(a.val),
7263 simd: self,
7264 }
7265 }
7266 }
7267 #[inline(always)]
7268 fn slide_u16x32<const SHIFT: usize>(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7269 unsafe {
7270 if SHIFT >= 32usize {
7271 return b;
7272 }
7273 let result = cross_block_alignr_128x4(
7274 self.cvt_to_bytes_u16x32(b).val.0,
7275 self.cvt_to_bytes_u16x32(a).val.0,
7276 SHIFT * 2usize,
7277 );
7278 self.cvt_from_bytes_u16x32(u8x64 {
7279 val: crate::support::Aligned512(result),
7280 simd: self,
7281 })
7282 }
7283 }
7284 #[inline(always)]
7285 fn slide_within_blocks_u16x32<const SHIFT: usize>(
7286 self,
7287 a: u16x32<Self>,
7288 b: u16x32<Self>,
7289 ) -> u16x32<Self> {
7290 let (a0, a1) = self.split_u16x32(a);
7291 let (b0, b1) = self.split_u16x32(b);
7292 self.combine_u16x16(
7293 self.slide_within_blocks_u16x16::<SHIFT>(a0, b0),
7294 self.slide_within_blocks_u16x16::<SHIFT>(a1, b1),
7295 )
7296 }
7297 #[inline(always)]
7298 fn add_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7299 let (a0, a1) = self.split_u16x32(a);
7300 let (b0, b1) = self.split_u16x32(b);
7301 self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1))
7302 }
7303 #[inline(always)]
7304 fn sub_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7305 let (a0, a1) = self.split_u16x32(a);
7306 let (b0, b1) = self.split_u16x32(b);
7307 self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1))
7308 }
7309 #[inline(always)]
7310 fn mul_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7311 let (a0, a1) = self.split_u16x32(a);
7312 let (b0, b1) = self.split_u16x32(b);
7313 self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1))
7314 }
7315 #[inline(always)]
7316 fn and_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7317 let (a0, a1) = self.split_u16x32(a);
7318 let (b0, b1) = self.split_u16x32(b);
7319 self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1))
7320 }
7321 #[inline(always)]
7322 fn or_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7323 let (a0, a1) = self.split_u16x32(a);
7324 let (b0, b1) = self.split_u16x32(b);
7325 self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1))
7326 }
7327 #[inline(always)]
7328 fn xor_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7329 let (a0, a1) = self.split_u16x32(a);
7330 let (b0, b1) = self.split_u16x32(b);
7331 self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1))
7332 }
7333 #[inline(always)]
7334 fn not_u16x32(self, a: u16x32<Self>) -> u16x32<Self> {
7335 let (a0, a1) = self.split_u16x32(a);
7336 self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1))
7337 }
7338 #[inline(always)]
7339 fn shl_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
7340 let (a0, a1) = self.split_u16x32(a);
7341 self.combine_u16x16(self.shl_u16x16(a0, shift), self.shl_u16x16(a1, shift))
7342 }
7343 #[inline(always)]
7344 fn shlv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7345 let (a0, a1) = self.split_u16x32(a);
7346 let (b0, b1) = self.split_u16x32(b);
7347 self.combine_u16x16(self.shlv_u16x16(a0, b0), self.shlv_u16x16(a1, b1))
7348 }
7349 #[inline(always)]
7350 fn shr_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
7351 let (a0, a1) = self.split_u16x32(a);
7352 self.combine_u16x16(self.shr_u16x16(a0, shift), self.shr_u16x16(a1, shift))
7353 }
7354 #[inline(always)]
7355 fn shrv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7356 let (a0, a1) = self.split_u16x32(a);
7357 let (b0, b1) = self.split_u16x32(b);
7358 self.combine_u16x16(self.shrv_u16x16(a0, b0), self.shrv_u16x16(a1, b1))
7359 }
7360 #[inline(always)]
7361 fn simd_eq_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
7362 let (a0, a1) = self.split_u16x32(a);
7363 let (b0, b1) = self.split_u16x32(b);
7364 self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1))
7365 }
7366 #[inline(always)]
7367 fn simd_lt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
7368 let (a0, a1) = self.split_u16x32(a);
7369 let (b0, b1) = self.split_u16x32(b);
7370 self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1))
7371 }
7372 #[inline(always)]
7373 fn simd_le_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
7374 let (a0, a1) = self.split_u16x32(a);
7375 let (b0, b1) = self.split_u16x32(b);
7376 self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1))
7377 }
7378 #[inline(always)]
7379 fn simd_ge_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
7380 let (a0, a1) = self.split_u16x32(a);
7381 let (b0, b1) = self.split_u16x32(b);
7382 self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1))
7383 }
7384 #[inline(always)]
7385 fn simd_gt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
7386 let (a0, a1) = self.split_u16x32(a);
7387 let (b0, b1) = self.split_u16x32(b);
7388 self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1))
7389 }
7390 #[inline(always)]
7391 fn zip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7392 let (a0, _) = self.split_u16x32(a);
7393 let (b0, _) = self.split_u16x32(b);
7394 self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0))
7395 }
7396 #[inline(always)]
7397 fn zip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7398 let (_, a1) = self.split_u16x32(a);
7399 let (_, b1) = self.split_u16x32(b);
7400 self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1))
7401 }
7402 #[inline(always)]
7403 fn unzip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7404 let (a0, a1) = self.split_u16x32(a);
7405 let (b0, b1) = self.split_u16x32(b);
7406 self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1))
7407 }
7408 #[inline(always)]
7409 fn unzip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7410 let (a0, a1) = self.split_u16x32(a);
7411 let (b0, b1) = self.split_u16x32(b);
7412 self.combine_u16x16(
7413 self.unzip_high_u16x16(a0, a1),
7414 self.unzip_high_u16x16(b0, b1),
7415 )
7416 }
7417 #[inline(always)]
7418 fn interleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
7419 let (a0, a1) = self.split_u16x32(a);
7420 let (b0, b1) = self.split_u16x32(b);
7421 let lo_lo = self.zip_low_u16x16(a0, b0);
7422 let lo_hi = self.zip_high_u16x16(a0, b0);
7423 let hi_lo = self.zip_low_u16x16(a1, b1);
7424 let hi_hi = self.zip_high_u16x16(a1, b1);
7425 (
7426 self.combine_u16x16(lo_lo, lo_hi),
7427 self.combine_u16x16(hi_lo, hi_hi),
7428 )
7429 }
7430 #[inline(always)]
7431 fn deinterleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
7432 let (a0, a1) = self.split_u16x32(a);
7433 let (b0, b1) = self.split_u16x32(b);
7434 let lo_even = self.unzip_low_u16x16(a0, a1);
7435 let lo_odd = self.unzip_high_u16x16(a0, a1);
7436 let hi_even = self.unzip_low_u16x16(b0, b1);
7437 let hi_odd = self.unzip_high_u16x16(b0, b1);
7438 (
7439 self.combine_u16x16(lo_even, hi_even),
7440 self.combine_u16x16(lo_odd, hi_odd),
7441 )
7442 }
7443 #[inline(always)]
7444 fn select_u16x32(self, a: mask16x32<Self>, b: u16x32<Self>, c: u16x32<Self>) -> u16x32<Self> {
7445 let (a0, a1) = self.split_mask16x32(a);
7446 let (b0, b1) = self.split_u16x32(b);
7447 let (c0, c1) = self.split_u16x32(c);
7448 self.combine_u16x16(
7449 self.select_u16x16(a0, b0, c0),
7450 self.select_u16x16(a1, b1, c1),
7451 )
7452 }
7453 #[inline(always)]
7454 fn min_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7455 let (a0, a1) = self.split_u16x32(a);
7456 let (b0, b1) = self.split_u16x32(b);
7457 self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1))
7458 }
7459 #[inline(always)]
7460 fn max_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7461 let (a0, a1) = self.split_u16x32(a);
7462 let (b0, b1) = self.split_u16x32(b);
7463 self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1))
7464 }
7465 #[inline(always)]
7466 fn split_u16x32(self, a: u16x32<Self>) -> (u16x16<Self>, u16x16<Self>) {
7467 (
7468 u16x16 {
7469 val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
7470 simd: self,
7471 },
7472 u16x16 {
7473 val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
7474 simd: self,
7475 },
7476 )
7477 }
7478 #[inline(always)]
7479 fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self> {
7480 unsafe {
7481 let v0 = _mm_loadu_si128(src.as_ptr() as *const _);
7482 let v1 = _mm_loadu_si128(src.as_ptr().add(8usize) as *const _);
7483 let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 8usize) as *const _);
7484 let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 8usize) as *const _);
7485 let mask = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
7486 let v0 = _mm_shuffle_epi8(v0, mask);
7487 let v1 = _mm_shuffle_epi8(v1, mask);
7488 let v2 = _mm_shuffle_epi8(v2, mask);
7489 let v3 = _mm_shuffle_epi8(v3, mask);
7490 let tmp0 = _mm_unpacklo_epi32(v0, v1);
7491 let tmp1 = _mm_unpackhi_epi32(v0, v1);
7492 let tmp2 = _mm_unpacklo_epi32(v2, v3);
7493 let tmp3 = _mm_unpackhi_epi32(v2, v3);
7494 let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
7495 let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
7496 let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
7497 let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
7498 self.combine_u16x16(
7499 self.combine_u16x8(out0.simd_into(self), out1.simd_into(self)),
7500 self.combine_u16x8(out2.simd_into(self), out3.simd_into(self)),
7501 )
7502 }
7503 }
7504 #[inline(always)]
7505 fn store_interleaved_128_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
7506 let (v01, v23) = self.split_u16x32(a);
7507 let (v0, v1) = self.split_u16x16(v01);
7508 let (v2, v3) = self.split_u16x16(v23);
7509 let v0 = v0.into();
7510 let v1 = v1.into();
7511 let v2 = v2.into();
7512 let v3 = v3.into();
7513 unsafe {
7514 let tmp0 = _mm_unpacklo_epi32(v0, v1);
7515 let tmp1 = _mm_unpackhi_epi32(v0, v1);
7516 let tmp2 = _mm_unpacklo_epi32(v2, v3);
7517 let tmp3 = _mm_unpackhi_epi32(v2, v3);
7518 let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
7519 let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
7520 let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
7521 let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
7522 let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
7523 let out0 = _mm_shuffle_epi8(out0, mask);
7524 let out1 = _mm_shuffle_epi8(out1, mask);
7525 let out2 = _mm_shuffle_epi8(out2, mask);
7526 let out3 = _mm_shuffle_epi8(out3, mask);
7527 _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0);
7528 _mm_storeu_si128(dest.as_mut_ptr().add(8usize) as *mut _, out1);
7529 _mm_storeu_si128(dest.as_mut_ptr().add(2 * 8usize) as *mut _, out2);
7530 _mm_storeu_si128(dest.as_mut_ptr().add(3 * 8usize) as *mut _, out3);
7531 }
7532 }
7533 #[inline(always)]
7534 fn narrow_u16x32(self, a: u16x32<Self>) -> u8x32<Self> {
7535 let (a0, a1) = self.split_u16x32(a);
7536 self.combine_u8x16(self.narrow_u16x16(a0), self.narrow_u16x16(a1))
7537 }
7538 #[inline(always)]
7539 fn reinterpret_u8_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
7540 let (a0, a1) = self.split_u16x32(a);
7541 self.combine_u8x32(
7542 self.reinterpret_u8_u16x16(a0),
7543 self.reinterpret_u8_u16x16(a1),
7544 )
7545 }
7546 #[inline(always)]
7547 fn reinterpret_u32_u16x32(self, a: u16x32<Self>) -> u32x16<Self> {
7548 let (a0, a1) = self.split_u16x32(a);
7549 self.combine_u32x8(
7550 self.reinterpret_u32_u16x16(a0),
7551 self.reinterpret_u32_u16x16(a1),
7552 )
7553 }
7554 #[inline(always)]
7555 fn splat_mask16x32(self, val: i16) -> mask16x32<Self> {
7556 let half = self.splat_mask16x16(val);
7557 self.combine_mask16x16(half, half)
7558 }
7559 #[inline(always)]
7560 fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32<Self> {
7561 mask16x32 {
7562 val: unsafe { core::mem::transmute_copy(&val) },
7563 simd: self,
7564 }
7565 }
7566 #[inline(always)]
7567 fn load_array_ref_mask16x32(self, val: &[i16; 32usize]) -> mask16x32<Self> {
7568 mask16x32 {
7569 val: unsafe { core::mem::transmute_copy(val) },
7570 simd: self,
7571 }
7572 }
7573 #[inline(always)]
7574 fn as_array_mask16x32(self, a: mask16x32<Self>) -> [i16; 32usize] {
7575 unsafe { core::mem::transmute::<[__m128i; 4usize], [i16; 32usize]>(a.val.0) }
7576 }
7577 #[inline(always)]
7578 fn as_array_ref_mask16x32(self, a: &mask16x32<Self>) -> &[i16; 32usize] {
7579 unsafe { core::mem::transmute::<&[__m128i; 4usize], &[i16; 32usize]>(&a.val.0) }
7580 }
7581 #[inline(always)]
7582 fn as_array_mut_mask16x32(self, a: &mut mask16x32<Self>) -> &mut [i16; 32usize] {
7583 unsafe { core::mem::transmute::<&mut [__m128i; 4usize], &mut [i16; 32usize]>(&mut a.val.0) }
7584 }
7585 #[inline(always)]
7586 fn store_array_mask16x32(self, a: mask16x32<Self>, dest: &mut [i16; 32usize]) -> () {
7587 unsafe {
7588 core::ptr::copy_nonoverlapping(
7589 (&raw const a.val.0) as *const i16,
7590 dest.as_mut_ptr(),
7591 32usize,
7592 );
7593 }
7594 }
7595 #[inline(always)]
7596 fn cvt_from_bytes_mask16x32(self, a: u8x64<Self>) -> mask16x32<Self> {
7597 unsafe {
7598 mask16x32 {
7599 val: core::mem::transmute(a.val),
7600 simd: self,
7601 }
7602 }
7603 }
7604 #[inline(always)]
7605 fn cvt_to_bytes_mask16x32(self, a: mask16x32<Self>) -> u8x64<Self> {
7606 unsafe {
7607 u8x64 {
7608 val: core::mem::transmute(a.val),
7609 simd: self,
7610 }
7611 }
7612 }
7613 #[inline(always)]
7614 fn slide_mask16x32<const SHIFT: usize>(
7615 self,
7616 a: mask16x32<Self>,
7617 b: mask16x32<Self>,
7618 ) -> mask16x32<Self> {
7619 unsafe {
7620 if SHIFT >= 32usize {
7621 return b;
7622 }
7623 let result = cross_block_alignr_128x4(
7624 self.cvt_to_bytes_mask16x32(b).val.0,
7625 self.cvt_to_bytes_mask16x32(a).val.0,
7626 SHIFT * 2usize,
7627 );
7628 self.cvt_from_bytes_mask16x32(u8x64 {
7629 val: crate::support::Aligned512(result),
7630 simd: self,
7631 })
7632 }
7633 }
7634 #[inline(always)]
7635 fn slide_within_blocks_mask16x32<const SHIFT: usize>(
7636 self,
7637 a: mask16x32<Self>,
7638 b: mask16x32<Self>,
7639 ) -> mask16x32<Self> {
7640 let (a0, a1) = self.split_mask16x32(a);
7641 let (b0, b1) = self.split_mask16x32(b);
7642 self.combine_mask16x16(
7643 self.slide_within_blocks_mask16x16::<SHIFT>(a0, b0),
7644 self.slide_within_blocks_mask16x16::<SHIFT>(a1, b1),
7645 )
7646 }
7647 #[inline(always)]
7648 fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
7649 let (a0, a1) = self.split_mask16x32(a);
7650 let (b0, b1) = self.split_mask16x32(b);
7651 self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1))
7652 }
7653 #[inline(always)]
7654 fn or_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
7655 let (a0, a1) = self.split_mask16x32(a);
7656 let (b0, b1) = self.split_mask16x32(b);
7657 self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1))
7658 }
7659 #[inline(always)]
7660 fn xor_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
7661 let (a0, a1) = self.split_mask16x32(a);
7662 let (b0, b1) = self.split_mask16x32(b);
7663 self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1))
7664 }
7665 #[inline(always)]
7666 fn not_mask16x32(self, a: mask16x32<Self>) -> mask16x32<Self> {
7667 let (a0, a1) = self.split_mask16x32(a);
7668 self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1))
7669 }
7670 #[inline(always)]
7671 fn select_mask16x32(
7672 self,
7673 a: mask16x32<Self>,
7674 b: mask16x32<Self>,
7675 c: mask16x32<Self>,
7676 ) -> mask16x32<Self> {
7677 let (a0, a1) = self.split_mask16x32(a);
7678 let (b0, b1) = self.split_mask16x32(b);
7679 let (c0, c1) = self.split_mask16x32(c);
7680 self.combine_mask16x16(
7681 self.select_mask16x16(a0, b0, c0),
7682 self.select_mask16x16(a1, b1, c1),
7683 )
7684 }
7685 #[inline(always)]
7686 fn simd_eq_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
7687 let (a0, a1) = self.split_mask16x32(a);
7688 let (b0, b1) = self.split_mask16x32(b);
7689 self.combine_mask16x16(
7690 self.simd_eq_mask16x16(a0, b0),
7691 self.simd_eq_mask16x16(a1, b1),
7692 )
7693 }
7694 #[inline(always)]
7695 fn any_true_mask16x32(self, a: mask16x32<Self>) -> bool {
7696 let (a0, a1) = self.split_mask16x32(a);
7697 self.any_true_mask16x16(a0) || self.any_true_mask16x16(a1)
7698 }
7699 #[inline(always)]
7700 fn all_true_mask16x32(self, a: mask16x32<Self>) -> bool {
7701 let (a0, a1) = self.split_mask16x32(a);
7702 self.all_true_mask16x16(a0) && self.all_true_mask16x16(a1)
7703 }
7704 #[inline(always)]
7705 fn any_false_mask16x32(self, a: mask16x32<Self>) -> bool {
7706 let (a0, a1) = self.split_mask16x32(a);
7707 self.any_false_mask16x16(a0) || self.any_false_mask16x16(a1)
7708 }
7709 #[inline(always)]
7710 fn all_false_mask16x32(self, a: mask16x32<Self>) -> bool {
7711 let (a0, a1) = self.split_mask16x32(a);
7712 self.all_false_mask16x16(a0) && self.all_false_mask16x16(a1)
7713 }
7714 #[inline(always)]
7715 fn split_mask16x32(self, a: mask16x32<Self>) -> (mask16x16<Self>, mask16x16<Self>) {
7716 (
7717 mask16x16 {
7718 val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
7719 simd: self,
7720 },
7721 mask16x16 {
7722 val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
7723 simd: self,
7724 },
7725 )
7726 }
7727 #[inline(always)]
7728 fn splat_i32x16(self, val: i32) -> i32x16<Self> {
7729 let half = self.splat_i32x8(val);
7730 self.combine_i32x8(half, half)
7731 }
7732 #[inline(always)]
7733 fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16<Self> {
7734 i32x16 {
7735 val: unsafe { core::mem::transmute_copy(&val) },
7736 simd: self,
7737 }
7738 }
7739 #[inline(always)]
7740 fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16<Self> {
7741 i32x16 {
7742 val: unsafe { core::mem::transmute_copy(val) },
7743 simd: self,
7744 }
7745 }
7746 #[inline(always)]
7747 fn as_array_i32x16(self, a: i32x16<Self>) -> [i32; 16usize] {
7748 unsafe { core::mem::transmute::<[__m128i; 4usize], [i32; 16usize]>(a.val.0) }
7749 }
7750 #[inline(always)]
7751 fn as_array_ref_i32x16(self, a: &i32x16<Self>) -> &[i32; 16usize] {
7752 unsafe { core::mem::transmute::<&[__m128i; 4usize], &[i32; 16usize]>(&a.val.0) }
7753 }
7754 #[inline(always)]
7755 fn as_array_mut_i32x16(self, a: &mut i32x16<Self>) -> &mut [i32; 16usize] {
7756 unsafe { core::mem::transmute::<&mut [__m128i; 4usize], &mut [i32; 16usize]>(&mut a.val.0) }
7757 }
7758 #[inline(always)]
7759 fn store_array_i32x16(self, a: i32x16<Self>, dest: &mut [i32; 16usize]) -> () {
7760 unsafe {
7761 core::ptr::copy_nonoverlapping(
7762 (&raw const a.val.0) as *const i32,
7763 dest.as_mut_ptr(),
7764 16usize,
7765 );
7766 }
7767 }
7768 #[inline(always)]
7769 fn cvt_from_bytes_i32x16(self, a: u8x64<Self>) -> i32x16<Self> {
7770 unsafe {
7771 i32x16 {
7772 val: core::mem::transmute(a.val),
7773 simd: self,
7774 }
7775 }
7776 }
7777 #[inline(always)]
7778 fn cvt_to_bytes_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
7779 unsafe {
7780 u8x64 {
7781 val: core::mem::transmute(a.val),
7782 simd: self,
7783 }
7784 }
7785 }
7786 #[inline(always)]
7787 fn slide_i32x16<const SHIFT: usize>(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7788 unsafe {
7789 if SHIFT >= 16usize {
7790 return b;
7791 }
7792 let result = cross_block_alignr_128x4(
7793 self.cvt_to_bytes_i32x16(b).val.0,
7794 self.cvt_to_bytes_i32x16(a).val.0,
7795 SHIFT * 4usize,
7796 );
7797 self.cvt_from_bytes_i32x16(u8x64 {
7798 val: crate::support::Aligned512(result),
7799 simd: self,
7800 })
7801 }
7802 }
7803 #[inline(always)]
7804 fn slide_within_blocks_i32x16<const SHIFT: usize>(
7805 self,
7806 a: i32x16<Self>,
7807 b: i32x16<Self>,
7808 ) -> i32x16<Self> {
7809 let (a0, a1) = self.split_i32x16(a);
7810 let (b0, b1) = self.split_i32x16(b);
7811 self.combine_i32x8(
7812 self.slide_within_blocks_i32x8::<SHIFT>(a0, b0),
7813 self.slide_within_blocks_i32x8::<SHIFT>(a1, b1),
7814 )
7815 }
7816 #[inline(always)]
7817 fn add_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7818 let (a0, a1) = self.split_i32x16(a);
7819 let (b0, b1) = self.split_i32x16(b);
7820 self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1))
7821 }
7822 #[inline(always)]
7823 fn sub_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7824 let (a0, a1) = self.split_i32x16(a);
7825 let (b0, b1) = self.split_i32x16(b);
7826 self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1))
7827 }
7828 #[inline(always)]
7829 fn mul_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7830 let (a0, a1) = self.split_i32x16(a);
7831 let (b0, b1) = self.split_i32x16(b);
7832 self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1))
7833 }
7834 #[inline(always)]
7835 fn and_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7836 let (a0, a1) = self.split_i32x16(a);
7837 let (b0, b1) = self.split_i32x16(b);
7838 self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1))
7839 }
7840 #[inline(always)]
7841 fn or_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7842 let (a0, a1) = self.split_i32x16(a);
7843 let (b0, b1) = self.split_i32x16(b);
7844 self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1))
7845 }
7846 #[inline(always)]
7847 fn xor_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7848 let (a0, a1) = self.split_i32x16(a);
7849 let (b0, b1) = self.split_i32x16(b);
7850 self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1))
7851 }
7852 #[inline(always)]
7853 fn not_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
7854 let (a0, a1) = self.split_i32x16(a);
7855 self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1))
7856 }
7857 #[inline(always)]
7858 fn shl_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
7859 let (a0, a1) = self.split_i32x16(a);
7860 self.combine_i32x8(self.shl_i32x8(a0, shift), self.shl_i32x8(a1, shift))
7861 }
7862 #[inline(always)]
7863 fn shlv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7864 let (a0, a1) = self.split_i32x16(a);
7865 let (b0, b1) = self.split_i32x16(b);
7866 self.combine_i32x8(self.shlv_i32x8(a0, b0), self.shlv_i32x8(a1, b1))
7867 }
7868 #[inline(always)]
7869 fn shr_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
7870 let (a0, a1) = self.split_i32x16(a);
7871 self.combine_i32x8(self.shr_i32x8(a0, shift), self.shr_i32x8(a1, shift))
7872 }
7873 #[inline(always)]
7874 fn shrv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7875 let (a0, a1) = self.split_i32x16(a);
7876 let (b0, b1) = self.split_i32x16(b);
7877 self.combine_i32x8(self.shrv_i32x8(a0, b0), self.shrv_i32x8(a1, b1))
7878 }
7879 #[inline(always)]
7880 fn simd_eq_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
7881 let (a0, a1) = self.split_i32x16(a);
7882 let (b0, b1) = self.split_i32x16(b);
7883 self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1))
7884 }
7885 #[inline(always)]
7886 fn simd_lt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
7887 let (a0, a1) = self.split_i32x16(a);
7888 let (b0, b1) = self.split_i32x16(b);
7889 self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1))
7890 }
7891 #[inline(always)]
7892 fn simd_le_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
7893 let (a0, a1) = self.split_i32x16(a);
7894 let (b0, b1) = self.split_i32x16(b);
7895 self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1))
7896 }
7897 #[inline(always)]
7898 fn simd_ge_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
7899 let (a0, a1) = self.split_i32x16(a);
7900 let (b0, b1) = self.split_i32x16(b);
7901 self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1))
7902 }
7903 #[inline(always)]
7904 fn simd_gt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
7905 let (a0, a1) = self.split_i32x16(a);
7906 let (b0, b1) = self.split_i32x16(b);
7907 self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1))
7908 }
7909 #[inline(always)]
7910 fn zip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7911 let (a0, _) = self.split_i32x16(a);
7912 let (b0, _) = self.split_i32x16(b);
7913 self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0))
7914 }
7915 #[inline(always)]
7916 fn zip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7917 let (_, a1) = self.split_i32x16(a);
7918 let (_, b1) = self.split_i32x16(b);
7919 self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1))
7920 }
7921 #[inline(always)]
7922 fn unzip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7923 let (a0, a1) = self.split_i32x16(a);
7924 let (b0, b1) = self.split_i32x16(b);
7925 self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1))
7926 }
7927 #[inline(always)]
7928 fn unzip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7929 let (a0, a1) = self.split_i32x16(a);
7930 let (b0, b1) = self.split_i32x16(b);
7931 self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1))
7932 }
7933 #[inline(always)]
7934 fn interleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
7935 let (a0, a1) = self.split_i32x16(a);
7936 let (b0, b1) = self.split_i32x16(b);
7937 let lo_lo = self.zip_low_i32x8(a0, b0);
7938 let lo_hi = self.zip_high_i32x8(a0, b0);
7939 let hi_lo = self.zip_low_i32x8(a1, b1);
7940 let hi_hi = self.zip_high_i32x8(a1, b1);
7941 (
7942 self.combine_i32x8(lo_lo, lo_hi),
7943 self.combine_i32x8(hi_lo, hi_hi),
7944 )
7945 }
7946 #[inline(always)]
7947 fn deinterleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
7948 let (a0, a1) = self.split_i32x16(a);
7949 let (b0, b1) = self.split_i32x16(b);
7950 let lo_even = self.unzip_low_i32x8(a0, a1);
7951 let lo_odd = self.unzip_high_i32x8(a0, a1);
7952 let hi_even = self.unzip_low_i32x8(b0, b1);
7953 let hi_odd = self.unzip_high_i32x8(b0, b1);
7954 (
7955 self.combine_i32x8(lo_even, hi_even),
7956 self.combine_i32x8(lo_odd, hi_odd),
7957 )
7958 }
7959 #[inline(always)]
7960 fn select_i32x16(self, a: mask32x16<Self>, b: i32x16<Self>, c: i32x16<Self>) -> i32x16<Self> {
7961 let (a0, a1) = self.split_mask32x16(a);
7962 let (b0, b1) = self.split_i32x16(b);
7963 let (c0, c1) = self.split_i32x16(c);
7964 self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1))
7965 }
7966 #[inline(always)]
7967 fn min_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7968 let (a0, a1) = self.split_i32x16(a);
7969 let (b0, b1) = self.split_i32x16(b);
7970 self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1))
7971 }
7972 #[inline(always)]
7973 fn max_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7974 let (a0, a1) = self.split_i32x16(a);
7975 let (b0, b1) = self.split_i32x16(b);
7976 self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1))
7977 }
7978 #[inline(always)]
7979 fn split_i32x16(self, a: i32x16<Self>) -> (i32x8<Self>, i32x8<Self>) {
7980 (
7981 i32x8 {
7982 val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
7983 simd: self,
7984 },
7985 i32x8 {
7986 val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
7987 simd: self,
7988 },
7989 )
7990 }
7991 #[inline(always)]
7992 fn neg_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
7993 let (a0, a1) = self.split_i32x16(a);
7994 self.combine_i32x8(self.neg_i32x8(a0), self.neg_i32x8(a1))
7995 }
7996 #[inline(always)]
7997 fn reinterpret_u8_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
7998 let (a0, a1) = self.split_i32x16(a);
7999 self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1))
8000 }
8001 #[inline(always)]
8002 fn reinterpret_u32_i32x16(self, a: i32x16<Self>) -> u32x16<Self> {
8003 let (a0, a1) = self.split_i32x16(a);
8004 self.combine_u32x8(
8005 self.reinterpret_u32_i32x8(a0),
8006 self.reinterpret_u32_i32x8(a1),
8007 )
8008 }
8009 #[inline(always)]
8010 fn cvt_f32_i32x16(self, a: i32x16<Self>) -> f32x16<Self> {
8011 let (a0, a1) = self.split_i32x16(a);
8012 self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1))
8013 }
8014 #[inline(always)]
8015 fn splat_u32x16(self, val: u32) -> u32x16<Self> {
8016 let half = self.splat_u32x8(val);
8017 self.combine_u32x8(half, half)
8018 }
8019 #[inline(always)]
8020 fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16<Self> {
8021 u32x16 {
8022 val: unsafe { core::mem::transmute_copy(&val) },
8023 simd: self,
8024 }
8025 }
8026 #[inline(always)]
8027 fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16<Self> {
8028 u32x16 {
8029 val: unsafe { core::mem::transmute_copy(val) },
8030 simd: self,
8031 }
8032 }
8033 #[inline(always)]
8034 fn as_array_u32x16(self, a: u32x16<Self>) -> [u32; 16usize] {
8035 unsafe { core::mem::transmute::<[__m128i; 4usize], [u32; 16usize]>(a.val.0) }
8036 }
8037 #[inline(always)]
8038 fn as_array_ref_u32x16(self, a: &u32x16<Self>) -> &[u32; 16usize] {
8039 unsafe { core::mem::transmute::<&[__m128i; 4usize], &[u32; 16usize]>(&a.val.0) }
8040 }
8041 #[inline(always)]
8042 fn as_array_mut_u32x16(self, a: &mut u32x16<Self>) -> &mut [u32; 16usize] {
8043 unsafe { core::mem::transmute::<&mut [__m128i; 4usize], &mut [u32; 16usize]>(&mut a.val.0) }
8044 }
8045 #[inline(always)]
8046 fn store_array_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
8047 unsafe {
8048 core::ptr::copy_nonoverlapping(
8049 (&raw const a.val.0) as *const u32,
8050 dest.as_mut_ptr(),
8051 16usize,
8052 );
8053 }
8054 }
8055 #[inline(always)]
8056 fn cvt_from_bytes_u32x16(self, a: u8x64<Self>) -> u32x16<Self> {
8057 unsafe {
8058 u32x16 {
8059 val: core::mem::transmute(a.val),
8060 simd: self,
8061 }
8062 }
8063 }
8064 #[inline(always)]
8065 fn cvt_to_bytes_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
8066 unsafe {
8067 u8x64 {
8068 val: core::mem::transmute(a.val),
8069 simd: self,
8070 }
8071 }
8072 }
8073 #[inline(always)]
8074 fn slide_u32x16<const SHIFT: usize>(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8075 unsafe {
8076 if SHIFT >= 16usize {
8077 return b;
8078 }
8079 let result = cross_block_alignr_128x4(
8080 self.cvt_to_bytes_u32x16(b).val.0,
8081 self.cvt_to_bytes_u32x16(a).val.0,
8082 SHIFT * 4usize,
8083 );
8084 self.cvt_from_bytes_u32x16(u8x64 {
8085 val: crate::support::Aligned512(result),
8086 simd: self,
8087 })
8088 }
8089 }
8090 #[inline(always)]
8091 fn slide_within_blocks_u32x16<const SHIFT: usize>(
8092 self,
8093 a: u32x16<Self>,
8094 b: u32x16<Self>,
8095 ) -> u32x16<Self> {
8096 let (a0, a1) = self.split_u32x16(a);
8097 let (b0, b1) = self.split_u32x16(b);
8098 self.combine_u32x8(
8099 self.slide_within_blocks_u32x8::<SHIFT>(a0, b0),
8100 self.slide_within_blocks_u32x8::<SHIFT>(a1, b1),
8101 )
8102 }
8103 #[inline(always)]
8104 fn add_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8105 let (a0, a1) = self.split_u32x16(a);
8106 let (b0, b1) = self.split_u32x16(b);
8107 self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1))
8108 }
8109 #[inline(always)]
8110 fn sub_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8111 let (a0, a1) = self.split_u32x16(a);
8112 let (b0, b1) = self.split_u32x16(b);
8113 self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1))
8114 }
8115 #[inline(always)]
8116 fn mul_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8117 let (a0, a1) = self.split_u32x16(a);
8118 let (b0, b1) = self.split_u32x16(b);
8119 self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1))
8120 }
8121 #[inline(always)]
8122 fn and_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8123 let (a0, a1) = self.split_u32x16(a);
8124 let (b0, b1) = self.split_u32x16(b);
8125 self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1))
8126 }
8127 #[inline(always)]
8128 fn or_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8129 let (a0, a1) = self.split_u32x16(a);
8130 let (b0, b1) = self.split_u32x16(b);
8131 self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1))
8132 }
8133 #[inline(always)]
8134 fn xor_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8135 let (a0, a1) = self.split_u32x16(a);
8136 let (b0, b1) = self.split_u32x16(b);
8137 self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1))
8138 }
8139 #[inline(always)]
8140 fn not_u32x16(self, a: u32x16<Self>) -> u32x16<Self> {
8141 let (a0, a1) = self.split_u32x16(a);
8142 self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1))
8143 }
8144 #[inline(always)]
8145 fn shl_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
8146 let (a0, a1) = self.split_u32x16(a);
8147 self.combine_u32x8(self.shl_u32x8(a0, shift), self.shl_u32x8(a1, shift))
8148 }
8149 #[inline(always)]
8150 fn shlv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8151 let (a0, a1) = self.split_u32x16(a);
8152 let (b0, b1) = self.split_u32x16(b);
8153 self.combine_u32x8(self.shlv_u32x8(a0, b0), self.shlv_u32x8(a1, b1))
8154 }
8155 #[inline(always)]
8156 fn shr_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
8157 let (a0, a1) = self.split_u32x16(a);
8158 self.combine_u32x8(self.shr_u32x8(a0, shift), self.shr_u32x8(a1, shift))
8159 }
8160 #[inline(always)]
8161 fn shrv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8162 let (a0, a1) = self.split_u32x16(a);
8163 let (b0, b1) = self.split_u32x16(b);
8164 self.combine_u32x8(self.shrv_u32x8(a0, b0), self.shrv_u32x8(a1, b1))
8165 }
8166 #[inline(always)]
8167 fn simd_eq_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
8168 let (a0, a1) = self.split_u32x16(a);
8169 let (b0, b1) = self.split_u32x16(b);
8170 self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1))
8171 }
8172 #[inline(always)]
8173 fn simd_lt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
8174 let (a0, a1) = self.split_u32x16(a);
8175 let (b0, b1) = self.split_u32x16(b);
8176 self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1))
8177 }
8178 #[inline(always)]
8179 fn simd_le_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
8180 let (a0, a1) = self.split_u32x16(a);
8181 let (b0, b1) = self.split_u32x16(b);
8182 self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1))
8183 }
8184 #[inline(always)]
8185 fn simd_ge_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
8186 let (a0, a1) = self.split_u32x16(a);
8187 let (b0, b1) = self.split_u32x16(b);
8188 self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1))
8189 }
8190 #[inline(always)]
8191 fn simd_gt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
8192 let (a0, a1) = self.split_u32x16(a);
8193 let (b0, b1) = self.split_u32x16(b);
8194 self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1))
8195 }
8196 #[inline(always)]
8197 fn zip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8198 let (a0, _) = self.split_u32x16(a);
8199 let (b0, _) = self.split_u32x16(b);
8200 self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0))
8201 }
8202 #[inline(always)]
8203 fn zip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8204 let (_, a1) = self.split_u32x16(a);
8205 let (_, b1) = self.split_u32x16(b);
8206 self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1))
8207 }
8208 #[inline(always)]
8209 fn unzip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8210 let (a0, a1) = self.split_u32x16(a);
8211 let (b0, b1) = self.split_u32x16(b);
8212 self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1))
8213 }
8214 #[inline(always)]
8215 fn unzip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8216 let (a0, a1) = self.split_u32x16(a);
8217 let (b0, b1) = self.split_u32x16(b);
8218 self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1))
8219 }
8220 #[inline(always)]
8221 fn interleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
8222 let (a0, a1) = self.split_u32x16(a);
8223 let (b0, b1) = self.split_u32x16(b);
8224 let lo_lo = self.zip_low_u32x8(a0, b0);
8225 let lo_hi = self.zip_high_u32x8(a0, b0);
8226 let hi_lo = self.zip_low_u32x8(a1, b1);
8227 let hi_hi = self.zip_high_u32x8(a1, b1);
8228 (
8229 self.combine_u32x8(lo_lo, lo_hi),
8230 self.combine_u32x8(hi_lo, hi_hi),
8231 )
8232 }
8233 #[inline(always)]
8234 fn deinterleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
8235 let (a0, a1) = self.split_u32x16(a);
8236 let (b0, b1) = self.split_u32x16(b);
8237 let lo_even = self.unzip_low_u32x8(a0, a1);
8238 let lo_odd = self.unzip_high_u32x8(a0, a1);
8239 let hi_even = self.unzip_low_u32x8(b0, b1);
8240 let hi_odd = self.unzip_high_u32x8(b0, b1);
8241 (
8242 self.combine_u32x8(lo_even, hi_even),
8243 self.combine_u32x8(lo_odd, hi_odd),
8244 )
8245 }
8246 #[inline(always)]
8247 fn select_u32x16(self, a: mask32x16<Self>, b: u32x16<Self>, c: u32x16<Self>) -> u32x16<Self> {
8248 let (a0, a1) = self.split_mask32x16(a);
8249 let (b0, b1) = self.split_u32x16(b);
8250 let (c0, c1) = self.split_u32x16(c);
8251 self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1))
8252 }
8253 #[inline(always)]
8254 fn min_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8255 let (a0, a1) = self.split_u32x16(a);
8256 let (b0, b1) = self.split_u32x16(b);
8257 self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1))
8258 }
8259 #[inline(always)]
8260 fn max_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8261 let (a0, a1) = self.split_u32x16(a);
8262 let (b0, b1) = self.split_u32x16(b);
8263 self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1))
8264 }
8265 #[inline(always)]
8266 fn split_u32x16(self, a: u32x16<Self>) -> (u32x8<Self>, u32x8<Self>) {
8267 (
8268 u32x8 {
8269 val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
8270 simd: self,
8271 },
8272 u32x8 {
8273 val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
8274 simd: self,
8275 },
8276 )
8277 }
8278 #[inline(always)]
8279 fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self> {
8280 unsafe {
8281 let v0 = _mm_loadu_si128(src.as_ptr() as *const _);
8282 let v1 = _mm_loadu_si128(src.as_ptr().add(4usize) as *const _);
8283 let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 4usize) as *const _);
8284 let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 4usize) as *const _);
8285 let tmp0 = _mm_unpacklo_epi32(v0, v1);
8286 let tmp1 = _mm_unpackhi_epi32(v0, v1);
8287 let tmp2 = _mm_unpacklo_epi32(v2, v3);
8288 let tmp3 = _mm_unpackhi_epi32(v2, v3);
8289 let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
8290 let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
8291 let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
8292 let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
8293 self.combine_u32x8(
8294 self.combine_u32x4(out0.simd_into(self), out1.simd_into(self)),
8295 self.combine_u32x4(out2.simd_into(self), out3.simd_into(self)),
8296 )
8297 }
8298 }
8299 #[inline(always)]
8300 fn store_interleaved_128_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
8301 let (v01, v23) = self.split_u32x16(a);
8302 let (v0, v1) = self.split_u32x8(v01);
8303 let (v2, v3) = self.split_u32x8(v23);
8304 let v0 = v0.into();
8305 let v1 = v1.into();
8306 let v2 = v2.into();
8307 let v3 = v3.into();
8308 unsafe {
8309 let tmp0 = _mm_unpacklo_epi32(v0, v1);
8310 let tmp1 = _mm_unpackhi_epi32(v0, v1);
8311 let tmp2 = _mm_unpacklo_epi32(v2, v3);
8312 let tmp3 = _mm_unpackhi_epi32(v2, v3);
8313 let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
8314 let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
8315 let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
8316 let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
8317 _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0);
8318 _mm_storeu_si128(dest.as_mut_ptr().add(4usize) as *mut _, out1);
8319 _mm_storeu_si128(dest.as_mut_ptr().add(2 * 4usize) as *mut _, out2);
8320 _mm_storeu_si128(dest.as_mut_ptr().add(3 * 4usize) as *mut _, out3);
8321 }
8322 }
8323 #[inline(always)]
8324 fn reinterpret_u8_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
8325 let (a0, a1) = self.split_u32x16(a);
8326 self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1))
8327 }
8328 #[inline(always)]
8329 fn cvt_f32_u32x16(self, a: u32x16<Self>) -> f32x16<Self> {
8330 let (a0, a1) = self.split_u32x16(a);
8331 self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1))
8332 }
8333 #[inline(always)]
8334 fn splat_mask32x16(self, val: i32) -> mask32x16<Self> {
8335 let half = self.splat_mask32x8(val);
8336 self.combine_mask32x8(half, half)
8337 }
8338 #[inline(always)]
8339 fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16<Self> {
8340 mask32x16 {
8341 val: unsafe { core::mem::transmute_copy(&val) },
8342 simd: self,
8343 }
8344 }
8345 #[inline(always)]
8346 fn load_array_ref_mask32x16(self, val: &[i32; 16usize]) -> mask32x16<Self> {
8347 mask32x16 {
8348 val: unsafe { core::mem::transmute_copy(val) },
8349 simd: self,
8350 }
8351 }
8352 #[inline(always)]
8353 fn as_array_mask32x16(self, a: mask32x16<Self>) -> [i32; 16usize] {
8354 unsafe { core::mem::transmute::<[__m128i; 4usize], [i32; 16usize]>(a.val.0) }
8355 }
8356 #[inline(always)]
8357 fn as_array_ref_mask32x16(self, a: &mask32x16<Self>) -> &[i32; 16usize] {
8358 unsafe { core::mem::transmute::<&[__m128i; 4usize], &[i32; 16usize]>(&a.val.0) }
8359 }
8360 #[inline(always)]
8361 fn as_array_mut_mask32x16(self, a: &mut mask32x16<Self>) -> &mut [i32; 16usize] {
8362 unsafe { core::mem::transmute::<&mut [__m128i; 4usize], &mut [i32; 16usize]>(&mut a.val.0) }
8363 }
8364 #[inline(always)]
8365 fn store_array_mask32x16(self, a: mask32x16<Self>, dest: &mut [i32; 16usize]) -> () {
8366 unsafe {
8367 core::ptr::copy_nonoverlapping(
8368 (&raw const a.val.0) as *const i32,
8369 dest.as_mut_ptr(),
8370 16usize,
8371 );
8372 }
8373 }
8374 #[inline(always)]
8375 fn cvt_from_bytes_mask32x16(self, a: u8x64<Self>) -> mask32x16<Self> {
8376 unsafe {
8377 mask32x16 {
8378 val: core::mem::transmute(a.val),
8379 simd: self,
8380 }
8381 }
8382 }
8383 #[inline(always)]
8384 fn cvt_to_bytes_mask32x16(self, a: mask32x16<Self>) -> u8x64<Self> {
8385 unsafe {
8386 u8x64 {
8387 val: core::mem::transmute(a.val),
8388 simd: self,
8389 }
8390 }
8391 }
8392 #[inline(always)]
8393 fn slide_mask32x16<const SHIFT: usize>(
8394 self,
8395 a: mask32x16<Self>,
8396 b: mask32x16<Self>,
8397 ) -> mask32x16<Self> {
8398 unsafe {
8399 if SHIFT >= 16usize {
8400 return b;
8401 }
8402 let result = cross_block_alignr_128x4(
8403 self.cvt_to_bytes_mask32x16(b).val.0,
8404 self.cvt_to_bytes_mask32x16(a).val.0,
8405 SHIFT * 4usize,
8406 );
8407 self.cvt_from_bytes_mask32x16(u8x64 {
8408 val: crate::support::Aligned512(result),
8409 simd: self,
8410 })
8411 }
8412 }
8413 #[inline(always)]
8414 fn slide_within_blocks_mask32x16<const SHIFT: usize>(
8415 self,
8416 a: mask32x16<Self>,
8417 b: mask32x16<Self>,
8418 ) -> mask32x16<Self> {
8419 let (a0, a1) = self.split_mask32x16(a);
8420 let (b0, b1) = self.split_mask32x16(b);
8421 self.combine_mask32x8(
8422 self.slide_within_blocks_mask32x8::<SHIFT>(a0, b0),
8423 self.slide_within_blocks_mask32x8::<SHIFT>(a1, b1),
8424 )
8425 }
8426 #[inline(always)]
8427 fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
8428 let (a0, a1) = self.split_mask32x16(a);
8429 let (b0, b1) = self.split_mask32x16(b);
8430 self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1))
8431 }
8432 #[inline(always)]
8433 fn or_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
8434 let (a0, a1) = self.split_mask32x16(a);
8435 let (b0, b1) = self.split_mask32x16(b);
8436 self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1))
8437 }
8438 #[inline(always)]
8439 fn xor_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
8440 let (a0, a1) = self.split_mask32x16(a);
8441 let (b0, b1) = self.split_mask32x16(b);
8442 self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1))
8443 }
8444 #[inline(always)]
8445 fn not_mask32x16(self, a: mask32x16<Self>) -> mask32x16<Self> {
8446 let (a0, a1) = self.split_mask32x16(a);
8447 self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1))
8448 }
8449 #[inline(always)]
8450 fn select_mask32x16(
8451 self,
8452 a: mask32x16<Self>,
8453 b: mask32x16<Self>,
8454 c: mask32x16<Self>,
8455 ) -> mask32x16<Self> {
8456 let (a0, a1) = self.split_mask32x16(a);
8457 let (b0, b1) = self.split_mask32x16(b);
8458 let (c0, c1) = self.split_mask32x16(c);
8459 self.combine_mask32x8(
8460 self.select_mask32x8(a0, b0, c0),
8461 self.select_mask32x8(a1, b1, c1),
8462 )
8463 }
8464 #[inline(always)]
8465 fn simd_eq_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
8466 let (a0, a1) = self.split_mask32x16(a);
8467 let (b0, b1) = self.split_mask32x16(b);
8468 self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1))
8469 }
8470 #[inline(always)]
8471 fn any_true_mask32x16(self, a: mask32x16<Self>) -> bool {
8472 let (a0, a1) = self.split_mask32x16(a);
8473 self.any_true_mask32x8(a0) || self.any_true_mask32x8(a1)
8474 }
8475 #[inline(always)]
8476 fn all_true_mask32x16(self, a: mask32x16<Self>) -> bool {
8477 let (a0, a1) = self.split_mask32x16(a);
8478 self.all_true_mask32x8(a0) && self.all_true_mask32x8(a1)
8479 }
8480 #[inline(always)]
8481 fn any_false_mask32x16(self, a: mask32x16<Self>) -> bool {
8482 let (a0, a1) = self.split_mask32x16(a);
8483 self.any_false_mask32x8(a0) || self.any_false_mask32x8(a1)
8484 }
8485 #[inline(always)]
8486 fn all_false_mask32x16(self, a: mask32x16<Self>) -> bool {
8487 let (a0, a1) = self.split_mask32x16(a);
8488 self.all_false_mask32x8(a0) && self.all_false_mask32x8(a1)
8489 }
8490 #[inline(always)]
8491 fn split_mask32x16(self, a: mask32x16<Self>) -> (mask32x8<Self>, mask32x8<Self>) {
8492 (
8493 mask32x8 {
8494 val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
8495 simd: self,
8496 },
8497 mask32x8 {
8498 val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
8499 simd: self,
8500 },
8501 )
8502 }
8503 #[inline(always)]
8504 fn splat_f64x8(self, val: f64) -> f64x8<Self> {
8505 let half = self.splat_f64x4(val);
8506 self.combine_f64x4(half, half)
8507 }
8508 #[inline(always)]
8509 fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8<Self> {
8510 f64x8 {
8511 val: unsafe { core::mem::transmute_copy(&val) },
8512 simd: self,
8513 }
8514 }
8515 #[inline(always)]
8516 fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8<Self> {
8517 f64x8 {
8518 val: unsafe { core::mem::transmute_copy(val) },
8519 simd: self,
8520 }
8521 }
8522 #[inline(always)]
8523 fn as_array_f64x8(self, a: f64x8<Self>) -> [f64; 8usize] {
8524 unsafe { core::mem::transmute::<[__m128d; 4usize], [f64; 8usize]>(a.val.0) }
8525 }
8526 #[inline(always)]
8527 fn as_array_ref_f64x8(self, a: &f64x8<Self>) -> &[f64; 8usize] {
8528 unsafe { core::mem::transmute::<&[__m128d; 4usize], &[f64; 8usize]>(&a.val.0) }
8529 }
8530 #[inline(always)]
8531 fn as_array_mut_f64x8(self, a: &mut f64x8<Self>) -> &mut [f64; 8usize] {
8532 unsafe { core::mem::transmute::<&mut [__m128d; 4usize], &mut [f64; 8usize]>(&mut a.val.0) }
8533 }
8534 #[inline(always)]
8535 fn store_array_f64x8(self, a: f64x8<Self>, dest: &mut [f64; 8usize]) -> () {
8536 unsafe {
8537 core::ptr::copy_nonoverlapping(
8538 (&raw const a.val.0) as *const f64,
8539 dest.as_mut_ptr(),
8540 8usize,
8541 );
8542 }
8543 }
8544 #[inline(always)]
8545 fn cvt_from_bytes_f64x8(self, a: u8x64<Self>) -> f64x8<Self> {
8546 unsafe {
8547 f64x8 {
8548 val: core::mem::transmute(a.val),
8549 simd: self,
8550 }
8551 }
8552 }
8553 #[inline(always)]
8554 fn cvt_to_bytes_f64x8(self, a: f64x8<Self>) -> u8x64<Self> {
8555 unsafe {
8556 u8x64 {
8557 val: core::mem::transmute(a.val),
8558 simd: self,
8559 }
8560 }
8561 }
8562 #[inline(always)]
8563 fn slide_f64x8<const SHIFT: usize>(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8564 unsafe {
8565 if SHIFT >= 8usize {
8566 return b;
8567 }
8568 let result = cross_block_alignr_128x4(
8569 self.cvt_to_bytes_f64x8(b).val.0,
8570 self.cvt_to_bytes_f64x8(a).val.0,
8571 SHIFT * 8usize,
8572 );
8573 self.cvt_from_bytes_f64x8(u8x64 {
8574 val: crate::support::Aligned512(result),
8575 simd: self,
8576 })
8577 }
8578 }
8579 #[inline(always)]
8580 fn slide_within_blocks_f64x8<const SHIFT: usize>(
8581 self,
8582 a: f64x8<Self>,
8583 b: f64x8<Self>,
8584 ) -> f64x8<Self> {
8585 let (a0, a1) = self.split_f64x8(a);
8586 let (b0, b1) = self.split_f64x8(b);
8587 self.combine_f64x4(
8588 self.slide_within_blocks_f64x4::<SHIFT>(a0, b0),
8589 self.slide_within_blocks_f64x4::<SHIFT>(a1, b1),
8590 )
8591 }
8592 #[inline(always)]
8593 fn abs_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8594 let (a0, a1) = self.split_f64x8(a);
8595 self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1))
8596 }
8597 #[inline(always)]
8598 fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8599 let (a0, a1) = self.split_f64x8(a);
8600 self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1))
8601 }
8602 #[inline(always)]
8603 fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8604 let (a0, a1) = self.split_f64x8(a);
8605 self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
8606 }
8607 #[inline(always)]
8608 fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8609 let (a0, a1) = self.split_f64x8(a);
8610 let (b0, b1) = self.split_f64x8(b);
8611 self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1))
8612 }
8613 #[inline(always)]
8614 fn sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8615 let (a0, a1) = self.split_f64x8(a);
8616 let (b0, b1) = self.split_f64x8(b);
8617 self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1))
8618 }
8619 #[inline(always)]
8620 fn mul_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8621 let (a0, a1) = self.split_f64x8(a);
8622 let (b0, b1) = self.split_f64x8(b);
8623 self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1))
8624 }
8625 #[inline(always)]
8626 fn div_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8627 let (a0, a1) = self.split_f64x8(a);
8628 let (b0, b1) = self.split_f64x8(b);
8629 self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1))
8630 }
8631 #[inline(always)]
8632 fn copysign_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8633 let (a0, a1) = self.split_f64x8(a);
8634 let (b0, b1) = self.split_f64x8(b);
8635 self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1))
8636 }
8637 #[inline(always)]
8638 fn simd_eq_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
8639 let (a0, a1) = self.split_f64x8(a);
8640 let (b0, b1) = self.split_f64x8(b);
8641 self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1))
8642 }
8643 #[inline(always)]
8644 fn simd_lt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
8645 let (a0, a1) = self.split_f64x8(a);
8646 let (b0, b1) = self.split_f64x8(b);
8647 self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1))
8648 }
8649 #[inline(always)]
8650 fn simd_le_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
8651 let (a0, a1) = self.split_f64x8(a);
8652 let (b0, b1) = self.split_f64x8(b);
8653 self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1))
8654 }
8655 #[inline(always)]
8656 fn simd_ge_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
8657 let (a0, a1) = self.split_f64x8(a);
8658 let (b0, b1) = self.split_f64x8(b);
8659 self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1))
8660 }
8661 #[inline(always)]
8662 fn simd_gt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
8663 let (a0, a1) = self.split_f64x8(a);
8664 let (b0, b1) = self.split_f64x8(b);
8665 self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1))
8666 }
8667 #[inline(always)]
8668 fn zip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8669 let (a0, _) = self.split_f64x8(a);
8670 let (b0, _) = self.split_f64x8(b);
8671 self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0))
8672 }
8673 #[inline(always)]
8674 fn zip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8675 let (_, a1) = self.split_f64x8(a);
8676 let (_, b1) = self.split_f64x8(b);
8677 self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1))
8678 }
8679 #[inline(always)]
8680 fn unzip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8681 let (a0, a1) = self.split_f64x8(a);
8682 let (b0, b1) = self.split_f64x8(b);
8683 self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1))
8684 }
8685 #[inline(always)]
8686 fn unzip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8687 let (a0, a1) = self.split_f64x8(a);
8688 let (b0, b1) = self.split_f64x8(b);
8689 self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1))
8690 }
8691 #[inline(always)]
8692 fn interleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
8693 let (a0, a1) = self.split_f64x8(a);
8694 let (b0, b1) = self.split_f64x8(b);
8695 let lo_lo = self.zip_low_f64x4(a0, b0);
8696 let lo_hi = self.zip_high_f64x4(a0, b0);
8697 let hi_lo = self.zip_low_f64x4(a1, b1);
8698 let hi_hi = self.zip_high_f64x4(a1, b1);
8699 (
8700 self.combine_f64x4(lo_lo, lo_hi),
8701 self.combine_f64x4(hi_lo, hi_hi),
8702 )
8703 }
8704 #[inline(always)]
8705 fn deinterleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
8706 let (a0, a1) = self.split_f64x8(a);
8707 let (b0, b1) = self.split_f64x8(b);
8708 let lo_even = self.unzip_low_f64x4(a0, a1);
8709 let lo_odd = self.unzip_high_f64x4(a0, a1);
8710 let hi_even = self.unzip_low_f64x4(b0, b1);
8711 let hi_odd = self.unzip_high_f64x4(b0, b1);
8712 (
8713 self.combine_f64x4(lo_even, hi_even),
8714 self.combine_f64x4(lo_odd, hi_odd),
8715 )
8716 }
8717 #[inline(always)]
8718 fn max_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8719 let (a0, a1) = self.split_f64x8(a);
8720 let (b0, b1) = self.split_f64x8(b);
8721 self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1))
8722 }
8723 #[inline(always)]
8724 fn min_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8725 let (a0, a1) = self.split_f64x8(a);
8726 let (b0, b1) = self.split_f64x8(b);
8727 self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1))
8728 }
8729 #[inline(always)]
8730 fn max_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8731 let (a0, a1) = self.split_f64x8(a);
8732 let (b0, b1) = self.split_f64x8(b);
8733 self.combine_f64x4(
8734 self.max_precise_f64x4(a0, b0),
8735 self.max_precise_f64x4(a1, b1),
8736 )
8737 }
8738 #[inline(always)]
8739 fn min_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8740 let (a0, a1) = self.split_f64x8(a);
8741 let (b0, b1) = self.split_f64x8(b);
8742 self.combine_f64x4(
8743 self.min_precise_f64x4(a0, b0),
8744 self.min_precise_f64x4(a1, b1),
8745 )
8746 }
8747 #[inline(always)]
8748 fn mul_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
8749 let (a0, a1) = self.split_f64x8(a);
8750 let (b0, b1) = self.split_f64x8(b);
8751 let (c0, c1) = self.split_f64x8(c);
8752 self.combine_f64x4(
8753 self.mul_add_f64x4(a0, b0, c0),
8754 self.mul_add_f64x4(a1, b1, c1),
8755 )
8756 }
8757 #[inline(always)]
8758 fn mul_sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
8759 let (a0, a1) = self.split_f64x8(a);
8760 let (b0, b1) = self.split_f64x8(b);
8761 let (c0, c1) = self.split_f64x8(c);
8762 self.combine_f64x4(
8763 self.mul_sub_f64x4(a0, b0, c0),
8764 self.mul_sub_f64x4(a1, b1, c1),
8765 )
8766 }
8767 #[inline(always)]
8768 fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8769 let (a0, a1) = self.split_f64x8(a);
8770 self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1))
8771 }
8772 #[inline(always)]
8773 fn ceil_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8774 let (a0, a1) = self.split_f64x8(a);
8775 self.combine_f64x4(self.ceil_f64x4(a0), self.ceil_f64x4(a1))
8776 }
8777 #[inline(always)]
8778 fn round_ties_even_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8779 let (a0, a1) = self.split_f64x8(a);
8780 self.combine_f64x4(
8781 self.round_ties_even_f64x4(a0),
8782 self.round_ties_even_f64x4(a1),
8783 )
8784 }
8785 #[inline(always)]
8786 fn fract_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8787 let (a0, a1) = self.split_f64x8(a);
8788 self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1))
8789 }
8790 #[inline(always)]
8791 fn trunc_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8792 let (a0, a1) = self.split_f64x8(a);
8793 self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1))
8794 }
8795 #[inline(always)]
8796 fn select_f64x8(self, a: mask64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
8797 let (a0, a1) = self.split_mask64x8(a);
8798 let (b0, b1) = self.split_f64x8(b);
8799 let (c0, c1) = self.split_f64x8(c);
8800 self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1))
8801 }
8802 #[inline(always)]
8803 fn split_f64x8(self, a: f64x8<Self>) -> (f64x4<Self>, f64x4<Self>) {
8804 (
8805 f64x4 {
8806 val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
8807 simd: self,
8808 },
8809 f64x4 {
8810 val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
8811 simd: self,
8812 },
8813 )
8814 }
8815 #[inline(always)]
8816 fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self> {
8817 let (a0, a1) = self.split_f64x8(a);
8818 self.combine_f32x8(
8819 self.reinterpret_f32_f64x4(a0),
8820 self.reinterpret_f32_f64x4(a1),
8821 )
8822 }
8823 #[inline(always)]
8824 fn splat_mask64x8(self, val: i64) -> mask64x8<Self> {
8825 let half = self.splat_mask64x4(val);
8826 self.combine_mask64x4(half, half)
8827 }
8828 #[inline(always)]
8829 fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8<Self> {
8830 mask64x8 {
8831 val: unsafe { core::mem::transmute_copy(&val) },
8832 simd: self,
8833 }
8834 }
8835 #[inline(always)]
8836 fn load_array_ref_mask64x8(self, val: &[i64; 8usize]) -> mask64x8<Self> {
8837 mask64x8 {
8838 val: unsafe { core::mem::transmute_copy(val) },
8839 simd: self,
8840 }
8841 }
8842 #[inline(always)]
8843 fn as_array_mask64x8(self, a: mask64x8<Self>) -> [i64; 8usize] {
8844 unsafe { core::mem::transmute::<[__m128i; 4usize], [i64; 8usize]>(a.val.0) }
8845 }
8846 #[inline(always)]
8847 fn as_array_ref_mask64x8(self, a: &mask64x8<Self>) -> &[i64; 8usize] {
8848 unsafe { core::mem::transmute::<&[__m128i; 4usize], &[i64; 8usize]>(&a.val.0) }
8849 }
8850 #[inline(always)]
8851 fn as_array_mut_mask64x8(self, a: &mut mask64x8<Self>) -> &mut [i64; 8usize] {
8852 unsafe { core::mem::transmute::<&mut [__m128i; 4usize], &mut [i64; 8usize]>(&mut a.val.0) }
8853 }
8854 #[inline(always)]
8855 fn store_array_mask64x8(self, a: mask64x8<Self>, dest: &mut [i64; 8usize]) -> () {
8856 unsafe {
8857 core::ptr::copy_nonoverlapping(
8858 (&raw const a.val.0) as *const i64,
8859 dest.as_mut_ptr(),
8860 8usize,
8861 );
8862 }
8863 }
8864 #[inline(always)]
8865 fn cvt_from_bytes_mask64x8(self, a: u8x64<Self>) -> mask64x8<Self> {
8866 unsafe {
8867 mask64x8 {
8868 val: core::mem::transmute(a.val),
8869 simd: self,
8870 }
8871 }
8872 }
8873 #[inline(always)]
8874 fn cvt_to_bytes_mask64x8(self, a: mask64x8<Self>) -> u8x64<Self> {
8875 unsafe {
8876 u8x64 {
8877 val: core::mem::transmute(a.val),
8878 simd: self,
8879 }
8880 }
8881 }
8882 #[inline(always)]
8883 fn slide_mask64x8<const SHIFT: usize>(
8884 self,
8885 a: mask64x8<Self>,
8886 b: mask64x8<Self>,
8887 ) -> mask64x8<Self> {
8888 unsafe {
8889 if SHIFT >= 8usize {
8890 return b;
8891 }
8892 let result = cross_block_alignr_128x4(
8893 self.cvt_to_bytes_mask64x8(b).val.0,
8894 self.cvt_to_bytes_mask64x8(a).val.0,
8895 SHIFT * 8usize,
8896 );
8897 self.cvt_from_bytes_mask64x8(u8x64 {
8898 val: crate::support::Aligned512(result),
8899 simd: self,
8900 })
8901 }
8902 }
8903 #[inline(always)]
8904 fn slide_within_blocks_mask64x8<const SHIFT: usize>(
8905 self,
8906 a: mask64x8<Self>,
8907 b: mask64x8<Self>,
8908 ) -> mask64x8<Self> {
8909 let (a0, a1) = self.split_mask64x8(a);
8910 let (b0, b1) = self.split_mask64x8(b);
8911 self.combine_mask64x4(
8912 self.slide_within_blocks_mask64x4::<SHIFT>(a0, b0),
8913 self.slide_within_blocks_mask64x4::<SHIFT>(a1, b1),
8914 )
8915 }
8916 #[inline(always)]
8917 fn and_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
8918 let (a0, a1) = self.split_mask64x8(a);
8919 let (b0, b1) = self.split_mask64x8(b);
8920 self.combine_mask64x4(self.and_mask64x4(a0, b0), self.and_mask64x4(a1, b1))
8921 }
8922 #[inline(always)]
8923 fn or_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
8924 let (a0, a1) = self.split_mask64x8(a);
8925 let (b0, b1) = self.split_mask64x8(b);
8926 self.combine_mask64x4(self.or_mask64x4(a0, b0), self.or_mask64x4(a1, b1))
8927 }
8928 #[inline(always)]
8929 fn xor_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
8930 let (a0, a1) = self.split_mask64x8(a);
8931 let (b0, b1) = self.split_mask64x8(b);
8932 self.combine_mask64x4(self.xor_mask64x4(a0, b0), self.xor_mask64x4(a1, b1))
8933 }
8934 #[inline(always)]
8935 fn not_mask64x8(self, a: mask64x8<Self>) -> mask64x8<Self> {
8936 let (a0, a1) = self.split_mask64x8(a);
8937 self.combine_mask64x4(self.not_mask64x4(a0), self.not_mask64x4(a1))
8938 }
8939 #[inline(always)]
8940 fn select_mask64x8(
8941 self,
8942 a: mask64x8<Self>,
8943 b: mask64x8<Self>,
8944 c: mask64x8<Self>,
8945 ) -> mask64x8<Self> {
8946 let (a0, a1) = self.split_mask64x8(a);
8947 let (b0, b1) = self.split_mask64x8(b);
8948 let (c0, c1) = self.split_mask64x8(c);
8949 self.combine_mask64x4(
8950 self.select_mask64x4(a0, b0, c0),
8951 self.select_mask64x4(a1, b1, c1),
8952 )
8953 }
8954 #[inline(always)]
8955 fn simd_eq_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
8956 let (a0, a1) = self.split_mask64x8(a);
8957 let (b0, b1) = self.split_mask64x8(b);
8958 self.combine_mask64x4(self.simd_eq_mask64x4(a0, b0), self.simd_eq_mask64x4(a1, b1))
8959 }
8960 #[inline(always)]
8961 fn any_true_mask64x8(self, a: mask64x8<Self>) -> bool {
8962 let (a0, a1) = self.split_mask64x8(a);
8963 self.any_true_mask64x4(a0) || self.any_true_mask64x4(a1)
8964 }
8965 #[inline(always)]
8966 fn all_true_mask64x8(self, a: mask64x8<Self>) -> bool {
8967 let (a0, a1) = self.split_mask64x8(a);
8968 self.all_true_mask64x4(a0) && self.all_true_mask64x4(a1)
8969 }
8970 #[inline(always)]
8971 fn any_false_mask64x8(self, a: mask64x8<Self>) -> bool {
8972 let (a0, a1) = self.split_mask64x8(a);
8973 self.any_false_mask64x4(a0) || self.any_false_mask64x4(a1)
8974 }
8975 #[inline(always)]
8976 fn all_false_mask64x8(self, a: mask64x8<Self>) -> bool {
8977 let (a0, a1) = self.split_mask64x8(a);
8978 self.all_false_mask64x4(a0) && self.all_false_mask64x4(a1)
8979 }
8980 #[inline(always)]
8981 fn split_mask64x8(self, a: mask64x8<Self>) -> (mask64x4<Self>, mask64x4<Self>) {
8982 (
8983 mask64x4 {
8984 val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
8985 simd: self,
8986 },
8987 mask64x4 {
8988 val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
8989 simd: self,
8990 },
8991 )
8992 }
8993}
8994impl<S: Simd> SimdFrom<__m128, S> for f32x4<S> {
8995 #[inline(always)]
8996 fn simd_from(simd: S, arch: __m128) -> Self {
8997 Self {
8998 val: unsafe { core::mem::transmute_copy(&arch) },
8999 simd,
9000 }
9001 }
9002}
9003impl<S: Simd> From<f32x4<S>> for __m128 {
9004 #[inline(always)]
9005 fn from(value: f32x4<S>) -> Self {
9006 unsafe { core::mem::transmute_copy(&value.val) }
9007 }
9008}
9009impl<S: Simd> SimdFrom<__m128i, S> for i8x16<S> {
9010 #[inline(always)]
9011 fn simd_from(simd: S, arch: __m128i) -> Self {
9012 Self {
9013 val: unsafe { core::mem::transmute_copy(&arch) },
9014 simd,
9015 }
9016 }
9017}
9018impl<S: Simd> From<i8x16<S>> for __m128i {
9019 #[inline(always)]
9020 fn from(value: i8x16<S>) -> Self {
9021 unsafe { core::mem::transmute_copy(&value.val) }
9022 }
9023}
9024impl<S: Simd> SimdFrom<__m128i, S> for u8x16<S> {
9025 #[inline(always)]
9026 fn simd_from(simd: S, arch: __m128i) -> Self {
9027 Self {
9028 val: unsafe { core::mem::transmute_copy(&arch) },
9029 simd,
9030 }
9031 }
9032}
9033impl<S: Simd> From<u8x16<S>> for __m128i {
9034 #[inline(always)]
9035 fn from(value: u8x16<S>) -> Self {
9036 unsafe { core::mem::transmute_copy(&value.val) }
9037 }
9038}
9039impl<S: Simd> SimdFrom<__m128i, S> for mask8x16<S> {
9040 #[inline(always)]
9041 fn simd_from(simd: S, arch: __m128i) -> Self {
9042 Self {
9043 val: unsafe { core::mem::transmute_copy(&arch) },
9044 simd,
9045 }
9046 }
9047}
9048impl<S: Simd> From<mask8x16<S>> for __m128i {
9049 #[inline(always)]
9050 fn from(value: mask8x16<S>) -> Self {
9051 unsafe { core::mem::transmute_copy(&value.val) }
9052 }
9053}
9054impl<S: Simd> SimdFrom<__m128i, S> for i16x8<S> {
9055 #[inline(always)]
9056 fn simd_from(simd: S, arch: __m128i) -> Self {
9057 Self {
9058 val: unsafe { core::mem::transmute_copy(&arch) },
9059 simd,
9060 }
9061 }
9062}
9063impl<S: Simd> From<i16x8<S>> for __m128i {
9064 #[inline(always)]
9065 fn from(value: i16x8<S>) -> Self {
9066 unsafe { core::mem::transmute_copy(&value.val) }
9067 }
9068}
9069impl<S: Simd> SimdFrom<__m128i, S> for u16x8<S> {
9070 #[inline(always)]
9071 fn simd_from(simd: S, arch: __m128i) -> Self {
9072 Self {
9073 val: unsafe { core::mem::transmute_copy(&arch) },
9074 simd,
9075 }
9076 }
9077}
9078impl<S: Simd> From<u16x8<S>> for __m128i {
9079 #[inline(always)]
9080 fn from(value: u16x8<S>) -> Self {
9081 unsafe { core::mem::transmute_copy(&value.val) }
9082 }
9083}
9084impl<S: Simd> SimdFrom<__m128i, S> for mask16x8<S> {
9085 #[inline(always)]
9086 fn simd_from(simd: S, arch: __m128i) -> Self {
9087 Self {
9088 val: unsafe { core::mem::transmute_copy(&arch) },
9089 simd,
9090 }
9091 }
9092}
9093impl<S: Simd> From<mask16x8<S>> for __m128i {
9094 #[inline(always)]
9095 fn from(value: mask16x8<S>) -> Self {
9096 unsafe { core::mem::transmute_copy(&value.val) }
9097 }
9098}
9099impl<S: Simd> SimdFrom<__m128i, S> for i32x4<S> {
9100 #[inline(always)]
9101 fn simd_from(simd: S, arch: __m128i) -> Self {
9102 Self {
9103 val: unsafe { core::mem::transmute_copy(&arch) },
9104 simd,
9105 }
9106 }
9107}
9108impl<S: Simd> From<i32x4<S>> for __m128i {
9109 #[inline(always)]
9110 fn from(value: i32x4<S>) -> Self {
9111 unsafe { core::mem::transmute_copy(&value.val) }
9112 }
9113}
9114impl<S: Simd> SimdFrom<__m128i, S> for u32x4<S> {
9115 #[inline(always)]
9116 fn simd_from(simd: S, arch: __m128i) -> Self {
9117 Self {
9118 val: unsafe { core::mem::transmute_copy(&arch) },
9119 simd,
9120 }
9121 }
9122}
9123impl<S: Simd> From<u32x4<S>> for __m128i {
9124 #[inline(always)]
9125 fn from(value: u32x4<S>) -> Self {
9126 unsafe { core::mem::transmute_copy(&value.val) }
9127 }
9128}
9129impl<S: Simd> SimdFrom<__m128i, S> for mask32x4<S> {
9130 #[inline(always)]
9131 fn simd_from(simd: S, arch: __m128i) -> Self {
9132 Self {
9133 val: unsafe { core::mem::transmute_copy(&arch) },
9134 simd,
9135 }
9136 }
9137}
9138impl<S: Simd> From<mask32x4<S>> for __m128i {
9139 #[inline(always)]
9140 fn from(value: mask32x4<S>) -> Self {
9141 unsafe { core::mem::transmute_copy(&value.val) }
9142 }
9143}
9144impl<S: Simd> SimdFrom<__m128d, S> for f64x2<S> {
9145 #[inline(always)]
9146 fn simd_from(simd: S, arch: __m128d) -> Self {
9147 Self {
9148 val: unsafe { core::mem::transmute_copy(&arch) },
9149 simd,
9150 }
9151 }
9152}
9153impl<S: Simd> From<f64x2<S>> for __m128d {
9154 #[inline(always)]
9155 fn from(value: f64x2<S>) -> Self {
9156 unsafe { core::mem::transmute_copy(&value.val) }
9157 }
9158}
9159impl<S: Simd> SimdFrom<__m128i, S> for mask64x2<S> {
9160 #[inline(always)]
9161 fn simd_from(simd: S, arch: __m128i) -> Self {
9162 Self {
9163 val: unsafe { core::mem::transmute_copy(&arch) },
9164 simd,
9165 }
9166 }
9167}
9168impl<S: Simd> From<mask64x2<S>> for __m128i {
9169 #[inline(always)]
9170 fn from(value: mask64x2<S>) -> Self {
9171 unsafe { core::mem::transmute_copy(&value.val) }
9172 }
9173}
9174#[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"]
9175#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"]
9176#[doc = r" Rust doesn't currently let you do math on const generics."]
9177#[inline(always)]
9178unsafe fn dyn_alignr_128(a: __m128i, b: __m128i, shift: usize) -> __m128i {
9179 unsafe {
9180 match shift {
9181 0usize => _mm_alignr_epi8::<0i32>(a, b),
9182 1usize => _mm_alignr_epi8::<1i32>(a, b),
9183 2usize => _mm_alignr_epi8::<2i32>(a, b),
9184 3usize => _mm_alignr_epi8::<3i32>(a, b),
9185 4usize => _mm_alignr_epi8::<4i32>(a, b),
9186 5usize => _mm_alignr_epi8::<5i32>(a, b),
9187 6usize => _mm_alignr_epi8::<6i32>(a, b),
9188 7usize => _mm_alignr_epi8::<7i32>(a, b),
9189 8usize => _mm_alignr_epi8::<8i32>(a, b),
9190 9usize => _mm_alignr_epi8::<9i32>(a, b),
9191 10usize => _mm_alignr_epi8::<10i32>(a, b),
9192 11usize => _mm_alignr_epi8::<11i32>(a, b),
9193 12usize => _mm_alignr_epi8::<12i32>(a, b),
9194 13usize => _mm_alignr_epi8::<13i32>(a, b),
9195 14usize => _mm_alignr_epi8::<14i32>(a, b),
9196 15usize => _mm_alignr_epi8::<15i32>(a, b),
9197 _ => unreachable!(),
9198 }
9199 }
9200}
9201#[doc = r" Concatenates `b` and `a` (each N blocks) and extracts N blocks starting at byte offset `shift_bytes`."]
9202#[doc = r" Extracts from [b : a] (b in low bytes, a in high bytes), matching `alignr` semantics."]
9203#[inline(always)]
9204unsafe fn cross_block_alignr_128x2(
9205 a: [__m128i; 2usize],
9206 b: [__m128i; 2usize],
9207 shift_bytes: usize,
9208) -> [__m128i; 2usize] {
9209 [
9210 {
9211 let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 0usize, shift_bytes);
9212 unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) }
9213 },
9214 {
9215 let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 1usize, shift_bytes);
9216 unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) }
9217 },
9218 ]
9219}
9220#[doc = r" Concatenates `b` and `a` (each N blocks) and extracts N blocks starting at byte offset `shift_bytes`."]
9221#[doc = r" Extracts from [b : a] (b in low bytes, a in high bytes), matching `alignr` semantics."]
9222#[inline(always)]
9223unsafe fn cross_block_alignr_128x4(
9224 a: [__m128i; 4usize],
9225 b: [__m128i; 4usize],
9226 shift_bytes: usize,
9227) -> [__m128i; 4usize] {
9228 [
9229 {
9230 let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 0usize, shift_bytes);
9231 unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) }
9232 },
9233 {
9234 let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 1usize, shift_bytes);
9235 unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) }
9236 },
9237 {
9238 let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 2usize, shift_bytes);
9239 unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) }
9240 },
9241 {
9242 let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 3usize, shift_bytes);
9243 unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) }
9244 },
9245 ]
9246}