1use crate::{Level, arch_types::ArchTypes, prelude::*, seal::Seal};
7use crate::{
8 f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4,
9 i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4,
10 mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32,
11 u32x4, u32x8, u32x16,
12};
13#[cfg(target_arch = "x86")]
14use core::arch::x86::*;
15#[cfg(target_arch = "x86_64")]
16use core::arch::x86_64::*;
17#[doc = "The SIMD token for the x86-64-v3 level."]
18#[derive(Clone, Copy, Debug)]
19pub struct Avx2 {
20 pub avx2: crate::core_arch::x86::Avx2,
21}
22impl Avx2 {
23 #[doc = r" Create a SIMD token."]
24 #[doc = r""]
25 #[doc = r" # Safety"]
26 #[doc = r""]
27 #[doc = r" The `avx2`, `bmi1`, `bmi2`, `cmpxchg16b`, `f16c`, `fma`,"]
28 #[doc = r" `lzcnt`, `movbe`, `popcnt`, and `xsave` CPU features must"]
29 #[doc = r" be available."]
30 #[inline]
31 pub const unsafe fn new_unchecked() -> Self {
32 Self {
33 avx2: unsafe { crate::core_arch::x86::Avx2::new_unchecked() },
34 }
35 }
36}
37impl Seal for Avx2 {}
38impl ArchTypes for Avx2 {
39 type f32x4 = crate::support::Aligned128<__m128>;
40 type i8x16 = crate::support::Aligned128<__m128i>;
41 type u8x16 = crate::support::Aligned128<__m128i>;
42 type mask8x16 = crate::support::Aligned128<__m128i>;
43 type i16x8 = crate::support::Aligned128<__m128i>;
44 type u16x8 = crate::support::Aligned128<__m128i>;
45 type mask16x8 = crate::support::Aligned128<__m128i>;
46 type i32x4 = crate::support::Aligned128<__m128i>;
47 type u32x4 = crate::support::Aligned128<__m128i>;
48 type mask32x4 = crate::support::Aligned128<__m128i>;
49 type f64x2 = crate::support::Aligned128<__m128d>;
50 type mask64x2 = crate::support::Aligned128<__m128i>;
51 type f32x8 = crate::support::Aligned256<__m256>;
52 type i8x32 = crate::support::Aligned256<__m256i>;
53 type u8x32 = crate::support::Aligned256<__m256i>;
54 type mask8x32 = crate::support::Aligned256<__m256i>;
55 type i16x16 = crate::support::Aligned256<__m256i>;
56 type u16x16 = crate::support::Aligned256<__m256i>;
57 type mask16x16 = crate::support::Aligned256<__m256i>;
58 type i32x8 = crate::support::Aligned256<__m256i>;
59 type u32x8 = crate::support::Aligned256<__m256i>;
60 type mask32x8 = crate::support::Aligned256<__m256i>;
61 type f64x4 = crate::support::Aligned256<__m256d>;
62 type mask64x4 = crate::support::Aligned256<__m256i>;
63 type f32x16 = crate::support::Aligned512<[__m256; 2usize]>;
64 type i8x64 = crate::support::Aligned512<[__m256i; 2usize]>;
65 type u8x64 = crate::support::Aligned512<[__m256i; 2usize]>;
66 type mask8x64 = crate::support::Aligned512<[__m256i; 2usize]>;
67 type i16x32 = crate::support::Aligned512<[__m256i; 2usize]>;
68 type u16x32 = crate::support::Aligned512<[__m256i; 2usize]>;
69 type mask16x32 = crate::support::Aligned512<[__m256i; 2usize]>;
70 type i32x16 = crate::support::Aligned512<[__m256i; 2usize]>;
71 type u32x16 = crate::support::Aligned512<[__m256i; 2usize]>;
72 type mask32x16 = crate::support::Aligned512<[__m256i; 2usize]>;
73 type f64x8 = crate::support::Aligned512<[__m256d; 2usize]>;
74 type mask64x8 = crate::support::Aligned512<[__m256i; 2usize]>;
75}
76impl Simd for Avx2 {
77 type f32s = f32x8<Self>;
78 type f64s = f64x4<Self>;
79 type u8s = u8x32<Self>;
80 type i8s = i8x32<Self>;
81 type u16s = u16x16<Self>;
82 type i16s = i16x16<Self>;
83 type u32s = u32x8<Self>;
84 type i32s = i32x8<Self>;
85 type mask8s = mask8x32<Self>;
86 type mask16s = mask16x16<Self>;
87 type mask32s = mask32x8<Self>;
88 type mask64s = mask64x4<Self>;
89 #[inline(always)]
90 fn level(self) -> Level {
91 Level::Avx2(self)
92 }
93 #[inline]
94 fn vectorize<F: FnOnce() -> R, R>(self, f: F) -> R {
95 #[target_feature(enable = "avx2,bmi1,bmi2,cmpxchg16b,f16c,fma,lzcnt,movbe,popcnt,xsave")]
96 unsafe fn vectorize_avx2<F: FnOnce() -> R, R>(f: F) -> R {
97 f()
98 }
99 unsafe { vectorize_avx2(f) }
100 }
101 #[inline(always)]
102 fn splat_f32x4(self, val: f32) -> f32x4<Self> {
103 unsafe { _mm_set1_ps(val).simd_into(self) }
104 }
105 #[inline(always)]
106 fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4<Self> {
107 f32x4 {
108 val: unsafe { core::mem::transmute_copy(&val) },
109 simd: self,
110 }
111 }
112 #[inline(always)]
113 fn load_array_ref_f32x4(self, val: &[f32; 4usize]) -> f32x4<Self> {
114 f32x4 {
115 val: unsafe { core::mem::transmute_copy(val) },
116 simd: self,
117 }
118 }
119 #[inline(always)]
120 fn as_array_f32x4(self, a: f32x4<Self>) -> [f32; 4usize] {
121 unsafe { core::mem::transmute::<__m128, [f32; 4usize]>(a.val.0) }
122 }
123 #[inline(always)]
124 fn as_array_ref_f32x4(self, a: &f32x4<Self>) -> &[f32; 4usize] {
125 unsafe { core::mem::transmute::<&__m128, &[f32; 4usize]>(&a.val.0) }
126 }
127 #[inline(always)]
128 fn as_array_mut_f32x4(self, a: &mut f32x4<Self>) -> &mut [f32; 4usize] {
129 unsafe { core::mem::transmute::<&mut __m128, &mut [f32; 4usize]>(&mut a.val.0) }
130 }
131 #[inline(always)]
132 fn store_array_f32x4(self, a: f32x4<Self>, dest: &mut [f32; 4usize]) -> () {
133 unsafe {
134 core::ptr::copy_nonoverlapping(
135 (&raw const a.val.0) as *const f32,
136 dest.as_mut_ptr(),
137 4usize,
138 );
139 }
140 }
141 #[inline(always)]
142 fn cvt_from_bytes_f32x4(self, a: u8x16<Self>) -> f32x4<Self> {
143 unsafe {
144 f32x4 {
145 val: core::mem::transmute(a.val),
146 simd: self,
147 }
148 }
149 }
150 #[inline(always)]
151 fn cvt_to_bytes_f32x4(self, a: f32x4<Self>) -> u8x16<Self> {
152 unsafe {
153 u8x16 {
154 val: core::mem::transmute(a.val),
155 simd: self,
156 }
157 }
158 }
159 #[inline(always)]
160 fn slide_f32x4<const SHIFT: usize>(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
161 unsafe {
162 if SHIFT >= 4usize {
163 return b;
164 }
165 let result = dyn_alignr_128(
166 self.cvt_to_bytes_f32x4(b).val.0,
167 self.cvt_to_bytes_f32x4(a).val.0,
168 SHIFT * 4usize,
169 );
170 self.cvt_from_bytes_f32x4(u8x16 {
171 val: crate::support::Aligned128(result),
172 simd: self,
173 })
174 }
175 }
176 #[inline(always)]
177 fn slide_within_blocks_f32x4<const SHIFT: usize>(
178 self,
179 a: f32x4<Self>,
180 b: f32x4<Self>,
181 ) -> f32x4<Self> {
182 self.slide_f32x4::<SHIFT>(a, b)
183 }
184 #[inline(always)]
185 fn abs_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
186 unsafe { _mm_andnot_ps(_mm_set1_ps(-0.0), a.into()).simd_into(self) }
187 }
188 #[inline(always)]
189 fn neg_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
190 unsafe { _mm_xor_ps(a.into(), _mm_set1_ps(-0.0)).simd_into(self) }
191 }
192 #[inline(always)]
193 fn sqrt_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
194 unsafe { _mm_sqrt_ps(a.into()).simd_into(self) }
195 }
196 #[inline(always)]
197 fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
198 unsafe { _mm_add_ps(a.into(), b.into()).simd_into(self) }
199 }
200 #[inline(always)]
201 fn sub_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
202 unsafe { _mm_sub_ps(a.into(), b.into()).simd_into(self) }
203 }
204 #[inline(always)]
205 fn mul_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
206 unsafe { _mm_mul_ps(a.into(), b.into()).simd_into(self) }
207 }
208 #[inline(always)]
209 fn div_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
210 unsafe { _mm_div_ps(a.into(), b.into()).simd_into(self) }
211 }
212 #[inline(always)]
213 fn copysign_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
214 unsafe {
215 let mask = _mm_set1_ps(-0.0);
216 _mm_or_ps(_mm_and_ps(mask, b.into()), _mm_andnot_ps(mask, a.into())).simd_into(self)
217 }
218 }
219 #[inline(always)]
220 fn simd_eq_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
221 unsafe { _mm_castps_si128(_mm_cmpeq_ps(a.into(), b.into())).simd_into(self) }
222 }
223 #[inline(always)]
224 fn simd_lt_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
225 unsafe { _mm_castps_si128(_mm_cmplt_ps(a.into(), b.into())).simd_into(self) }
226 }
227 #[inline(always)]
228 fn simd_le_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
229 unsafe { _mm_castps_si128(_mm_cmple_ps(a.into(), b.into())).simd_into(self) }
230 }
231 #[inline(always)]
232 fn simd_ge_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
233 unsafe { _mm_castps_si128(_mm_cmpge_ps(a.into(), b.into())).simd_into(self) }
234 }
235 #[inline(always)]
236 fn simd_gt_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
237 unsafe { _mm_castps_si128(_mm_cmpgt_ps(a.into(), b.into())).simd_into(self) }
238 }
239 #[inline(always)]
240 fn zip_low_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
241 unsafe { _mm_unpacklo_ps(a.into(), b.into()).simd_into(self) }
242 }
243 #[inline(always)]
244 fn zip_high_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
245 unsafe { _mm_unpackhi_ps(a.into(), b.into()).simd_into(self) }
246 }
247 #[inline(always)]
248 fn unzip_low_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
249 unsafe { _mm_shuffle_ps::<0b10_00_10_00>(a.into(), b.into()).simd_into(self) }
250 }
251 #[inline(always)]
252 fn unzip_high_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
253 unsafe { _mm_shuffle_ps::<0b11_01_11_01>(a.into(), b.into()).simd_into(self) }
254 }
255 #[inline(always)]
256 fn interleave_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> (f32x4<Self>, f32x4<Self>) {
257 (self.zip_low_f32x4(a, b), self.zip_high_f32x4(a, b))
258 }
259 #[inline(always)]
260 fn deinterleave_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> (f32x4<Self>, f32x4<Self>) {
261 (self.unzip_low_f32x4(a, b), self.unzip_high_f32x4(a, b))
262 }
263 #[inline(always)]
264 fn max_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
265 unsafe { _mm_max_ps(a.into(), b.into()).simd_into(self) }
266 }
267 #[inline(always)]
268 fn min_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
269 unsafe { _mm_min_ps(a.into(), b.into()).simd_into(self) }
270 }
271 #[inline(always)]
272 fn max_precise_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
273 unsafe {
274 let intermediate = _mm_max_ps(a.into(), b.into());
275 let b_is_nan = _mm_cmpunord_ps(b.into(), b.into());
276 _mm_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(self)
277 }
278 }
279 #[inline(always)]
280 fn min_precise_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
281 unsafe {
282 let intermediate = _mm_min_ps(a.into(), b.into());
283 let b_is_nan = _mm_cmpunord_ps(b.into(), b.into());
284 _mm_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(self)
285 }
286 }
287 #[inline(always)]
288 fn mul_add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
289 unsafe { _mm_fmadd_ps(a.into(), b.into(), c.into()).simd_into(self) }
290 }
291 #[inline(always)]
292 fn mul_sub_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
293 unsafe { _mm_fmsub_ps(a.into(), b.into(), c.into()).simd_into(self) }
294 }
295 #[inline(always)]
296 fn floor_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
297 unsafe {
298 _mm_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
299 }
300 }
301 #[inline(always)]
302 fn ceil_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
303 unsafe {
304 _mm_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
305 }
306 }
307 #[inline(always)]
308 fn round_ties_even_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
309 unsafe {
310 _mm_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
311 .simd_into(self)
312 }
313 }
314 #[inline(always)]
315 fn fract_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
316 a - self.trunc_f32x4(a)
317 }
318 #[inline(always)]
319 fn trunc_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
320 unsafe {
321 _mm_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
322 }
323 }
324 #[inline(always)]
325 fn select_f32x4(self, a: mask32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
326 unsafe { _mm_blendv_ps(c.into(), b.into(), _mm_castsi128_ps(a.into())).simd_into(self) }
327 }
328 #[inline(always)]
329 fn combine_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x8<Self> {
330 unsafe { _mm256_setr_m128(a.into(), b.into()).simd_into(self) }
331 }
332 #[inline(always)]
333 fn reinterpret_f64_f32x4(self, a: f32x4<Self>) -> f64x2<Self> {
334 unsafe { _mm_castps_pd(a.into()).simd_into(self) }
335 }
336 #[inline(always)]
337 fn reinterpret_i32_f32x4(self, a: f32x4<Self>) -> i32x4<Self> {
338 unsafe { _mm_castps_si128(a.into()).simd_into(self) }
339 }
340 #[inline(always)]
341 fn reinterpret_u8_f32x4(self, a: f32x4<Self>) -> u8x16<Self> {
342 unsafe { _mm_castps_si128(a.into()).simd_into(self) }
343 }
344 #[inline(always)]
345 fn reinterpret_u32_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
346 unsafe { _mm_castps_si128(a.into()).simd_into(self) }
347 }
348 #[inline(always)]
349 fn cvt_u32_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
350 unsafe {
351 let mut converted = _mm_cvttps_epi32(a.into());
352 let in_range = _mm_cmplt_ps(a.into(), _mm_set1_ps(2147483648.0));
353 let all_in_range = _mm_movemask_ps(in_range) == 0b1111;
354 if !all_in_range {
355 let excess = _mm_sub_ps(a.into(), _mm_set1_ps(2147483648.0));
356 let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess));
357 converted = _mm_add_epi32(converted, excess_converted);
358 }
359 converted.simd_into(self)
360 }
361 }
362 #[inline(always)]
363 fn cvt_u32_precise_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
364 unsafe {
365 let a = _mm_max_ps(a.into(), _mm_setzero_ps());
366 let mut converted = _mm_cvttps_epi32(a);
367 let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0));
368 let all_in_range = _mm_movemask_ps(in_range) == 0b1111;
369 if !all_in_range {
370 let exceeds_unsigned_range =
371 _mm_castps_si128(_mm_cmplt_ps(_mm_set1_ps(4294967040.0), a));
372 let excess = _mm_sub_ps(a, _mm_set1_ps(2147483648.0));
373 let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess));
374 converted = _mm_add_epi32(converted, excess_converted);
375 converted = _mm_blendv_epi8(
376 converted,
377 _mm_set1_epi32(u32::MAX.cast_signed()),
378 exceeds_unsigned_range,
379 );
380 }
381 converted.simd_into(self)
382 }
383 }
384 #[inline(always)]
385 fn cvt_i32_f32x4(self, a: f32x4<Self>) -> i32x4<Self> {
386 unsafe { _mm_cvttps_epi32(a.into()).simd_into(self) }
387 }
388 #[inline(always)]
389 fn cvt_i32_precise_f32x4(self, a: f32x4<Self>) -> i32x4<Self> {
390 unsafe {
391 let a = a.into();
392 let mut converted = _mm_cvttps_epi32(a);
393 let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0));
394 let all_in_range = _mm_movemask_ps(in_range) == 0b1111;
395 if !all_in_range {
396 converted = _mm_blendv_epi8(
397 _mm_set1_epi32(i32::MAX),
398 converted,
399 _mm_castps_si128(in_range),
400 );
401 let is_not_nan = _mm_castps_si128(_mm_cmpord_ps(a, a));
402 converted = _mm_and_si128(converted, is_not_nan);
403 }
404 converted.simd_into(self)
405 }
406 }
407 #[inline(always)]
408 fn splat_i8x16(self, val: i8) -> i8x16<Self> {
409 unsafe { _mm_set1_epi8(val).simd_into(self) }
410 }
411 #[inline(always)]
412 fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16<Self> {
413 i8x16 {
414 val: unsafe { core::mem::transmute_copy(&val) },
415 simd: self,
416 }
417 }
418 #[inline(always)]
419 fn load_array_ref_i8x16(self, val: &[i8; 16usize]) -> i8x16<Self> {
420 i8x16 {
421 val: unsafe { core::mem::transmute_copy(val) },
422 simd: self,
423 }
424 }
425 #[inline(always)]
426 fn as_array_i8x16(self, a: i8x16<Self>) -> [i8; 16usize] {
427 unsafe { core::mem::transmute::<__m128i, [i8; 16usize]>(a.val.0) }
428 }
429 #[inline(always)]
430 fn as_array_ref_i8x16(self, a: &i8x16<Self>) -> &[i8; 16usize] {
431 unsafe { core::mem::transmute::<&__m128i, &[i8; 16usize]>(&a.val.0) }
432 }
433 #[inline(always)]
434 fn as_array_mut_i8x16(self, a: &mut i8x16<Self>) -> &mut [i8; 16usize] {
435 unsafe { core::mem::transmute::<&mut __m128i, &mut [i8; 16usize]>(&mut a.val.0) }
436 }
437 #[inline(always)]
438 fn store_array_i8x16(self, a: i8x16<Self>, dest: &mut [i8; 16usize]) -> () {
439 unsafe {
440 core::ptr::copy_nonoverlapping(
441 (&raw const a.val.0) as *const i8,
442 dest.as_mut_ptr(),
443 16usize,
444 );
445 }
446 }
447 #[inline(always)]
448 fn cvt_from_bytes_i8x16(self, a: u8x16<Self>) -> i8x16<Self> {
449 unsafe {
450 i8x16 {
451 val: core::mem::transmute(a.val),
452 simd: self,
453 }
454 }
455 }
456 #[inline(always)]
457 fn cvt_to_bytes_i8x16(self, a: i8x16<Self>) -> u8x16<Self> {
458 unsafe {
459 u8x16 {
460 val: core::mem::transmute(a.val),
461 simd: self,
462 }
463 }
464 }
465 #[inline(always)]
466 fn slide_i8x16<const SHIFT: usize>(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
467 unsafe {
468 if SHIFT >= 16usize {
469 return b;
470 }
471 let result = dyn_alignr_128(
472 self.cvt_to_bytes_i8x16(b).val.0,
473 self.cvt_to_bytes_i8x16(a).val.0,
474 SHIFT,
475 );
476 self.cvt_from_bytes_i8x16(u8x16 {
477 val: crate::support::Aligned128(result),
478 simd: self,
479 })
480 }
481 }
482 #[inline(always)]
483 fn slide_within_blocks_i8x16<const SHIFT: usize>(
484 self,
485 a: i8x16<Self>,
486 b: i8x16<Self>,
487 ) -> i8x16<Self> {
488 self.slide_i8x16::<SHIFT>(a, b)
489 }
490 #[inline(always)]
491 fn add_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
492 unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) }
493 }
494 #[inline(always)]
495 fn sub_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
496 unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) }
497 }
498 #[inline(always)]
499 fn mul_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
500 unsafe {
501 let dst_even = _mm_mullo_epi16(a.into(), b.into());
502 let dst_odd =
503 _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into()));
504 _mm_or_si128(
505 _mm_slli_epi16(dst_odd, 8),
506 _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)),
507 )
508 .simd_into(self)
509 }
510 }
511 #[inline(always)]
512 fn and_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
513 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
514 }
515 #[inline(always)]
516 fn or_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
517 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
518 }
519 #[inline(always)]
520 fn xor_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
521 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
522 }
523 #[inline(always)]
524 fn not_i8x16(self, a: i8x16<Self>) -> i8x16<Self> {
525 a ^ !0
526 }
527 #[inline(always)]
528 fn shl_i8x16(self, a: i8x16<Self>, shift: u32) -> i8x16<Self> {
529 unsafe {
530 let val = a.into();
531 let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
532 let lo_16 = _mm_unpacklo_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val));
533 let hi_16 = _mm_unpackhi_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val));
534 let lo_shifted = _mm_sll_epi16(lo_16, shift_count);
535 let hi_shifted = _mm_sll_epi16(hi_16, shift_count);
536 _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
537 }
538 }
539 #[inline(always)]
540 fn shlv_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
541 core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
542 }
543 #[inline(always)]
544 fn shr_i8x16(self, a: i8x16<Self>, shift: u32) -> i8x16<Self> {
545 unsafe {
546 let val = a.into();
547 let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
548 let lo_16 = _mm_unpacklo_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val));
549 let hi_16 = _mm_unpackhi_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val));
550 let lo_shifted = _mm_sra_epi16(lo_16, shift_count);
551 let hi_shifted = _mm_sra_epi16(hi_16, shift_count);
552 _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
553 }
554 }
555 #[inline(always)]
556 fn shrv_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
557 core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
558 }
559 #[inline(always)]
560 fn simd_eq_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
561 unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) }
562 }
563 #[inline(always)]
564 fn simd_lt_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
565 unsafe { _mm_cmpgt_epi8(b.into(), a.into()).simd_into(self) }
566 }
567 #[inline(always)]
568 fn simd_le_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
569 unsafe { _mm_cmpeq_epi8(_mm_min_epi8(a.into(), b.into()), a.into()).simd_into(self) }
570 }
571 #[inline(always)]
572 fn simd_ge_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
573 unsafe { _mm_cmpeq_epi8(_mm_max_epi8(a.into(), b.into()), a.into()).simd_into(self) }
574 }
575 #[inline(always)]
576 fn simd_gt_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
577 unsafe { _mm_cmpgt_epi8(a.into(), b.into()).simd_into(self) }
578 }
579 #[inline(always)]
580 fn zip_low_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
581 unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) }
582 }
583 #[inline(always)]
584 fn zip_high_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
585 unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) }
586 }
587 #[inline(always)]
588 fn unzip_low_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
589 unsafe {
590 let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
591 let t1 = _mm_shuffle_epi8(a.into(), mask);
592 let t2 = _mm_shuffle_epi8(b.into(), mask);
593 _mm_unpacklo_epi64(t1, t2).simd_into(self)
594 }
595 }
596 #[inline(always)]
597 fn unzip_high_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
598 unsafe {
599 let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
600 let t1 = _mm_shuffle_epi8(a.into(), mask);
601 let t2 = _mm_shuffle_epi8(b.into(), mask);
602 _mm_unpackhi_epi64(t1, t2).simd_into(self)
603 }
604 }
605 #[inline(always)]
606 fn interleave_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> (i8x16<Self>, i8x16<Self>) {
607 (self.zip_low_i8x16(a, b), self.zip_high_i8x16(a, b))
608 }
609 #[inline(always)]
610 fn deinterleave_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> (i8x16<Self>, i8x16<Self>) {
611 (self.unzip_low_i8x16(a, b), self.unzip_high_i8x16(a, b))
612 }
613 #[inline(always)]
614 fn select_i8x16(self, a: mask8x16<Self>, b: i8x16<Self>, c: i8x16<Self>) -> i8x16<Self> {
615 unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
616 }
617 #[inline(always)]
618 fn min_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
619 unsafe { _mm_min_epi8(a.into(), b.into()).simd_into(self) }
620 }
621 #[inline(always)]
622 fn max_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
623 unsafe { _mm_max_epi8(a.into(), b.into()).simd_into(self) }
624 }
625 #[inline(always)]
626 fn combine_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x32<Self> {
627 unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
628 }
629 #[inline(always)]
630 fn neg_i8x16(self, a: i8x16<Self>) -> i8x16<Self> {
631 unsafe { _mm_sub_epi8(_mm_setzero_si128(), a.into()).simd_into(self) }
632 }
633 #[inline(always)]
634 fn reinterpret_u8_i8x16(self, a: i8x16<Self>) -> u8x16<Self> {
635 __m128i::from(a).simd_into(self)
636 }
637 #[inline(always)]
638 fn reinterpret_u32_i8x16(self, a: i8x16<Self>) -> u32x4<Self> {
639 __m128i::from(a).simd_into(self)
640 }
641 #[inline(always)]
642 fn splat_u8x16(self, val: u8) -> u8x16<Self> {
643 unsafe { _mm_set1_epi8(val.cast_signed()).simd_into(self) }
644 }
645 #[inline(always)]
646 fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16<Self> {
647 u8x16 {
648 val: unsafe { core::mem::transmute_copy(&val) },
649 simd: self,
650 }
651 }
652 #[inline(always)]
653 fn load_array_ref_u8x16(self, val: &[u8; 16usize]) -> u8x16<Self> {
654 u8x16 {
655 val: unsafe { core::mem::transmute_copy(val) },
656 simd: self,
657 }
658 }
659 #[inline(always)]
660 fn as_array_u8x16(self, a: u8x16<Self>) -> [u8; 16usize] {
661 unsafe { core::mem::transmute::<__m128i, [u8; 16usize]>(a.val.0) }
662 }
663 #[inline(always)]
664 fn as_array_ref_u8x16(self, a: &u8x16<Self>) -> &[u8; 16usize] {
665 unsafe { core::mem::transmute::<&__m128i, &[u8; 16usize]>(&a.val.0) }
666 }
667 #[inline(always)]
668 fn as_array_mut_u8x16(self, a: &mut u8x16<Self>) -> &mut [u8; 16usize] {
669 unsafe { core::mem::transmute::<&mut __m128i, &mut [u8; 16usize]>(&mut a.val.0) }
670 }
671 #[inline(always)]
672 fn store_array_u8x16(self, a: u8x16<Self>, dest: &mut [u8; 16usize]) -> () {
673 unsafe {
674 core::ptr::copy_nonoverlapping(
675 (&raw const a.val.0) as *const u8,
676 dest.as_mut_ptr(),
677 16usize,
678 );
679 }
680 }
681 #[inline(always)]
682 fn cvt_from_bytes_u8x16(self, a: u8x16<Self>) -> u8x16<Self> {
683 unsafe {
684 u8x16 {
685 val: core::mem::transmute(a.val),
686 simd: self,
687 }
688 }
689 }
690 #[inline(always)]
691 fn cvt_to_bytes_u8x16(self, a: u8x16<Self>) -> u8x16<Self> {
692 unsafe {
693 u8x16 {
694 val: core::mem::transmute(a.val),
695 simd: self,
696 }
697 }
698 }
699 #[inline(always)]
700 fn slide_u8x16<const SHIFT: usize>(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
701 unsafe {
702 if SHIFT >= 16usize {
703 return b;
704 }
705 let result = dyn_alignr_128(
706 self.cvt_to_bytes_u8x16(b).val.0,
707 self.cvt_to_bytes_u8x16(a).val.0,
708 SHIFT,
709 );
710 self.cvt_from_bytes_u8x16(u8x16 {
711 val: crate::support::Aligned128(result),
712 simd: self,
713 })
714 }
715 }
716 #[inline(always)]
717 fn slide_within_blocks_u8x16<const SHIFT: usize>(
718 self,
719 a: u8x16<Self>,
720 b: u8x16<Self>,
721 ) -> u8x16<Self> {
722 self.slide_u8x16::<SHIFT>(a, b)
723 }
724 #[inline(always)]
725 fn add_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
726 unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) }
727 }
728 #[inline(always)]
729 fn sub_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
730 unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) }
731 }
732 #[inline(always)]
733 fn mul_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
734 unsafe {
735 let dst_even = _mm_mullo_epi16(a.into(), b.into());
736 let dst_odd =
737 _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into()));
738 _mm_or_si128(
739 _mm_slli_epi16(dst_odd, 8),
740 _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)),
741 )
742 .simd_into(self)
743 }
744 }
745 #[inline(always)]
746 fn and_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
747 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
748 }
749 #[inline(always)]
750 fn or_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
751 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
752 }
753 #[inline(always)]
754 fn xor_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
755 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
756 }
757 #[inline(always)]
758 fn not_u8x16(self, a: u8x16<Self>) -> u8x16<Self> {
759 a ^ !0
760 }
761 #[inline(always)]
762 fn shl_u8x16(self, a: u8x16<Self>, shift: u32) -> u8x16<Self> {
763 unsafe {
764 let val = a.into();
765 let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
766 let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128());
767 let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128());
768 let lo_shifted = _mm_sll_epi16(lo_16, shift_count);
769 let hi_shifted = _mm_sll_epi16(hi_16, shift_count);
770 _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
771 }
772 }
773 #[inline(always)]
774 fn shlv_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
775 core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
776 }
777 #[inline(always)]
778 fn shr_u8x16(self, a: u8x16<Self>, shift: u32) -> u8x16<Self> {
779 unsafe {
780 let val = a.into();
781 let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
782 let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128());
783 let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128());
784 let lo_shifted = _mm_srl_epi16(lo_16, shift_count);
785 let hi_shifted = _mm_srl_epi16(hi_16, shift_count);
786 _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
787 }
788 }
789 #[inline(always)]
790 fn shrv_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
791 core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
792 }
793 #[inline(always)]
794 fn simd_eq_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
795 unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) }
796 }
797 #[inline(always)]
798 fn simd_lt_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
799 unsafe {
800 let sign_bit = _mm_set1_epi8(0x80u8.cast_signed());
801 let a_signed = _mm_xor_si128(a.into(), sign_bit);
802 let b_signed = _mm_xor_si128(b.into(), sign_bit);
803 _mm_cmpgt_epi8(b_signed, a_signed).simd_into(self)
804 }
805 }
806 #[inline(always)]
807 fn simd_le_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
808 unsafe { _mm_cmpeq_epi8(_mm_min_epu8(a.into(), b.into()), a.into()).simd_into(self) }
809 }
810 #[inline(always)]
811 fn simd_ge_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
812 unsafe { _mm_cmpeq_epi8(_mm_max_epu8(a.into(), b.into()), a.into()).simd_into(self) }
813 }
814 #[inline(always)]
815 fn simd_gt_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
816 unsafe {
817 let sign_bit = _mm_set1_epi8(0x80u8.cast_signed());
818 let a_signed = _mm_xor_si128(a.into(), sign_bit);
819 let b_signed = _mm_xor_si128(b.into(), sign_bit);
820 _mm_cmpgt_epi8(a_signed, b_signed).simd_into(self)
821 }
822 }
823 #[inline(always)]
824 fn zip_low_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
825 unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) }
826 }
827 #[inline(always)]
828 fn zip_high_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
829 unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) }
830 }
831 #[inline(always)]
832 fn unzip_low_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
833 unsafe {
834 let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
835 let t1 = _mm_shuffle_epi8(a.into(), mask);
836 let t2 = _mm_shuffle_epi8(b.into(), mask);
837 _mm_unpacklo_epi64(t1, t2).simd_into(self)
838 }
839 }
840 #[inline(always)]
841 fn unzip_high_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
842 unsafe {
843 let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
844 let t1 = _mm_shuffle_epi8(a.into(), mask);
845 let t2 = _mm_shuffle_epi8(b.into(), mask);
846 _mm_unpackhi_epi64(t1, t2).simd_into(self)
847 }
848 }
849 #[inline(always)]
850 fn interleave_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> (u8x16<Self>, u8x16<Self>) {
851 (self.zip_low_u8x16(a, b), self.zip_high_u8x16(a, b))
852 }
853 #[inline(always)]
854 fn deinterleave_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> (u8x16<Self>, u8x16<Self>) {
855 (self.unzip_low_u8x16(a, b), self.unzip_high_u8x16(a, b))
856 }
857 #[inline(always)]
858 fn select_u8x16(self, a: mask8x16<Self>, b: u8x16<Self>, c: u8x16<Self>) -> u8x16<Self> {
859 unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
860 }
861 #[inline(always)]
862 fn min_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
863 unsafe { _mm_min_epu8(a.into(), b.into()).simd_into(self) }
864 }
865 #[inline(always)]
866 fn max_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
867 unsafe { _mm_max_epu8(a.into(), b.into()).simd_into(self) }
868 }
869 #[inline(always)]
870 fn combine_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x32<Self> {
871 unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
872 }
873 #[inline(always)]
874 fn widen_u8x16(self, a: u8x16<Self>) -> u16x16<Self> {
875 unsafe { _mm256_cvtepu8_epi16(a.into()).simd_into(self) }
876 }
877 #[inline(always)]
878 fn reinterpret_u32_u8x16(self, a: u8x16<Self>) -> u32x4<Self> {
879 __m128i::from(a).simd_into(self)
880 }
881 #[inline(always)]
882 fn splat_mask8x16(self, val: i8) -> mask8x16<Self> {
883 unsafe { _mm_set1_epi8(val).simd_into(self) }
884 }
885 #[inline(always)]
886 fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16<Self> {
887 mask8x16 {
888 val: unsafe { core::mem::transmute_copy(&val) },
889 simd: self,
890 }
891 }
892 #[inline(always)]
893 fn load_array_ref_mask8x16(self, val: &[i8; 16usize]) -> mask8x16<Self> {
894 mask8x16 {
895 val: unsafe { core::mem::transmute_copy(val) },
896 simd: self,
897 }
898 }
899 #[inline(always)]
900 fn as_array_mask8x16(self, a: mask8x16<Self>) -> [i8; 16usize] {
901 unsafe { core::mem::transmute::<__m128i, [i8; 16usize]>(a.val.0) }
902 }
903 #[inline(always)]
904 fn as_array_ref_mask8x16(self, a: &mask8x16<Self>) -> &[i8; 16usize] {
905 unsafe { core::mem::transmute::<&__m128i, &[i8; 16usize]>(&a.val.0) }
906 }
907 #[inline(always)]
908 fn as_array_mut_mask8x16(self, a: &mut mask8x16<Self>) -> &mut [i8; 16usize] {
909 unsafe { core::mem::transmute::<&mut __m128i, &mut [i8; 16usize]>(&mut a.val.0) }
910 }
911 #[inline(always)]
912 fn store_array_mask8x16(self, a: mask8x16<Self>, dest: &mut [i8; 16usize]) -> () {
913 unsafe {
914 core::ptr::copy_nonoverlapping(
915 (&raw const a.val.0) as *const i8,
916 dest.as_mut_ptr(),
917 16usize,
918 );
919 }
920 }
921 #[inline(always)]
922 fn cvt_from_bytes_mask8x16(self, a: u8x16<Self>) -> mask8x16<Self> {
923 unsafe {
924 mask8x16 {
925 val: core::mem::transmute(a.val),
926 simd: self,
927 }
928 }
929 }
930 #[inline(always)]
931 fn cvt_to_bytes_mask8x16(self, a: mask8x16<Self>) -> u8x16<Self> {
932 unsafe {
933 u8x16 {
934 val: core::mem::transmute(a.val),
935 simd: self,
936 }
937 }
938 }
939 #[inline(always)]
940 fn slide_mask8x16<const SHIFT: usize>(
941 self,
942 a: mask8x16<Self>,
943 b: mask8x16<Self>,
944 ) -> mask8x16<Self> {
945 unsafe {
946 if SHIFT >= 16usize {
947 return b;
948 }
949 let result = dyn_alignr_128(
950 self.cvt_to_bytes_mask8x16(b).val.0,
951 self.cvt_to_bytes_mask8x16(a).val.0,
952 SHIFT,
953 );
954 self.cvt_from_bytes_mask8x16(u8x16 {
955 val: crate::support::Aligned128(result),
956 simd: self,
957 })
958 }
959 }
960 #[inline(always)]
961 fn slide_within_blocks_mask8x16<const SHIFT: usize>(
962 self,
963 a: mask8x16<Self>,
964 b: mask8x16<Self>,
965 ) -> mask8x16<Self> {
966 self.slide_mask8x16::<SHIFT>(a, b)
967 }
968 #[inline(always)]
969 fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
970 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
971 }
972 #[inline(always)]
973 fn or_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
974 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
975 }
976 #[inline(always)]
977 fn xor_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
978 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
979 }
980 #[inline(always)]
981 fn not_mask8x16(self, a: mask8x16<Self>) -> mask8x16<Self> {
982 a ^ !0
983 }
984 #[inline(always)]
985 fn select_mask8x16(
986 self,
987 a: mask8x16<Self>,
988 b: mask8x16<Self>,
989 c: mask8x16<Self>,
990 ) -> mask8x16<Self> {
991 unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
992 }
993 #[inline(always)]
994 fn simd_eq_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
995 unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) }
996 }
997 #[inline(always)]
998 fn any_true_mask8x16(self, a: mask8x16<Self>) -> bool {
999 unsafe { _mm_movemask_epi8(a.into()) as u32 != 0 }
1000 }
1001 #[inline(always)]
1002 fn all_true_mask8x16(self, a: mask8x16<Self>) -> bool {
1003 unsafe { _mm_movemask_epi8(a.into()) as u32 == 0xffff }
1004 }
1005 #[inline(always)]
1006 fn any_false_mask8x16(self, a: mask8x16<Self>) -> bool {
1007 unsafe { _mm_movemask_epi8(a.into()) as u32 != 0xffff }
1008 }
1009 #[inline(always)]
1010 fn all_false_mask8x16(self, a: mask8x16<Self>) -> bool {
1011 unsafe { _mm_movemask_epi8(a.into()) as u32 == 0 }
1012 }
1013 #[inline(always)]
1014 fn combine_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x32<Self> {
1015 unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
1016 }
1017 #[inline(always)]
1018 fn splat_i16x8(self, val: i16) -> i16x8<Self> {
1019 unsafe { _mm_set1_epi16(val).simd_into(self) }
1020 }
1021 #[inline(always)]
1022 fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8<Self> {
1023 i16x8 {
1024 val: unsafe { core::mem::transmute_copy(&val) },
1025 simd: self,
1026 }
1027 }
1028 #[inline(always)]
1029 fn load_array_ref_i16x8(self, val: &[i16; 8usize]) -> i16x8<Self> {
1030 i16x8 {
1031 val: unsafe { core::mem::transmute_copy(val) },
1032 simd: self,
1033 }
1034 }
1035 #[inline(always)]
1036 fn as_array_i16x8(self, a: i16x8<Self>) -> [i16; 8usize] {
1037 unsafe { core::mem::transmute::<__m128i, [i16; 8usize]>(a.val.0) }
1038 }
1039 #[inline(always)]
1040 fn as_array_ref_i16x8(self, a: &i16x8<Self>) -> &[i16; 8usize] {
1041 unsafe { core::mem::transmute::<&__m128i, &[i16; 8usize]>(&a.val.0) }
1042 }
1043 #[inline(always)]
1044 fn as_array_mut_i16x8(self, a: &mut i16x8<Self>) -> &mut [i16; 8usize] {
1045 unsafe { core::mem::transmute::<&mut __m128i, &mut [i16; 8usize]>(&mut a.val.0) }
1046 }
1047 #[inline(always)]
1048 fn store_array_i16x8(self, a: i16x8<Self>, dest: &mut [i16; 8usize]) -> () {
1049 unsafe {
1050 core::ptr::copy_nonoverlapping(
1051 (&raw const a.val.0) as *const i16,
1052 dest.as_mut_ptr(),
1053 8usize,
1054 );
1055 }
1056 }
1057 #[inline(always)]
1058 fn cvt_from_bytes_i16x8(self, a: u8x16<Self>) -> i16x8<Self> {
1059 unsafe {
1060 i16x8 {
1061 val: core::mem::transmute(a.val),
1062 simd: self,
1063 }
1064 }
1065 }
1066 #[inline(always)]
1067 fn cvt_to_bytes_i16x8(self, a: i16x8<Self>) -> u8x16<Self> {
1068 unsafe {
1069 u8x16 {
1070 val: core::mem::transmute(a.val),
1071 simd: self,
1072 }
1073 }
1074 }
1075 #[inline(always)]
1076 fn slide_i16x8<const SHIFT: usize>(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1077 unsafe {
1078 if SHIFT >= 8usize {
1079 return b;
1080 }
1081 let result = dyn_alignr_128(
1082 self.cvt_to_bytes_i16x8(b).val.0,
1083 self.cvt_to_bytes_i16x8(a).val.0,
1084 SHIFT * 2usize,
1085 );
1086 self.cvt_from_bytes_i16x8(u8x16 {
1087 val: crate::support::Aligned128(result),
1088 simd: self,
1089 })
1090 }
1091 }
1092 #[inline(always)]
1093 fn slide_within_blocks_i16x8<const SHIFT: usize>(
1094 self,
1095 a: i16x8<Self>,
1096 b: i16x8<Self>,
1097 ) -> i16x8<Self> {
1098 self.slide_i16x8::<SHIFT>(a, b)
1099 }
1100 #[inline(always)]
1101 fn add_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1102 unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) }
1103 }
1104 #[inline(always)]
1105 fn sub_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1106 unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) }
1107 }
1108 #[inline(always)]
1109 fn mul_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1110 unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) }
1111 }
1112 #[inline(always)]
1113 fn and_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1114 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1115 }
1116 #[inline(always)]
1117 fn or_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1118 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1119 }
1120 #[inline(always)]
1121 fn xor_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1122 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1123 }
1124 #[inline(always)]
1125 fn not_i16x8(self, a: i16x8<Self>) -> i16x8<Self> {
1126 a ^ !0
1127 }
1128 #[inline(always)]
1129 fn shl_i16x8(self, a: i16x8<Self>, shift: u32) -> i16x8<Self> {
1130 unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1131 }
1132 #[inline(always)]
1133 fn shlv_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1134 core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
1135 }
1136 #[inline(always)]
1137 fn shr_i16x8(self, a: i16x8<Self>, shift: u32) -> i16x8<Self> {
1138 unsafe { _mm_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1139 }
1140 #[inline(always)]
1141 fn shrv_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1142 core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
1143 }
1144 #[inline(always)]
1145 fn simd_eq_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
1146 unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) }
1147 }
1148 #[inline(always)]
1149 fn simd_lt_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
1150 unsafe { _mm_cmpgt_epi16(b.into(), a.into()).simd_into(self) }
1151 }
1152 #[inline(always)]
1153 fn simd_le_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
1154 unsafe { _mm_cmpeq_epi16(_mm_min_epi16(a.into(), b.into()), a.into()).simd_into(self) }
1155 }
1156 #[inline(always)]
1157 fn simd_ge_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
1158 unsafe { _mm_cmpeq_epi16(_mm_max_epi16(a.into(), b.into()), a.into()).simd_into(self) }
1159 }
1160 #[inline(always)]
1161 fn simd_gt_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
1162 unsafe { _mm_cmpgt_epi16(a.into(), b.into()).simd_into(self) }
1163 }
1164 #[inline(always)]
1165 fn zip_low_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1166 unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) }
1167 }
1168 #[inline(always)]
1169 fn zip_high_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1170 unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) }
1171 }
1172 #[inline(always)]
1173 fn unzip_low_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1174 unsafe {
1175 let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
1176 let t1 = _mm_shuffle_epi8(a.into(), mask);
1177 let t2 = _mm_shuffle_epi8(b.into(), mask);
1178 _mm_unpacklo_epi64(t1, t2).simd_into(self)
1179 }
1180 }
1181 #[inline(always)]
1182 fn unzip_high_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1183 unsafe {
1184 let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
1185 let t1 = _mm_shuffle_epi8(a.into(), mask);
1186 let t2 = _mm_shuffle_epi8(b.into(), mask);
1187 _mm_unpackhi_epi64(t1, t2).simd_into(self)
1188 }
1189 }
1190 #[inline(always)]
1191 fn interleave_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> (i16x8<Self>, i16x8<Self>) {
1192 (self.zip_low_i16x8(a, b), self.zip_high_i16x8(a, b))
1193 }
1194 #[inline(always)]
1195 fn deinterleave_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> (i16x8<Self>, i16x8<Self>) {
1196 (self.unzip_low_i16x8(a, b), self.unzip_high_i16x8(a, b))
1197 }
1198 #[inline(always)]
1199 fn select_i16x8(self, a: mask16x8<Self>, b: i16x8<Self>, c: i16x8<Self>) -> i16x8<Self> {
1200 unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
1201 }
1202 #[inline(always)]
1203 fn min_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1204 unsafe { _mm_min_epi16(a.into(), b.into()).simd_into(self) }
1205 }
1206 #[inline(always)]
1207 fn max_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
1208 unsafe { _mm_max_epi16(a.into(), b.into()).simd_into(self) }
1209 }
1210 #[inline(always)]
1211 fn combine_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x16<Self> {
1212 unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
1213 }
1214 #[inline(always)]
1215 fn neg_i16x8(self, a: i16x8<Self>) -> i16x8<Self> {
1216 unsafe { _mm_sub_epi16(_mm_setzero_si128(), a.into()).simd_into(self) }
1217 }
1218 #[inline(always)]
1219 fn reinterpret_u8_i16x8(self, a: i16x8<Self>) -> u8x16<Self> {
1220 __m128i::from(a).simd_into(self)
1221 }
1222 #[inline(always)]
1223 fn reinterpret_u32_i16x8(self, a: i16x8<Self>) -> u32x4<Self> {
1224 __m128i::from(a).simd_into(self)
1225 }
1226 #[inline(always)]
1227 fn splat_u16x8(self, val: u16) -> u16x8<Self> {
1228 unsafe { _mm_set1_epi16(val.cast_signed()).simd_into(self) }
1229 }
1230 #[inline(always)]
1231 fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8<Self> {
1232 u16x8 {
1233 val: unsafe { core::mem::transmute_copy(&val) },
1234 simd: self,
1235 }
1236 }
1237 #[inline(always)]
1238 fn load_array_ref_u16x8(self, val: &[u16; 8usize]) -> u16x8<Self> {
1239 u16x8 {
1240 val: unsafe { core::mem::transmute_copy(val) },
1241 simd: self,
1242 }
1243 }
1244 #[inline(always)]
1245 fn as_array_u16x8(self, a: u16x8<Self>) -> [u16; 8usize] {
1246 unsafe { core::mem::transmute::<__m128i, [u16; 8usize]>(a.val.0) }
1247 }
1248 #[inline(always)]
1249 fn as_array_ref_u16x8(self, a: &u16x8<Self>) -> &[u16; 8usize] {
1250 unsafe { core::mem::transmute::<&__m128i, &[u16; 8usize]>(&a.val.0) }
1251 }
1252 #[inline(always)]
1253 fn as_array_mut_u16x8(self, a: &mut u16x8<Self>) -> &mut [u16; 8usize] {
1254 unsafe { core::mem::transmute::<&mut __m128i, &mut [u16; 8usize]>(&mut a.val.0) }
1255 }
1256 #[inline(always)]
1257 fn store_array_u16x8(self, a: u16x8<Self>, dest: &mut [u16; 8usize]) -> () {
1258 unsafe {
1259 core::ptr::copy_nonoverlapping(
1260 (&raw const a.val.0) as *const u16,
1261 dest.as_mut_ptr(),
1262 8usize,
1263 );
1264 }
1265 }
1266 #[inline(always)]
1267 fn cvt_from_bytes_u16x8(self, a: u8x16<Self>) -> u16x8<Self> {
1268 unsafe {
1269 u16x8 {
1270 val: core::mem::transmute(a.val),
1271 simd: self,
1272 }
1273 }
1274 }
1275 #[inline(always)]
1276 fn cvt_to_bytes_u16x8(self, a: u16x8<Self>) -> u8x16<Self> {
1277 unsafe {
1278 u8x16 {
1279 val: core::mem::transmute(a.val),
1280 simd: self,
1281 }
1282 }
1283 }
1284 #[inline(always)]
1285 fn slide_u16x8<const SHIFT: usize>(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1286 unsafe {
1287 if SHIFT >= 8usize {
1288 return b;
1289 }
1290 let result = dyn_alignr_128(
1291 self.cvt_to_bytes_u16x8(b).val.0,
1292 self.cvt_to_bytes_u16x8(a).val.0,
1293 SHIFT * 2usize,
1294 );
1295 self.cvt_from_bytes_u16x8(u8x16 {
1296 val: crate::support::Aligned128(result),
1297 simd: self,
1298 })
1299 }
1300 }
1301 #[inline(always)]
1302 fn slide_within_blocks_u16x8<const SHIFT: usize>(
1303 self,
1304 a: u16x8<Self>,
1305 b: u16x8<Self>,
1306 ) -> u16x8<Self> {
1307 self.slide_u16x8::<SHIFT>(a, b)
1308 }
1309 #[inline(always)]
1310 fn add_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1311 unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) }
1312 }
1313 #[inline(always)]
1314 fn sub_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1315 unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) }
1316 }
1317 #[inline(always)]
1318 fn mul_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1319 unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) }
1320 }
1321 #[inline(always)]
1322 fn and_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1323 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1324 }
1325 #[inline(always)]
1326 fn or_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1327 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1328 }
1329 #[inline(always)]
1330 fn xor_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1331 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1332 }
1333 #[inline(always)]
1334 fn not_u16x8(self, a: u16x8<Self>) -> u16x8<Self> {
1335 a ^ !0
1336 }
1337 #[inline(always)]
1338 fn shl_u16x8(self, a: u16x8<Self>, shift: u32) -> u16x8<Self> {
1339 unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1340 }
1341 #[inline(always)]
1342 fn shlv_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1343 core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
1344 }
1345 #[inline(always)]
1346 fn shr_u16x8(self, a: u16x8<Self>, shift: u32) -> u16x8<Self> {
1347 unsafe { _mm_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1348 }
1349 #[inline(always)]
1350 fn shrv_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1351 core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
1352 }
1353 #[inline(always)]
1354 fn simd_eq_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
1355 unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) }
1356 }
1357 #[inline(always)]
1358 fn simd_lt_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
1359 unsafe {
1360 let sign_bit = _mm_set1_epi16(0x8000u16.cast_signed());
1361 let a_signed = _mm_xor_si128(a.into(), sign_bit);
1362 let b_signed = _mm_xor_si128(b.into(), sign_bit);
1363 _mm_cmpgt_epi16(b_signed, a_signed).simd_into(self)
1364 }
1365 }
1366 #[inline(always)]
1367 fn simd_le_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
1368 unsafe { _mm_cmpeq_epi16(_mm_min_epu16(a.into(), b.into()), a.into()).simd_into(self) }
1369 }
1370 #[inline(always)]
1371 fn simd_ge_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
1372 unsafe { _mm_cmpeq_epi16(_mm_max_epu16(a.into(), b.into()), a.into()).simd_into(self) }
1373 }
1374 #[inline(always)]
1375 fn simd_gt_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
1376 unsafe {
1377 let sign_bit = _mm_set1_epi16(0x8000u16.cast_signed());
1378 let a_signed = _mm_xor_si128(a.into(), sign_bit);
1379 let b_signed = _mm_xor_si128(b.into(), sign_bit);
1380 _mm_cmpgt_epi16(a_signed, b_signed).simd_into(self)
1381 }
1382 }
1383 #[inline(always)]
1384 fn zip_low_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1385 unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) }
1386 }
1387 #[inline(always)]
1388 fn zip_high_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1389 unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) }
1390 }
1391 #[inline(always)]
1392 fn unzip_low_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1393 unsafe {
1394 let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
1395 let t1 = _mm_shuffle_epi8(a.into(), mask);
1396 let t2 = _mm_shuffle_epi8(b.into(), mask);
1397 _mm_unpacklo_epi64(t1, t2).simd_into(self)
1398 }
1399 }
1400 #[inline(always)]
1401 fn unzip_high_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1402 unsafe {
1403 let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
1404 let t1 = _mm_shuffle_epi8(a.into(), mask);
1405 let t2 = _mm_shuffle_epi8(b.into(), mask);
1406 _mm_unpackhi_epi64(t1, t2).simd_into(self)
1407 }
1408 }
1409 #[inline(always)]
1410 fn interleave_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> (u16x8<Self>, u16x8<Self>) {
1411 (self.zip_low_u16x8(a, b), self.zip_high_u16x8(a, b))
1412 }
1413 #[inline(always)]
1414 fn deinterleave_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> (u16x8<Self>, u16x8<Self>) {
1415 (self.unzip_low_u16x8(a, b), self.unzip_high_u16x8(a, b))
1416 }
1417 #[inline(always)]
1418 fn select_u16x8(self, a: mask16x8<Self>, b: u16x8<Self>, c: u16x8<Self>) -> u16x8<Self> {
1419 unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
1420 }
1421 #[inline(always)]
1422 fn min_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1423 unsafe { _mm_min_epu16(a.into(), b.into()).simd_into(self) }
1424 }
1425 #[inline(always)]
1426 fn max_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
1427 unsafe { _mm_max_epu16(a.into(), b.into()).simd_into(self) }
1428 }
1429 #[inline(always)]
1430 fn combine_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x16<Self> {
1431 unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
1432 }
1433 #[inline(always)]
1434 fn reinterpret_u8_u16x8(self, a: u16x8<Self>) -> u8x16<Self> {
1435 __m128i::from(a).simd_into(self)
1436 }
1437 #[inline(always)]
1438 fn reinterpret_u32_u16x8(self, a: u16x8<Self>) -> u32x4<Self> {
1439 __m128i::from(a).simd_into(self)
1440 }
1441 #[inline(always)]
1442 fn splat_mask16x8(self, val: i16) -> mask16x8<Self> {
1443 unsafe { _mm_set1_epi16(val).simd_into(self) }
1444 }
1445 #[inline(always)]
1446 fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8<Self> {
1447 mask16x8 {
1448 val: unsafe { core::mem::transmute_copy(&val) },
1449 simd: self,
1450 }
1451 }
1452 #[inline(always)]
1453 fn load_array_ref_mask16x8(self, val: &[i16; 8usize]) -> mask16x8<Self> {
1454 mask16x8 {
1455 val: unsafe { core::mem::transmute_copy(val) },
1456 simd: self,
1457 }
1458 }
1459 #[inline(always)]
1460 fn as_array_mask16x8(self, a: mask16x8<Self>) -> [i16; 8usize] {
1461 unsafe { core::mem::transmute::<__m128i, [i16; 8usize]>(a.val.0) }
1462 }
1463 #[inline(always)]
1464 fn as_array_ref_mask16x8(self, a: &mask16x8<Self>) -> &[i16; 8usize] {
1465 unsafe { core::mem::transmute::<&__m128i, &[i16; 8usize]>(&a.val.0) }
1466 }
1467 #[inline(always)]
1468 fn as_array_mut_mask16x8(self, a: &mut mask16x8<Self>) -> &mut [i16; 8usize] {
1469 unsafe { core::mem::transmute::<&mut __m128i, &mut [i16; 8usize]>(&mut a.val.0) }
1470 }
1471 #[inline(always)]
1472 fn store_array_mask16x8(self, a: mask16x8<Self>, dest: &mut [i16; 8usize]) -> () {
1473 unsafe {
1474 core::ptr::copy_nonoverlapping(
1475 (&raw const a.val.0) as *const i16,
1476 dest.as_mut_ptr(),
1477 8usize,
1478 );
1479 }
1480 }
1481 #[inline(always)]
1482 fn cvt_from_bytes_mask16x8(self, a: u8x16<Self>) -> mask16x8<Self> {
1483 unsafe {
1484 mask16x8 {
1485 val: core::mem::transmute(a.val),
1486 simd: self,
1487 }
1488 }
1489 }
1490 #[inline(always)]
1491 fn cvt_to_bytes_mask16x8(self, a: mask16x8<Self>) -> u8x16<Self> {
1492 unsafe {
1493 u8x16 {
1494 val: core::mem::transmute(a.val),
1495 simd: self,
1496 }
1497 }
1498 }
1499 #[inline(always)]
1500 fn slide_mask16x8<const SHIFT: usize>(
1501 self,
1502 a: mask16x8<Self>,
1503 b: mask16x8<Self>,
1504 ) -> mask16x8<Self> {
1505 unsafe {
1506 if SHIFT >= 8usize {
1507 return b;
1508 }
1509 let result = dyn_alignr_128(
1510 self.cvt_to_bytes_mask16x8(b).val.0,
1511 self.cvt_to_bytes_mask16x8(a).val.0,
1512 SHIFT * 2usize,
1513 );
1514 self.cvt_from_bytes_mask16x8(u8x16 {
1515 val: crate::support::Aligned128(result),
1516 simd: self,
1517 })
1518 }
1519 }
1520 #[inline(always)]
1521 fn slide_within_blocks_mask16x8<const SHIFT: usize>(
1522 self,
1523 a: mask16x8<Self>,
1524 b: mask16x8<Self>,
1525 ) -> mask16x8<Self> {
1526 self.slide_mask16x8::<SHIFT>(a, b)
1527 }
1528 #[inline(always)]
1529 fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
1530 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1531 }
1532 #[inline(always)]
1533 fn or_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
1534 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1535 }
1536 #[inline(always)]
1537 fn xor_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
1538 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1539 }
1540 #[inline(always)]
1541 fn not_mask16x8(self, a: mask16x8<Self>) -> mask16x8<Self> {
1542 a ^ !0
1543 }
1544 #[inline(always)]
1545 fn select_mask16x8(
1546 self,
1547 a: mask16x8<Self>,
1548 b: mask16x8<Self>,
1549 c: mask16x8<Self>,
1550 ) -> mask16x8<Self> {
1551 unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
1552 }
1553 #[inline(always)]
1554 fn simd_eq_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
1555 unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) }
1556 }
1557 #[inline(always)]
1558 fn any_true_mask16x8(self, a: mask16x8<Self>) -> bool {
1559 unsafe { _mm_movemask_epi8(a.into()) as u32 != 0 }
1560 }
1561 #[inline(always)]
1562 fn all_true_mask16x8(self, a: mask16x8<Self>) -> bool {
1563 unsafe { _mm_movemask_epi8(a.into()) as u32 == 0xffff }
1564 }
1565 #[inline(always)]
1566 fn any_false_mask16x8(self, a: mask16x8<Self>) -> bool {
1567 unsafe { _mm_movemask_epi8(a.into()) as u32 != 0xffff }
1568 }
1569 #[inline(always)]
1570 fn all_false_mask16x8(self, a: mask16x8<Self>) -> bool {
1571 unsafe { _mm_movemask_epi8(a.into()) as u32 == 0 }
1572 }
1573 #[inline(always)]
1574 fn combine_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x16<Self> {
1575 unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
1576 }
1577 #[inline(always)]
1578 fn splat_i32x4(self, val: i32) -> i32x4<Self> {
1579 unsafe { _mm_set1_epi32(val).simd_into(self) }
1580 }
1581 #[inline(always)]
1582 fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4<Self> {
1583 i32x4 {
1584 val: unsafe { core::mem::transmute_copy(&val) },
1585 simd: self,
1586 }
1587 }
1588 #[inline(always)]
1589 fn load_array_ref_i32x4(self, val: &[i32; 4usize]) -> i32x4<Self> {
1590 i32x4 {
1591 val: unsafe { core::mem::transmute_copy(val) },
1592 simd: self,
1593 }
1594 }
1595 #[inline(always)]
1596 fn as_array_i32x4(self, a: i32x4<Self>) -> [i32; 4usize] {
1597 unsafe { core::mem::transmute::<__m128i, [i32; 4usize]>(a.val.0) }
1598 }
1599 #[inline(always)]
1600 fn as_array_ref_i32x4(self, a: &i32x4<Self>) -> &[i32; 4usize] {
1601 unsafe { core::mem::transmute::<&__m128i, &[i32; 4usize]>(&a.val.0) }
1602 }
1603 #[inline(always)]
1604 fn as_array_mut_i32x4(self, a: &mut i32x4<Self>) -> &mut [i32; 4usize] {
1605 unsafe { core::mem::transmute::<&mut __m128i, &mut [i32; 4usize]>(&mut a.val.0) }
1606 }
1607 #[inline(always)]
1608 fn store_array_i32x4(self, a: i32x4<Self>, dest: &mut [i32; 4usize]) -> () {
1609 unsafe {
1610 core::ptr::copy_nonoverlapping(
1611 (&raw const a.val.0) as *const i32,
1612 dest.as_mut_ptr(),
1613 4usize,
1614 );
1615 }
1616 }
1617 #[inline(always)]
1618 fn cvt_from_bytes_i32x4(self, a: u8x16<Self>) -> i32x4<Self> {
1619 unsafe {
1620 i32x4 {
1621 val: core::mem::transmute(a.val),
1622 simd: self,
1623 }
1624 }
1625 }
1626 #[inline(always)]
1627 fn cvt_to_bytes_i32x4(self, a: i32x4<Self>) -> u8x16<Self> {
1628 unsafe {
1629 u8x16 {
1630 val: core::mem::transmute(a.val),
1631 simd: self,
1632 }
1633 }
1634 }
1635 #[inline(always)]
1636 fn slide_i32x4<const SHIFT: usize>(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1637 unsafe {
1638 if SHIFT >= 4usize {
1639 return b;
1640 }
1641 let result = dyn_alignr_128(
1642 self.cvt_to_bytes_i32x4(b).val.0,
1643 self.cvt_to_bytes_i32x4(a).val.0,
1644 SHIFT * 4usize,
1645 );
1646 self.cvt_from_bytes_i32x4(u8x16 {
1647 val: crate::support::Aligned128(result),
1648 simd: self,
1649 })
1650 }
1651 }
1652 #[inline(always)]
1653 fn slide_within_blocks_i32x4<const SHIFT: usize>(
1654 self,
1655 a: i32x4<Self>,
1656 b: i32x4<Self>,
1657 ) -> i32x4<Self> {
1658 self.slide_i32x4::<SHIFT>(a, b)
1659 }
1660 #[inline(always)]
1661 fn add_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1662 unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) }
1663 }
1664 #[inline(always)]
1665 fn sub_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1666 unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) }
1667 }
1668 #[inline(always)]
1669 fn mul_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1670 unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) }
1671 }
1672 #[inline(always)]
1673 fn and_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1674 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1675 }
1676 #[inline(always)]
1677 fn or_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1678 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1679 }
1680 #[inline(always)]
1681 fn xor_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1682 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1683 }
1684 #[inline(always)]
1685 fn not_i32x4(self, a: i32x4<Self>) -> i32x4<Self> {
1686 a ^ !0
1687 }
1688 #[inline(always)]
1689 fn shl_i32x4(self, a: i32x4<Self>, shift: u32) -> i32x4<Self> {
1690 unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1691 }
1692 #[inline(always)]
1693 fn shlv_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1694 unsafe { _mm_sllv_epi32(a.into(), b.into()).simd_into(self) }
1695 }
1696 #[inline(always)]
1697 fn shr_i32x4(self, a: i32x4<Self>, shift: u32) -> i32x4<Self> {
1698 unsafe { _mm_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1699 }
1700 #[inline(always)]
1701 fn shrv_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1702 unsafe { _mm_srav_epi32(a.into(), b.into()).simd_into(self) }
1703 }
1704 #[inline(always)]
1705 fn simd_eq_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
1706 unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) }
1707 }
1708 #[inline(always)]
1709 fn simd_lt_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
1710 unsafe { _mm_cmpgt_epi32(b.into(), a.into()).simd_into(self) }
1711 }
1712 #[inline(always)]
1713 fn simd_le_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
1714 unsafe { _mm_cmpeq_epi32(_mm_min_epi32(a.into(), b.into()), a.into()).simd_into(self) }
1715 }
1716 #[inline(always)]
1717 fn simd_ge_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
1718 unsafe { _mm_cmpeq_epi32(_mm_max_epi32(a.into(), b.into()), a.into()).simd_into(self) }
1719 }
1720 #[inline(always)]
1721 fn simd_gt_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
1722 unsafe { _mm_cmpgt_epi32(a.into(), b.into()).simd_into(self) }
1723 }
1724 #[inline(always)]
1725 fn zip_low_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1726 unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) }
1727 }
1728 #[inline(always)]
1729 fn zip_high_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1730 unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) }
1731 }
1732 #[inline(always)]
1733 fn unzip_low_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1734 unsafe {
1735 let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
1736 let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
1737 _mm_unpacklo_epi64(t1, t2).simd_into(self)
1738 }
1739 }
1740 #[inline(always)]
1741 fn unzip_high_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1742 unsafe {
1743 let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
1744 let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
1745 _mm_unpackhi_epi64(t1, t2).simd_into(self)
1746 }
1747 }
1748 #[inline(always)]
1749 fn interleave_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> (i32x4<Self>, i32x4<Self>) {
1750 (self.zip_low_i32x4(a, b), self.zip_high_i32x4(a, b))
1751 }
1752 #[inline(always)]
1753 fn deinterleave_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> (i32x4<Self>, i32x4<Self>) {
1754 (self.unzip_low_i32x4(a, b), self.unzip_high_i32x4(a, b))
1755 }
1756 #[inline(always)]
1757 fn select_i32x4(self, a: mask32x4<Self>, b: i32x4<Self>, c: i32x4<Self>) -> i32x4<Self> {
1758 unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
1759 }
1760 #[inline(always)]
1761 fn min_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1762 unsafe { _mm_min_epi32(a.into(), b.into()).simd_into(self) }
1763 }
1764 #[inline(always)]
1765 fn max_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
1766 unsafe { _mm_max_epi32(a.into(), b.into()).simd_into(self) }
1767 }
1768 #[inline(always)]
1769 fn combine_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x8<Self> {
1770 unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
1771 }
1772 #[inline(always)]
1773 fn neg_i32x4(self, a: i32x4<Self>) -> i32x4<Self> {
1774 unsafe { _mm_sub_epi32(_mm_setzero_si128(), a.into()).simd_into(self) }
1775 }
1776 #[inline(always)]
1777 fn reinterpret_u8_i32x4(self, a: i32x4<Self>) -> u8x16<Self> {
1778 __m128i::from(a).simd_into(self)
1779 }
1780 #[inline(always)]
1781 fn reinterpret_u32_i32x4(self, a: i32x4<Self>) -> u32x4<Self> {
1782 __m128i::from(a).simd_into(self)
1783 }
1784 #[inline(always)]
1785 fn cvt_f32_i32x4(self, a: i32x4<Self>) -> f32x4<Self> {
1786 unsafe { _mm_cvtepi32_ps(a.into()).simd_into(self) }
1787 }
1788 #[inline(always)]
1789 fn splat_u32x4(self, val: u32) -> u32x4<Self> {
1790 unsafe { _mm_set1_epi32(val.cast_signed()).simd_into(self) }
1791 }
1792 #[inline(always)]
1793 fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4<Self> {
1794 u32x4 {
1795 val: unsafe { core::mem::transmute_copy(&val) },
1796 simd: self,
1797 }
1798 }
1799 #[inline(always)]
1800 fn load_array_ref_u32x4(self, val: &[u32; 4usize]) -> u32x4<Self> {
1801 u32x4 {
1802 val: unsafe { core::mem::transmute_copy(val) },
1803 simd: self,
1804 }
1805 }
1806 #[inline(always)]
1807 fn as_array_u32x4(self, a: u32x4<Self>) -> [u32; 4usize] {
1808 unsafe { core::mem::transmute::<__m128i, [u32; 4usize]>(a.val.0) }
1809 }
1810 #[inline(always)]
1811 fn as_array_ref_u32x4(self, a: &u32x4<Self>) -> &[u32; 4usize] {
1812 unsafe { core::mem::transmute::<&__m128i, &[u32; 4usize]>(&a.val.0) }
1813 }
1814 #[inline(always)]
1815 fn as_array_mut_u32x4(self, a: &mut u32x4<Self>) -> &mut [u32; 4usize] {
1816 unsafe { core::mem::transmute::<&mut __m128i, &mut [u32; 4usize]>(&mut a.val.0) }
1817 }
1818 #[inline(always)]
1819 fn store_array_u32x4(self, a: u32x4<Self>, dest: &mut [u32; 4usize]) -> () {
1820 unsafe {
1821 core::ptr::copy_nonoverlapping(
1822 (&raw const a.val.0) as *const u32,
1823 dest.as_mut_ptr(),
1824 4usize,
1825 );
1826 }
1827 }
1828 #[inline(always)]
1829 fn cvt_from_bytes_u32x4(self, a: u8x16<Self>) -> u32x4<Self> {
1830 unsafe {
1831 u32x4 {
1832 val: core::mem::transmute(a.val),
1833 simd: self,
1834 }
1835 }
1836 }
1837 #[inline(always)]
1838 fn cvt_to_bytes_u32x4(self, a: u32x4<Self>) -> u8x16<Self> {
1839 unsafe {
1840 u8x16 {
1841 val: core::mem::transmute(a.val),
1842 simd: self,
1843 }
1844 }
1845 }
1846 #[inline(always)]
1847 fn slide_u32x4<const SHIFT: usize>(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1848 unsafe {
1849 if SHIFT >= 4usize {
1850 return b;
1851 }
1852 let result = dyn_alignr_128(
1853 self.cvt_to_bytes_u32x4(b).val.0,
1854 self.cvt_to_bytes_u32x4(a).val.0,
1855 SHIFT * 4usize,
1856 );
1857 self.cvt_from_bytes_u32x4(u8x16 {
1858 val: crate::support::Aligned128(result),
1859 simd: self,
1860 })
1861 }
1862 }
1863 #[inline(always)]
1864 fn slide_within_blocks_u32x4<const SHIFT: usize>(
1865 self,
1866 a: u32x4<Self>,
1867 b: u32x4<Self>,
1868 ) -> u32x4<Self> {
1869 self.slide_u32x4::<SHIFT>(a, b)
1870 }
1871 #[inline(always)]
1872 fn add_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1873 unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) }
1874 }
1875 #[inline(always)]
1876 fn sub_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1877 unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) }
1878 }
1879 #[inline(always)]
1880 fn mul_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1881 unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) }
1882 }
1883 #[inline(always)]
1884 fn and_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1885 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
1886 }
1887 #[inline(always)]
1888 fn or_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1889 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
1890 }
1891 #[inline(always)]
1892 fn xor_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1893 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
1894 }
1895 #[inline(always)]
1896 fn not_u32x4(self, a: u32x4<Self>) -> u32x4<Self> {
1897 a ^ !0
1898 }
1899 #[inline(always)]
1900 fn shl_u32x4(self, a: u32x4<Self>, shift: u32) -> u32x4<Self> {
1901 unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1902 }
1903 #[inline(always)]
1904 fn shlv_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1905 unsafe { _mm_sllv_epi32(a.into(), b.into()).simd_into(self) }
1906 }
1907 #[inline(always)]
1908 fn shr_u32x4(self, a: u32x4<Self>, shift: u32) -> u32x4<Self> {
1909 unsafe { _mm_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
1910 }
1911 #[inline(always)]
1912 fn shrv_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1913 unsafe { _mm_srlv_epi32(a.into(), b.into()).simd_into(self) }
1914 }
1915 #[inline(always)]
1916 fn simd_eq_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1917 unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) }
1918 }
1919 #[inline(always)]
1920 fn simd_lt_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1921 unsafe {
1922 let sign_bit = _mm_set1_epi32(0x80000000u32.cast_signed());
1923 let a_signed = _mm_xor_si128(a.into(), sign_bit);
1924 let b_signed = _mm_xor_si128(b.into(), sign_bit);
1925 _mm_cmpgt_epi32(b_signed, a_signed).simd_into(self)
1926 }
1927 }
1928 #[inline(always)]
1929 fn simd_le_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1930 unsafe { _mm_cmpeq_epi32(_mm_min_epu32(a.into(), b.into()), a.into()).simd_into(self) }
1931 }
1932 #[inline(always)]
1933 fn simd_ge_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1934 unsafe { _mm_cmpeq_epi32(_mm_max_epu32(a.into(), b.into()), a.into()).simd_into(self) }
1935 }
1936 #[inline(always)]
1937 fn simd_gt_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
1938 unsafe {
1939 let sign_bit = _mm_set1_epi32(0x80000000u32.cast_signed());
1940 let a_signed = _mm_xor_si128(a.into(), sign_bit);
1941 let b_signed = _mm_xor_si128(b.into(), sign_bit);
1942 _mm_cmpgt_epi32(a_signed, b_signed).simd_into(self)
1943 }
1944 }
1945 #[inline(always)]
1946 fn zip_low_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1947 unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) }
1948 }
1949 #[inline(always)]
1950 fn zip_high_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1951 unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) }
1952 }
1953 #[inline(always)]
1954 fn unzip_low_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1955 unsafe {
1956 let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
1957 let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
1958 _mm_unpacklo_epi64(t1, t2).simd_into(self)
1959 }
1960 }
1961 #[inline(always)]
1962 fn unzip_high_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1963 unsafe {
1964 let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
1965 let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
1966 _mm_unpackhi_epi64(t1, t2).simd_into(self)
1967 }
1968 }
1969 #[inline(always)]
1970 fn interleave_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> (u32x4<Self>, u32x4<Self>) {
1971 (self.zip_low_u32x4(a, b), self.zip_high_u32x4(a, b))
1972 }
1973 #[inline(always)]
1974 fn deinterleave_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> (u32x4<Self>, u32x4<Self>) {
1975 (self.unzip_low_u32x4(a, b), self.unzip_high_u32x4(a, b))
1976 }
1977 #[inline(always)]
1978 fn select_u32x4(self, a: mask32x4<Self>, b: u32x4<Self>, c: u32x4<Self>) -> u32x4<Self> {
1979 unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
1980 }
1981 #[inline(always)]
1982 fn min_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1983 unsafe { _mm_min_epu32(a.into(), b.into()).simd_into(self) }
1984 }
1985 #[inline(always)]
1986 fn max_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
1987 unsafe { _mm_max_epu32(a.into(), b.into()).simd_into(self) }
1988 }
1989 #[inline(always)]
1990 fn combine_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x8<Self> {
1991 unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
1992 }
1993 #[inline(always)]
1994 fn reinterpret_u8_u32x4(self, a: u32x4<Self>) -> u8x16<Self> {
1995 __m128i::from(a).simd_into(self)
1996 }
1997 #[inline(always)]
1998 fn cvt_f32_u32x4(self, a: u32x4<Self>) -> f32x4<Self> {
1999 unsafe {
2000 let a = a.into();
2001 let lo = _mm_blend_epi16::<0xAA>(a, _mm_set1_epi32(0x4B000000));
2002 let hi = _mm_blend_epi16::<0xAA>(_mm_srli_epi32::<16>(a), _mm_set1_epi32(0x53000000));
2003 let fhi = _mm_sub_ps(
2004 _mm_castsi128_ps(hi),
2005 _mm_set1_ps(f32::from_bits(0x53000080)),
2006 );
2007 let result = _mm_add_ps(_mm_castsi128_ps(lo), fhi);
2008 result.simd_into(self)
2009 }
2010 }
2011 #[inline(always)]
2012 fn splat_mask32x4(self, val: i32) -> mask32x4<Self> {
2013 unsafe { _mm_set1_epi32(val).simd_into(self) }
2014 }
2015 #[inline(always)]
2016 fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4<Self> {
2017 mask32x4 {
2018 val: unsafe { core::mem::transmute_copy(&val) },
2019 simd: self,
2020 }
2021 }
2022 #[inline(always)]
2023 fn load_array_ref_mask32x4(self, val: &[i32; 4usize]) -> mask32x4<Self> {
2024 mask32x4 {
2025 val: unsafe { core::mem::transmute_copy(val) },
2026 simd: self,
2027 }
2028 }
2029 #[inline(always)]
2030 fn as_array_mask32x4(self, a: mask32x4<Self>) -> [i32; 4usize] {
2031 unsafe { core::mem::transmute::<__m128i, [i32; 4usize]>(a.val.0) }
2032 }
2033 #[inline(always)]
2034 fn as_array_ref_mask32x4(self, a: &mask32x4<Self>) -> &[i32; 4usize] {
2035 unsafe { core::mem::transmute::<&__m128i, &[i32; 4usize]>(&a.val.0) }
2036 }
2037 #[inline(always)]
2038 fn as_array_mut_mask32x4(self, a: &mut mask32x4<Self>) -> &mut [i32; 4usize] {
2039 unsafe { core::mem::transmute::<&mut __m128i, &mut [i32; 4usize]>(&mut a.val.0) }
2040 }
2041 #[inline(always)]
2042 fn store_array_mask32x4(self, a: mask32x4<Self>, dest: &mut [i32; 4usize]) -> () {
2043 unsafe {
2044 core::ptr::copy_nonoverlapping(
2045 (&raw const a.val.0) as *const i32,
2046 dest.as_mut_ptr(),
2047 4usize,
2048 );
2049 }
2050 }
2051 #[inline(always)]
2052 fn cvt_from_bytes_mask32x4(self, a: u8x16<Self>) -> mask32x4<Self> {
2053 unsafe {
2054 mask32x4 {
2055 val: core::mem::transmute(a.val),
2056 simd: self,
2057 }
2058 }
2059 }
2060 #[inline(always)]
2061 fn cvt_to_bytes_mask32x4(self, a: mask32x4<Self>) -> u8x16<Self> {
2062 unsafe {
2063 u8x16 {
2064 val: core::mem::transmute(a.val),
2065 simd: self,
2066 }
2067 }
2068 }
2069 #[inline(always)]
2070 fn slide_mask32x4<const SHIFT: usize>(
2071 self,
2072 a: mask32x4<Self>,
2073 b: mask32x4<Self>,
2074 ) -> mask32x4<Self> {
2075 unsafe {
2076 if SHIFT >= 4usize {
2077 return b;
2078 }
2079 let result = dyn_alignr_128(
2080 self.cvt_to_bytes_mask32x4(b).val.0,
2081 self.cvt_to_bytes_mask32x4(a).val.0,
2082 SHIFT * 4usize,
2083 );
2084 self.cvt_from_bytes_mask32x4(u8x16 {
2085 val: crate::support::Aligned128(result),
2086 simd: self,
2087 })
2088 }
2089 }
2090 #[inline(always)]
2091 fn slide_within_blocks_mask32x4<const SHIFT: usize>(
2092 self,
2093 a: mask32x4<Self>,
2094 b: mask32x4<Self>,
2095 ) -> mask32x4<Self> {
2096 self.slide_mask32x4::<SHIFT>(a, b)
2097 }
2098 #[inline(always)]
2099 fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
2100 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
2101 }
2102 #[inline(always)]
2103 fn or_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
2104 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
2105 }
2106 #[inline(always)]
2107 fn xor_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
2108 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
2109 }
2110 #[inline(always)]
2111 fn not_mask32x4(self, a: mask32x4<Self>) -> mask32x4<Self> {
2112 a ^ !0
2113 }
2114 #[inline(always)]
2115 fn select_mask32x4(
2116 self,
2117 a: mask32x4<Self>,
2118 b: mask32x4<Self>,
2119 c: mask32x4<Self>,
2120 ) -> mask32x4<Self> {
2121 unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
2122 }
2123 #[inline(always)]
2124 fn simd_eq_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
2125 unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) }
2126 }
2127 #[inline(always)]
2128 fn any_true_mask32x4(self, a: mask32x4<Self>) -> bool {
2129 unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 != 0 }
2130 }
2131 #[inline(always)]
2132 fn all_true_mask32x4(self, a: mask32x4<Self>) -> bool {
2133 unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 == 0b1111 }
2134 }
2135 #[inline(always)]
2136 fn any_false_mask32x4(self, a: mask32x4<Self>) -> bool {
2137 unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 != 0b1111 }
2138 }
2139 #[inline(always)]
2140 fn all_false_mask32x4(self, a: mask32x4<Self>) -> bool {
2141 unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 == 0 }
2142 }
2143 #[inline(always)]
2144 fn combine_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x8<Self> {
2145 unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
2146 }
2147 #[inline(always)]
2148 fn splat_f64x2(self, val: f64) -> f64x2<Self> {
2149 unsafe { _mm_set1_pd(val).simd_into(self) }
2150 }
2151 #[inline(always)]
2152 fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2<Self> {
2153 f64x2 {
2154 val: unsafe { core::mem::transmute_copy(&val) },
2155 simd: self,
2156 }
2157 }
2158 #[inline(always)]
2159 fn load_array_ref_f64x2(self, val: &[f64; 2usize]) -> f64x2<Self> {
2160 f64x2 {
2161 val: unsafe { core::mem::transmute_copy(val) },
2162 simd: self,
2163 }
2164 }
2165 #[inline(always)]
2166 fn as_array_f64x2(self, a: f64x2<Self>) -> [f64; 2usize] {
2167 unsafe { core::mem::transmute::<__m128d, [f64; 2usize]>(a.val.0) }
2168 }
2169 #[inline(always)]
2170 fn as_array_ref_f64x2(self, a: &f64x2<Self>) -> &[f64; 2usize] {
2171 unsafe { core::mem::transmute::<&__m128d, &[f64; 2usize]>(&a.val.0) }
2172 }
2173 #[inline(always)]
2174 fn as_array_mut_f64x2(self, a: &mut f64x2<Self>) -> &mut [f64; 2usize] {
2175 unsafe { core::mem::transmute::<&mut __m128d, &mut [f64; 2usize]>(&mut a.val.0) }
2176 }
2177 #[inline(always)]
2178 fn store_array_f64x2(self, a: f64x2<Self>, dest: &mut [f64; 2usize]) -> () {
2179 unsafe {
2180 core::ptr::copy_nonoverlapping(
2181 (&raw const a.val.0) as *const f64,
2182 dest.as_mut_ptr(),
2183 2usize,
2184 );
2185 }
2186 }
2187 #[inline(always)]
2188 fn cvt_from_bytes_f64x2(self, a: u8x16<Self>) -> f64x2<Self> {
2189 unsafe {
2190 f64x2 {
2191 val: core::mem::transmute(a.val),
2192 simd: self,
2193 }
2194 }
2195 }
2196 #[inline(always)]
2197 fn cvt_to_bytes_f64x2(self, a: f64x2<Self>) -> u8x16<Self> {
2198 unsafe {
2199 u8x16 {
2200 val: core::mem::transmute(a.val),
2201 simd: self,
2202 }
2203 }
2204 }
2205 #[inline(always)]
2206 fn slide_f64x2<const SHIFT: usize>(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2207 unsafe {
2208 if SHIFT >= 2usize {
2209 return b;
2210 }
2211 let result = dyn_alignr_128(
2212 self.cvt_to_bytes_f64x2(b).val.0,
2213 self.cvt_to_bytes_f64x2(a).val.0,
2214 SHIFT * 8usize,
2215 );
2216 self.cvt_from_bytes_f64x2(u8x16 {
2217 val: crate::support::Aligned128(result),
2218 simd: self,
2219 })
2220 }
2221 }
2222 #[inline(always)]
2223 fn slide_within_blocks_f64x2<const SHIFT: usize>(
2224 self,
2225 a: f64x2<Self>,
2226 b: f64x2<Self>,
2227 ) -> f64x2<Self> {
2228 self.slide_f64x2::<SHIFT>(a, b)
2229 }
2230 #[inline(always)]
2231 fn abs_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2232 unsafe { _mm_andnot_pd(_mm_set1_pd(-0.0), a.into()).simd_into(self) }
2233 }
2234 #[inline(always)]
2235 fn neg_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2236 unsafe { _mm_xor_pd(a.into(), _mm_set1_pd(-0.0)).simd_into(self) }
2237 }
2238 #[inline(always)]
2239 fn sqrt_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2240 unsafe { _mm_sqrt_pd(a.into()).simd_into(self) }
2241 }
2242 #[inline(always)]
2243 fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2244 unsafe { _mm_add_pd(a.into(), b.into()).simd_into(self) }
2245 }
2246 #[inline(always)]
2247 fn sub_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2248 unsafe { _mm_sub_pd(a.into(), b.into()).simd_into(self) }
2249 }
2250 #[inline(always)]
2251 fn mul_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2252 unsafe { _mm_mul_pd(a.into(), b.into()).simd_into(self) }
2253 }
2254 #[inline(always)]
2255 fn div_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2256 unsafe { _mm_div_pd(a.into(), b.into()).simd_into(self) }
2257 }
2258 #[inline(always)]
2259 fn copysign_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2260 unsafe {
2261 let mask = _mm_set1_pd(-0.0);
2262 _mm_or_pd(_mm_and_pd(mask, b.into()), _mm_andnot_pd(mask, a.into())).simd_into(self)
2263 }
2264 }
2265 #[inline(always)]
2266 fn simd_eq_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
2267 unsafe { _mm_castpd_si128(_mm_cmpeq_pd(a.into(), b.into())).simd_into(self) }
2268 }
2269 #[inline(always)]
2270 fn simd_lt_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
2271 unsafe { _mm_castpd_si128(_mm_cmplt_pd(a.into(), b.into())).simd_into(self) }
2272 }
2273 #[inline(always)]
2274 fn simd_le_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
2275 unsafe { _mm_castpd_si128(_mm_cmple_pd(a.into(), b.into())).simd_into(self) }
2276 }
2277 #[inline(always)]
2278 fn simd_ge_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
2279 unsafe { _mm_castpd_si128(_mm_cmpge_pd(a.into(), b.into())).simd_into(self) }
2280 }
2281 #[inline(always)]
2282 fn simd_gt_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
2283 unsafe { _mm_castpd_si128(_mm_cmpgt_pd(a.into(), b.into())).simd_into(self) }
2284 }
2285 #[inline(always)]
2286 fn zip_low_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2287 unsafe { _mm_unpacklo_pd(a.into(), b.into()).simd_into(self) }
2288 }
2289 #[inline(always)]
2290 fn zip_high_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2291 unsafe { _mm_unpackhi_pd(a.into(), b.into()).simd_into(self) }
2292 }
2293 #[inline(always)]
2294 fn unzip_low_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2295 unsafe { _mm_shuffle_pd::<0b00>(a.into(), b.into()).simd_into(self) }
2296 }
2297 #[inline(always)]
2298 fn unzip_high_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2299 unsafe { _mm_shuffle_pd::<0b11>(a.into(), b.into()).simd_into(self) }
2300 }
2301 #[inline(always)]
2302 fn interleave_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> (f64x2<Self>, f64x2<Self>) {
2303 (self.zip_low_f64x2(a, b), self.zip_high_f64x2(a, b))
2304 }
2305 #[inline(always)]
2306 fn deinterleave_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> (f64x2<Self>, f64x2<Self>) {
2307 (self.unzip_low_f64x2(a, b), self.unzip_high_f64x2(a, b))
2308 }
2309 #[inline(always)]
2310 fn max_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2311 unsafe { _mm_max_pd(a.into(), b.into()).simd_into(self) }
2312 }
2313 #[inline(always)]
2314 fn min_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2315 unsafe { _mm_min_pd(a.into(), b.into()).simd_into(self) }
2316 }
2317 #[inline(always)]
2318 fn max_precise_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2319 unsafe {
2320 let intermediate = _mm_max_pd(a.into(), b.into());
2321 let b_is_nan = _mm_cmpunord_pd(b.into(), b.into());
2322 _mm_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(self)
2323 }
2324 }
2325 #[inline(always)]
2326 fn min_precise_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
2327 unsafe {
2328 let intermediate = _mm_min_pd(a.into(), b.into());
2329 let b_is_nan = _mm_cmpunord_pd(b.into(), b.into());
2330 _mm_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(self)
2331 }
2332 }
2333 #[inline(always)]
2334 fn mul_add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
2335 unsafe { _mm_fmadd_pd(a.into(), b.into(), c.into()).simd_into(self) }
2336 }
2337 #[inline(always)]
2338 fn mul_sub_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
2339 unsafe { _mm_fmsub_pd(a.into(), b.into(), c.into()).simd_into(self) }
2340 }
2341 #[inline(always)]
2342 fn floor_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2343 unsafe {
2344 _mm_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
2345 }
2346 }
2347 #[inline(always)]
2348 fn ceil_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2349 unsafe {
2350 _mm_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
2351 }
2352 }
2353 #[inline(always)]
2354 fn round_ties_even_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2355 unsafe {
2356 _mm_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
2357 .simd_into(self)
2358 }
2359 }
2360 #[inline(always)]
2361 fn fract_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2362 a - self.trunc_f64x2(a)
2363 }
2364 #[inline(always)]
2365 fn trunc_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
2366 unsafe {
2367 _mm_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
2368 }
2369 }
2370 #[inline(always)]
2371 fn select_f64x2(self, a: mask64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
2372 unsafe { _mm_blendv_pd(c.into(), b.into(), _mm_castsi128_pd(a.into())).simd_into(self) }
2373 }
2374 #[inline(always)]
2375 fn combine_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x4<Self> {
2376 unsafe { _mm256_setr_m128d(a.into(), b.into()).simd_into(self) }
2377 }
2378 #[inline(always)]
2379 fn reinterpret_f32_f64x2(self, a: f64x2<Self>) -> f32x4<Self> {
2380 unsafe { _mm_castpd_ps(a.into()).simd_into(self) }
2381 }
2382 #[inline(always)]
2383 fn splat_mask64x2(self, val: i64) -> mask64x2<Self> {
2384 unsafe { _mm_set1_epi64x(val).simd_into(self) }
2385 }
2386 #[inline(always)]
2387 fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2<Self> {
2388 mask64x2 {
2389 val: unsafe { core::mem::transmute_copy(&val) },
2390 simd: self,
2391 }
2392 }
2393 #[inline(always)]
2394 fn load_array_ref_mask64x2(self, val: &[i64; 2usize]) -> mask64x2<Self> {
2395 mask64x2 {
2396 val: unsafe { core::mem::transmute_copy(val) },
2397 simd: self,
2398 }
2399 }
2400 #[inline(always)]
2401 fn as_array_mask64x2(self, a: mask64x2<Self>) -> [i64; 2usize] {
2402 unsafe { core::mem::transmute::<__m128i, [i64; 2usize]>(a.val.0) }
2403 }
2404 #[inline(always)]
2405 fn as_array_ref_mask64x2(self, a: &mask64x2<Self>) -> &[i64; 2usize] {
2406 unsafe { core::mem::transmute::<&__m128i, &[i64; 2usize]>(&a.val.0) }
2407 }
2408 #[inline(always)]
2409 fn as_array_mut_mask64x2(self, a: &mut mask64x2<Self>) -> &mut [i64; 2usize] {
2410 unsafe { core::mem::transmute::<&mut __m128i, &mut [i64; 2usize]>(&mut a.val.0) }
2411 }
2412 #[inline(always)]
2413 fn store_array_mask64x2(self, a: mask64x2<Self>, dest: &mut [i64; 2usize]) -> () {
2414 unsafe {
2415 core::ptr::copy_nonoverlapping(
2416 (&raw const a.val.0) as *const i64,
2417 dest.as_mut_ptr(),
2418 2usize,
2419 );
2420 }
2421 }
2422 #[inline(always)]
2423 fn cvt_from_bytes_mask64x2(self, a: u8x16<Self>) -> mask64x2<Self> {
2424 unsafe {
2425 mask64x2 {
2426 val: core::mem::transmute(a.val),
2427 simd: self,
2428 }
2429 }
2430 }
2431 #[inline(always)]
2432 fn cvt_to_bytes_mask64x2(self, a: mask64x2<Self>) -> u8x16<Self> {
2433 unsafe {
2434 u8x16 {
2435 val: core::mem::transmute(a.val),
2436 simd: self,
2437 }
2438 }
2439 }
2440 #[inline(always)]
2441 fn slide_mask64x2<const SHIFT: usize>(
2442 self,
2443 a: mask64x2<Self>,
2444 b: mask64x2<Self>,
2445 ) -> mask64x2<Self> {
2446 unsafe {
2447 if SHIFT >= 2usize {
2448 return b;
2449 }
2450 let result = dyn_alignr_128(
2451 self.cvt_to_bytes_mask64x2(b).val.0,
2452 self.cvt_to_bytes_mask64x2(a).val.0,
2453 SHIFT * 8usize,
2454 );
2455 self.cvt_from_bytes_mask64x2(u8x16 {
2456 val: crate::support::Aligned128(result),
2457 simd: self,
2458 })
2459 }
2460 }
2461 #[inline(always)]
2462 fn slide_within_blocks_mask64x2<const SHIFT: usize>(
2463 self,
2464 a: mask64x2<Self>,
2465 b: mask64x2<Self>,
2466 ) -> mask64x2<Self> {
2467 self.slide_mask64x2::<SHIFT>(a, b)
2468 }
2469 #[inline(always)]
2470 fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
2471 unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
2472 }
2473 #[inline(always)]
2474 fn or_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
2475 unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
2476 }
2477 #[inline(always)]
2478 fn xor_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
2479 unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
2480 }
2481 #[inline(always)]
2482 fn not_mask64x2(self, a: mask64x2<Self>) -> mask64x2<Self> {
2483 a ^ !0
2484 }
2485 #[inline(always)]
2486 fn select_mask64x2(
2487 self,
2488 a: mask64x2<Self>,
2489 b: mask64x2<Self>,
2490 c: mask64x2<Self>,
2491 ) -> mask64x2<Self> {
2492 unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
2493 }
2494 #[inline(always)]
2495 fn simd_eq_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
2496 unsafe { _mm_cmpeq_epi64(a.into(), b.into()).simd_into(self) }
2497 }
2498 #[inline(always)]
2499 fn any_true_mask64x2(self, a: mask64x2<Self>) -> bool {
2500 unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0 }
2501 }
2502 #[inline(always)]
2503 fn all_true_mask64x2(self, a: mask64x2<Self>) -> bool {
2504 unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0b11 }
2505 }
2506 #[inline(always)]
2507 fn any_false_mask64x2(self, a: mask64x2<Self>) -> bool {
2508 unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0b11 }
2509 }
2510 #[inline(always)]
2511 fn all_false_mask64x2(self, a: mask64x2<Self>) -> bool {
2512 unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0 }
2513 }
2514 #[inline(always)]
2515 fn combine_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x4<Self> {
2516 unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
2517 }
2518 #[inline(always)]
2519 fn splat_f32x8(self, val: f32) -> f32x8<Self> {
2520 unsafe { _mm256_set1_ps(val).simd_into(self) }
2521 }
2522 #[inline(always)]
2523 fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8<Self> {
2524 f32x8 {
2525 val: unsafe { core::mem::transmute_copy(&val) },
2526 simd: self,
2527 }
2528 }
2529 #[inline(always)]
2530 fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8<Self> {
2531 f32x8 {
2532 val: unsafe { core::mem::transmute_copy(val) },
2533 simd: self,
2534 }
2535 }
2536 #[inline(always)]
2537 fn as_array_f32x8(self, a: f32x8<Self>) -> [f32; 8usize] {
2538 unsafe { core::mem::transmute::<__m256, [f32; 8usize]>(a.val.0) }
2539 }
2540 #[inline(always)]
2541 fn as_array_ref_f32x8(self, a: &f32x8<Self>) -> &[f32; 8usize] {
2542 unsafe { core::mem::transmute::<&__m256, &[f32; 8usize]>(&a.val.0) }
2543 }
2544 #[inline(always)]
2545 fn as_array_mut_f32x8(self, a: &mut f32x8<Self>) -> &mut [f32; 8usize] {
2546 unsafe { core::mem::transmute::<&mut __m256, &mut [f32; 8usize]>(&mut a.val.0) }
2547 }
2548 #[inline(always)]
2549 fn store_array_f32x8(self, a: f32x8<Self>, dest: &mut [f32; 8usize]) -> () {
2550 unsafe {
2551 core::ptr::copy_nonoverlapping(
2552 (&raw const a.val.0) as *const f32,
2553 dest.as_mut_ptr(),
2554 8usize,
2555 );
2556 }
2557 }
2558 #[inline(always)]
2559 fn cvt_from_bytes_f32x8(self, a: u8x32<Self>) -> f32x8<Self> {
2560 unsafe {
2561 f32x8 {
2562 val: core::mem::transmute(a.val),
2563 simd: self,
2564 }
2565 }
2566 }
2567 #[inline(always)]
2568 fn cvt_to_bytes_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
2569 unsafe {
2570 u8x32 {
2571 val: core::mem::transmute(a.val),
2572 simd: self,
2573 }
2574 }
2575 }
2576 #[inline(always)]
2577 fn slide_f32x8<const SHIFT: usize>(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2578 unsafe {
2579 if SHIFT >= 8usize {
2580 return b;
2581 }
2582 let result = cross_block_alignr_256x1(
2583 self.cvt_to_bytes_f32x8(b).val.0,
2584 self.cvt_to_bytes_f32x8(a).val.0,
2585 SHIFT * 4usize,
2586 );
2587 self.cvt_from_bytes_f32x8(u8x32 {
2588 val: crate::support::Aligned256(result),
2589 simd: self,
2590 })
2591 }
2592 }
2593 #[inline(always)]
2594 fn slide_within_blocks_f32x8<const SHIFT: usize>(
2595 self,
2596 a: f32x8<Self>,
2597 b: f32x8<Self>,
2598 ) -> f32x8<Self> {
2599 unsafe {
2600 if SHIFT >= 4usize {
2601 return b;
2602 }
2603 let result = dyn_alignr_256(
2604 self.cvt_to_bytes_f32x8(b).val.0,
2605 self.cvt_to_bytes_f32x8(a).val.0,
2606 SHIFT * 4usize,
2607 );
2608 self.cvt_from_bytes_f32x8(u8x32 {
2609 val: crate::support::Aligned256(result),
2610 simd: self,
2611 })
2612 }
2613 }
2614 #[inline(always)]
2615 fn abs_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2616 unsafe { _mm256_andnot_ps(_mm256_set1_ps(-0.0), a.into()).simd_into(self) }
2617 }
2618 #[inline(always)]
2619 fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2620 unsafe { _mm256_xor_ps(a.into(), _mm256_set1_ps(-0.0)).simd_into(self) }
2621 }
2622 #[inline(always)]
2623 fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2624 unsafe { _mm256_sqrt_ps(a.into()).simd_into(self) }
2625 }
2626 #[inline(always)]
2627 fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2628 unsafe { _mm256_add_ps(a.into(), b.into()).simd_into(self) }
2629 }
2630 #[inline(always)]
2631 fn sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2632 unsafe { _mm256_sub_ps(a.into(), b.into()).simd_into(self) }
2633 }
2634 #[inline(always)]
2635 fn mul_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2636 unsafe { _mm256_mul_ps(a.into(), b.into()).simd_into(self) }
2637 }
2638 #[inline(always)]
2639 fn div_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2640 unsafe { _mm256_div_ps(a.into(), b.into()).simd_into(self) }
2641 }
2642 #[inline(always)]
2643 fn copysign_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2644 unsafe {
2645 let mask = _mm256_set1_ps(-0.0);
2646 _mm256_or_ps(
2647 _mm256_and_ps(mask, b.into()),
2648 _mm256_andnot_ps(mask, a.into()),
2649 )
2650 .simd_into(self)
2651 }
2652 }
2653 #[inline(always)]
2654 fn simd_eq_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
2655 unsafe { _mm256_castps_si256(_mm256_cmp_ps::<0i32>(a.into(), b.into())).simd_into(self) }
2656 }
2657 #[inline(always)]
2658 fn simd_lt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
2659 unsafe { _mm256_castps_si256(_mm256_cmp_ps::<17i32>(a.into(), b.into())).simd_into(self) }
2660 }
2661 #[inline(always)]
2662 fn simd_le_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
2663 unsafe { _mm256_castps_si256(_mm256_cmp_ps::<18i32>(a.into(), b.into())).simd_into(self) }
2664 }
2665 #[inline(always)]
2666 fn simd_ge_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
2667 unsafe { _mm256_castps_si256(_mm256_cmp_ps::<29i32>(a.into(), b.into())).simd_into(self) }
2668 }
2669 #[inline(always)]
2670 fn simd_gt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
2671 unsafe { _mm256_castps_si256(_mm256_cmp_ps::<30i32>(a.into(), b.into())).simd_into(self) }
2672 }
2673 #[inline(always)]
2674 fn zip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2675 unsafe {
2676 let lo = _mm256_unpacklo_ps(a.into(), b.into());
2677 let hi = _mm256_unpackhi_ps(a.into(), b.into());
2678 _mm256_permute2f128_ps::<0b0010_0000>(lo, hi).simd_into(self)
2679 }
2680 }
2681 #[inline(always)]
2682 fn zip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2683 unsafe {
2684 let lo = _mm256_unpacklo_ps(a.into(), b.into());
2685 let hi = _mm256_unpackhi_ps(a.into(), b.into());
2686 _mm256_permute2f128_ps::<0b0011_0001>(lo, hi).simd_into(self)
2687 }
2688 }
2689 #[inline(always)]
2690 fn unzip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2691 unsafe {
2692 let t1 = _mm256_permutevar8x32_ps(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
2693 let t2 = _mm256_permutevar8x32_ps(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
2694 _mm256_permute2f128_ps::<0b0010_0000>(t1, t2).simd_into(self)
2695 }
2696 }
2697 #[inline(always)]
2698 fn unzip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2699 unsafe {
2700 let t1 = _mm256_permutevar8x32_ps(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
2701 let t2 = _mm256_permutevar8x32_ps(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
2702 _mm256_permute2f128_ps::<0b0011_0001>(t1, t2).simd_into(self)
2703 }
2704 }
2705 #[inline(always)]
2706 fn interleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
2707 unsafe {
2708 let lo = _mm256_unpacklo_ps(a.into(), b.into());
2709 let hi = _mm256_unpackhi_ps(a.into(), b.into());
2710 (
2711 _mm256_permute2f128_ps::<0b0010_0000>(lo, hi).simd_into(self),
2712 _mm256_permute2f128_ps::<0b0011_0001>(lo, hi).simd_into(self),
2713 )
2714 }
2715 }
2716 #[inline(always)]
2717 fn deinterleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
2718 unsafe {
2719 let t1 = _mm256_permutevar8x32_ps(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
2720 let t2 = _mm256_permutevar8x32_ps(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
2721 (
2722 _mm256_permute2f128_ps::<0b0010_0000>(t1, t2).simd_into(self),
2723 _mm256_permute2f128_ps::<0b0011_0001>(t1, t2).simd_into(self),
2724 )
2725 }
2726 }
2727 #[inline(always)]
2728 fn max_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2729 unsafe { _mm256_max_ps(a.into(), b.into()).simd_into(self) }
2730 }
2731 #[inline(always)]
2732 fn min_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2733 unsafe { _mm256_min_ps(a.into(), b.into()).simd_into(self) }
2734 }
2735 #[inline(always)]
2736 fn max_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2737 unsafe {
2738 let intermediate = _mm256_max_ps(a.into(), b.into());
2739 let b_is_nan = _mm256_cmp_ps::<3i32>(b.into(), b.into());
2740 _mm256_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(self)
2741 }
2742 }
2743 #[inline(always)]
2744 fn min_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
2745 unsafe {
2746 let intermediate = _mm256_min_ps(a.into(), b.into());
2747 let b_is_nan = _mm256_cmp_ps::<3i32>(b.into(), b.into());
2748 _mm256_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(self)
2749 }
2750 }
2751 #[inline(always)]
2752 fn mul_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
2753 unsafe { _mm256_fmadd_ps(a.into(), b.into(), c.into()).simd_into(self) }
2754 }
2755 #[inline(always)]
2756 fn mul_sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
2757 unsafe { _mm256_fmsub_ps(a.into(), b.into(), c.into()).simd_into(self) }
2758 }
2759 #[inline(always)]
2760 fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2761 unsafe {
2762 _mm256_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
2763 .simd_into(self)
2764 }
2765 }
2766 #[inline(always)]
2767 fn ceil_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2768 unsafe {
2769 _mm256_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into())
2770 .simd_into(self)
2771 }
2772 }
2773 #[inline(always)]
2774 fn round_ties_even_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2775 unsafe {
2776 _mm256_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
2777 .simd_into(self)
2778 }
2779 }
2780 #[inline(always)]
2781 fn fract_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2782 a - self.trunc_f32x8(a)
2783 }
2784 #[inline(always)]
2785 fn trunc_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
2786 unsafe {
2787 _mm256_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
2788 }
2789 }
2790 #[inline(always)]
2791 fn select_f32x8(self, a: mask32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
2792 unsafe {
2793 _mm256_blendv_ps(c.into(), b.into(), _mm256_castsi256_ps(a.into())).simd_into(self)
2794 }
2795 }
2796 #[inline(always)]
2797 fn combine_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x16<Self> {
2798 f32x16 {
2799 val: crate::support::Aligned512([a.val.0, b.val.0]),
2800 simd: self,
2801 }
2802 }
2803 #[inline(always)]
2804 fn split_f32x8(self, a: f32x8<Self>) -> (f32x4<Self>, f32x4<Self>) {
2805 unsafe {
2806 (
2807 _mm256_extractf128_ps::<0>(a.into()).simd_into(self),
2808 _mm256_extractf128_ps::<1>(a.into()).simd_into(self),
2809 )
2810 }
2811 }
2812 #[inline(always)]
2813 fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self> {
2814 unsafe { _mm256_castps_pd(a.into()).simd_into(self) }
2815 }
2816 #[inline(always)]
2817 fn reinterpret_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
2818 unsafe { _mm256_castps_si256(a.into()).simd_into(self) }
2819 }
2820 #[inline(always)]
2821 fn reinterpret_u8_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
2822 unsafe { _mm256_castps_si256(a.into()).simd_into(self) }
2823 }
2824 #[inline(always)]
2825 fn reinterpret_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
2826 unsafe { _mm256_castps_si256(a.into()).simd_into(self) }
2827 }
2828 #[inline(always)]
2829 fn cvt_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
2830 unsafe {
2831 let mut converted = _mm256_cvttps_epi32(a.into());
2832 let in_range = _mm256_cmp_ps::<17i32>(a.into(), _mm256_set1_ps(2147483648.0));
2833 let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111;
2834 if !all_in_range {
2835 let excess = _mm256_sub_ps(a.into(), _mm256_set1_ps(2147483648.0));
2836 let excess_converted = _mm256_cvttps_epi32(_mm256_andnot_ps(in_range, excess));
2837 converted = _mm256_add_epi32(converted, excess_converted);
2838 }
2839 converted.simd_into(self)
2840 }
2841 }
2842 #[inline(always)]
2843 fn cvt_u32_precise_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
2844 unsafe {
2845 let a = _mm256_max_ps(a.into(), _mm256_setzero_ps());
2846 let mut converted = _mm256_cvttps_epi32(a);
2847 let in_range = _mm256_cmp_ps::<17i32>(a, _mm256_set1_ps(2147483648.0));
2848 let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111;
2849 if !all_in_range {
2850 let exceeds_unsigned_range =
2851 _mm256_castps_si256(_mm256_cmp_ps::<17i32>(_mm256_set1_ps(4294967040.0), a));
2852 let excess = _mm256_sub_ps(a, _mm256_set1_ps(2147483648.0));
2853 let excess_converted = _mm256_cvttps_epi32(_mm256_andnot_ps(in_range, excess));
2854 converted = _mm256_add_epi32(converted, excess_converted);
2855 converted = _mm256_blendv_epi8(
2856 converted,
2857 _mm256_set1_epi32(u32::MAX.cast_signed()),
2858 exceeds_unsigned_range,
2859 );
2860 }
2861 converted.simd_into(self)
2862 }
2863 }
2864 #[inline(always)]
2865 fn cvt_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
2866 unsafe { _mm256_cvttps_epi32(a.into()).simd_into(self) }
2867 }
2868 #[inline(always)]
2869 fn cvt_i32_precise_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
2870 unsafe {
2871 let a = a.into();
2872 let mut converted = _mm256_cvttps_epi32(a);
2873 let in_range = _mm256_cmp_ps::<17i32>(a, _mm256_set1_ps(2147483648.0));
2874 let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111;
2875 if !all_in_range {
2876 converted = _mm256_blendv_epi8(
2877 _mm256_set1_epi32(i32::MAX),
2878 converted,
2879 _mm256_castps_si256(in_range),
2880 );
2881 let is_not_nan = _mm256_castps_si256(_mm256_cmp_ps::<7i32>(a, a));
2882 converted = _mm256_and_si256(converted, is_not_nan);
2883 }
2884 converted.simd_into(self)
2885 }
2886 }
2887 #[inline(always)]
2888 fn splat_i8x32(self, val: i8) -> i8x32<Self> {
2889 unsafe { _mm256_set1_epi8(val).simd_into(self) }
2890 }
2891 #[inline(always)]
2892 fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32<Self> {
2893 i8x32 {
2894 val: unsafe { core::mem::transmute_copy(&val) },
2895 simd: self,
2896 }
2897 }
2898 #[inline(always)]
2899 fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32<Self> {
2900 i8x32 {
2901 val: unsafe { core::mem::transmute_copy(val) },
2902 simd: self,
2903 }
2904 }
2905 #[inline(always)]
2906 fn as_array_i8x32(self, a: i8x32<Self>) -> [i8; 32usize] {
2907 unsafe { core::mem::transmute::<__m256i, [i8; 32usize]>(a.val.0) }
2908 }
2909 #[inline(always)]
2910 fn as_array_ref_i8x32(self, a: &i8x32<Self>) -> &[i8; 32usize] {
2911 unsafe { core::mem::transmute::<&__m256i, &[i8; 32usize]>(&a.val.0) }
2912 }
2913 #[inline(always)]
2914 fn as_array_mut_i8x32(self, a: &mut i8x32<Self>) -> &mut [i8; 32usize] {
2915 unsafe { core::mem::transmute::<&mut __m256i, &mut [i8; 32usize]>(&mut a.val.0) }
2916 }
2917 #[inline(always)]
2918 fn store_array_i8x32(self, a: i8x32<Self>, dest: &mut [i8; 32usize]) -> () {
2919 unsafe {
2920 core::ptr::copy_nonoverlapping(
2921 (&raw const a.val.0) as *const i8,
2922 dest.as_mut_ptr(),
2923 32usize,
2924 );
2925 }
2926 }
2927 #[inline(always)]
2928 fn cvt_from_bytes_i8x32(self, a: u8x32<Self>) -> i8x32<Self> {
2929 unsafe {
2930 i8x32 {
2931 val: core::mem::transmute(a.val),
2932 simd: self,
2933 }
2934 }
2935 }
2936 #[inline(always)]
2937 fn cvt_to_bytes_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
2938 unsafe {
2939 u8x32 {
2940 val: core::mem::transmute(a.val),
2941 simd: self,
2942 }
2943 }
2944 }
2945 #[inline(always)]
2946 fn slide_i8x32<const SHIFT: usize>(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
2947 unsafe {
2948 if SHIFT >= 32usize {
2949 return b;
2950 }
2951 let result = cross_block_alignr_256x1(
2952 self.cvt_to_bytes_i8x32(b).val.0,
2953 self.cvt_to_bytes_i8x32(a).val.0,
2954 SHIFT,
2955 );
2956 self.cvt_from_bytes_i8x32(u8x32 {
2957 val: crate::support::Aligned256(result),
2958 simd: self,
2959 })
2960 }
2961 }
2962 #[inline(always)]
2963 fn slide_within_blocks_i8x32<const SHIFT: usize>(
2964 self,
2965 a: i8x32<Self>,
2966 b: i8x32<Self>,
2967 ) -> i8x32<Self> {
2968 unsafe {
2969 if SHIFT >= 16usize {
2970 return b;
2971 }
2972 let result = dyn_alignr_256(
2973 self.cvt_to_bytes_i8x32(b).val.0,
2974 self.cvt_to_bytes_i8x32(a).val.0,
2975 SHIFT,
2976 );
2977 self.cvt_from_bytes_i8x32(u8x32 {
2978 val: crate::support::Aligned256(result),
2979 simd: self,
2980 })
2981 }
2982 }
2983 #[inline(always)]
2984 fn add_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
2985 unsafe { _mm256_add_epi8(a.into(), b.into()).simd_into(self) }
2986 }
2987 #[inline(always)]
2988 fn sub_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
2989 unsafe { _mm256_sub_epi8(a.into(), b.into()).simd_into(self) }
2990 }
2991 #[inline(always)]
2992 fn mul_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
2993 unsafe {
2994 let dst_even = _mm256_mullo_epi16(a.into(), b.into());
2995 let dst_odd = _mm256_mullo_epi16(
2996 _mm256_srli_epi16::<8>(a.into()),
2997 _mm256_srli_epi16::<8>(b.into()),
2998 );
2999 _mm256_or_si256(
3000 _mm256_slli_epi16(dst_odd, 8),
3001 _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)),
3002 )
3003 .simd_into(self)
3004 }
3005 }
3006 #[inline(always)]
3007 fn and_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3008 unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
3009 }
3010 #[inline(always)]
3011 fn or_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3012 unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
3013 }
3014 #[inline(always)]
3015 fn xor_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3016 unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
3017 }
3018 #[inline(always)]
3019 fn not_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
3020 a ^ !0
3021 }
3022 #[inline(always)]
3023 fn shl_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
3024 unsafe {
3025 let val = a.into();
3026 let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
3027 let lo_16 = _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
3028 let hi_16 = _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
3029 let lo_shifted = _mm256_sll_epi16(lo_16, shift_count);
3030 let hi_shifted = _mm256_sll_epi16(hi_16, shift_count);
3031 _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
3032 }
3033 }
3034 #[inline(always)]
3035 fn shlv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3036 core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
3037 }
3038 #[inline(always)]
3039 fn shr_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
3040 unsafe {
3041 let val = a.into();
3042 let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
3043 let lo_16 = _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
3044 let hi_16 = _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
3045 let lo_shifted = _mm256_sra_epi16(lo_16, shift_count);
3046 let hi_shifted = _mm256_sra_epi16(hi_16, shift_count);
3047 _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
3048 }
3049 }
3050 #[inline(always)]
3051 fn shrv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3052 core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
3053 }
3054 #[inline(always)]
3055 fn simd_eq_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
3056 unsafe { _mm256_cmpeq_epi8(a.into(), b.into()).simd_into(self) }
3057 }
3058 #[inline(always)]
3059 fn simd_lt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
3060 unsafe { _mm256_cmpgt_epi8(b.into(), a.into()).simd_into(self) }
3061 }
3062 #[inline(always)]
3063 fn simd_le_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
3064 unsafe { _mm256_cmpeq_epi8(_mm256_min_epi8(a.into(), b.into()), a.into()).simd_into(self) }
3065 }
3066 #[inline(always)]
3067 fn simd_ge_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
3068 unsafe { _mm256_cmpeq_epi8(_mm256_max_epi8(a.into(), b.into()), a.into()).simd_into(self) }
3069 }
3070 #[inline(always)]
3071 fn simd_gt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
3072 unsafe { _mm256_cmpgt_epi8(a.into(), b.into()).simd_into(self) }
3073 }
3074 #[inline(always)]
3075 fn zip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3076 unsafe {
3077 let lo = _mm256_unpacklo_epi8(a.into(), b.into());
3078 let hi = _mm256_unpackhi_epi8(a.into(), b.into());
3079 _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self)
3080 }
3081 }
3082 #[inline(always)]
3083 fn zip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3084 unsafe {
3085 let lo = _mm256_unpacklo_epi8(a.into(), b.into());
3086 let hi = _mm256_unpackhi_epi8(a.into(), b.into());
3087 _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self)
3088 }
3089 }
3090 #[inline(always)]
3091 fn unzip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3092 unsafe {
3093 let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3094 a.into(),
3095 _mm256_setr_epi8(
3096 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12,
3097 14, 1, 3, 5, 7, 9, 11, 13, 15,
3098 ),
3099 ));
3100 let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3101 b.into(),
3102 _mm256_setr_epi8(
3103 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12,
3104 14, 1, 3, 5, 7, 9, 11, 13, 15,
3105 ),
3106 ));
3107 _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self)
3108 }
3109 }
3110 #[inline(always)]
3111 fn unzip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3112 unsafe {
3113 let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3114 a.into(),
3115 _mm256_setr_epi8(
3116 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12,
3117 14, 1, 3, 5, 7, 9, 11, 13, 15,
3118 ),
3119 ));
3120 let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3121 b.into(),
3122 _mm256_setr_epi8(
3123 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12,
3124 14, 1, 3, 5, 7, 9, 11, 13, 15,
3125 ),
3126 ));
3127 _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self)
3128 }
3129 }
3130 #[inline(always)]
3131 fn interleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
3132 unsafe {
3133 let lo = _mm256_unpacklo_epi8(a.into(), b.into());
3134 let hi = _mm256_unpackhi_epi8(a.into(), b.into());
3135 (
3136 _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self),
3137 _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self),
3138 )
3139 }
3140 }
3141 #[inline(always)]
3142 fn deinterleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
3143 unsafe {
3144 let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3145 a.into(),
3146 _mm256_setr_epi8(
3147 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12,
3148 14, 1, 3, 5, 7, 9, 11, 13, 15,
3149 ),
3150 ));
3151 let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3152 b.into(),
3153 _mm256_setr_epi8(
3154 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12,
3155 14, 1, 3, 5, 7, 9, 11, 13, 15,
3156 ),
3157 ));
3158 (
3159 _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self),
3160 _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self),
3161 )
3162 }
3163 }
3164 #[inline(always)]
3165 fn select_i8x32(self, a: mask8x32<Self>, b: i8x32<Self>, c: i8x32<Self>) -> i8x32<Self> {
3166 unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
3167 }
3168 #[inline(always)]
3169 fn min_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3170 unsafe { _mm256_min_epi8(a.into(), b.into()).simd_into(self) }
3171 }
3172 #[inline(always)]
3173 fn max_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
3174 unsafe { _mm256_max_epi8(a.into(), b.into()).simd_into(self) }
3175 }
3176 #[inline(always)]
3177 fn combine_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x64<Self> {
3178 i8x64 {
3179 val: crate::support::Aligned512([a.val.0, b.val.0]),
3180 simd: self,
3181 }
3182 }
3183 #[inline(always)]
3184 fn split_i8x32(self, a: i8x32<Self>) -> (i8x16<Self>, i8x16<Self>) {
3185 unsafe {
3186 (
3187 _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
3188 _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
3189 )
3190 }
3191 }
3192 #[inline(always)]
3193 fn neg_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
3194 unsafe { _mm256_sub_epi8(_mm256_setzero_si256(), a.into()).simd_into(self) }
3195 }
3196 #[inline(always)]
3197 fn reinterpret_u8_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
3198 __m256i::from(a).simd_into(self)
3199 }
3200 #[inline(always)]
3201 fn reinterpret_u32_i8x32(self, a: i8x32<Self>) -> u32x8<Self> {
3202 __m256i::from(a).simd_into(self)
3203 }
3204 #[inline(always)]
3205 fn splat_u8x32(self, val: u8) -> u8x32<Self> {
3206 unsafe { _mm256_set1_epi8(val.cast_signed()).simd_into(self) }
3207 }
3208 #[inline(always)]
3209 fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32<Self> {
3210 u8x32 {
3211 val: unsafe { core::mem::transmute_copy(&val) },
3212 simd: self,
3213 }
3214 }
3215 #[inline(always)]
3216 fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32<Self> {
3217 u8x32 {
3218 val: unsafe { core::mem::transmute_copy(val) },
3219 simd: self,
3220 }
3221 }
3222 #[inline(always)]
3223 fn as_array_u8x32(self, a: u8x32<Self>) -> [u8; 32usize] {
3224 unsafe { core::mem::transmute::<__m256i, [u8; 32usize]>(a.val.0) }
3225 }
3226 #[inline(always)]
3227 fn as_array_ref_u8x32(self, a: &u8x32<Self>) -> &[u8; 32usize] {
3228 unsafe { core::mem::transmute::<&__m256i, &[u8; 32usize]>(&a.val.0) }
3229 }
3230 #[inline(always)]
3231 fn as_array_mut_u8x32(self, a: &mut u8x32<Self>) -> &mut [u8; 32usize] {
3232 unsafe { core::mem::transmute::<&mut __m256i, &mut [u8; 32usize]>(&mut a.val.0) }
3233 }
3234 #[inline(always)]
3235 fn store_array_u8x32(self, a: u8x32<Self>, dest: &mut [u8; 32usize]) -> () {
3236 unsafe {
3237 core::ptr::copy_nonoverlapping(
3238 (&raw const a.val.0) as *const u8,
3239 dest.as_mut_ptr(),
3240 32usize,
3241 );
3242 }
3243 }
3244 #[inline(always)]
3245 fn cvt_from_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
3246 unsafe {
3247 u8x32 {
3248 val: core::mem::transmute(a.val),
3249 simd: self,
3250 }
3251 }
3252 }
3253 #[inline(always)]
3254 fn cvt_to_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
3255 unsafe {
3256 u8x32 {
3257 val: core::mem::transmute(a.val),
3258 simd: self,
3259 }
3260 }
3261 }
3262 #[inline(always)]
3263 fn slide_u8x32<const SHIFT: usize>(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3264 unsafe {
3265 if SHIFT >= 32usize {
3266 return b;
3267 }
3268 let result = cross_block_alignr_256x1(
3269 self.cvt_to_bytes_u8x32(b).val.0,
3270 self.cvt_to_bytes_u8x32(a).val.0,
3271 SHIFT,
3272 );
3273 self.cvt_from_bytes_u8x32(u8x32 {
3274 val: crate::support::Aligned256(result),
3275 simd: self,
3276 })
3277 }
3278 }
3279 #[inline(always)]
3280 fn slide_within_blocks_u8x32<const SHIFT: usize>(
3281 self,
3282 a: u8x32<Self>,
3283 b: u8x32<Self>,
3284 ) -> u8x32<Self> {
3285 unsafe {
3286 if SHIFT >= 16usize {
3287 return b;
3288 }
3289 let result = dyn_alignr_256(
3290 self.cvt_to_bytes_u8x32(b).val.0,
3291 self.cvt_to_bytes_u8x32(a).val.0,
3292 SHIFT,
3293 );
3294 self.cvt_from_bytes_u8x32(u8x32 {
3295 val: crate::support::Aligned256(result),
3296 simd: self,
3297 })
3298 }
3299 }
3300 #[inline(always)]
3301 fn add_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3302 unsafe { _mm256_add_epi8(a.into(), b.into()).simd_into(self) }
3303 }
3304 #[inline(always)]
3305 fn sub_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3306 unsafe { _mm256_sub_epi8(a.into(), b.into()).simd_into(self) }
3307 }
3308 #[inline(always)]
3309 fn mul_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3310 unsafe {
3311 let dst_even = _mm256_mullo_epi16(a.into(), b.into());
3312 let dst_odd = _mm256_mullo_epi16(
3313 _mm256_srli_epi16::<8>(a.into()),
3314 _mm256_srli_epi16::<8>(b.into()),
3315 );
3316 _mm256_or_si256(
3317 _mm256_slli_epi16(dst_odd, 8),
3318 _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)),
3319 )
3320 .simd_into(self)
3321 }
3322 }
3323 #[inline(always)]
3324 fn and_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3325 unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
3326 }
3327 #[inline(always)]
3328 fn or_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3329 unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
3330 }
3331 #[inline(always)]
3332 fn xor_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3333 unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
3334 }
3335 #[inline(always)]
3336 fn not_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
3337 a ^ !0
3338 }
3339 #[inline(always)]
3340 fn shl_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
3341 unsafe {
3342 let val = a.into();
3343 let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
3344 let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256());
3345 let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256());
3346 let lo_shifted = _mm256_sll_epi16(lo_16, shift_count);
3347 let hi_shifted = _mm256_sll_epi16(hi_16, shift_count);
3348 _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
3349 }
3350 }
3351 #[inline(always)]
3352 fn shlv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3353 core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
3354 }
3355 #[inline(always)]
3356 fn shr_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
3357 unsafe {
3358 let val = a.into();
3359 let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
3360 let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256());
3361 let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256());
3362 let lo_shifted = _mm256_srl_epi16(lo_16, shift_count);
3363 let hi_shifted = _mm256_srl_epi16(hi_16, shift_count);
3364 _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
3365 }
3366 }
3367 #[inline(always)]
3368 fn shrv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3369 core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
3370 }
3371 #[inline(always)]
3372 fn simd_eq_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
3373 unsafe { _mm256_cmpeq_epi8(a.into(), b.into()).simd_into(self) }
3374 }
3375 #[inline(always)]
3376 fn simd_lt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
3377 unsafe {
3378 let sign_bit = _mm256_set1_epi8(0x80u8.cast_signed());
3379 let a_signed = _mm256_xor_si256(a.into(), sign_bit);
3380 let b_signed = _mm256_xor_si256(b.into(), sign_bit);
3381 _mm256_cmpgt_epi8(b_signed, a_signed).simd_into(self)
3382 }
3383 }
3384 #[inline(always)]
3385 fn simd_le_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
3386 unsafe { _mm256_cmpeq_epi8(_mm256_min_epu8(a.into(), b.into()), a.into()).simd_into(self) }
3387 }
3388 #[inline(always)]
3389 fn simd_ge_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
3390 unsafe { _mm256_cmpeq_epi8(_mm256_max_epu8(a.into(), b.into()), a.into()).simd_into(self) }
3391 }
3392 #[inline(always)]
3393 fn simd_gt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
3394 unsafe {
3395 let sign_bit = _mm256_set1_epi8(0x80u8.cast_signed());
3396 let a_signed = _mm256_xor_si256(a.into(), sign_bit);
3397 let b_signed = _mm256_xor_si256(b.into(), sign_bit);
3398 _mm256_cmpgt_epi8(a_signed, b_signed).simd_into(self)
3399 }
3400 }
3401 #[inline(always)]
3402 fn zip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3403 unsafe {
3404 let lo = _mm256_unpacklo_epi8(a.into(), b.into());
3405 let hi = _mm256_unpackhi_epi8(a.into(), b.into());
3406 _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self)
3407 }
3408 }
3409 #[inline(always)]
3410 fn zip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3411 unsafe {
3412 let lo = _mm256_unpacklo_epi8(a.into(), b.into());
3413 let hi = _mm256_unpackhi_epi8(a.into(), b.into());
3414 _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self)
3415 }
3416 }
3417 #[inline(always)]
3418 fn unzip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3419 unsafe {
3420 let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3421 a.into(),
3422 _mm256_setr_epi8(
3423 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12,
3424 14, 1, 3, 5, 7, 9, 11, 13, 15,
3425 ),
3426 ));
3427 let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3428 b.into(),
3429 _mm256_setr_epi8(
3430 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12,
3431 14, 1, 3, 5, 7, 9, 11, 13, 15,
3432 ),
3433 ));
3434 _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self)
3435 }
3436 }
3437 #[inline(always)]
3438 fn unzip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3439 unsafe {
3440 let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3441 a.into(),
3442 _mm256_setr_epi8(
3443 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12,
3444 14, 1, 3, 5, 7, 9, 11, 13, 15,
3445 ),
3446 ));
3447 let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3448 b.into(),
3449 _mm256_setr_epi8(
3450 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12,
3451 14, 1, 3, 5, 7, 9, 11, 13, 15,
3452 ),
3453 ));
3454 _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self)
3455 }
3456 }
3457 #[inline(always)]
3458 fn interleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
3459 unsafe {
3460 let lo = _mm256_unpacklo_epi8(a.into(), b.into());
3461 let hi = _mm256_unpackhi_epi8(a.into(), b.into());
3462 (
3463 _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self),
3464 _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self),
3465 )
3466 }
3467 }
3468 #[inline(always)]
3469 fn deinterleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
3470 unsafe {
3471 let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3472 a.into(),
3473 _mm256_setr_epi8(
3474 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12,
3475 14, 1, 3, 5, 7, 9, 11, 13, 15,
3476 ),
3477 ));
3478 let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3479 b.into(),
3480 _mm256_setr_epi8(
3481 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12,
3482 14, 1, 3, 5, 7, 9, 11, 13, 15,
3483 ),
3484 ));
3485 (
3486 _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self),
3487 _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self),
3488 )
3489 }
3490 }
3491 #[inline(always)]
3492 fn select_u8x32(self, a: mask8x32<Self>, b: u8x32<Self>, c: u8x32<Self>) -> u8x32<Self> {
3493 unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
3494 }
3495 #[inline(always)]
3496 fn min_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3497 unsafe { _mm256_min_epu8(a.into(), b.into()).simd_into(self) }
3498 }
3499 #[inline(always)]
3500 fn max_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
3501 unsafe { _mm256_max_epu8(a.into(), b.into()).simd_into(self) }
3502 }
3503 #[inline(always)]
3504 fn combine_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x64<Self> {
3505 u8x64 {
3506 val: crate::support::Aligned512([a.val.0, b.val.0]),
3507 simd: self,
3508 }
3509 }
3510 #[inline(always)]
3511 fn split_u8x32(self, a: u8x32<Self>) -> (u8x16<Self>, u8x16<Self>) {
3512 unsafe {
3513 (
3514 _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
3515 _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
3516 )
3517 }
3518 }
3519 #[inline(always)]
3520 fn widen_u8x32(self, a: u8x32<Self>) -> u16x32<Self> {
3521 unsafe {
3522 let (a0, a1) = self.split_u8x32(a);
3523 let high = _mm256_cvtepu8_epi16(a0.into()).simd_into(self);
3524 let low = _mm256_cvtepu8_epi16(a1.into()).simd_into(self);
3525 self.combine_u16x16(high, low)
3526 }
3527 }
3528 #[inline(always)]
3529 fn reinterpret_u32_u8x32(self, a: u8x32<Self>) -> u32x8<Self> {
3530 __m256i::from(a).simd_into(self)
3531 }
3532 #[inline(always)]
3533 fn splat_mask8x32(self, val: i8) -> mask8x32<Self> {
3534 unsafe { _mm256_set1_epi8(val).simd_into(self) }
3535 }
3536 #[inline(always)]
3537 fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32<Self> {
3538 mask8x32 {
3539 val: unsafe { core::mem::transmute_copy(&val) },
3540 simd: self,
3541 }
3542 }
3543 #[inline(always)]
3544 fn load_array_ref_mask8x32(self, val: &[i8; 32usize]) -> mask8x32<Self> {
3545 mask8x32 {
3546 val: unsafe { core::mem::transmute_copy(val) },
3547 simd: self,
3548 }
3549 }
3550 #[inline(always)]
3551 fn as_array_mask8x32(self, a: mask8x32<Self>) -> [i8; 32usize] {
3552 unsafe { core::mem::transmute::<__m256i, [i8; 32usize]>(a.val.0) }
3553 }
3554 #[inline(always)]
3555 fn as_array_ref_mask8x32(self, a: &mask8x32<Self>) -> &[i8; 32usize] {
3556 unsafe { core::mem::transmute::<&__m256i, &[i8; 32usize]>(&a.val.0) }
3557 }
3558 #[inline(always)]
3559 fn as_array_mut_mask8x32(self, a: &mut mask8x32<Self>) -> &mut [i8; 32usize] {
3560 unsafe { core::mem::transmute::<&mut __m256i, &mut [i8; 32usize]>(&mut a.val.0) }
3561 }
3562 #[inline(always)]
3563 fn store_array_mask8x32(self, a: mask8x32<Self>, dest: &mut [i8; 32usize]) -> () {
3564 unsafe {
3565 core::ptr::copy_nonoverlapping(
3566 (&raw const a.val.0) as *const i8,
3567 dest.as_mut_ptr(),
3568 32usize,
3569 );
3570 }
3571 }
3572 #[inline(always)]
3573 fn cvt_from_bytes_mask8x32(self, a: u8x32<Self>) -> mask8x32<Self> {
3574 unsafe {
3575 mask8x32 {
3576 val: core::mem::transmute(a.val),
3577 simd: self,
3578 }
3579 }
3580 }
3581 #[inline(always)]
3582 fn cvt_to_bytes_mask8x32(self, a: mask8x32<Self>) -> u8x32<Self> {
3583 unsafe {
3584 u8x32 {
3585 val: core::mem::transmute(a.val),
3586 simd: self,
3587 }
3588 }
3589 }
3590 #[inline(always)]
3591 fn slide_mask8x32<const SHIFT: usize>(
3592 self,
3593 a: mask8x32<Self>,
3594 b: mask8x32<Self>,
3595 ) -> mask8x32<Self> {
3596 unsafe {
3597 if SHIFT >= 32usize {
3598 return b;
3599 }
3600 let result = cross_block_alignr_256x1(
3601 self.cvt_to_bytes_mask8x32(b).val.0,
3602 self.cvt_to_bytes_mask8x32(a).val.0,
3603 SHIFT,
3604 );
3605 self.cvt_from_bytes_mask8x32(u8x32 {
3606 val: crate::support::Aligned256(result),
3607 simd: self,
3608 })
3609 }
3610 }
3611 #[inline(always)]
3612 fn slide_within_blocks_mask8x32<const SHIFT: usize>(
3613 self,
3614 a: mask8x32<Self>,
3615 b: mask8x32<Self>,
3616 ) -> mask8x32<Self> {
3617 unsafe {
3618 if SHIFT >= 16usize {
3619 return b;
3620 }
3621 let result = dyn_alignr_256(
3622 self.cvt_to_bytes_mask8x32(b).val.0,
3623 self.cvt_to_bytes_mask8x32(a).val.0,
3624 SHIFT,
3625 );
3626 self.cvt_from_bytes_mask8x32(u8x32 {
3627 val: crate::support::Aligned256(result),
3628 simd: self,
3629 })
3630 }
3631 }
3632 #[inline(always)]
3633 fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
3634 unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
3635 }
3636 #[inline(always)]
3637 fn or_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
3638 unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
3639 }
3640 #[inline(always)]
3641 fn xor_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
3642 unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
3643 }
3644 #[inline(always)]
3645 fn not_mask8x32(self, a: mask8x32<Self>) -> mask8x32<Self> {
3646 a ^ !0
3647 }
3648 #[inline(always)]
3649 fn select_mask8x32(
3650 self,
3651 a: mask8x32<Self>,
3652 b: mask8x32<Self>,
3653 c: mask8x32<Self>,
3654 ) -> mask8x32<Self> {
3655 unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
3656 }
3657 #[inline(always)]
3658 fn simd_eq_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
3659 unsafe { _mm256_cmpeq_epi8(a.into(), b.into()).simd_into(self) }
3660 }
3661 #[inline(always)]
3662 fn any_true_mask8x32(self, a: mask8x32<Self>) -> bool {
3663 unsafe { _mm256_movemask_epi8(a.into()) as u32 != 0 }
3664 }
3665 #[inline(always)]
3666 fn all_true_mask8x32(self, a: mask8x32<Self>) -> bool {
3667 unsafe { _mm256_movemask_epi8(a.into()) as u32 == 0xffffffff }
3668 }
3669 #[inline(always)]
3670 fn any_false_mask8x32(self, a: mask8x32<Self>) -> bool {
3671 unsafe { _mm256_movemask_epi8(a.into()) as u32 != 0xffffffff }
3672 }
3673 #[inline(always)]
3674 fn all_false_mask8x32(self, a: mask8x32<Self>) -> bool {
3675 unsafe { _mm256_movemask_epi8(a.into()) as u32 == 0 }
3676 }
3677 #[inline(always)]
3678 fn combine_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x64<Self> {
3679 mask8x64 {
3680 val: crate::support::Aligned512([a.val.0, b.val.0]),
3681 simd: self,
3682 }
3683 }
3684 #[inline(always)]
3685 fn split_mask8x32(self, a: mask8x32<Self>) -> (mask8x16<Self>, mask8x16<Self>) {
3686 unsafe {
3687 (
3688 _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
3689 _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
3690 )
3691 }
3692 }
3693 #[inline(always)]
3694 fn splat_i16x16(self, val: i16) -> i16x16<Self> {
3695 unsafe { _mm256_set1_epi16(val).simd_into(self) }
3696 }
3697 #[inline(always)]
3698 fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16<Self> {
3699 i16x16 {
3700 val: unsafe { core::mem::transmute_copy(&val) },
3701 simd: self,
3702 }
3703 }
3704 #[inline(always)]
3705 fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16<Self> {
3706 i16x16 {
3707 val: unsafe { core::mem::transmute_copy(val) },
3708 simd: self,
3709 }
3710 }
3711 #[inline(always)]
3712 fn as_array_i16x16(self, a: i16x16<Self>) -> [i16; 16usize] {
3713 unsafe { core::mem::transmute::<__m256i, [i16; 16usize]>(a.val.0) }
3714 }
3715 #[inline(always)]
3716 fn as_array_ref_i16x16(self, a: &i16x16<Self>) -> &[i16; 16usize] {
3717 unsafe { core::mem::transmute::<&__m256i, &[i16; 16usize]>(&a.val.0) }
3718 }
3719 #[inline(always)]
3720 fn as_array_mut_i16x16(self, a: &mut i16x16<Self>) -> &mut [i16; 16usize] {
3721 unsafe { core::mem::transmute::<&mut __m256i, &mut [i16; 16usize]>(&mut a.val.0) }
3722 }
3723 #[inline(always)]
3724 fn store_array_i16x16(self, a: i16x16<Self>, dest: &mut [i16; 16usize]) -> () {
3725 unsafe {
3726 core::ptr::copy_nonoverlapping(
3727 (&raw const a.val.0) as *const i16,
3728 dest.as_mut_ptr(),
3729 16usize,
3730 );
3731 }
3732 }
3733 #[inline(always)]
3734 fn cvt_from_bytes_i16x16(self, a: u8x32<Self>) -> i16x16<Self> {
3735 unsafe {
3736 i16x16 {
3737 val: core::mem::transmute(a.val),
3738 simd: self,
3739 }
3740 }
3741 }
3742 #[inline(always)]
3743 fn cvt_to_bytes_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
3744 unsafe {
3745 u8x32 {
3746 val: core::mem::transmute(a.val),
3747 simd: self,
3748 }
3749 }
3750 }
3751 #[inline(always)]
3752 fn slide_i16x16<const SHIFT: usize>(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3753 unsafe {
3754 if SHIFT >= 16usize {
3755 return b;
3756 }
3757 let result = cross_block_alignr_256x1(
3758 self.cvt_to_bytes_i16x16(b).val.0,
3759 self.cvt_to_bytes_i16x16(a).val.0,
3760 SHIFT * 2usize,
3761 );
3762 self.cvt_from_bytes_i16x16(u8x32 {
3763 val: crate::support::Aligned256(result),
3764 simd: self,
3765 })
3766 }
3767 }
3768 #[inline(always)]
3769 fn slide_within_blocks_i16x16<const SHIFT: usize>(
3770 self,
3771 a: i16x16<Self>,
3772 b: i16x16<Self>,
3773 ) -> i16x16<Self> {
3774 unsafe {
3775 if SHIFT >= 8usize {
3776 return b;
3777 }
3778 let result = dyn_alignr_256(
3779 self.cvt_to_bytes_i16x16(b).val.0,
3780 self.cvt_to_bytes_i16x16(a).val.0,
3781 SHIFT * 2usize,
3782 );
3783 self.cvt_from_bytes_i16x16(u8x32 {
3784 val: crate::support::Aligned256(result),
3785 simd: self,
3786 })
3787 }
3788 }
3789 #[inline(always)]
3790 fn add_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3791 unsafe { _mm256_add_epi16(a.into(), b.into()).simd_into(self) }
3792 }
3793 #[inline(always)]
3794 fn sub_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3795 unsafe { _mm256_sub_epi16(a.into(), b.into()).simd_into(self) }
3796 }
3797 #[inline(always)]
3798 fn mul_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3799 unsafe { _mm256_mullo_epi16(a.into(), b.into()).simd_into(self) }
3800 }
3801 #[inline(always)]
3802 fn and_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3803 unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
3804 }
3805 #[inline(always)]
3806 fn or_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3807 unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
3808 }
3809 #[inline(always)]
3810 fn xor_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3811 unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
3812 }
3813 #[inline(always)]
3814 fn not_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
3815 a ^ !0
3816 }
3817 #[inline(always)]
3818 fn shl_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
3819 unsafe {
3820 _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
3821 }
3822 }
3823 #[inline(always)]
3824 fn shlv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3825 core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
3826 }
3827 #[inline(always)]
3828 fn shr_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
3829 unsafe {
3830 _mm256_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
3831 }
3832 }
3833 #[inline(always)]
3834 fn shrv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3835 core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
3836 }
3837 #[inline(always)]
3838 fn simd_eq_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
3839 unsafe { _mm256_cmpeq_epi16(a.into(), b.into()).simd_into(self) }
3840 }
3841 #[inline(always)]
3842 fn simd_lt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
3843 unsafe { _mm256_cmpgt_epi16(b.into(), a.into()).simd_into(self) }
3844 }
3845 #[inline(always)]
3846 fn simd_le_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
3847 unsafe {
3848 _mm256_cmpeq_epi16(_mm256_min_epi16(a.into(), b.into()), a.into()).simd_into(self)
3849 }
3850 }
3851 #[inline(always)]
3852 fn simd_ge_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
3853 unsafe {
3854 _mm256_cmpeq_epi16(_mm256_max_epi16(a.into(), b.into()), a.into()).simd_into(self)
3855 }
3856 }
3857 #[inline(always)]
3858 fn simd_gt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
3859 unsafe { _mm256_cmpgt_epi16(a.into(), b.into()).simd_into(self) }
3860 }
3861 #[inline(always)]
3862 fn zip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3863 unsafe {
3864 let lo = _mm256_unpacklo_epi16(a.into(), b.into());
3865 let hi = _mm256_unpackhi_epi16(a.into(), b.into());
3866 _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self)
3867 }
3868 }
3869 #[inline(always)]
3870 fn zip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3871 unsafe {
3872 let lo = _mm256_unpacklo_epi16(a.into(), b.into());
3873 let hi = _mm256_unpackhi_epi16(a.into(), b.into());
3874 _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self)
3875 }
3876 }
3877 #[inline(always)]
3878 fn unzip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3879 unsafe {
3880 let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3881 a.into(),
3882 _mm256_setr_epi8(
3883 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13,
3884 2, 3, 6, 7, 10, 11, 14, 15,
3885 ),
3886 ));
3887 let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3888 b.into(),
3889 _mm256_setr_epi8(
3890 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13,
3891 2, 3, 6, 7, 10, 11, 14, 15,
3892 ),
3893 ));
3894 _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self)
3895 }
3896 }
3897 #[inline(always)]
3898 fn unzip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3899 unsafe {
3900 let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3901 a.into(),
3902 _mm256_setr_epi8(
3903 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13,
3904 2, 3, 6, 7, 10, 11, 14, 15,
3905 ),
3906 ));
3907 let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3908 b.into(),
3909 _mm256_setr_epi8(
3910 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13,
3911 2, 3, 6, 7, 10, 11, 14, 15,
3912 ),
3913 ));
3914 _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self)
3915 }
3916 }
3917 #[inline(always)]
3918 fn interleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
3919 unsafe {
3920 let lo = _mm256_unpacklo_epi16(a.into(), b.into());
3921 let hi = _mm256_unpackhi_epi16(a.into(), b.into());
3922 (
3923 _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self),
3924 _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self),
3925 )
3926 }
3927 }
3928 #[inline(always)]
3929 fn deinterleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
3930 unsafe {
3931 let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3932 a.into(),
3933 _mm256_setr_epi8(
3934 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13,
3935 2, 3, 6, 7, 10, 11, 14, 15,
3936 ),
3937 ));
3938 let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
3939 b.into(),
3940 _mm256_setr_epi8(
3941 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13,
3942 2, 3, 6, 7, 10, 11, 14, 15,
3943 ),
3944 ));
3945 (
3946 _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self),
3947 _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self),
3948 )
3949 }
3950 }
3951 #[inline(always)]
3952 fn select_i16x16(self, a: mask16x16<Self>, b: i16x16<Self>, c: i16x16<Self>) -> i16x16<Self> {
3953 unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
3954 }
3955 #[inline(always)]
3956 fn min_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3957 unsafe { _mm256_min_epi16(a.into(), b.into()).simd_into(self) }
3958 }
3959 #[inline(always)]
3960 fn max_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
3961 unsafe { _mm256_max_epi16(a.into(), b.into()).simd_into(self) }
3962 }
3963 #[inline(always)]
3964 fn combine_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x32<Self> {
3965 i16x32 {
3966 val: crate::support::Aligned512([a.val.0, b.val.0]),
3967 simd: self,
3968 }
3969 }
3970 #[inline(always)]
3971 fn split_i16x16(self, a: i16x16<Self>) -> (i16x8<Self>, i16x8<Self>) {
3972 unsafe {
3973 (
3974 _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
3975 _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
3976 )
3977 }
3978 }
3979 #[inline(always)]
3980 fn neg_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
3981 unsafe { _mm256_sub_epi16(_mm256_setzero_si256(), a.into()).simd_into(self) }
3982 }
3983 #[inline(always)]
3984 fn reinterpret_u8_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
3985 __m256i::from(a).simd_into(self)
3986 }
3987 #[inline(always)]
3988 fn reinterpret_u32_i16x16(self, a: i16x16<Self>) -> u32x8<Self> {
3989 __m256i::from(a).simd_into(self)
3990 }
3991 #[inline(always)]
3992 fn splat_u16x16(self, val: u16) -> u16x16<Self> {
3993 unsafe { _mm256_set1_epi16(val.cast_signed()).simd_into(self) }
3994 }
3995 #[inline(always)]
3996 fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16<Self> {
3997 u16x16 {
3998 val: unsafe { core::mem::transmute_copy(&val) },
3999 simd: self,
4000 }
4001 }
4002 #[inline(always)]
4003 fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16<Self> {
4004 u16x16 {
4005 val: unsafe { core::mem::transmute_copy(val) },
4006 simd: self,
4007 }
4008 }
4009 #[inline(always)]
4010 fn as_array_u16x16(self, a: u16x16<Self>) -> [u16; 16usize] {
4011 unsafe { core::mem::transmute::<__m256i, [u16; 16usize]>(a.val.0) }
4012 }
4013 #[inline(always)]
4014 fn as_array_ref_u16x16(self, a: &u16x16<Self>) -> &[u16; 16usize] {
4015 unsafe { core::mem::transmute::<&__m256i, &[u16; 16usize]>(&a.val.0) }
4016 }
4017 #[inline(always)]
4018 fn as_array_mut_u16x16(self, a: &mut u16x16<Self>) -> &mut [u16; 16usize] {
4019 unsafe { core::mem::transmute::<&mut __m256i, &mut [u16; 16usize]>(&mut a.val.0) }
4020 }
4021 #[inline(always)]
4022 fn store_array_u16x16(self, a: u16x16<Self>, dest: &mut [u16; 16usize]) -> () {
4023 unsafe {
4024 core::ptr::copy_nonoverlapping(
4025 (&raw const a.val.0) as *const u16,
4026 dest.as_mut_ptr(),
4027 16usize,
4028 );
4029 }
4030 }
4031 #[inline(always)]
4032 fn cvt_from_bytes_u16x16(self, a: u8x32<Self>) -> u16x16<Self> {
4033 unsafe {
4034 u16x16 {
4035 val: core::mem::transmute(a.val),
4036 simd: self,
4037 }
4038 }
4039 }
4040 #[inline(always)]
4041 fn cvt_to_bytes_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
4042 unsafe {
4043 u8x32 {
4044 val: core::mem::transmute(a.val),
4045 simd: self,
4046 }
4047 }
4048 }
4049 #[inline(always)]
4050 fn slide_u16x16<const SHIFT: usize>(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4051 unsafe {
4052 if SHIFT >= 16usize {
4053 return b;
4054 }
4055 let result = cross_block_alignr_256x1(
4056 self.cvt_to_bytes_u16x16(b).val.0,
4057 self.cvt_to_bytes_u16x16(a).val.0,
4058 SHIFT * 2usize,
4059 );
4060 self.cvt_from_bytes_u16x16(u8x32 {
4061 val: crate::support::Aligned256(result),
4062 simd: self,
4063 })
4064 }
4065 }
4066 #[inline(always)]
4067 fn slide_within_blocks_u16x16<const SHIFT: usize>(
4068 self,
4069 a: u16x16<Self>,
4070 b: u16x16<Self>,
4071 ) -> u16x16<Self> {
4072 unsafe {
4073 if SHIFT >= 8usize {
4074 return b;
4075 }
4076 let result = dyn_alignr_256(
4077 self.cvt_to_bytes_u16x16(b).val.0,
4078 self.cvt_to_bytes_u16x16(a).val.0,
4079 SHIFT * 2usize,
4080 );
4081 self.cvt_from_bytes_u16x16(u8x32 {
4082 val: crate::support::Aligned256(result),
4083 simd: self,
4084 })
4085 }
4086 }
4087 #[inline(always)]
4088 fn add_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4089 unsafe { _mm256_add_epi16(a.into(), b.into()).simd_into(self) }
4090 }
4091 #[inline(always)]
4092 fn sub_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4093 unsafe { _mm256_sub_epi16(a.into(), b.into()).simd_into(self) }
4094 }
4095 #[inline(always)]
4096 fn mul_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4097 unsafe { _mm256_mullo_epi16(a.into(), b.into()).simd_into(self) }
4098 }
4099 #[inline(always)]
4100 fn and_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4101 unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
4102 }
4103 #[inline(always)]
4104 fn or_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4105 unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
4106 }
4107 #[inline(always)]
4108 fn xor_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4109 unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
4110 }
4111 #[inline(always)]
4112 fn not_u16x16(self, a: u16x16<Self>) -> u16x16<Self> {
4113 a ^ !0
4114 }
4115 #[inline(always)]
4116 fn shl_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
4117 unsafe {
4118 _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
4119 }
4120 }
4121 #[inline(always)]
4122 fn shlv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4123 core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
4124 }
4125 #[inline(always)]
4126 fn shr_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
4127 unsafe {
4128 _mm256_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
4129 }
4130 }
4131 #[inline(always)]
4132 fn shrv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4133 core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
4134 }
4135 #[inline(always)]
4136 fn simd_eq_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
4137 unsafe { _mm256_cmpeq_epi16(a.into(), b.into()).simd_into(self) }
4138 }
4139 #[inline(always)]
4140 fn simd_lt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
4141 unsafe {
4142 let sign_bit = _mm256_set1_epi16(0x8000u16.cast_signed());
4143 let a_signed = _mm256_xor_si256(a.into(), sign_bit);
4144 let b_signed = _mm256_xor_si256(b.into(), sign_bit);
4145 _mm256_cmpgt_epi16(b_signed, a_signed).simd_into(self)
4146 }
4147 }
4148 #[inline(always)]
4149 fn simd_le_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
4150 unsafe {
4151 _mm256_cmpeq_epi16(_mm256_min_epu16(a.into(), b.into()), a.into()).simd_into(self)
4152 }
4153 }
4154 #[inline(always)]
4155 fn simd_ge_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
4156 unsafe {
4157 _mm256_cmpeq_epi16(_mm256_max_epu16(a.into(), b.into()), a.into()).simd_into(self)
4158 }
4159 }
4160 #[inline(always)]
4161 fn simd_gt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
4162 unsafe {
4163 let sign_bit = _mm256_set1_epi16(0x8000u16.cast_signed());
4164 let a_signed = _mm256_xor_si256(a.into(), sign_bit);
4165 let b_signed = _mm256_xor_si256(b.into(), sign_bit);
4166 _mm256_cmpgt_epi16(a_signed, b_signed).simd_into(self)
4167 }
4168 }
4169 #[inline(always)]
4170 fn zip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4171 unsafe {
4172 let lo = _mm256_unpacklo_epi16(a.into(), b.into());
4173 let hi = _mm256_unpackhi_epi16(a.into(), b.into());
4174 _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self)
4175 }
4176 }
4177 #[inline(always)]
4178 fn zip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4179 unsafe {
4180 let lo = _mm256_unpacklo_epi16(a.into(), b.into());
4181 let hi = _mm256_unpackhi_epi16(a.into(), b.into());
4182 _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self)
4183 }
4184 }
4185 #[inline(always)]
4186 fn unzip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4187 unsafe {
4188 let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
4189 a.into(),
4190 _mm256_setr_epi8(
4191 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13,
4192 2, 3, 6, 7, 10, 11, 14, 15,
4193 ),
4194 ));
4195 let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
4196 b.into(),
4197 _mm256_setr_epi8(
4198 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13,
4199 2, 3, 6, 7, 10, 11, 14, 15,
4200 ),
4201 ));
4202 _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self)
4203 }
4204 }
4205 #[inline(always)]
4206 fn unzip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4207 unsafe {
4208 let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
4209 a.into(),
4210 _mm256_setr_epi8(
4211 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13,
4212 2, 3, 6, 7, 10, 11, 14, 15,
4213 ),
4214 ));
4215 let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
4216 b.into(),
4217 _mm256_setr_epi8(
4218 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13,
4219 2, 3, 6, 7, 10, 11, 14, 15,
4220 ),
4221 ));
4222 _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self)
4223 }
4224 }
4225 #[inline(always)]
4226 fn interleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
4227 unsafe {
4228 let lo = _mm256_unpacklo_epi16(a.into(), b.into());
4229 let hi = _mm256_unpackhi_epi16(a.into(), b.into());
4230 (
4231 _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self),
4232 _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self),
4233 )
4234 }
4235 }
4236 #[inline(always)]
4237 fn deinterleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
4238 unsafe {
4239 let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
4240 a.into(),
4241 _mm256_setr_epi8(
4242 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13,
4243 2, 3, 6, 7, 10, 11, 14, 15,
4244 ),
4245 ));
4246 let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
4247 b.into(),
4248 _mm256_setr_epi8(
4249 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13,
4250 2, 3, 6, 7, 10, 11, 14, 15,
4251 ),
4252 ));
4253 (
4254 _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self),
4255 _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self),
4256 )
4257 }
4258 }
4259 #[inline(always)]
4260 fn select_u16x16(self, a: mask16x16<Self>, b: u16x16<Self>, c: u16x16<Self>) -> u16x16<Self> {
4261 unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
4262 }
4263 #[inline(always)]
4264 fn min_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4265 unsafe { _mm256_min_epu16(a.into(), b.into()).simd_into(self) }
4266 }
4267 #[inline(always)]
4268 fn max_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
4269 unsafe { _mm256_max_epu16(a.into(), b.into()).simd_into(self) }
4270 }
4271 #[inline(always)]
4272 fn combine_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x32<Self> {
4273 u16x32 {
4274 val: crate::support::Aligned512([a.val.0, b.val.0]),
4275 simd: self,
4276 }
4277 }
4278 #[inline(always)]
4279 fn split_u16x16(self, a: u16x16<Self>) -> (u16x8<Self>, u16x8<Self>) {
4280 unsafe {
4281 (
4282 _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
4283 _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
4284 )
4285 }
4286 }
4287 #[inline(always)]
4288 fn narrow_u16x16(self, a: u16x16<Self>) -> u8x16<Self> {
4289 unsafe {
4290 let mask = _mm256_setr_epi8(
4291 0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, 6, 8, 10, 12,
4292 14, -1, -1, -1, -1, -1, -1, -1, -1,
4293 );
4294 let shuffled = _mm256_shuffle_epi8(a.into(), mask);
4295 let packed = _mm256_permute4x64_epi64::<0b11_01_10_00>(shuffled);
4296 _mm256_castsi256_si128(packed).simd_into(self)
4297 }
4298 }
4299 #[inline(always)]
4300 fn reinterpret_u8_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
4301 __m256i::from(a).simd_into(self)
4302 }
4303 #[inline(always)]
4304 fn reinterpret_u32_u16x16(self, a: u16x16<Self>) -> u32x8<Self> {
4305 __m256i::from(a).simd_into(self)
4306 }
4307 #[inline(always)]
4308 fn splat_mask16x16(self, val: i16) -> mask16x16<Self> {
4309 unsafe { _mm256_set1_epi16(val).simd_into(self) }
4310 }
4311 #[inline(always)]
4312 fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16<Self> {
4313 mask16x16 {
4314 val: unsafe { core::mem::transmute_copy(&val) },
4315 simd: self,
4316 }
4317 }
4318 #[inline(always)]
4319 fn load_array_ref_mask16x16(self, val: &[i16; 16usize]) -> mask16x16<Self> {
4320 mask16x16 {
4321 val: unsafe { core::mem::transmute_copy(val) },
4322 simd: self,
4323 }
4324 }
4325 #[inline(always)]
4326 fn as_array_mask16x16(self, a: mask16x16<Self>) -> [i16; 16usize] {
4327 unsafe { core::mem::transmute::<__m256i, [i16; 16usize]>(a.val.0) }
4328 }
4329 #[inline(always)]
4330 fn as_array_ref_mask16x16(self, a: &mask16x16<Self>) -> &[i16; 16usize] {
4331 unsafe { core::mem::transmute::<&__m256i, &[i16; 16usize]>(&a.val.0) }
4332 }
4333 #[inline(always)]
4334 fn as_array_mut_mask16x16(self, a: &mut mask16x16<Self>) -> &mut [i16; 16usize] {
4335 unsafe { core::mem::transmute::<&mut __m256i, &mut [i16; 16usize]>(&mut a.val.0) }
4336 }
4337 #[inline(always)]
4338 fn store_array_mask16x16(self, a: mask16x16<Self>, dest: &mut [i16; 16usize]) -> () {
4339 unsafe {
4340 core::ptr::copy_nonoverlapping(
4341 (&raw const a.val.0) as *const i16,
4342 dest.as_mut_ptr(),
4343 16usize,
4344 );
4345 }
4346 }
4347 #[inline(always)]
4348 fn cvt_from_bytes_mask16x16(self, a: u8x32<Self>) -> mask16x16<Self> {
4349 unsafe {
4350 mask16x16 {
4351 val: core::mem::transmute(a.val),
4352 simd: self,
4353 }
4354 }
4355 }
4356 #[inline(always)]
4357 fn cvt_to_bytes_mask16x16(self, a: mask16x16<Self>) -> u8x32<Self> {
4358 unsafe {
4359 u8x32 {
4360 val: core::mem::transmute(a.val),
4361 simd: self,
4362 }
4363 }
4364 }
4365 #[inline(always)]
4366 fn slide_mask16x16<const SHIFT: usize>(
4367 self,
4368 a: mask16x16<Self>,
4369 b: mask16x16<Self>,
4370 ) -> mask16x16<Self> {
4371 unsafe {
4372 if SHIFT >= 16usize {
4373 return b;
4374 }
4375 let result = cross_block_alignr_256x1(
4376 self.cvt_to_bytes_mask16x16(b).val.0,
4377 self.cvt_to_bytes_mask16x16(a).val.0,
4378 SHIFT * 2usize,
4379 );
4380 self.cvt_from_bytes_mask16x16(u8x32 {
4381 val: crate::support::Aligned256(result),
4382 simd: self,
4383 })
4384 }
4385 }
4386 #[inline(always)]
4387 fn slide_within_blocks_mask16x16<const SHIFT: usize>(
4388 self,
4389 a: mask16x16<Self>,
4390 b: mask16x16<Self>,
4391 ) -> mask16x16<Self> {
4392 unsafe {
4393 if SHIFT >= 8usize {
4394 return b;
4395 }
4396 let result = dyn_alignr_256(
4397 self.cvt_to_bytes_mask16x16(b).val.0,
4398 self.cvt_to_bytes_mask16x16(a).val.0,
4399 SHIFT * 2usize,
4400 );
4401 self.cvt_from_bytes_mask16x16(u8x32 {
4402 val: crate::support::Aligned256(result),
4403 simd: self,
4404 })
4405 }
4406 }
4407 #[inline(always)]
4408 fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
4409 unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
4410 }
4411 #[inline(always)]
4412 fn or_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
4413 unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
4414 }
4415 #[inline(always)]
4416 fn xor_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
4417 unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
4418 }
4419 #[inline(always)]
4420 fn not_mask16x16(self, a: mask16x16<Self>) -> mask16x16<Self> {
4421 a ^ !0
4422 }
4423 #[inline(always)]
4424 fn select_mask16x16(
4425 self,
4426 a: mask16x16<Self>,
4427 b: mask16x16<Self>,
4428 c: mask16x16<Self>,
4429 ) -> mask16x16<Self> {
4430 unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
4431 }
4432 #[inline(always)]
4433 fn simd_eq_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
4434 unsafe { _mm256_cmpeq_epi16(a.into(), b.into()).simd_into(self) }
4435 }
4436 #[inline(always)]
4437 fn any_true_mask16x16(self, a: mask16x16<Self>) -> bool {
4438 unsafe { _mm256_movemask_epi8(a.into()) as u32 != 0 }
4439 }
4440 #[inline(always)]
4441 fn all_true_mask16x16(self, a: mask16x16<Self>) -> bool {
4442 unsafe { _mm256_movemask_epi8(a.into()) as u32 == 0xffffffff }
4443 }
4444 #[inline(always)]
4445 fn any_false_mask16x16(self, a: mask16x16<Self>) -> bool {
4446 unsafe { _mm256_movemask_epi8(a.into()) as u32 != 0xffffffff }
4447 }
4448 #[inline(always)]
4449 fn all_false_mask16x16(self, a: mask16x16<Self>) -> bool {
4450 unsafe { _mm256_movemask_epi8(a.into()) as u32 == 0 }
4451 }
4452 #[inline(always)]
4453 fn combine_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x32<Self> {
4454 mask16x32 {
4455 val: crate::support::Aligned512([a.val.0, b.val.0]),
4456 simd: self,
4457 }
4458 }
4459 #[inline(always)]
4460 fn split_mask16x16(self, a: mask16x16<Self>) -> (mask16x8<Self>, mask16x8<Self>) {
4461 unsafe {
4462 (
4463 _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
4464 _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
4465 )
4466 }
4467 }
4468 #[inline(always)]
4469 fn splat_i32x8(self, val: i32) -> i32x8<Self> {
4470 unsafe { _mm256_set1_epi32(val).simd_into(self) }
4471 }
4472 #[inline(always)]
4473 fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8<Self> {
4474 i32x8 {
4475 val: unsafe { core::mem::transmute_copy(&val) },
4476 simd: self,
4477 }
4478 }
4479 #[inline(always)]
4480 fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8<Self> {
4481 i32x8 {
4482 val: unsafe { core::mem::transmute_copy(val) },
4483 simd: self,
4484 }
4485 }
4486 #[inline(always)]
4487 fn as_array_i32x8(self, a: i32x8<Self>) -> [i32; 8usize] {
4488 unsafe { core::mem::transmute::<__m256i, [i32; 8usize]>(a.val.0) }
4489 }
4490 #[inline(always)]
4491 fn as_array_ref_i32x8(self, a: &i32x8<Self>) -> &[i32; 8usize] {
4492 unsafe { core::mem::transmute::<&__m256i, &[i32; 8usize]>(&a.val.0) }
4493 }
4494 #[inline(always)]
4495 fn as_array_mut_i32x8(self, a: &mut i32x8<Self>) -> &mut [i32; 8usize] {
4496 unsafe { core::mem::transmute::<&mut __m256i, &mut [i32; 8usize]>(&mut a.val.0) }
4497 }
4498 #[inline(always)]
4499 fn store_array_i32x8(self, a: i32x8<Self>, dest: &mut [i32; 8usize]) -> () {
4500 unsafe {
4501 core::ptr::copy_nonoverlapping(
4502 (&raw const a.val.0) as *const i32,
4503 dest.as_mut_ptr(),
4504 8usize,
4505 );
4506 }
4507 }
4508 #[inline(always)]
4509 fn cvt_from_bytes_i32x8(self, a: u8x32<Self>) -> i32x8<Self> {
4510 unsafe {
4511 i32x8 {
4512 val: core::mem::transmute(a.val),
4513 simd: self,
4514 }
4515 }
4516 }
4517 #[inline(always)]
4518 fn cvt_to_bytes_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
4519 unsafe {
4520 u8x32 {
4521 val: core::mem::transmute(a.val),
4522 simd: self,
4523 }
4524 }
4525 }
4526 #[inline(always)]
4527 fn slide_i32x8<const SHIFT: usize>(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4528 unsafe {
4529 if SHIFT >= 8usize {
4530 return b;
4531 }
4532 let result = cross_block_alignr_256x1(
4533 self.cvt_to_bytes_i32x8(b).val.0,
4534 self.cvt_to_bytes_i32x8(a).val.0,
4535 SHIFT * 4usize,
4536 );
4537 self.cvt_from_bytes_i32x8(u8x32 {
4538 val: crate::support::Aligned256(result),
4539 simd: self,
4540 })
4541 }
4542 }
4543 #[inline(always)]
4544 fn slide_within_blocks_i32x8<const SHIFT: usize>(
4545 self,
4546 a: i32x8<Self>,
4547 b: i32x8<Self>,
4548 ) -> i32x8<Self> {
4549 unsafe {
4550 if SHIFT >= 4usize {
4551 return b;
4552 }
4553 let result = dyn_alignr_256(
4554 self.cvt_to_bytes_i32x8(b).val.0,
4555 self.cvt_to_bytes_i32x8(a).val.0,
4556 SHIFT * 4usize,
4557 );
4558 self.cvt_from_bytes_i32x8(u8x32 {
4559 val: crate::support::Aligned256(result),
4560 simd: self,
4561 })
4562 }
4563 }
4564 #[inline(always)]
4565 fn add_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4566 unsafe { _mm256_add_epi32(a.into(), b.into()).simd_into(self) }
4567 }
4568 #[inline(always)]
4569 fn sub_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4570 unsafe { _mm256_sub_epi32(a.into(), b.into()).simd_into(self) }
4571 }
4572 #[inline(always)]
4573 fn mul_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4574 unsafe { _mm256_mullo_epi32(a.into(), b.into()).simd_into(self) }
4575 }
4576 #[inline(always)]
4577 fn and_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4578 unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
4579 }
4580 #[inline(always)]
4581 fn or_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4582 unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
4583 }
4584 #[inline(always)]
4585 fn xor_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4586 unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
4587 }
4588 #[inline(always)]
4589 fn not_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
4590 a ^ !0
4591 }
4592 #[inline(always)]
4593 fn shl_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
4594 unsafe {
4595 _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
4596 }
4597 }
4598 #[inline(always)]
4599 fn shlv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4600 unsafe { _mm256_sllv_epi32(a.into(), b.into()).simd_into(self) }
4601 }
4602 #[inline(always)]
4603 fn shr_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
4604 unsafe {
4605 _mm256_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
4606 }
4607 }
4608 #[inline(always)]
4609 fn shrv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4610 unsafe { _mm256_srav_epi32(a.into(), b.into()).simd_into(self) }
4611 }
4612 #[inline(always)]
4613 fn simd_eq_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
4614 unsafe { _mm256_cmpeq_epi32(a.into(), b.into()).simd_into(self) }
4615 }
4616 #[inline(always)]
4617 fn simd_lt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
4618 unsafe { _mm256_cmpgt_epi32(b.into(), a.into()).simd_into(self) }
4619 }
4620 #[inline(always)]
4621 fn simd_le_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
4622 unsafe {
4623 _mm256_cmpeq_epi32(_mm256_min_epi32(a.into(), b.into()), a.into()).simd_into(self)
4624 }
4625 }
4626 #[inline(always)]
4627 fn simd_ge_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
4628 unsafe {
4629 _mm256_cmpeq_epi32(_mm256_max_epi32(a.into(), b.into()), a.into()).simd_into(self)
4630 }
4631 }
4632 #[inline(always)]
4633 fn simd_gt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
4634 unsafe { _mm256_cmpgt_epi32(a.into(), b.into()).simd_into(self) }
4635 }
4636 #[inline(always)]
4637 fn zip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4638 unsafe {
4639 let lo = _mm256_unpacklo_epi32(a.into(), b.into());
4640 let hi = _mm256_unpackhi_epi32(a.into(), b.into());
4641 _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self)
4642 }
4643 }
4644 #[inline(always)]
4645 fn zip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4646 unsafe {
4647 let lo = _mm256_unpacklo_epi32(a.into(), b.into());
4648 let hi = _mm256_unpackhi_epi32(a.into(), b.into());
4649 _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self)
4650 }
4651 }
4652 #[inline(always)]
4653 fn unzip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4654 unsafe {
4655 let t1 =
4656 _mm256_permutevar8x32_epi32(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
4657 let t2 =
4658 _mm256_permutevar8x32_epi32(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
4659 _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self)
4660 }
4661 }
4662 #[inline(always)]
4663 fn unzip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4664 unsafe {
4665 let t1 =
4666 _mm256_permutevar8x32_epi32(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
4667 let t2 =
4668 _mm256_permutevar8x32_epi32(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
4669 _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self)
4670 }
4671 }
4672 #[inline(always)]
4673 fn interleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
4674 unsafe {
4675 let lo = _mm256_unpacklo_epi32(a.into(), b.into());
4676 let hi = _mm256_unpackhi_epi32(a.into(), b.into());
4677 (
4678 _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self),
4679 _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self),
4680 )
4681 }
4682 }
4683 #[inline(always)]
4684 fn deinterleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
4685 unsafe {
4686 let t1 =
4687 _mm256_permutevar8x32_epi32(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
4688 let t2 =
4689 _mm256_permutevar8x32_epi32(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
4690 (
4691 _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self),
4692 _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self),
4693 )
4694 }
4695 }
4696 #[inline(always)]
4697 fn select_i32x8(self, a: mask32x8<Self>, b: i32x8<Self>, c: i32x8<Self>) -> i32x8<Self> {
4698 unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
4699 }
4700 #[inline(always)]
4701 fn min_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4702 unsafe { _mm256_min_epi32(a.into(), b.into()).simd_into(self) }
4703 }
4704 #[inline(always)]
4705 fn max_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
4706 unsafe { _mm256_max_epi32(a.into(), b.into()).simd_into(self) }
4707 }
4708 #[inline(always)]
4709 fn combine_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x16<Self> {
4710 i32x16 {
4711 val: crate::support::Aligned512([a.val.0, b.val.0]),
4712 simd: self,
4713 }
4714 }
4715 #[inline(always)]
4716 fn split_i32x8(self, a: i32x8<Self>) -> (i32x4<Self>, i32x4<Self>) {
4717 unsafe {
4718 (
4719 _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
4720 _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
4721 )
4722 }
4723 }
4724 #[inline(always)]
4725 fn neg_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
4726 unsafe { _mm256_sub_epi32(_mm256_setzero_si256(), a.into()).simd_into(self) }
4727 }
4728 #[inline(always)]
4729 fn reinterpret_u8_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
4730 __m256i::from(a).simd_into(self)
4731 }
4732 #[inline(always)]
4733 fn reinterpret_u32_i32x8(self, a: i32x8<Self>) -> u32x8<Self> {
4734 __m256i::from(a).simd_into(self)
4735 }
4736 #[inline(always)]
4737 fn cvt_f32_i32x8(self, a: i32x8<Self>) -> f32x8<Self> {
4738 unsafe { _mm256_cvtepi32_ps(a.into()).simd_into(self) }
4739 }
4740 #[inline(always)]
4741 fn splat_u32x8(self, val: u32) -> u32x8<Self> {
4742 unsafe { _mm256_set1_epi32(val.cast_signed()).simd_into(self) }
4743 }
4744 #[inline(always)]
4745 fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8<Self> {
4746 u32x8 {
4747 val: unsafe { core::mem::transmute_copy(&val) },
4748 simd: self,
4749 }
4750 }
4751 #[inline(always)]
4752 fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8<Self> {
4753 u32x8 {
4754 val: unsafe { core::mem::transmute_copy(val) },
4755 simd: self,
4756 }
4757 }
4758 #[inline(always)]
4759 fn as_array_u32x8(self, a: u32x8<Self>) -> [u32; 8usize] {
4760 unsafe { core::mem::transmute::<__m256i, [u32; 8usize]>(a.val.0) }
4761 }
4762 #[inline(always)]
4763 fn as_array_ref_u32x8(self, a: &u32x8<Self>) -> &[u32; 8usize] {
4764 unsafe { core::mem::transmute::<&__m256i, &[u32; 8usize]>(&a.val.0) }
4765 }
4766 #[inline(always)]
4767 fn as_array_mut_u32x8(self, a: &mut u32x8<Self>) -> &mut [u32; 8usize] {
4768 unsafe { core::mem::transmute::<&mut __m256i, &mut [u32; 8usize]>(&mut a.val.0) }
4769 }
4770 #[inline(always)]
4771 fn store_array_u32x8(self, a: u32x8<Self>, dest: &mut [u32; 8usize]) -> () {
4772 unsafe {
4773 core::ptr::copy_nonoverlapping(
4774 (&raw const a.val.0) as *const u32,
4775 dest.as_mut_ptr(),
4776 8usize,
4777 );
4778 }
4779 }
4780 #[inline(always)]
4781 fn cvt_from_bytes_u32x8(self, a: u8x32<Self>) -> u32x8<Self> {
4782 unsafe {
4783 u32x8 {
4784 val: core::mem::transmute(a.val),
4785 simd: self,
4786 }
4787 }
4788 }
4789 #[inline(always)]
4790 fn cvt_to_bytes_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
4791 unsafe {
4792 u8x32 {
4793 val: core::mem::transmute(a.val),
4794 simd: self,
4795 }
4796 }
4797 }
4798 #[inline(always)]
4799 fn slide_u32x8<const SHIFT: usize>(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4800 unsafe {
4801 if SHIFT >= 8usize {
4802 return b;
4803 }
4804 let result = cross_block_alignr_256x1(
4805 self.cvt_to_bytes_u32x8(b).val.0,
4806 self.cvt_to_bytes_u32x8(a).val.0,
4807 SHIFT * 4usize,
4808 );
4809 self.cvt_from_bytes_u32x8(u8x32 {
4810 val: crate::support::Aligned256(result),
4811 simd: self,
4812 })
4813 }
4814 }
4815 #[inline(always)]
4816 fn slide_within_blocks_u32x8<const SHIFT: usize>(
4817 self,
4818 a: u32x8<Self>,
4819 b: u32x8<Self>,
4820 ) -> u32x8<Self> {
4821 unsafe {
4822 if SHIFT >= 4usize {
4823 return b;
4824 }
4825 let result = dyn_alignr_256(
4826 self.cvt_to_bytes_u32x8(b).val.0,
4827 self.cvt_to_bytes_u32x8(a).val.0,
4828 SHIFT * 4usize,
4829 );
4830 self.cvt_from_bytes_u32x8(u8x32 {
4831 val: crate::support::Aligned256(result),
4832 simd: self,
4833 })
4834 }
4835 }
4836 #[inline(always)]
4837 fn add_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4838 unsafe { _mm256_add_epi32(a.into(), b.into()).simd_into(self) }
4839 }
4840 #[inline(always)]
4841 fn sub_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4842 unsafe { _mm256_sub_epi32(a.into(), b.into()).simd_into(self) }
4843 }
4844 #[inline(always)]
4845 fn mul_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4846 unsafe { _mm256_mullo_epi32(a.into(), b.into()).simd_into(self) }
4847 }
4848 #[inline(always)]
4849 fn and_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4850 unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
4851 }
4852 #[inline(always)]
4853 fn or_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4854 unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
4855 }
4856 #[inline(always)]
4857 fn xor_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4858 unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
4859 }
4860 #[inline(always)]
4861 fn not_u32x8(self, a: u32x8<Self>) -> u32x8<Self> {
4862 a ^ !0
4863 }
4864 #[inline(always)]
4865 fn shl_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
4866 unsafe {
4867 _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
4868 }
4869 }
4870 #[inline(always)]
4871 fn shlv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4872 unsafe { _mm256_sllv_epi32(a.into(), b.into()).simd_into(self) }
4873 }
4874 #[inline(always)]
4875 fn shr_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
4876 unsafe {
4877 _mm256_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
4878 }
4879 }
4880 #[inline(always)]
4881 fn shrv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4882 unsafe { _mm256_srlv_epi32(a.into(), b.into()).simd_into(self) }
4883 }
4884 #[inline(always)]
4885 fn simd_eq_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
4886 unsafe { _mm256_cmpeq_epi32(a.into(), b.into()).simd_into(self) }
4887 }
4888 #[inline(always)]
4889 fn simd_lt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
4890 unsafe {
4891 let sign_bit = _mm256_set1_epi32(0x80000000u32.cast_signed());
4892 let a_signed = _mm256_xor_si256(a.into(), sign_bit);
4893 let b_signed = _mm256_xor_si256(b.into(), sign_bit);
4894 _mm256_cmpgt_epi32(b_signed, a_signed).simd_into(self)
4895 }
4896 }
4897 #[inline(always)]
4898 fn simd_le_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
4899 unsafe {
4900 _mm256_cmpeq_epi32(_mm256_min_epu32(a.into(), b.into()), a.into()).simd_into(self)
4901 }
4902 }
4903 #[inline(always)]
4904 fn simd_ge_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
4905 unsafe {
4906 _mm256_cmpeq_epi32(_mm256_max_epu32(a.into(), b.into()), a.into()).simd_into(self)
4907 }
4908 }
4909 #[inline(always)]
4910 fn simd_gt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
4911 unsafe {
4912 let sign_bit = _mm256_set1_epi32(0x80000000u32.cast_signed());
4913 let a_signed = _mm256_xor_si256(a.into(), sign_bit);
4914 let b_signed = _mm256_xor_si256(b.into(), sign_bit);
4915 _mm256_cmpgt_epi32(a_signed, b_signed).simd_into(self)
4916 }
4917 }
4918 #[inline(always)]
4919 fn zip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4920 unsafe {
4921 let lo = _mm256_unpacklo_epi32(a.into(), b.into());
4922 let hi = _mm256_unpackhi_epi32(a.into(), b.into());
4923 _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self)
4924 }
4925 }
4926 #[inline(always)]
4927 fn zip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4928 unsafe {
4929 let lo = _mm256_unpacklo_epi32(a.into(), b.into());
4930 let hi = _mm256_unpackhi_epi32(a.into(), b.into());
4931 _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self)
4932 }
4933 }
4934 #[inline(always)]
4935 fn unzip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4936 unsafe {
4937 let t1 =
4938 _mm256_permutevar8x32_epi32(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
4939 let t2 =
4940 _mm256_permutevar8x32_epi32(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
4941 _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self)
4942 }
4943 }
4944 #[inline(always)]
4945 fn unzip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4946 unsafe {
4947 let t1 =
4948 _mm256_permutevar8x32_epi32(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
4949 let t2 =
4950 _mm256_permutevar8x32_epi32(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
4951 _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self)
4952 }
4953 }
4954 #[inline(always)]
4955 fn interleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
4956 unsafe {
4957 let lo = _mm256_unpacklo_epi32(a.into(), b.into());
4958 let hi = _mm256_unpackhi_epi32(a.into(), b.into());
4959 (
4960 _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(self),
4961 _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(self),
4962 )
4963 }
4964 }
4965 #[inline(always)]
4966 fn deinterleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
4967 unsafe {
4968 let t1 =
4969 _mm256_permutevar8x32_epi32(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
4970 let t2 =
4971 _mm256_permutevar8x32_epi32(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
4972 (
4973 _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(self),
4974 _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(self),
4975 )
4976 }
4977 }
4978 #[inline(always)]
4979 fn select_u32x8(self, a: mask32x8<Self>, b: u32x8<Self>, c: u32x8<Self>) -> u32x8<Self> {
4980 unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
4981 }
4982 #[inline(always)]
4983 fn min_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4984 unsafe { _mm256_min_epu32(a.into(), b.into()).simd_into(self) }
4985 }
4986 #[inline(always)]
4987 fn max_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
4988 unsafe { _mm256_max_epu32(a.into(), b.into()).simd_into(self) }
4989 }
4990 #[inline(always)]
4991 fn combine_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x16<Self> {
4992 u32x16 {
4993 val: crate::support::Aligned512([a.val.0, b.val.0]),
4994 simd: self,
4995 }
4996 }
4997 #[inline(always)]
4998 fn split_u32x8(self, a: u32x8<Self>) -> (u32x4<Self>, u32x4<Self>) {
4999 unsafe {
5000 (
5001 _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
5002 _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
5003 )
5004 }
5005 }
5006 #[inline(always)]
5007 fn reinterpret_u8_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
5008 __m256i::from(a).simd_into(self)
5009 }
5010 #[inline(always)]
5011 fn cvt_f32_u32x8(self, a: u32x8<Self>) -> f32x8<Self> {
5012 unsafe {
5013 let a = a.into();
5014 let lo = _mm256_blend_epi16::<0xAA>(a, _mm256_set1_epi32(0x4B000000));
5015 let hi = _mm256_blend_epi16::<0xAA>(
5016 _mm256_srli_epi32::<16>(a),
5017 _mm256_set1_epi32(0x53000000),
5018 );
5019 let fhi = _mm256_sub_ps(
5020 _mm256_castsi256_ps(hi),
5021 _mm256_set1_ps(f32::from_bits(0x53000080)),
5022 );
5023 let result = _mm256_add_ps(_mm256_castsi256_ps(lo), fhi);
5024 result.simd_into(self)
5025 }
5026 }
5027 #[inline(always)]
5028 fn splat_mask32x8(self, val: i32) -> mask32x8<Self> {
5029 unsafe { _mm256_set1_epi32(val).simd_into(self) }
5030 }
5031 #[inline(always)]
5032 fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8<Self> {
5033 mask32x8 {
5034 val: unsafe { core::mem::transmute_copy(&val) },
5035 simd: self,
5036 }
5037 }
5038 #[inline(always)]
5039 fn load_array_ref_mask32x8(self, val: &[i32; 8usize]) -> mask32x8<Self> {
5040 mask32x8 {
5041 val: unsafe { core::mem::transmute_copy(val) },
5042 simd: self,
5043 }
5044 }
5045 #[inline(always)]
5046 fn as_array_mask32x8(self, a: mask32x8<Self>) -> [i32; 8usize] {
5047 unsafe { core::mem::transmute::<__m256i, [i32; 8usize]>(a.val.0) }
5048 }
5049 #[inline(always)]
5050 fn as_array_ref_mask32x8(self, a: &mask32x8<Self>) -> &[i32; 8usize] {
5051 unsafe { core::mem::transmute::<&__m256i, &[i32; 8usize]>(&a.val.0) }
5052 }
5053 #[inline(always)]
5054 fn as_array_mut_mask32x8(self, a: &mut mask32x8<Self>) -> &mut [i32; 8usize] {
5055 unsafe { core::mem::transmute::<&mut __m256i, &mut [i32; 8usize]>(&mut a.val.0) }
5056 }
5057 #[inline(always)]
5058 fn store_array_mask32x8(self, a: mask32x8<Self>, dest: &mut [i32; 8usize]) -> () {
5059 unsafe {
5060 core::ptr::copy_nonoverlapping(
5061 (&raw const a.val.0) as *const i32,
5062 dest.as_mut_ptr(),
5063 8usize,
5064 );
5065 }
5066 }
5067 #[inline(always)]
5068 fn cvt_from_bytes_mask32x8(self, a: u8x32<Self>) -> mask32x8<Self> {
5069 unsafe {
5070 mask32x8 {
5071 val: core::mem::transmute(a.val),
5072 simd: self,
5073 }
5074 }
5075 }
5076 #[inline(always)]
5077 fn cvt_to_bytes_mask32x8(self, a: mask32x8<Self>) -> u8x32<Self> {
5078 unsafe {
5079 u8x32 {
5080 val: core::mem::transmute(a.val),
5081 simd: self,
5082 }
5083 }
5084 }
5085 #[inline(always)]
5086 fn slide_mask32x8<const SHIFT: usize>(
5087 self,
5088 a: mask32x8<Self>,
5089 b: mask32x8<Self>,
5090 ) -> mask32x8<Self> {
5091 unsafe {
5092 if SHIFT >= 8usize {
5093 return b;
5094 }
5095 let result = cross_block_alignr_256x1(
5096 self.cvt_to_bytes_mask32x8(b).val.0,
5097 self.cvt_to_bytes_mask32x8(a).val.0,
5098 SHIFT * 4usize,
5099 );
5100 self.cvt_from_bytes_mask32x8(u8x32 {
5101 val: crate::support::Aligned256(result),
5102 simd: self,
5103 })
5104 }
5105 }
5106 #[inline(always)]
5107 fn slide_within_blocks_mask32x8<const SHIFT: usize>(
5108 self,
5109 a: mask32x8<Self>,
5110 b: mask32x8<Self>,
5111 ) -> mask32x8<Self> {
5112 unsafe {
5113 if SHIFT >= 4usize {
5114 return b;
5115 }
5116 let result = dyn_alignr_256(
5117 self.cvt_to_bytes_mask32x8(b).val.0,
5118 self.cvt_to_bytes_mask32x8(a).val.0,
5119 SHIFT * 4usize,
5120 );
5121 self.cvt_from_bytes_mask32x8(u8x32 {
5122 val: crate::support::Aligned256(result),
5123 simd: self,
5124 })
5125 }
5126 }
5127 #[inline(always)]
5128 fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
5129 unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
5130 }
5131 #[inline(always)]
5132 fn or_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
5133 unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
5134 }
5135 #[inline(always)]
5136 fn xor_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
5137 unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
5138 }
5139 #[inline(always)]
5140 fn not_mask32x8(self, a: mask32x8<Self>) -> mask32x8<Self> {
5141 a ^ !0
5142 }
5143 #[inline(always)]
5144 fn select_mask32x8(
5145 self,
5146 a: mask32x8<Self>,
5147 b: mask32x8<Self>,
5148 c: mask32x8<Self>,
5149 ) -> mask32x8<Self> {
5150 unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
5151 }
5152 #[inline(always)]
5153 fn simd_eq_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
5154 unsafe { _mm256_cmpeq_epi32(a.into(), b.into()).simd_into(self) }
5155 }
5156 #[inline(always)]
5157 fn any_true_mask32x8(self, a: mask32x8<Self>) -> bool {
5158 unsafe { _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 != 0 }
5159 }
5160 #[inline(always)]
5161 fn all_true_mask32x8(self, a: mask32x8<Self>) -> bool {
5162 unsafe { _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 == 0b11111111 }
5163 }
5164 #[inline(always)]
5165 fn any_false_mask32x8(self, a: mask32x8<Self>) -> bool {
5166 unsafe { _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 != 0b11111111 }
5167 }
5168 #[inline(always)]
5169 fn all_false_mask32x8(self, a: mask32x8<Self>) -> bool {
5170 unsafe { _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 == 0 }
5171 }
5172 #[inline(always)]
5173 fn combine_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x16<Self> {
5174 mask32x16 {
5175 val: crate::support::Aligned512([a.val.0, b.val.0]),
5176 simd: self,
5177 }
5178 }
5179 #[inline(always)]
5180 fn split_mask32x8(self, a: mask32x8<Self>) -> (mask32x4<Self>, mask32x4<Self>) {
5181 unsafe {
5182 (
5183 _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
5184 _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
5185 )
5186 }
5187 }
5188 #[inline(always)]
5189 fn splat_f64x4(self, val: f64) -> f64x4<Self> {
5190 unsafe { _mm256_set1_pd(val).simd_into(self) }
5191 }
5192 #[inline(always)]
5193 fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4<Self> {
5194 f64x4 {
5195 val: unsafe { core::mem::transmute_copy(&val) },
5196 simd: self,
5197 }
5198 }
5199 #[inline(always)]
5200 fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4<Self> {
5201 f64x4 {
5202 val: unsafe { core::mem::transmute_copy(val) },
5203 simd: self,
5204 }
5205 }
5206 #[inline(always)]
5207 fn as_array_f64x4(self, a: f64x4<Self>) -> [f64; 4usize] {
5208 unsafe { core::mem::transmute::<__m256d, [f64; 4usize]>(a.val.0) }
5209 }
5210 #[inline(always)]
5211 fn as_array_ref_f64x4(self, a: &f64x4<Self>) -> &[f64; 4usize] {
5212 unsafe { core::mem::transmute::<&__m256d, &[f64; 4usize]>(&a.val.0) }
5213 }
5214 #[inline(always)]
5215 fn as_array_mut_f64x4(self, a: &mut f64x4<Self>) -> &mut [f64; 4usize] {
5216 unsafe { core::mem::transmute::<&mut __m256d, &mut [f64; 4usize]>(&mut a.val.0) }
5217 }
5218 #[inline(always)]
5219 fn store_array_f64x4(self, a: f64x4<Self>, dest: &mut [f64; 4usize]) -> () {
5220 unsafe {
5221 core::ptr::copy_nonoverlapping(
5222 (&raw const a.val.0) as *const f64,
5223 dest.as_mut_ptr(),
5224 4usize,
5225 );
5226 }
5227 }
5228 #[inline(always)]
5229 fn cvt_from_bytes_f64x4(self, a: u8x32<Self>) -> f64x4<Self> {
5230 unsafe {
5231 f64x4 {
5232 val: core::mem::transmute(a.val),
5233 simd: self,
5234 }
5235 }
5236 }
5237 #[inline(always)]
5238 fn cvt_to_bytes_f64x4(self, a: f64x4<Self>) -> u8x32<Self> {
5239 unsafe {
5240 u8x32 {
5241 val: core::mem::transmute(a.val),
5242 simd: self,
5243 }
5244 }
5245 }
5246 #[inline(always)]
5247 fn slide_f64x4<const SHIFT: usize>(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5248 unsafe {
5249 if SHIFT >= 4usize {
5250 return b;
5251 }
5252 let result = cross_block_alignr_256x1(
5253 self.cvt_to_bytes_f64x4(b).val.0,
5254 self.cvt_to_bytes_f64x4(a).val.0,
5255 SHIFT * 8usize,
5256 );
5257 self.cvt_from_bytes_f64x4(u8x32 {
5258 val: crate::support::Aligned256(result),
5259 simd: self,
5260 })
5261 }
5262 }
5263 #[inline(always)]
5264 fn slide_within_blocks_f64x4<const SHIFT: usize>(
5265 self,
5266 a: f64x4<Self>,
5267 b: f64x4<Self>,
5268 ) -> f64x4<Self> {
5269 unsafe {
5270 if SHIFT >= 2usize {
5271 return b;
5272 }
5273 let result = dyn_alignr_256(
5274 self.cvt_to_bytes_f64x4(b).val.0,
5275 self.cvt_to_bytes_f64x4(a).val.0,
5276 SHIFT * 8usize,
5277 );
5278 self.cvt_from_bytes_f64x4(u8x32 {
5279 val: crate::support::Aligned256(result),
5280 simd: self,
5281 })
5282 }
5283 }
5284 #[inline(always)]
5285 fn abs_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5286 unsafe { _mm256_andnot_pd(_mm256_set1_pd(-0.0), a.into()).simd_into(self) }
5287 }
5288 #[inline(always)]
5289 fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5290 unsafe { _mm256_xor_pd(a.into(), _mm256_set1_pd(-0.0)).simd_into(self) }
5291 }
5292 #[inline(always)]
5293 fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5294 unsafe { _mm256_sqrt_pd(a.into()).simd_into(self) }
5295 }
5296 #[inline(always)]
5297 fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5298 unsafe { _mm256_add_pd(a.into(), b.into()).simd_into(self) }
5299 }
5300 #[inline(always)]
5301 fn sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5302 unsafe { _mm256_sub_pd(a.into(), b.into()).simd_into(self) }
5303 }
5304 #[inline(always)]
5305 fn mul_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5306 unsafe { _mm256_mul_pd(a.into(), b.into()).simd_into(self) }
5307 }
5308 #[inline(always)]
5309 fn div_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5310 unsafe { _mm256_div_pd(a.into(), b.into()).simd_into(self) }
5311 }
5312 #[inline(always)]
5313 fn copysign_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5314 unsafe {
5315 let mask = _mm256_set1_pd(-0.0);
5316 _mm256_or_pd(
5317 _mm256_and_pd(mask, b.into()),
5318 _mm256_andnot_pd(mask, a.into()),
5319 )
5320 .simd_into(self)
5321 }
5322 }
5323 #[inline(always)]
5324 fn simd_eq_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
5325 unsafe { _mm256_castpd_si256(_mm256_cmp_pd::<0i32>(a.into(), b.into())).simd_into(self) }
5326 }
5327 #[inline(always)]
5328 fn simd_lt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
5329 unsafe { _mm256_castpd_si256(_mm256_cmp_pd::<17i32>(a.into(), b.into())).simd_into(self) }
5330 }
5331 #[inline(always)]
5332 fn simd_le_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
5333 unsafe { _mm256_castpd_si256(_mm256_cmp_pd::<18i32>(a.into(), b.into())).simd_into(self) }
5334 }
5335 #[inline(always)]
5336 fn simd_ge_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
5337 unsafe { _mm256_castpd_si256(_mm256_cmp_pd::<29i32>(a.into(), b.into())).simd_into(self) }
5338 }
5339 #[inline(always)]
5340 fn simd_gt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
5341 unsafe { _mm256_castpd_si256(_mm256_cmp_pd::<30i32>(a.into(), b.into())).simd_into(self) }
5342 }
5343 #[inline(always)]
5344 fn zip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5345 unsafe {
5346 let lo = _mm256_unpacklo_pd(a.into(), b.into());
5347 let hi = _mm256_unpackhi_pd(a.into(), b.into());
5348 _mm256_permute2f128_pd::<0b0010_0000>(lo, hi).simd_into(self)
5349 }
5350 }
5351 #[inline(always)]
5352 fn zip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5353 unsafe {
5354 let lo = _mm256_unpacklo_pd(a.into(), b.into());
5355 let hi = _mm256_unpackhi_pd(a.into(), b.into());
5356 _mm256_permute2f128_pd::<0b0011_0001>(lo, hi).simd_into(self)
5357 }
5358 }
5359 #[inline(always)]
5360 fn unzip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5361 unsafe {
5362 let t1 = _mm256_permute4x64_pd::<0b11_01_10_00>(a.into());
5363 let t2 = _mm256_permute4x64_pd::<0b11_01_10_00>(b.into());
5364 _mm256_permute2f128_pd::<0b0010_0000>(t1, t2).simd_into(self)
5365 }
5366 }
5367 #[inline(always)]
5368 fn unzip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5369 unsafe {
5370 let t1 = _mm256_permute4x64_pd::<0b11_01_10_00>(a.into());
5371 let t2 = _mm256_permute4x64_pd::<0b11_01_10_00>(b.into());
5372 _mm256_permute2f128_pd::<0b0011_0001>(t1, t2).simd_into(self)
5373 }
5374 }
5375 #[inline(always)]
5376 fn interleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
5377 unsafe {
5378 let lo = _mm256_unpacklo_pd(a.into(), b.into());
5379 let hi = _mm256_unpackhi_pd(a.into(), b.into());
5380 (
5381 _mm256_permute2f128_pd::<0b0010_0000>(lo, hi).simd_into(self),
5382 _mm256_permute2f128_pd::<0b0011_0001>(lo, hi).simd_into(self),
5383 )
5384 }
5385 }
5386 #[inline(always)]
5387 fn deinterleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
5388 unsafe {
5389 let t1 = _mm256_permute4x64_pd::<0b11_01_10_00>(a.into());
5390 let t2 = _mm256_permute4x64_pd::<0b11_01_10_00>(b.into());
5391 (
5392 _mm256_permute2f128_pd::<0b0010_0000>(t1, t2).simd_into(self),
5393 _mm256_permute2f128_pd::<0b0011_0001>(t1, t2).simd_into(self),
5394 )
5395 }
5396 }
5397 #[inline(always)]
5398 fn max_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5399 unsafe { _mm256_max_pd(a.into(), b.into()).simd_into(self) }
5400 }
5401 #[inline(always)]
5402 fn min_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5403 unsafe { _mm256_min_pd(a.into(), b.into()).simd_into(self) }
5404 }
5405 #[inline(always)]
5406 fn max_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5407 unsafe {
5408 let intermediate = _mm256_max_pd(a.into(), b.into());
5409 let b_is_nan = _mm256_cmp_pd::<3i32>(b.into(), b.into());
5410 _mm256_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(self)
5411 }
5412 }
5413 #[inline(always)]
5414 fn min_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
5415 unsafe {
5416 let intermediate = _mm256_min_pd(a.into(), b.into());
5417 let b_is_nan = _mm256_cmp_pd::<3i32>(b.into(), b.into());
5418 _mm256_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(self)
5419 }
5420 }
5421 #[inline(always)]
5422 fn mul_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
5423 unsafe { _mm256_fmadd_pd(a.into(), b.into(), c.into()).simd_into(self) }
5424 }
5425 #[inline(always)]
5426 fn mul_sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
5427 unsafe { _mm256_fmsub_pd(a.into(), b.into(), c.into()).simd_into(self) }
5428 }
5429 #[inline(always)]
5430 fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5431 unsafe {
5432 _mm256_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
5433 .simd_into(self)
5434 }
5435 }
5436 #[inline(always)]
5437 fn ceil_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5438 unsafe {
5439 _mm256_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into())
5440 .simd_into(self)
5441 }
5442 }
5443 #[inline(always)]
5444 fn round_ties_even_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5445 unsafe {
5446 _mm256_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
5447 .simd_into(self)
5448 }
5449 }
5450 #[inline(always)]
5451 fn fract_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5452 a - self.trunc_f64x4(a)
5453 }
5454 #[inline(always)]
5455 fn trunc_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
5456 unsafe {
5457 _mm256_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
5458 }
5459 }
5460 #[inline(always)]
5461 fn select_f64x4(self, a: mask64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
5462 unsafe {
5463 _mm256_blendv_pd(c.into(), b.into(), _mm256_castsi256_pd(a.into())).simd_into(self)
5464 }
5465 }
5466 #[inline(always)]
5467 fn combine_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x8<Self> {
5468 f64x8 {
5469 val: crate::support::Aligned512([a.val.0, b.val.0]),
5470 simd: self,
5471 }
5472 }
5473 #[inline(always)]
5474 fn split_f64x4(self, a: f64x4<Self>) -> (f64x2<Self>, f64x2<Self>) {
5475 unsafe {
5476 (
5477 _mm256_extractf128_pd::<0>(a.into()).simd_into(self),
5478 _mm256_extractf128_pd::<1>(a.into()).simd_into(self),
5479 )
5480 }
5481 }
5482 #[inline(always)]
5483 fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
5484 unsafe { _mm256_castpd_ps(a.into()).simd_into(self) }
5485 }
5486 #[inline(always)]
5487 fn splat_mask64x4(self, val: i64) -> mask64x4<Self> {
5488 unsafe { _mm256_set1_epi64x(val).simd_into(self) }
5489 }
5490 #[inline(always)]
5491 fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4<Self> {
5492 mask64x4 {
5493 val: unsafe { core::mem::transmute_copy(&val) },
5494 simd: self,
5495 }
5496 }
5497 #[inline(always)]
5498 fn load_array_ref_mask64x4(self, val: &[i64; 4usize]) -> mask64x4<Self> {
5499 mask64x4 {
5500 val: unsafe { core::mem::transmute_copy(val) },
5501 simd: self,
5502 }
5503 }
5504 #[inline(always)]
5505 fn as_array_mask64x4(self, a: mask64x4<Self>) -> [i64; 4usize] {
5506 unsafe { core::mem::transmute::<__m256i, [i64; 4usize]>(a.val.0) }
5507 }
5508 #[inline(always)]
5509 fn as_array_ref_mask64x4(self, a: &mask64x4<Self>) -> &[i64; 4usize] {
5510 unsafe { core::mem::transmute::<&__m256i, &[i64; 4usize]>(&a.val.0) }
5511 }
5512 #[inline(always)]
5513 fn as_array_mut_mask64x4(self, a: &mut mask64x4<Self>) -> &mut [i64; 4usize] {
5514 unsafe { core::mem::transmute::<&mut __m256i, &mut [i64; 4usize]>(&mut a.val.0) }
5515 }
5516 #[inline(always)]
5517 fn store_array_mask64x4(self, a: mask64x4<Self>, dest: &mut [i64; 4usize]) -> () {
5518 unsafe {
5519 core::ptr::copy_nonoverlapping(
5520 (&raw const a.val.0) as *const i64,
5521 dest.as_mut_ptr(),
5522 4usize,
5523 );
5524 }
5525 }
5526 #[inline(always)]
5527 fn cvt_from_bytes_mask64x4(self, a: u8x32<Self>) -> mask64x4<Self> {
5528 unsafe {
5529 mask64x4 {
5530 val: core::mem::transmute(a.val),
5531 simd: self,
5532 }
5533 }
5534 }
5535 #[inline(always)]
5536 fn cvt_to_bytes_mask64x4(self, a: mask64x4<Self>) -> u8x32<Self> {
5537 unsafe {
5538 u8x32 {
5539 val: core::mem::transmute(a.val),
5540 simd: self,
5541 }
5542 }
5543 }
5544 #[inline(always)]
5545 fn slide_mask64x4<const SHIFT: usize>(
5546 self,
5547 a: mask64x4<Self>,
5548 b: mask64x4<Self>,
5549 ) -> mask64x4<Self> {
5550 unsafe {
5551 if SHIFT >= 4usize {
5552 return b;
5553 }
5554 let result = cross_block_alignr_256x1(
5555 self.cvt_to_bytes_mask64x4(b).val.0,
5556 self.cvt_to_bytes_mask64x4(a).val.0,
5557 SHIFT * 8usize,
5558 );
5559 self.cvt_from_bytes_mask64x4(u8x32 {
5560 val: crate::support::Aligned256(result),
5561 simd: self,
5562 })
5563 }
5564 }
5565 #[inline(always)]
5566 fn slide_within_blocks_mask64x4<const SHIFT: usize>(
5567 self,
5568 a: mask64x4<Self>,
5569 b: mask64x4<Self>,
5570 ) -> mask64x4<Self> {
5571 unsafe {
5572 if SHIFT >= 2usize {
5573 return b;
5574 }
5575 let result = dyn_alignr_256(
5576 self.cvt_to_bytes_mask64x4(b).val.0,
5577 self.cvt_to_bytes_mask64x4(a).val.0,
5578 SHIFT * 8usize,
5579 );
5580 self.cvt_from_bytes_mask64x4(u8x32 {
5581 val: crate::support::Aligned256(result),
5582 simd: self,
5583 })
5584 }
5585 }
5586 #[inline(always)]
5587 fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
5588 unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
5589 }
5590 #[inline(always)]
5591 fn or_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
5592 unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
5593 }
5594 #[inline(always)]
5595 fn xor_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
5596 unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
5597 }
5598 #[inline(always)]
5599 fn not_mask64x4(self, a: mask64x4<Self>) -> mask64x4<Self> {
5600 a ^ !0
5601 }
5602 #[inline(always)]
5603 fn select_mask64x4(
5604 self,
5605 a: mask64x4<Self>,
5606 b: mask64x4<Self>,
5607 c: mask64x4<Self>,
5608 ) -> mask64x4<Self> {
5609 unsafe { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
5610 }
5611 #[inline(always)]
5612 fn simd_eq_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
5613 unsafe { _mm256_cmpeq_epi64(a.into(), b.into()).simd_into(self) }
5614 }
5615 #[inline(always)]
5616 fn any_true_mask64x4(self, a: mask64x4<Self>) -> bool {
5617 unsafe { _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 != 0 }
5618 }
5619 #[inline(always)]
5620 fn all_true_mask64x4(self, a: mask64x4<Self>) -> bool {
5621 unsafe { _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 == 0b1111 }
5622 }
5623 #[inline(always)]
5624 fn any_false_mask64x4(self, a: mask64x4<Self>) -> bool {
5625 unsafe { _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 != 0b1111 }
5626 }
5627 #[inline(always)]
5628 fn all_false_mask64x4(self, a: mask64x4<Self>) -> bool {
5629 unsafe { _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 == 0 }
5630 }
5631 #[inline(always)]
5632 fn combine_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x8<Self> {
5633 mask64x8 {
5634 val: crate::support::Aligned512([a.val.0, b.val.0]),
5635 simd: self,
5636 }
5637 }
5638 #[inline(always)]
5639 fn split_mask64x4(self, a: mask64x4<Self>) -> (mask64x2<Self>, mask64x2<Self>) {
5640 unsafe {
5641 (
5642 _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
5643 _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
5644 )
5645 }
5646 }
5647 #[inline(always)]
5648 fn splat_f32x16(self, val: f32) -> f32x16<Self> {
5649 let half = self.splat_f32x8(val);
5650 self.combine_f32x8(half, half)
5651 }
5652 #[inline(always)]
5653 fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16<Self> {
5654 f32x16 {
5655 val: unsafe { core::mem::transmute_copy(&val) },
5656 simd: self,
5657 }
5658 }
5659 #[inline(always)]
5660 fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16<Self> {
5661 f32x16 {
5662 val: unsafe { core::mem::transmute_copy(val) },
5663 simd: self,
5664 }
5665 }
5666 #[inline(always)]
5667 fn as_array_f32x16(self, a: f32x16<Self>) -> [f32; 16usize] {
5668 unsafe { core::mem::transmute::<[__m256; 2usize], [f32; 16usize]>(a.val.0) }
5669 }
5670 #[inline(always)]
5671 fn as_array_ref_f32x16(self, a: &f32x16<Self>) -> &[f32; 16usize] {
5672 unsafe { core::mem::transmute::<&[__m256; 2usize], &[f32; 16usize]>(&a.val.0) }
5673 }
5674 #[inline(always)]
5675 fn as_array_mut_f32x16(self, a: &mut f32x16<Self>) -> &mut [f32; 16usize] {
5676 unsafe { core::mem::transmute::<&mut [__m256; 2usize], &mut [f32; 16usize]>(&mut a.val.0) }
5677 }
5678 #[inline(always)]
5679 fn store_array_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
5680 unsafe {
5681 core::ptr::copy_nonoverlapping(
5682 (&raw const a.val.0) as *const f32,
5683 dest.as_mut_ptr(),
5684 16usize,
5685 );
5686 }
5687 }
5688 #[inline(always)]
5689 fn cvt_from_bytes_f32x16(self, a: u8x64<Self>) -> f32x16<Self> {
5690 unsafe {
5691 f32x16 {
5692 val: core::mem::transmute(a.val),
5693 simd: self,
5694 }
5695 }
5696 }
5697 #[inline(always)]
5698 fn cvt_to_bytes_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
5699 unsafe {
5700 u8x64 {
5701 val: core::mem::transmute(a.val),
5702 simd: self,
5703 }
5704 }
5705 }
5706 #[inline(always)]
5707 fn slide_f32x16<const SHIFT: usize>(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5708 unsafe {
5709 if SHIFT >= 16usize {
5710 return b;
5711 }
5712 let result = cross_block_alignr_256x2(
5713 self.cvt_to_bytes_f32x16(b).val.0,
5714 self.cvt_to_bytes_f32x16(a).val.0,
5715 SHIFT * 4usize,
5716 );
5717 self.cvt_from_bytes_f32x16(u8x64 {
5718 val: crate::support::Aligned512(result),
5719 simd: self,
5720 })
5721 }
5722 }
5723 #[inline(always)]
5724 fn slide_within_blocks_f32x16<const SHIFT: usize>(
5725 self,
5726 a: f32x16<Self>,
5727 b: f32x16<Self>,
5728 ) -> f32x16<Self> {
5729 let (a0, a1) = self.split_f32x16(a);
5730 let (b0, b1) = self.split_f32x16(b);
5731 self.combine_f32x8(
5732 self.slide_within_blocks_f32x8::<SHIFT>(a0, b0),
5733 self.slide_within_blocks_f32x8::<SHIFT>(a1, b1),
5734 )
5735 }
5736 #[inline(always)]
5737 fn abs_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
5738 let (a0, a1) = self.split_f32x16(a);
5739 self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1))
5740 }
5741 #[inline(always)]
5742 fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
5743 let (a0, a1) = self.split_f32x16(a);
5744 self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1))
5745 }
5746 #[inline(always)]
5747 fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
5748 let (a0, a1) = self.split_f32x16(a);
5749 self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
5750 }
5751 #[inline(always)]
5752 fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5753 let (a0, a1) = self.split_f32x16(a);
5754 let (b0, b1) = self.split_f32x16(b);
5755 self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1))
5756 }
5757 #[inline(always)]
5758 fn sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5759 let (a0, a1) = self.split_f32x16(a);
5760 let (b0, b1) = self.split_f32x16(b);
5761 self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1))
5762 }
5763 #[inline(always)]
5764 fn mul_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5765 let (a0, a1) = self.split_f32x16(a);
5766 let (b0, b1) = self.split_f32x16(b);
5767 self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1))
5768 }
5769 #[inline(always)]
5770 fn div_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5771 let (a0, a1) = self.split_f32x16(a);
5772 let (b0, b1) = self.split_f32x16(b);
5773 self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1))
5774 }
5775 #[inline(always)]
5776 fn copysign_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5777 let (a0, a1) = self.split_f32x16(a);
5778 let (b0, b1) = self.split_f32x16(b);
5779 self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1))
5780 }
5781 #[inline(always)]
5782 fn simd_eq_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
5783 let (a0, a1) = self.split_f32x16(a);
5784 let (b0, b1) = self.split_f32x16(b);
5785 self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1))
5786 }
5787 #[inline(always)]
5788 fn simd_lt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
5789 let (a0, a1) = self.split_f32x16(a);
5790 let (b0, b1) = self.split_f32x16(b);
5791 self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1))
5792 }
5793 #[inline(always)]
5794 fn simd_le_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
5795 let (a0, a1) = self.split_f32x16(a);
5796 let (b0, b1) = self.split_f32x16(b);
5797 self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1))
5798 }
5799 #[inline(always)]
5800 fn simd_ge_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
5801 let (a0, a1) = self.split_f32x16(a);
5802 let (b0, b1) = self.split_f32x16(b);
5803 self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1))
5804 }
5805 #[inline(always)]
5806 fn simd_gt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
5807 let (a0, a1) = self.split_f32x16(a);
5808 let (b0, b1) = self.split_f32x16(b);
5809 self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1))
5810 }
5811 #[inline(always)]
5812 fn zip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5813 let (a0, _) = self.split_f32x16(a);
5814 let (b0, _) = self.split_f32x16(b);
5815 self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0))
5816 }
5817 #[inline(always)]
5818 fn zip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5819 let (_, a1) = self.split_f32x16(a);
5820 let (_, b1) = self.split_f32x16(b);
5821 self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1))
5822 }
5823 #[inline(always)]
5824 fn unzip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5825 let (a0, a1) = self.split_f32x16(a);
5826 let (b0, b1) = self.split_f32x16(b);
5827 self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1))
5828 }
5829 #[inline(always)]
5830 fn unzip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5831 let (a0, a1) = self.split_f32x16(a);
5832 let (b0, b1) = self.split_f32x16(b);
5833 self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1))
5834 }
5835 #[inline(always)]
5836 fn interleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
5837 let (a0, a1) = self.split_f32x16(a);
5838 let (b0, b1) = self.split_f32x16(b);
5839 let lo_lo = self.zip_low_f32x8(a0, b0);
5840 let lo_hi = self.zip_high_f32x8(a0, b0);
5841 let hi_lo = self.zip_low_f32x8(a1, b1);
5842 let hi_hi = self.zip_high_f32x8(a1, b1);
5843 (
5844 self.combine_f32x8(lo_lo, lo_hi),
5845 self.combine_f32x8(hi_lo, hi_hi),
5846 )
5847 }
5848 #[inline(always)]
5849 fn deinterleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
5850 let (a0, a1) = self.split_f32x16(a);
5851 let (b0, b1) = self.split_f32x16(b);
5852 let lo_even = self.unzip_low_f32x8(a0, a1);
5853 let lo_odd = self.unzip_high_f32x8(a0, a1);
5854 let hi_even = self.unzip_low_f32x8(b0, b1);
5855 let hi_odd = self.unzip_high_f32x8(b0, b1);
5856 (
5857 self.combine_f32x8(lo_even, hi_even),
5858 self.combine_f32x8(lo_odd, hi_odd),
5859 )
5860 }
5861 #[inline(always)]
5862 fn max_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5863 let (a0, a1) = self.split_f32x16(a);
5864 let (b0, b1) = self.split_f32x16(b);
5865 self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1))
5866 }
5867 #[inline(always)]
5868 fn min_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5869 let (a0, a1) = self.split_f32x16(a);
5870 let (b0, b1) = self.split_f32x16(b);
5871 self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1))
5872 }
5873 #[inline(always)]
5874 fn max_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5875 let (a0, a1) = self.split_f32x16(a);
5876 let (b0, b1) = self.split_f32x16(b);
5877 self.combine_f32x8(
5878 self.max_precise_f32x8(a0, b0),
5879 self.max_precise_f32x8(a1, b1),
5880 )
5881 }
5882 #[inline(always)]
5883 fn min_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
5884 let (a0, a1) = self.split_f32x16(a);
5885 let (b0, b1) = self.split_f32x16(b);
5886 self.combine_f32x8(
5887 self.min_precise_f32x8(a0, b0),
5888 self.min_precise_f32x8(a1, b1),
5889 )
5890 }
5891 #[inline(always)]
5892 fn mul_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
5893 let (a0, a1) = self.split_f32x16(a);
5894 let (b0, b1) = self.split_f32x16(b);
5895 let (c0, c1) = self.split_f32x16(c);
5896 self.combine_f32x8(
5897 self.mul_add_f32x8(a0, b0, c0),
5898 self.mul_add_f32x8(a1, b1, c1),
5899 )
5900 }
5901 #[inline(always)]
5902 fn mul_sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
5903 let (a0, a1) = self.split_f32x16(a);
5904 let (b0, b1) = self.split_f32x16(b);
5905 let (c0, c1) = self.split_f32x16(c);
5906 self.combine_f32x8(
5907 self.mul_sub_f32x8(a0, b0, c0),
5908 self.mul_sub_f32x8(a1, b1, c1),
5909 )
5910 }
5911 #[inline(always)]
5912 fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
5913 let (a0, a1) = self.split_f32x16(a);
5914 self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1))
5915 }
5916 #[inline(always)]
5917 fn ceil_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
5918 let (a0, a1) = self.split_f32x16(a);
5919 self.combine_f32x8(self.ceil_f32x8(a0), self.ceil_f32x8(a1))
5920 }
5921 #[inline(always)]
5922 fn round_ties_even_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
5923 let (a0, a1) = self.split_f32x16(a);
5924 self.combine_f32x8(
5925 self.round_ties_even_f32x8(a0),
5926 self.round_ties_even_f32x8(a1),
5927 )
5928 }
5929 #[inline(always)]
5930 fn fract_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
5931 let (a0, a1) = self.split_f32x16(a);
5932 self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1))
5933 }
5934 #[inline(always)]
5935 fn trunc_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
5936 let (a0, a1) = self.split_f32x16(a);
5937 self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1))
5938 }
5939 #[inline(always)]
5940 fn select_f32x16(self, a: mask32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
5941 let (a0, a1) = self.split_mask32x16(a);
5942 let (b0, b1) = self.split_f32x16(b);
5943 let (c0, c1) = self.split_f32x16(c);
5944 self.combine_f32x8(self.select_f32x8(a0, b0, c0), self.select_f32x8(a1, b1, c1))
5945 }
5946 #[inline(always)]
5947 fn split_f32x16(self, a: f32x16<Self>) -> (f32x8<Self>, f32x8<Self>) {
5948 (
5949 f32x8 {
5950 val: crate::support::Aligned256(a.val.0[0]),
5951 simd: self,
5952 },
5953 f32x8 {
5954 val: crate::support::Aligned256(a.val.0[1]),
5955 simd: self,
5956 },
5957 )
5958 }
5959 #[inline(always)]
5960 fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
5961 let (a0, a1) = self.split_f32x16(a);
5962 self.combine_f64x4(
5963 self.reinterpret_f64_f32x8(a0),
5964 self.reinterpret_f64_f32x8(a1),
5965 )
5966 }
5967 #[inline(always)]
5968 fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
5969 let (a0, a1) = self.split_f32x16(a);
5970 self.combine_i32x8(
5971 self.reinterpret_i32_f32x8(a0),
5972 self.reinterpret_i32_f32x8(a1),
5973 )
5974 }
5975 #[inline(always)]
5976 fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
5977 unsafe {
5978 let v0 = _mm_loadu_ps(src.as_ptr() as *const _);
5979 let v1 = _mm_loadu_ps(src.as_ptr().add(4usize) as *const _);
5980 let v2 = _mm_loadu_ps(src.as_ptr().add(2 * 4usize) as *const _);
5981 let v3 = _mm_loadu_ps(src.as_ptr().add(3 * 4usize) as *const _);
5982 let tmp0 = _mm_unpacklo_ps(v0, v1);
5983 let tmp1 = _mm_unpackhi_ps(v0, v1);
5984 let tmp2 = _mm_unpacklo_ps(v2, v3);
5985 let tmp3 = _mm_unpackhi_ps(v2, v3);
5986 let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
5987 let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
5988 let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
5989 let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
5990 self.combine_f32x8(
5991 self.combine_f32x4(out0.simd_into(self), out1.simd_into(self)),
5992 self.combine_f32x4(out2.simd_into(self), out3.simd_into(self)),
5993 )
5994 }
5995 }
5996 #[inline(always)]
5997 fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
5998 let (v01, v23) = self.split_f32x16(a);
5999 let (v0, v1) = self.split_f32x8(v01);
6000 let (v2, v3) = self.split_f32x8(v23);
6001 let v0 = v0.into();
6002 let v1 = v1.into();
6003 let v2 = v2.into();
6004 let v3 = v3.into();
6005 unsafe {
6006 let tmp0 = _mm_unpacklo_ps(v0, v1);
6007 let tmp1 = _mm_unpackhi_ps(v0, v1);
6008 let tmp2 = _mm_unpacklo_ps(v2, v3);
6009 let tmp3 = _mm_unpackhi_ps(v2, v3);
6010 let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
6011 let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
6012 let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
6013 let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
6014 _mm_storeu_ps(dest.as_mut_ptr() as *mut _, out0);
6015 _mm_storeu_ps(dest.as_mut_ptr().add(4usize) as *mut _, out1);
6016 _mm_storeu_ps(dest.as_mut_ptr().add(2 * 4usize) as *mut _, out2);
6017 _mm_storeu_ps(dest.as_mut_ptr().add(3 * 4usize) as *mut _, out3);
6018 }
6019 }
6020 #[inline(always)]
6021 fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
6022 let (a0, a1) = self.split_f32x16(a);
6023 self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1))
6024 }
6025 #[inline(always)]
6026 fn reinterpret_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
6027 let (a0, a1) = self.split_f32x16(a);
6028 self.combine_u32x8(
6029 self.reinterpret_u32_f32x8(a0),
6030 self.reinterpret_u32_f32x8(a1),
6031 )
6032 }
6033 #[inline(always)]
6034 fn cvt_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
6035 let (a0, a1) = self.split_f32x16(a);
6036 self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1))
6037 }
6038 #[inline(always)]
6039 fn cvt_u32_precise_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
6040 let (a0, a1) = self.split_f32x16(a);
6041 self.combine_u32x8(
6042 self.cvt_u32_precise_f32x8(a0),
6043 self.cvt_u32_precise_f32x8(a1),
6044 )
6045 }
6046 #[inline(always)]
6047 fn cvt_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
6048 let (a0, a1) = self.split_f32x16(a);
6049 self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1))
6050 }
6051 #[inline(always)]
6052 fn cvt_i32_precise_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
6053 let (a0, a1) = self.split_f32x16(a);
6054 self.combine_i32x8(
6055 self.cvt_i32_precise_f32x8(a0),
6056 self.cvt_i32_precise_f32x8(a1),
6057 )
6058 }
6059 #[inline(always)]
6060 fn splat_i8x64(self, val: i8) -> i8x64<Self> {
6061 let half = self.splat_i8x32(val);
6062 self.combine_i8x32(half, half)
6063 }
6064 #[inline(always)]
6065 fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64<Self> {
6066 i8x64 {
6067 val: unsafe { core::mem::transmute_copy(&val) },
6068 simd: self,
6069 }
6070 }
6071 #[inline(always)]
6072 fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64<Self> {
6073 i8x64 {
6074 val: unsafe { core::mem::transmute_copy(val) },
6075 simd: self,
6076 }
6077 }
6078 #[inline(always)]
6079 fn as_array_i8x64(self, a: i8x64<Self>) -> [i8; 64usize] {
6080 unsafe { core::mem::transmute::<[__m256i; 2usize], [i8; 64usize]>(a.val.0) }
6081 }
6082 #[inline(always)]
6083 fn as_array_ref_i8x64(self, a: &i8x64<Self>) -> &[i8; 64usize] {
6084 unsafe { core::mem::transmute::<&[__m256i; 2usize], &[i8; 64usize]>(&a.val.0) }
6085 }
6086 #[inline(always)]
6087 fn as_array_mut_i8x64(self, a: &mut i8x64<Self>) -> &mut [i8; 64usize] {
6088 unsafe { core::mem::transmute::<&mut [__m256i; 2usize], &mut [i8; 64usize]>(&mut a.val.0) }
6089 }
6090 #[inline(always)]
6091 fn store_array_i8x64(self, a: i8x64<Self>, dest: &mut [i8; 64usize]) -> () {
6092 unsafe {
6093 core::ptr::copy_nonoverlapping(
6094 (&raw const a.val.0) as *const i8,
6095 dest.as_mut_ptr(),
6096 64usize,
6097 );
6098 }
6099 }
6100 #[inline(always)]
6101 fn cvt_from_bytes_i8x64(self, a: u8x64<Self>) -> i8x64<Self> {
6102 unsafe {
6103 i8x64 {
6104 val: core::mem::transmute(a.val),
6105 simd: self,
6106 }
6107 }
6108 }
6109 #[inline(always)]
6110 fn cvt_to_bytes_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
6111 unsafe {
6112 u8x64 {
6113 val: core::mem::transmute(a.val),
6114 simd: self,
6115 }
6116 }
6117 }
6118 #[inline(always)]
6119 fn slide_i8x64<const SHIFT: usize>(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6120 unsafe {
6121 if SHIFT >= 64usize {
6122 return b;
6123 }
6124 let result = cross_block_alignr_256x2(
6125 self.cvt_to_bytes_i8x64(b).val.0,
6126 self.cvt_to_bytes_i8x64(a).val.0,
6127 SHIFT,
6128 );
6129 self.cvt_from_bytes_i8x64(u8x64 {
6130 val: crate::support::Aligned512(result),
6131 simd: self,
6132 })
6133 }
6134 }
6135 #[inline(always)]
6136 fn slide_within_blocks_i8x64<const SHIFT: usize>(
6137 self,
6138 a: i8x64<Self>,
6139 b: i8x64<Self>,
6140 ) -> i8x64<Self> {
6141 let (a0, a1) = self.split_i8x64(a);
6142 let (b0, b1) = self.split_i8x64(b);
6143 self.combine_i8x32(
6144 self.slide_within_blocks_i8x32::<SHIFT>(a0, b0),
6145 self.slide_within_blocks_i8x32::<SHIFT>(a1, b1),
6146 )
6147 }
6148 #[inline(always)]
6149 fn add_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6150 let (a0, a1) = self.split_i8x64(a);
6151 let (b0, b1) = self.split_i8x64(b);
6152 self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1))
6153 }
6154 #[inline(always)]
6155 fn sub_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6156 let (a0, a1) = self.split_i8x64(a);
6157 let (b0, b1) = self.split_i8x64(b);
6158 self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1))
6159 }
6160 #[inline(always)]
6161 fn mul_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6162 let (a0, a1) = self.split_i8x64(a);
6163 let (b0, b1) = self.split_i8x64(b);
6164 self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1))
6165 }
6166 #[inline(always)]
6167 fn and_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6168 let (a0, a1) = self.split_i8x64(a);
6169 let (b0, b1) = self.split_i8x64(b);
6170 self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1))
6171 }
6172 #[inline(always)]
6173 fn or_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6174 let (a0, a1) = self.split_i8x64(a);
6175 let (b0, b1) = self.split_i8x64(b);
6176 self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1))
6177 }
6178 #[inline(always)]
6179 fn xor_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6180 let (a0, a1) = self.split_i8x64(a);
6181 let (b0, b1) = self.split_i8x64(b);
6182 self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1))
6183 }
6184 #[inline(always)]
6185 fn not_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
6186 let (a0, a1) = self.split_i8x64(a);
6187 self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1))
6188 }
6189 #[inline(always)]
6190 fn shl_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
6191 let (a0, a1) = self.split_i8x64(a);
6192 self.combine_i8x32(self.shl_i8x32(a0, shift), self.shl_i8x32(a1, shift))
6193 }
6194 #[inline(always)]
6195 fn shlv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6196 let (a0, a1) = self.split_i8x64(a);
6197 let (b0, b1) = self.split_i8x64(b);
6198 self.combine_i8x32(self.shlv_i8x32(a0, b0), self.shlv_i8x32(a1, b1))
6199 }
6200 #[inline(always)]
6201 fn shr_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
6202 let (a0, a1) = self.split_i8x64(a);
6203 self.combine_i8x32(self.shr_i8x32(a0, shift), self.shr_i8x32(a1, shift))
6204 }
6205 #[inline(always)]
6206 fn shrv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6207 let (a0, a1) = self.split_i8x64(a);
6208 let (b0, b1) = self.split_i8x64(b);
6209 self.combine_i8x32(self.shrv_i8x32(a0, b0), self.shrv_i8x32(a1, b1))
6210 }
6211 #[inline(always)]
6212 fn simd_eq_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
6213 let (a0, a1) = self.split_i8x64(a);
6214 let (b0, b1) = self.split_i8x64(b);
6215 self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1))
6216 }
6217 #[inline(always)]
6218 fn simd_lt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
6219 let (a0, a1) = self.split_i8x64(a);
6220 let (b0, b1) = self.split_i8x64(b);
6221 self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1))
6222 }
6223 #[inline(always)]
6224 fn simd_le_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
6225 let (a0, a1) = self.split_i8x64(a);
6226 let (b0, b1) = self.split_i8x64(b);
6227 self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1))
6228 }
6229 #[inline(always)]
6230 fn simd_ge_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
6231 let (a0, a1) = self.split_i8x64(a);
6232 let (b0, b1) = self.split_i8x64(b);
6233 self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1))
6234 }
6235 #[inline(always)]
6236 fn simd_gt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
6237 let (a0, a1) = self.split_i8x64(a);
6238 let (b0, b1) = self.split_i8x64(b);
6239 self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1))
6240 }
6241 #[inline(always)]
6242 fn zip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6243 let (a0, _) = self.split_i8x64(a);
6244 let (b0, _) = self.split_i8x64(b);
6245 self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0))
6246 }
6247 #[inline(always)]
6248 fn zip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6249 let (_, a1) = self.split_i8x64(a);
6250 let (_, b1) = self.split_i8x64(b);
6251 self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1))
6252 }
6253 #[inline(always)]
6254 fn unzip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6255 let (a0, a1) = self.split_i8x64(a);
6256 let (b0, b1) = self.split_i8x64(b);
6257 self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1))
6258 }
6259 #[inline(always)]
6260 fn unzip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6261 let (a0, a1) = self.split_i8x64(a);
6262 let (b0, b1) = self.split_i8x64(b);
6263 self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1))
6264 }
6265 #[inline(always)]
6266 fn interleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
6267 let (a0, a1) = self.split_i8x64(a);
6268 let (b0, b1) = self.split_i8x64(b);
6269 let lo_lo = self.zip_low_i8x32(a0, b0);
6270 let lo_hi = self.zip_high_i8x32(a0, b0);
6271 let hi_lo = self.zip_low_i8x32(a1, b1);
6272 let hi_hi = self.zip_high_i8x32(a1, b1);
6273 (
6274 self.combine_i8x32(lo_lo, lo_hi),
6275 self.combine_i8x32(hi_lo, hi_hi),
6276 )
6277 }
6278 #[inline(always)]
6279 fn deinterleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
6280 let (a0, a1) = self.split_i8x64(a);
6281 let (b0, b1) = self.split_i8x64(b);
6282 let lo_even = self.unzip_low_i8x32(a0, a1);
6283 let lo_odd = self.unzip_high_i8x32(a0, a1);
6284 let hi_even = self.unzip_low_i8x32(b0, b1);
6285 let hi_odd = self.unzip_high_i8x32(b0, b1);
6286 (
6287 self.combine_i8x32(lo_even, hi_even),
6288 self.combine_i8x32(lo_odd, hi_odd),
6289 )
6290 }
6291 #[inline(always)]
6292 fn select_i8x64(self, a: mask8x64<Self>, b: i8x64<Self>, c: i8x64<Self>) -> i8x64<Self> {
6293 let (a0, a1) = self.split_mask8x64(a);
6294 let (b0, b1) = self.split_i8x64(b);
6295 let (c0, c1) = self.split_i8x64(c);
6296 self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1))
6297 }
6298 #[inline(always)]
6299 fn min_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6300 let (a0, a1) = self.split_i8x64(a);
6301 let (b0, b1) = self.split_i8x64(b);
6302 self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1))
6303 }
6304 #[inline(always)]
6305 fn max_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
6306 let (a0, a1) = self.split_i8x64(a);
6307 let (b0, b1) = self.split_i8x64(b);
6308 self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1))
6309 }
6310 #[inline(always)]
6311 fn split_i8x64(self, a: i8x64<Self>) -> (i8x32<Self>, i8x32<Self>) {
6312 (
6313 i8x32 {
6314 val: crate::support::Aligned256(a.val.0[0]),
6315 simd: self,
6316 },
6317 i8x32 {
6318 val: crate::support::Aligned256(a.val.0[1]),
6319 simd: self,
6320 },
6321 )
6322 }
6323 #[inline(always)]
6324 fn neg_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
6325 let (a0, a1) = self.split_i8x64(a);
6326 self.combine_i8x32(self.neg_i8x32(a0), self.neg_i8x32(a1))
6327 }
6328 #[inline(always)]
6329 fn reinterpret_u8_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
6330 let (a0, a1) = self.split_i8x64(a);
6331 self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1))
6332 }
6333 #[inline(always)]
6334 fn reinterpret_u32_i8x64(self, a: i8x64<Self>) -> u32x16<Self> {
6335 let (a0, a1) = self.split_i8x64(a);
6336 self.combine_u32x8(
6337 self.reinterpret_u32_i8x32(a0),
6338 self.reinterpret_u32_i8x32(a1),
6339 )
6340 }
6341 #[inline(always)]
6342 fn splat_u8x64(self, val: u8) -> u8x64<Self> {
6343 let half = self.splat_u8x32(val);
6344 self.combine_u8x32(half, half)
6345 }
6346 #[inline(always)]
6347 fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64<Self> {
6348 u8x64 {
6349 val: unsafe { core::mem::transmute_copy(&val) },
6350 simd: self,
6351 }
6352 }
6353 #[inline(always)]
6354 fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64<Self> {
6355 u8x64 {
6356 val: unsafe { core::mem::transmute_copy(val) },
6357 simd: self,
6358 }
6359 }
6360 #[inline(always)]
6361 fn as_array_u8x64(self, a: u8x64<Self>) -> [u8; 64usize] {
6362 unsafe { core::mem::transmute::<[__m256i; 2usize], [u8; 64usize]>(a.val.0) }
6363 }
6364 #[inline(always)]
6365 fn as_array_ref_u8x64(self, a: &u8x64<Self>) -> &[u8; 64usize] {
6366 unsafe { core::mem::transmute::<&[__m256i; 2usize], &[u8; 64usize]>(&a.val.0) }
6367 }
6368 #[inline(always)]
6369 fn as_array_mut_u8x64(self, a: &mut u8x64<Self>) -> &mut [u8; 64usize] {
6370 unsafe { core::mem::transmute::<&mut [__m256i; 2usize], &mut [u8; 64usize]>(&mut a.val.0) }
6371 }
6372 #[inline(always)]
6373 fn store_array_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
6374 unsafe {
6375 core::ptr::copy_nonoverlapping(
6376 (&raw const a.val.0) as *const u8,
6377 dest.as_mut_ptr(),
6378 64usize,
6379 );
6380 }
6381 }
6382 #[inline(always)]
6383 fn cvt_from_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
6384 unsafe {
6385 u8x64 {
6386 val: core::mem::transmute(a.val),
6387 simd: self,
6388 }
6389 }
6390 }
6391 #[inline(always)]
6392 fn cvt_to_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
6393 unsafe {
6394 u8x64 {
6395 val: core::mem::transmute(a.val),
6396 simd: self,
6397 }
6398 }
6399 }
6400 #[inline(always)]
6401 fn slide_u8x64<const SHIFT: usize>(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6402 unsafe {
6403 if SHIFT >= 64usize {
6404 return b;
6405 }
6406 let result = cross_block_alignr_256x2(
6407 self.cvt_to_bytes_u8x64(b).val.0,
6408 self.cvt_to_bytes_u8x64(a).val.0,
6409 SHIFT,
6410 );
6411 self.cvt_from_bytes_u8x64(u8x64 {
6412 val: crate::support::Aligned512(result),
6413 simd: self,
6414 })
6415 }
6416 }
6417 #[inline(always)]
6418 fn slide_within_blocks_u8x64<const SHIFT: usize>(
6419 self,
6420 a: u8x64<Self>,
6421 b: u8x64<Self>,
6422 ) -> u8x64<Self> {
6423 let (a0, a1) = self.split_u8x64(a);
6424 let (b0, b1) = self.split_u8x64(b);
6425 self.combine_u8x32(
6426 self.slide_within_blocks_u8x32::<SHIFT>(a0, b0),
6427 self.slide_within_blocks_u8x32::<SHIFT>(a1, b1),
6428 )
6429 }
6430 #[inline(always)]
6431 fn add_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6432 let (a0, a1) = self.split_u8x64(a);
6433 let (b0, b1) = self.split_u8x64(b);
6434 self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1))
6435 }
6436 #[inline(always)]
6437 fn sub_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6438 let (a0, a1) = self.split_u8x64(a);
6439 let (b0, b1) = self.split_u8x64(b);
6440 self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1))
6441 }
6442 #[inline(always)]
6443 fn mul_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6444 let (a0, a1) = self.split_u8x64(a);
6445 let (b0, b1) = self.split_u8x64(b);
6446 self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1))
6447 }
6448 #[inline(always)]
6449 fn and_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6450 let (a0, a1) = self.split_u8x64(a);
6451 let (b0, b1) = self.split_u8x64(b);
6452 self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1))
6453 }
6454 #[inline(always)]
6455 fn or_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6456 let (a0, a1) = self.split_u8x64(a);
6457 let (b0, b1) = self.split_u8x64(b);
6458 self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1))
6459 }
6460 #[inline(always)]
6461 fn xor_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6462 let (a0, a1) = self.split_u8x64(a);
6463 let (b0, b1) = self.split_u8x64(b);
6464 self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1))
6465 }
6466 #[inline(always)]
6467 fn not_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
6468 let (a0, a1) = self.split_u8x64(a);
6469 self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1))
6470 }
6471 #[inline(always)]
6472 fn shl_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
6473 let (a0, a1) = self.split_u8x64(a);
6474 self.combine_u8x32(self.shl_u8x32(a0, shift), self.shl_u8x32(a1, shift))
6475 }
6476 #[inline(always)]
6477 fn shlv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6478 let (a0, a1) = self.split_u8x64(a);
6479 let (b0, b1) = self.split_u8x64(b);
6480 self.combine_u8x32(self.shlv_u8x32(a0, b0), self.shlv_u8x32(a1, b1))
6481 }
6482 #[inline(always)]
6483 fn shr_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
6484 let (a0, a1) = self.split_u8x64(a);
6485 self.combine_u8x32(self.shr_u8x32(a0, shift), self.shr_u8x32(a1, shift))
6486 }
6487 #[inline(always)]
6488 fn shrv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6489 let (a0, a1) = self.split_u8x64(a);
6490 let (b0, b1) = self.split_u8x64(b);
6491 self.combine_u8x32(self.shrv_u8x32(a0, b0), self.shrv_u8x32(a1, b1))
6492 }
6493 #[inline(always)]
6494 fn simd_eq_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
6495 let (a0, a1) = self.split_u8x64(a);
6496 let (b0, b1) = self.split_u8x64(b);
6497 self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1))
6498 }
6499 #[inline(always)]
6500 fn simd_lt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
6501 let (a0, a1) = self.split_u8x64(a);
6502 let (b0, b1) = self.split_u8x64(b);
6503 self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1))
6504 }
6505 #[inline(always)]
6506 fn simd_le_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
6507 let (a0, a1) = self.split_u8x64(a);
6508 let (b0, b1) = self.split_u8x64(b);
6509 self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1))
6510 }
6511 #[inline(always)]
6512 fn simd_ge_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
6513 let (a0, a1) = self.split_u8x64(a);
6514 let (b0, b1) = self.split_u8x64(b);
6515 self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1))
6516 }
6517 #[inline(always)]
6518 fn simd_gt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
6519 let (a0, a1) = self.split_u8x64(a);
6520 let (b0, b1) = self.split_u8x64(b);
6521 self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1))
6522 }
6523 #[inline(always)]
6524 fn zip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6525 let (a0, _) = self.split_u8x64(a);
6526 let (b0, _) = self.split_u8x64(b);
6527 self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0))
6528 }
6529 #[inline(always)]
6530 fn zip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6531 let (_, a1) = self.split_u8x64(a);
6532 let (_, b1) = self.split_u8x64(b);
6533 self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1))
6534 }
6535 #[inline(always)]
6536 fn unzip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6537 let (a0, a1) = self.split_u8x64(a);
6538 let (b0, b1) = self.split_u8x64(b);
6539 self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1))
6540 }
6541 #[inline(always)]
6542 fn unzip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6543 let (a0, a1) = self.split_u8x64(a);
6544 let (b0, b1) = self.split_u8x64(b);
6545 self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1))
6546 }
6547 #[inline(always)]
6548 fn interleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
6549 let (a0, a1) = self.split_u8x64(a);
6550 let (b0, b1) = self.split_u8x64(b);
6551 let lo_lo = self.zip_low_u8x32(a0, b0);
6552 let lo_hi = self.zip_high_u8x32(a0, b0);
6553 let hi_lo = self.zip_low_u8x32(a1, b1);
6554 let hi_hi = self.zip_high_u8x32(a1, b1);
6555 (
6556 self.combine_u8x32(lo_lo, lo_hi),
6557 self.combine_u8x32(hi_lo, hi_hi),
6558 )
6559 }
6560 #[inline(always)]
6561 fn deinterleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
6562 let (a0, a1) = self.split_u8x64(a);
6563 let (b0, b1) = self.split_u8x64(b);
6564 let lo_even = self.unzip_low_u8x32(a0, a1);
6565 let lo_odd = self.unzip_high_u8x32(a0, a1);
6566 let hi_even = self.unzip_low_u8x32(b0, b1);
6567 let hi_odd = self.unzip_high_u8x32(b0, b1);
6568 (
6569 self.combine_u8x32(lo_even, hi_even),
6570 self.combine_u8x32(lo_odd, hi_odd),
6571 )
6572 }
6573 #[inline(always)]
6574 fn select_u8x64(self, a: mask8x64<Self>, b: u8x64<Self>, c: u8x64<Self>) -> u8x64<Self> {
6575 let (a0, a1) = self.split_mask8x64(a);
6576 let (b0, b1) = self.split_u8x64(b);
6577 let (c0, c1) = self.split_u8x64(c);
6578 self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1))
6579 }
6580 #[inline(always)]
6581 fn min_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6582 let (a0, a1) = self.split_u8x64(a);
6583 let (b0, b1) = self.split_u8x64(b);
6584 self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1))
6585 }
6586 #[inline(always)]
6587 fn max_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
6588 let (a0, a1) = self.split_u8x64(a);
6589 let (b0, b1) = self.split_u8x64(b);
6590 self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1))
6591 }
6592 #[inline(always)]
6593 fn split_u8x64(self, a: u8x64<Self>) -> (u8x32<Self>, u8x32<Self>) {
6594 (
6595 u8x32 {
6596 val: crate::support::Aligned256(a.val.0[0]),
6597 simd: self,
6598 },
6599 u8x32 {
6600 val: crate::support::Aligned256(a.val.0[1]),
6601 simd: self,
6602 },
6603 )
6604 }
6605 #[inline(always)]
6606 fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self> {
6607 unsafe {
6608 let v0 = _mm_loadu_si128(src.as_ptr() as *const _);
6609 let v1 = _mm_loadu_si128(src.as_ptr().add(16usize) as *const _);
6610 let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 16usize) as *const _);
6611 let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 16usize) as *const _);
6612 let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
6613 let v0 = _mm_shuffle_epi8(v0, mask);
6614 let v1 = _mm_shuffle_epi8(v1, mask);
6615 let v2 = _mm_shuffle_epi8(v2, mask);
6616 let v3 = _mm_shuffle_epi8(v3, mask);
6617 let tmp0 = _mm_unpacklo_epi32(v0, v1);
6618 let tmp1 = _mm_unpackhi_epi32(v0, v1);
6619 let tmp2 = _mm_unpacklo_epi32(v2, v3);
6620 let tmp3 = _mm_unpackhi_epi32(v2, v3);
6621 let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
6622 let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
6623 let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
6624 let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
6625 self.combine_u8x32(
6626 self.combine_u8x16(out0.simd_into(self), out1.simd_into(self)),
6627 self.combine_u8x16(out2.simd_into(self), out3.simd_into(self)),
6628 )
6629 }
6630 }
6631 #[inline(always)]
6632 fn store_interleaved_128_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
6633 let (v01, v23) = self.split_u8x64(a);
6634 let (v0, v1) = self.split_u8x32(v01);
6635 let (v2, v3) = self.split_u8x32(v23);
6636 let v0 = v0.into();
6637 let v1 = v1.into();
6638 let v2 = v2.into();
6639 let v3 = v3.into();
6640 unsafe {
6641 let tmp0 = _mm_unpacklo_epi32(v0, v1);
6642 let tmp1 = _mm_unpackhi_epi32(v0, v1);
6643 let tmp2 = _mm_unpacklo_epi32(v2, v3);
6644 let tmp3 = _mm_unpackhi_epi32(v2, v3);
6645 let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
6646 let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
6647 let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
6648 let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
6649 let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
6650 let out0 = _mm_shuffle_epi8(out0, mask);
6651 let out1 = _mm_shuffle_epi8(out1, mask);
6652 let out2 = _mm_shuffle_epi8(out2, mask);
6653 let out3 = _mm_shuffle_epi8(out3, mask);
6654 _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0);
6655 _mm_storeu_si128(dest.as_mut_ptr().add(16usize) as *mut _, out1);
6656 _mm_storeu_si128(dest.as_mut_ptr().add(2 * 16usize) as *mut _, out2);
6657 _mm_storeu_si128(dest.as_mut_ptr().add(3 * 16usize) as *mut _, out3);
6658 }
6659 }
6660 #[inline(always)]
6661 fn reinterpret_u32_u8x64(self, a: u8x64<Self>) -> u32x16<Self> {
6662 let (a0, a1) = self.split_u8x64(a);
6663 self.combine_u32x8(
6664 self.reinterpret_u32_u8x32(a0),
6665 self.reinterpret_u32_u8x32(a1),
6666 )
6667 }
6668 #[inline(always)]
6669 fn splat_mask8x64(self, val: i8) -> mask8x64<Self> {
6670 let half = self.splat_mask8x32(val);
6671 self.combine_mask8x32(half, half)
6672 }
6673 #[inline(always)]
6674 fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64<Self> {
6675 mask8x64 {
6676 val: unsafe { core::mem::transmute_copy(&val) },
6677 simd: self,
6678 }
6679 }
6680 #[inline(always)]
6681 fn load_array_ref_mask8x64(self, val: &[i8; 64usize]) -> mask8x64<Self> {
6682 mask8x64 {
6683 val: unsafe { core::mem::transmute_copy(val) },
6684 simd: self,
6685 }
6686 }
6687 #[inline(always)]
6688 fn as_array_mask8x64(self, a: mask8x64<Self>) -> [i8; 64usize] {
6689 unsafe { core::mem::transmute::<[__m256i; 2usize], [i8; 64usize]>(a.val.0) }
6690 }
6691 #[inline(always)]
6692 fn as_array_ref_mask8x64(self, a: &mask8x64<Self>) -> &[i8; 64usize] {
6693 unsafe { core::mem::transmute::<&[__m256i; 2usize], &[i8; 64usize]>(&a.val.0) }
6694 }
6695 #[inline(always)]
6696 fn as_array_mut_mask8x64(self, a: &mut mask8x64<Self>) -> &mut [i8; 64usize] {
6697 unsafe { core::mem::transmute::<&mut [__m256i; 2usize], &mut [i8; 64usize]>(&mut a.val.0) }
6698 }
6699 #[inline(always)]
6700 fn store_array_mask8x64(self, a: mask8x64<Self>, dest: &mut [i8; 64usize]) -> () {
6701 unsafe {
6702 core::ptr::copy_nonoverlapping(
6703 (&raw const a.val.0) as *const i8,
6704 dest.as_mut_ptr(),
6705 64usize,
6706 );
6707 }
6708 }
6709 #[inline(always)]
6710 fn cvt_from_bytes_mask8x64(self, a: u8x64<Self>) -> mask8x64<Self> {
6711 unsafe {
6712 mask8x64 {
6713 val: core::mem::transmute(a.val),
6714 simd: self,
6715 }
6716 }
6717 }
6718 #[inline(always)]
6719 fn cvt_to_bytes_mask8x64(self, a: mask8x64<Self>) -> u8x64<Self> {
6720 unsafe {
6721 u8x64 {
6722 val: core::mem::transmute(a.val),
6723 simd: self,
6724 }
6725 }
6726 }
6727 #[inline(always)]
6728 fn slide_mask8x64<const SHIFT: usize>(
6729 self,
6730 a: mask8x64<Self>,
6731 b: mask8x64<Self>,
6732 ) -> mask8x64<Self> {
6733 unsafe {
6734 if SHIFT >= 64usize {
6735 return b;
6736 }
6737 let result = cross_block_alignr_256x2(
6738 self.cvt_to_bytes_mask8x64(b).val.0,
6739 self.cvt_to_bytes_mask8x64(a).val.0,
6740 SHIFT,
6741 );
6742 self.cvt_from_bytes_mask8x64(u8x64 {
6743 val: crate::support::Aligned512(result),
6744 simd: self,
6745 })
6746 }
6747 }
6748 #[inline(always)]
6749 fn slide_within_blocks_mask8x64<const SHIFT: usize>(
6750 self,
6751 a: mask8x64<Self>,
6752 b: mask8x64<Self>,
6753 ) -> mask8x64<Self> {
6754 let (a0, a1) = self.split_mask8x64(a);
6755 let (b0, b1) = self.split_mask8x64(b);
6756 self.combine_mask8x32(
6757 self.slide_within_blocks_mask8x32::<SHIFT>(a0, b0),
6758 self.slide_within_blocks_mask8x32::<SHIFT>(a1, b1),
6759 )
6760 }
6761 #[inline(always)]
6762 fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
6763 let (a0, a1) = self.split_mask8x64(a);
6764 let (b0, b1) = self.split_mask8x64(b);
6765 self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1))
6766 }
6767 #[inline(always)]
6768 fn or_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
6769 let (a0, a1) = self.split_mask8x64(a);
6770 let (b0, b1) = self.split_mask8x64(b);
6771 self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1))
6772 }
6773 #[inline(always)]
6774 fn xor_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
6775 let (a0, a1) = self.split_mask8x64(a);
6776 let (b0, b1) = self.split_mask8x64(b);
6777 self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1))
6778 }
6779 #[inline(always)]
6780 fn not_mask8x64(self, a: mask8x64<Self>) -> mask8x64<Self> {
6781 let (a0, a1) = self.split_mask8x64(a);
6782 self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1))
6783 }
6784 #[inline(always)]
6785 fn select_mask8x64(
6786 self,
6787 a: mask8x64<Self>,
6788 b: mask8x64<Self>,
6789 c: mask8x64<Self>,
6790 ) -> mask8x64<Self> {
6791 let (a0, a1) = self.split_mask8x64(a);
6792 let (b0, b1) = self.split_mask8x64(b);
6793 let (c0, c1) = self.split_mask8x64(c);
6794 self.combine_mask8x32(
6795 self.select_mask8x32(a0, b0, c0),
6796 self.select_mask8x32(a1, b1, c1),
6797 )
6798 }
6799 #[inline(always)]
6800 fn simd_eq_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
6801 let (a0, a1) = self.split_mask8x64(a);
6802 let (b0, b1) = self.split_mask8x64(b);
6803 self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1))
6804 }
6805 #[inline(always)]
6806 fn any_true_mask8x64(self, a: mask8x64<Self>) -> bool {
6807 let (a0, a1) = self.split_mask8x64(a);
6808 self.any_true_mask8x32(a0) || self.any_true_mask8x32(a1)
6809 }
6810 #[inline(always)]
6811 fn all_true_mask8x64(self, a: mask8x64<Self>) -> bool {
6812 let (a0, a1) = self.split_mask8x64(a);
6813 self.all_true_mask8x32(a0) && self.all_true_mask8x32(a1)
6814 }
6815 #[inline(always)]
6816 fn any_false_mask8x64(self, a: mask8x64<Self>) -> bool {
6817 let (a0, a1) = self.split_mask8x64(a);
6818 self.any_false_mask8x32(a0) || self.any_false_mask8x32(a1)
6819 }
6820 #[inline(always)]
6821 fn all_false_mask8x64(self, a: mask8x64<Self>) -> bool {
6822 let (a0, a1) = self.split_mask8x64(a);
6823 self.all_false_mask8x32(a0) && self.all_false_mask8x32(a1)
6824 }
6825 #[inline(always)]
6826 fn split_mask8x64(self, a: mask8x64<Self>) -> (mask8x32<Self>, mask8x32<Self>) {
6827 (
6828 mask8x32 {
6829 val: crate::support::Aligned256(a.val.0[0]),
6830 simd: self,
6831 },
6832 mask8x32 {
6833 val: crate::support::Aligned256(a.val.0[1]),
6834 simd: self,
6835 },
6836 )
6837 }
6838 #[inline(always)]
6839 fn splat_i16x32(self, val: i16) -> i16x32<Self> {
6840 let half = self.splat_i16x16(val);
6841 self.combine_i16x16(half, half)
6842 }
6843 #[inline(always)]
6844 fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32<Self> {
6845 i16x32 {
6846 val: unsafe { core::mem::transmute_copy(&val) },
6847 simd: self,
6848 }
6849 }
6850 #[inline(always)]
6851 fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32<Self> {
6852 i16x32 {
6853 val: unsafe { core::mem::transmute_copy(val) },
6854 simd: self,
6855 }
6856 }
6857 #[inline(always)]
6858 fn as_array_i16x32(self, a: i16x32<Self>) -> [i16; 32usize] {
6859 unsafe { core::mem::transmute::<[__m256i; 2usize], [i16; 32usize]>(a.val.0) }
6860 }
6861 #[inline(always)]
6862 fn as_array_ref_i16x32(self, a: &i16x32<Self>) -> &[i16; 32usize] {
6863 unsafe { core::mem::transmute::<&[__m256i; 2usize], &[i16; 32usize]>(&a.val.0) }
6864 }
6865 #[inline(always)]
6866 fn as_array_mut_i16x32(self, a: &mut i16x32<Self>) -> &mut [i16; 32usize] {
6867 unsafe { core::mem::transmute::<&mut [__m256i; 2usize], &mut [i16; 32usize]>(&mut a.val.0) }
6868 }
6869 #[inline(always)]
6870 fn store_array_i16x32(self, a: i16x32<Self>, dest: &mut [i16; 32usize]) -> () {
6871 unsafe {
6872 core::ptr::copy_nonoverlapping(
6873 (&raw const a.val.0) as *const i16,
6874 dest.as_mut_ptr(),
6875 32usize,
6876 );
6877 }
6878 }
6879 #[inline(always)]
6880 fn cvt_from_bytes_i16x32(self, a: u8x64<Self>) -> i16x32<Self> {
6881 unsafe {
6882 i16x32 {
6883 val: core::mem::transmute(a.val),
6884 simd: self,
6885 }
6886 }
6887 }
6888 #[inline(always)]
6889 fn cvt_to_bytes_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
6890 unsafe {
6891 u8x64 {
6892 val: core::mem::transmute(a.val),
6893 simd: self,
6894 }
6895 }
6896 }
6897 #[inline(always)]
6898 fn slide_i16x32<const SHIFT: usize>(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
6899 unsafe {
6900 if SHIFT >= 32usize {
6901 return b;
6902 }
6903 let result = cross_block_alignr_256x2(
6904 self.cvt_to_bytes_i16x32(b).val.0,
6905 self.cvt_to_bytes_i16x32(a).val.0,
6906 SHIFT * 2usize,
6907 );
6908 self.cvt_from_bytes_i16x32(u8x64 {
6909 val: crate::support::Aligned512(result),
6910 simd: self,
6911 })
6912 }
6913 }
6914 #[inline(always)]
6915 fn slide_within_blocks_i16x32<const SHIFT: usize>(
6916 self,
6917 a: i16x32<Self>,
6918 b: i16x32<Self>,
6919 ) -> i16x32<Self> {
6920 let (a0, a1) = self.split_i16x32(a);
6921 let (b0, b1) = self.split_i16x32(b);
6922 self.combine_i16x16(
6923 self.slide_within_blocks_i16x16::<SHIFT>(a0, b0),
6924 self.slide_within_blocks_i16x16::<SHIFT>(a1, b1),
6925 )
6926 }
6927 #[inline(always)]
6928 fn add_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
6929 let (a0, a1) = self.split_i16x32(a);
6930 let (b0, b1) = self.split_i16x32(b);
6931 self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1))
6932 }
6933 #[inline(always)]
6934 fn sub_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
6935 let (a0, a1) = self.split_i16x32(a);
6936 let (b0, b1) = self.split_i16x32(b);
6937 self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1))
6938 }
6939 #[inline(always)]
6940 fn mul_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
6941 let (a0, a1) = self.split_i16x32(a);
6942 let (b0, b1) = self.split_i16x32(b);
6943 self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1))
6944 }
6945 #[inline(always)]
6946 fn and_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
6947 let (a0, a1) = self.split_i16x32(a);
6948 let (b0, b1) = self.split_i16x32(b);
6949 self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1))
6950 }
6951 #[inline(always)]
6952 fn or_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
6953 let (a0, a1) = self.split_i16x32(a);
6954 let (b0, b1) = self.split_i16x32(b);
6955 self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1))
6956 }
6957 #[inline(always)]
6958 fn xor_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
6959 let (a0, a1) = self.split_i16x32(a);
6960 let (b0, b1) = self.split_i16x32(b);
6961 self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1))
6962 }
6963 #[inline(always)]
6964 fn not_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
6965 let (a0, a1) = self.split_i16x32(a);
6966 self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1))
6967 }
6968 #[inline(always)]
6969 fn shl_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
6970 let (a0, a1) = self.split_i16x32(a);
6971 self.combine_i16x16(self.shl_i16x16(a0, shift), self.shl_i16x16(a1, shift))
6972 }
6973 #[inline(always)]
6974 fn shlv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
6975 let (a0, a1) = self.split_i16x32(a);
6976 let (b0, b1) = self.split_i16x32(b);
6977 self.combine_i16x16(self.shlv_i16x16(a0, b0), self.shlv_i16x16(a1, b1))
6978 }
6979 #[inline(always)]
6980 fn shr_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
6981 let (a0, a1) = self.split_i16x32(a);
6982 self.combine_i16x16(self.shr_i16x16(a0, shift), self.shr_i16x16(a1, shift))
6983 }
6984 #[inline(always)]
6985 fn shrv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
6986 let (a0, a1) = self.split_i16x32(a);
6987 let (b0, b1) = self.split_i16x32(b);
6988 self.combine_i16x16(self.shrv_i16x16(a0, b0), self.shrv_i16x16(a1, b1))
6989 }
6990 #[inline(always)]
6991 fn simd_eq_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
6992 let (a0, a1) = self.split_i16x32(a);
6993 let (b0, b1) = self.split_i16x32(b);
6994 self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1))
6995 }
6996 #[inline(always)]
6997 fn simd_lt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
6998 let (a0, a1) = self.split_i16x32(a);
6999 let (b0, b1) = self.split_i16x32(b);
7000 self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1))
7001 }
7002 #[inline(always)]
7003 fn simd_le_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
7004 let (a0, a1) = self.split_i16x32(a);
7005 let (b0, b1) = self.split_i16x32(b);
7006 self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1))
7007 }
7008 #[inline(always)]
7009 fn simd_ge_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
7010 let (a0, a1) = self.split_i16x32(a);
7011 let (b0, b1) = self.split_i16x32(b);
7012 self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1))
7013 }
7014 #[inline(always)]
7015 fn simd_gt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
7016 let (a0, a1) = self.split_i16x32(a);
7017 let (b0, b1) = self.split_i16x32(b);
7018 self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1))
7019 }
7020 #[inline(always)]
7021 fn zip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7022 let (a0, _) = self.split_i16x32(a);
7023 let (b0, _) = self.split_i16x32(b);
7024 self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0))
7025 }
7026 #[inline(always)]
7027 fn zip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7028 let (_, a1) = self.split_i16x32(a);
7029 let (_, b1) = self.split_i16x32(b);
7030 self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1))
7031 }
7032 #[inline(always)]
7033 fn unzip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7034 let (a0, a1) = self.split_i16x32(a);
7035 let (b0, b1) = self.split_i16x32(b);
7036 self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1))
7037 }
7038 #[inline(always)]
7039 fn unzip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7040 let (a0, a1) = self.split_i16x32(a);
7041 let (b0, b1) = self.split_i16x32(b);
7042 self.combine_i16x16(
7043 self.unzip_high_i16x16(a0, a1),
7044 self.unzip_high_i16x16(b0, b1),
7045 )
7046 }
7047 #[inline(always)]
7048 fn interleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
7049 let (a0, a1) = self.split_i16x32(a);
7050 let (b0, b1) = self.split_i16x32(b);
7051 let lo_lo = self.zip_low_i16x16(a0, b0);
7052 let lo_hi = self.zip_high_i16x16(a0, b0);
7053 let hi_lo = self.zip_low_i16x16(a1, b1);
7054 let hi_hi = self.zip_high_i16x16(a1, b1);
7055 (
7056 self.combine_i16x16(lo_lo, lo_hi),
7057 self.combine_i16x16(hi_lo, hi_hi),
7058 )
7059 }
7060 #[inline(always)]
7061 fn deinterleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
7062 let (a0, a1) = self.split_i16x32(a);
7063 let (b0, b1) = self.split_i16x32(b);
7064 let lo_even = self.unzip_low_i16x16(a0, a1);
7065 let lo_odd = self.unzip_high_i16x16(a0, a1);
7066 let hi_even = self.unzip_low_i16x16(b0, b1);
7067 let hi_odd = self.unzip_high_i16x16(b0, b1);
7068 (
7069 self.combine_i16x16(lo_even, hi_even),
7070 self.combine_i16x16(lo_odd, hi_odd),
7071 )
7072 }
7073 #[inline(always)]
7074 fn select_i16x32(self, a: mask16x32<Self>, b: i16x32<Self>, c: i16x32<Self>) -> i16x32<Self> {
7075 let (a0, a1) = self.split_mask16x32(a);
7076 let (b0, b1) = self.split_i16x32(b);
7077 let (c0, c1) = self.split_i16x32(c);
7078 self.combine_i16x16(
7079 self.select_i16x16(a0, b0, c0),
7080 self.select_i16x16(a1, b1, c1),
7081 )
7082 }
7083 #[inline(always)]
7084 fn min_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7085 let (a0, a1) = self.split_i16x32(a);
7086 let (b0, b1) = self.split_i16x32(b);
7087 self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1))
7088 }
7089 #[inline(always)]
7090 fn max_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
7091 let (a0, a1) = self.split_i16x32(a);
7092 let (b0, b1) = self.split_i16x32(b);
7093 self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1))
7094 }
7095 #[inline(always)]
7096 fn split_i16x32(self, a: i16x32<Self>) -> (i16x16<Self>, i16x16<Self>) {
7097 (
7098 i16x16 {
7099 val: crate::support::Aligned256(a.val.0[0]),
7100 simd: self,
7101 },
7102 i16x16 {
7103 val: crate::support::Aligned256(a.val.0[1]),
7104 simd: self,
7105 },
7106 )
7107 }
7108 #[inline(always)]
7109 fn neg_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
7110 let (a0, a1) = self.split_i16x32(a);
7111 self.combine_i16x16(self.neg_i16x16(a0), self.neg_i16x16(a1))
7112 }
7113 #[inline(always)]
7114 fn reinterpret_u8_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
7115 let (a0, a1) = self.split_i16x32(a);
7116 self.combine_u8x32(
7117 self.reinterpret_u8_i16x16(a0),
7118 self.reinterpret_u8_i16x16(a1),
7119 )
7120 }
7121 #[inline(always)]
7122 fn reinterpret_u32_i16x32(self, a: i16x32<Self>) -> u32x16<Self> {
7123 let (a0, a1) = self.split_i16x32(a);
7124 self.combine_u32x8(
7125 self.reinterpret_u32_i16x16(a0),
7126 self.reinterpret_u32_i16x16(a1),
7127 )
7128 }
7129 #[inline(always)]
7130 fn splat_u16x32(self, val: u16) -> u16x32<Self> {
7131 let half = self.splat_u16x16(val);
7132 self.combine_u16x16(half, half)
7133 }
7134 #[inline(always)]
7135 fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32<Self> {
7136 u16x32 {
7137 val: unsafe { core::mem::transmute_copy(&val) },
7138 simd: self,
7139 }
7140 }
7141 #[inline(always)]
7142 fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32<Self> {
7143 u16x32 {
7144 val: unsafe { core::mem::transmute_copy(val) },
7145 simd: self,
7146 }
7147 }
7148 #[inline(always)]
7149 fn as_array_u16x32(self, a: u16x32<Self>) -> [u16; 32usize] {
7150 unsafe { core::mem::transmute::<[__m256i; 2usize], [u16; 32usize]>(a.val.0) }
7151 }
7152 #[inline(always)]
7153 fn as_array_ref_u16x32(self, a: &u16x32<Self>) -> &[u16; 32usize] {
7154 unsafe { core::mem::transmute::<&[__m256i; 2usize], &[u16; 32usize]>(&a.val.0) }
7155 }
7156 #[inline(always)]
7157 fn as_array_mut_u16x32(self, a: &mut u16x32<Self>) -> &mut [u16; 32usize] {
7158 unsafe { core::mem::transmute::<&mut [__m256i; 2usize], &mut [u16; 32usize]>(&mut a.val.0) }
7159 }
7160 #[inline(always)]
7161 fn store_array_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
7162 unsafe {
7163 core::ptr::copy_nonoverlapping(
7164 (&raw const a.val.0) as *const u16,
7165 dest.as_mut_ptr(),
7166 32usize,
7167 );
7168 }
7169 }
7170 #[inline(always)]
7171 fn cvt_from_bytes_u16x32(self, a: u8x64<Self>) -> u16x32<Self> {
7172 unsafe {
7173 u16x32 {
7174 val: core::mem::transmute(a.val),
7175 simd: self,
7176 }
7177 }
7178 }
7179 #[inline(always)]
7180 fn cvt_to_bytes_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
7181 unsafe {
7182 u8x64 {
7183 val: core::mem::transmute(a.val),
7184 simd: self,
7185 }
7186 }
7187 }
7188 #[inline(always)]
7189 fn slide_u16x32<const SHIFT: usize>(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7190 unsafe {
7191 if SHIFT >= 32usize {
7192 return b;
7193 }
7194 let result = cross_block_alignr_256x2(
7195 self.cvt_to_bytes_u16x32(b).val.0,
7196 self.cvt_to_bytes_u16x32(a).val.0,
7197 SHIFT * 2usize,
7198 );
7199 self.cvt_from_bytes_u16x32(u8x64 {
7200 val: crate::support::Aligned512(result),
7201 simd: self,
7202 })
7203 }
7204 }
7205 #[inline(always)]
7206 fn slide_within_blocks_u16x32<const SHIFT: usize>(
7207 self,
7208 a: u16x32<Self>,
7209 b: u16x32<Self>,
7210 ) -> u16x32<Self> {
7211 let (a0, a1) = self.split_u16x32(a);
7212 let (b0, b1) = self.split_u16x32(b);
7213 self.combine_u16x16(
7214 self.slide_within_blocks_u16x16::<SHIFT>(a0, b0),
7215 self.slide_within_blocks_u16x16::<SHIFT>(a1, b1),
7216 )
7217 }
7218 #[inline(always)]
7219 fn add_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7220 let (a0, a1) = self.split_u16x32(a);
7221 let (b0, b1) = self.split_u16x32(b);
7222 self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1))
7223 }
7224 #[inline(always)]
7225 fn sub_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7226 let (a0, a1) = self.split_u16x32(a);
7227 let (b0, b1) = self.split_u16x32(b);
7228 self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1))
7229 }
7230 #[inline(always)]
7231 fn mul_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7232 let (a0, a1) = self.split_u16x32(a);
7233 let (b0, b1) = self.split_u16x32(b);
7234 self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1))
7235 }
7236 #[inline(always)]
7237 fn and_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7238 let (a0, a1) = self.split_u16x32(a);
7239 let (b0, b1) = self.split_u16x32(b);
7240 self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1))
7241 }
7242 #[inline(always)]
7243 fn or_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7244 let (a0, a1) = self.split_u16x32(a);
7245 let (b0, b1) = self.split_u16x32(b);
7246 self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1))
7247 }
7248 #[inline(always)]
7249 fn xor_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7250 let (a0, a1) = self.split_u16x32(a);
7251 let (b0, b1) = self.split_u16x32(b);
7252 self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1))
7253 }
7254 #[inline(always)]
7255 fn not_u16x32(self, a: u16x32<Self>) -> u16x32<Self> {
7256 let (a0, a1) = self.split_u16x32(a);
7257 self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1))
7258 }
7259 #[inline(always)]
7260 fn shl_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
7261 let (a0, a1) = self.split_u16x32(a);
7262 self.combine_u16x16(self.shl_u16x16(a0, shift), self.shl_u16x16(a1, shift))
7263 }
7264 #[inline(always)]
7265 fn shlv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7266 let (a0, a1) = self.split_u16x32(a);
7267 let (b0, b1) = self.split_u16x32(b);
7268 self.combine_u16x16(self.shlv_u16x16(a0, b0), self.shlv_u16x16(a1, b1))
7269 }
7270 #[inline(always)]
7271 fn shr_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
7272 let (a0, a1) = self.split_u16x32(a);
7273 self.combine_u16x16(self.shr_u16x16(a0, shift), self.shr_u16x16(a1, shift))
7274 }
7275 #[inline(always)]
7276 fn shrv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7277 let (a0, a1) = self.split_u16x32(a);
7278 let (b0, b1) = self.split_u16x32(b);
7279 self.combine_u16x16(self.shrv_u16x16(a0, b0), self.shrv_u16x16(a1, b1))
7280 }
7281 #[inline(always)]
7282 fn simd_eq_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
7283 let (a0, a1) = self.split_u16x32(a);
7284 let (b0, b1) = self.split_u16x32(b);
7285 self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1))
7286 }
7287 #[inline(always)]
7288 fn simd_lt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
7289 let (a0, a1) = self.split_u16x32(a);
7290 let (b0, b1) = self.split_u16x32(b);
7291 self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1))
7292 }
7293 #[inline(always)]
7294 fn simd_le_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
7295 let (a0, a1) = self.split_u16x32(a);
7296 let (b0, b1) = self.split_u16x32(b);
7297 self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1))
7298 }
7299 #[inline(always)]
7300 fn simd_ge_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
7301 let (a0, a1) = self.split_u16x32(a);
7302 let (b0, b1) = self.split_u16x32(b);
7303 self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1))
7304 }
7305 #[inline(always)]
7306 fn simd_gt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
7307 let (a0, a1) = self.split_u16x32(a);
7308 let (b0, b1) = self.split_u16x32(b);
7309 self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1))
7310 }
7311 #[inline(always)]
7312 fn zip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7313 let (a0, _) = self.split_u16x32(a);
7314 let (b0, _) = self.split_u16x32(b);
7315 self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0))
7316 }
7317 #[inline(always)]
7318 fn zip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7319 let (_, a1) = self.split_u16x32(a);
7320 let (_, b1) = self.split_u16x32(b);
7321 self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1))
7322 }
7323 #[inline(always)]
7324 fn unzip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7325 let (a0, a1) = self.split_u16x32(a);
7326 let (b0, b1) = self.split_u16x32(b);
7327 self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1))
7328 }
7329 #[inline(always)]
7330 fn unzip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7331 let (a0, a1) = self.split_u16x32(a);
7332 let (b0, b1) = self.split_u16x32(b);
7333 self.combine_u16x16(
7334 self.unzip_high_u16x16(a0, a1),
7335 self.unzip_high_u16x16(b0, b1),
7336 )
7337 }
7338 #[inline(always)]
7339 fn interleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
7340 let (a0, a1) = self.split_u16x32(a);
7341 let (b0, b1) = self.split_u16x32(b);
7342 let lo_lo = self.zip_low_u16x16(a0, b0);
7343 let lo_hi = self.zip_high_u16x16(a0, b0);
7344 let hi_lo = self.zip_low_u16x16(a1, b1);
7345 let hi_hi = self.zip_high_u16x16(a1, b1);
7346 (
7347 self.combine_u16x16(lo_lo, lo_hi),
7348 self.combine_u16x16(hi_lo, hi_hi),
7349 )
7350 }
7351 #[inline(always)]
7352 fn deinterleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
7353 let (a0, a1) = self.split_u16x32(a);
7354 let (b0, b1) = self.split_u16x32(b);
7355 let lo_even = self.unzip_low_u16x16(a0, a1);
7356 let lo_odd = self.unzip_high_u16x16(a0, a1);
7357 let hi_even = self.unzip_low_u16x16(b0, b1);
7358 let hi_odd = self.unzip_high_u16x16(b0, b1);
7359 (
7360 self.combine_u16x16(lo_even, hi_even),
7361 self.combine_u16x16(lo_odd, hi_odd),
7362 )
7363 }
7364 #[inline(always)]
7365 fn select_u16x32(self, a: mask16x32<Self>, b: u16x32<Self>, c: u16x32<Self>) -> u16x32<Self> {
7366 let (a0, a1) = self.split_mask16x32(a);
7367 let (b0, b1) = self.split_u16x32(b);
7368 let (c0, c1) = self.split_u16x32(c);
7369 self.combine_u16x16(
7370 self.select_u16x16(a0, b0, c0),
7371 self.select_u16x16(a1, b1, c1),
7372 )
7373 }
7374 #[inline(always)]
7375 fn min_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7376 let (a0, a1) = self.split_u16x32(a);
7377 let (b0, b1) = self.split_u16x32(b);
7378 self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1))
7379 }
7380 #[inline(always)]
7381 fn max_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
7382 let (a0, a1) = self.split_u16x32(a);
7383 let (b0, b1) = self.split_u16x32(b);
7384 self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1))
7385 }
7386 #[inline(always)]
7387 fn split_u16x32(self, a: u16x32<Self>) -> (u16x16<Self>, u16x16<Self>) {
7388 (
7389 u16x16 {
7390 val: crate::support::Aligned256(a.val.0[0]),
7391 simd: self,
7392 },
7393 u16x16 {
7394 val: crate::support::Aligned256(a.val.0[1]),
7395 simd: self,
7396 },
7397 )
7398 }
7399 #[inline(always)]
7400 fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self> {
7401 unsafe {
7402 let v0 = _mm_loadu_si128(src.as_ptr() as *const _);
7403 let v1 = _mm_loadu_si128(src.as_ptr().add(8usize) as *const _);
7404 let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 8usize) as *const _);
7405 let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 8usize) as *const _);
7406 let mask = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
7407 let v0 = _mm_shuffle_epi8(v0, mask);
7408 let v1 = _mm_shuffle_epi8(v1, mask);
7409 let v2 = _mm_shuffle_epi8(v2, mask);
7410 let v3 = _mm_shuffle_epi8(v3, mask);
7411 let tmp0 = _mm_unpacklo_epi32(v0, v1);
7412 let tmp1 = _mm_unpackhi_epi32(v0, v1);
7413 let tmp2 = _mm_unpacklo_epi32(v2, v3);
7414 let tmp3 = _mm_unpackhi_epi32(v2, v3);
7415 let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
7416 let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
7417 let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
7418 let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
7419 self.combine_u16x16(
7420 self.combine_u16x8(out0.simd_into(self), out1.simd_into(self)),
7421 self.combine_u16x8(out2.simd_into(self), out3.simd_into(self)),
7422 )
7423 }
7424 }
7425 #[inline(always)]
7426 fn store_interleaved_128_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
7427 let (v01, v23) = self.split_u16x32(a);
7428 let (v0, v1) = self.split_u16x16(v01);
7429 let (v2, v3) = self.split_u16x16(v23);
7430 let v0 = v0.into();
7431 let v1 = v1.into();
7432 let v2 = v2.into();
7433 let v3 = v3.into();
7434 unsafe {
7435 let tmp0 = _mm_unpacklo_epi32(v0, v1);
7436 let tmp1 = _mm_unpackhi_epi32(v0, v1);
7437 let tmp2 = _mm_unpacklo_epi32(v2, v3);
7438 let tmp3 = _mm_unpackhi_epi32(v2, v3);
7439 let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
7440 let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
7441 let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
7442 let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
7443 let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
7444 let out0 = _mm_shuffle_epi8(out0, mask);
7445 let out1 = _mm_shuffle_epi8(out1, mask);
7446 let out2 = _mm_shuffle_epi8(out2, mask);
7447 let out3 = _mm_shuffle_epi8(out3, mask);
7448 _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0);
7449 _mm_storeu_si128(dest.as_mut_ptr().add(8usize) as *mut _, out1);
7450 _mm_storeu_si128(dest.as_mut_ptr().add(2 * 8usize) as *mut _, out2);
7451 _mm_storeu_si128(dest.as_mut_ptr().add(3 * 8usize) as *mut _, out3);
7452 }
7453 }
7454 #[inline(always)]
7455 fn narrow_u16x32(self, a: u16x32<Self>) -> u8x32<Self> {
7456 let (a, b) = self.split_u16x32(a);
7457 unsafe {
7458 let mask = _mm256_set1_epi16(0xFF);
7459 let lo_masked = _mm256_and_si256(a.into(), mask);
7460 let hi_masked = _mm256_and_si256(b.into(), mask);
7461 let result = _mm256_permute4x64_epi64::<0b_11_01_10_00>(_mm256_packus_epi16(
7462 lo_masked, hi_masked,
7463 ));
7464 result.simd_into(self)
7465 }
7466 }
7467 #[inline(always)]
7468 fn reinterpret_u8_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
7469 let (a0, a1) = self.split_u16x32(a);
7470 self.combine_u8x32(
7471 self.reinterpret_u8_u16x16(a0),
7472 self.reinterpret_u8_u16x16(a1),
7473 )
7474 }
7475 #[inline(always)]
7476 fn reinterpret_u32_u16x32(self, a: u16x32<Self>) -> u32x16<Self> {
7477 let (a0, a1) = self.split_u16x32(a);
7478 self.combine_u32x8(
7479 self.reinterpret_u32_u16x16(a0),
7480 self.reinterpret_u32_u16x16(a1),
7481 )
7482 }
7483 #[inline(always)]
7484 fn splat_mask16x32(self, val: i16) -> mask16x32<Self> {
7485 let half = self.splat_mask16x16(val);
7486 self.combine_mask16x16(half, half)
7487 }
7488 #[inline(always)]
7489 fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32<Self> {
7490 mask16x32 {
7491 val: unsafe { core::mem::transmute_copy(&val) },
7492 simd: self,
7493 }
7494 }
7495 #[inline(always)]
7496 fn load_array_ref_mask16x32(self, val: &[i16; 32usize]) -> mask16x32<Self> {
7497 mask16x32 {
7498 val: unsafe { core::mem::transmute_copy(val) },
7499 simd: self,
7500 }
7501 }
7502 #[inline(always)]
7503 fn as_array_mask16x32(self, a: mask16x32<Self>) -> [i16; 32usize] {
7504 unsafe { core::mem::transmute::<[__m256i; 2usize], [i16; 32usize]>(a.val.0) }
7505 }
7506 #[inline(always)]
7507 fn as_array_ref_mask16x32(self, a: &mask16x32<Self>) -> &[i16; 32usize] {
7508 unsafe { core::mem::transmute::<&[__m256i; 2usize], &[i16; 32usize]>(&a.val.0) }
7509 }
7510 #[inline(always)]
7511 fn as_array_mut_mask16x32(self, a: &mut mask16x32<Self>) -> &mut [i16; 32usize] {
7512 unsafe { core::mem::transmute::<&mut [__m256i; 2usize], &mut [i16; 32usize]>(&mut a.val.0) }
7513 }
7514 #[inline(always)]
7515 fn store_array_mask16x32(self, a: mask16x32<Self>, dest: &mut [i16; 32usize]) -> () {
7516 unsafe {
7517 core::ptr::copy_nonoverlapping(
7518 (&raw const a.val.0) as *const i16,
7519 dest.as_mut_ptr(),
7520 32usize,
7521 );
7522 }
7523 }
7524 #[inline(always)]
7525 fn cvt_from_bytes_mask16x32(self, a: u8x64<Self>) -> mask16x32<Self> {
7526 unsafe {
7527 mask16x32 {
7528 val: core::mem::transmute(a.val),
7529 simd: self,
7530 }
7531 }
7532 }
7533 #[inline(always)]
7534 fn cvt_to_bytes_mask16x32(self, a: mask16x32<Self>) -> u8x64<Self> {
7535 unsafe {
7536 u8x64 {
7537 val: core::mem::transmute(a.val),
7538 simd: self,
7539 }
7540 }
7541 }
7542 #[inline(always)]
7543 fn slide_mask16x32<const SHIFT: usize>(
7544 self,
7545 a: mask16x32<Self>,
7546 b: mask16x32<Self>,
7547 ) -> mask16x32<Self> {
7548 unsafe {
7549 if SHIFT >= 32usize {
7550 return b;
7551 }
7552 let result = cross_block_alignr_256x2(
7553 self.cvt_to_bytes_mask16x32(b).val.0,
7554 self.cvt_to_bytes_mask16x32(a).val.0,
7555 SHIFT * 2usize,
7556 );
7557 self.cvt_from_bytes_mask16x32(u8x64 {
7558 val: crate::support::Aligned512(result),
7559 simd: self,
7560 })
7561 }
7562 }
7563 #[inline(always)]
7564 fn slide_within_blocks_mask16x32<const SHIFT: usize>(
7565 self,
7566 a: mask16x32<Self>,
7567 b: mask16x32<Self>,
7568 ) -> mask16x32<Self> {
7569 let (a0, a1) = self.split_mask16x32(a);
7570 let (b0, b1) = self.split_mask16x32(b);
7571 self.combine_mask16x16(
7572 self.slide_within_blocks_mask16x16::<SHIFT>(a0, b0),
7573 self.slide_within_blocks_mask16x16::<SHIFT>(a1, b1),
7574 )
7575 }
7576 #[inline(always)]
7577 fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
7578 let (a0, a1) = self.split_mask16x32(a);
7579 let (b0, b1) = self.split_mask16x32(b);
7580 self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1))
7581 }
7582 #[inline(always)]
7583 fn or_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
7584 let (a0, a1) = self.split_mask16x32(a);
7585 let (b0, b1) = self.split_mask16x32(b);
7586 self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1))
7587 }
7588 #[inline(always)]
7589 fn xor_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
7590 let (a0, a1) = self.split_mask16x32(a);
7591 let (b0, b1) = self.split_mask16x32(b);
7592 self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1))
7593 }
7594 #[inline(always)]
7595 fn not_mask16x32(self, a: mask16x32<Self>) -> mask16x32<Self> {
7596 let (a0, a1) = self.split_mask16x32(a);
7597 self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1))
7598 }
7599 #[inline(always)]
7600 fn select_mask16x32(
7601 self,
7602 a: mask16x32<Self>,
7603 b: mask16x32<Self>,
7604 c: mask16x32<Self>,
7605 ) -> mask16x32<Self> {
7606 let (a0, a1) = self.split_mask16x32(a);
7607 let (b0, b1) = self.split_mask16x32(b);
7608 let (c0, c1) = self.split_mask16x32(c);
7609 self.combine_mask16x16(
7610 self.select_mask16x16(a0, b0, c0),
7611 self.select_mask16x16(a1, b1, c1),
7612 )
7613 }
7614 #[inline(always)]
7615 fn simd_eq_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
7616 let (a0, a1) = self.split_mask16x32(a);
7617 let (b0, b1) = self.split_mask16x32(b);
7618 self.combine_mask16x16(
7619 self.simd_eq_mask16x16(a0, b0),
7620 self.simd_eq_mask16x16(a1, b1),
7621 )
7622 }
7623 #[inline(always)]
7624 fn any_true_mask16x32(self, a: mask16x32<Self>) -> bool {
7625 let (a0, a1) = self.split_mask16x32(a);
7626 self.any_true_mask16x16(a0) || self.any_true_mask16x16(a1)
7627 }
7628 #[inline(always)]
7629 fn all_true_mask16x32(self, a: mask16x32<Self>) -> bool {
7630 let (a0, a1) = self.split_mask16x32(a);
7631 self.all_true_mask16x16(a0) && self.all_true_mask16x16(a1)
7632 }
7633 #[inline(always)]
7634 fn any_false_mask16x32(self, a: mask16x32<Self>) -> bool {
7635 let (a0, a1) = self.split_mask16x32(a);
7636 self.any_false_mask16x16(a0) || self.any_false_mask16x16(a1)
7637 }
7638 #[inline(always)]
7639 fn all_false_mask16x32(self, a: mask16x32<Self>) -> bool {
7640 let (a0, a1) = self.split_mask16x32(a);
7641 self.all_false_mask16x16(a0) && self.all_false_mask16x16(a1)
7642 }
7643 #[inline(always)]
7644 fn split_mask16x32(self, a: mask16x32<Self>) -> (mask16x16<Self>, mask16x16<Self>) {
7645 (
7646 mask16x16 {
7647 val: crate::support::Aligned256(a.val.0[0]),
7648 simd: self,
7649 },
7650 mask16x16 {
7651 val: crate::support::Aligned256(a.val.0[1]),
7652 simd: self,
7653 },
7654 )
7655 }
7656 #[inline(always)]
7657 fn splat_i32x16(self, val: i32) -> i32x16<Self> {
7658 let half = self.splat_i32x8(val);
7659 self.combine_i32x8(half, half)
7660 }
7661 #[inline(always)]
7662 fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16<Self> {
7663 i32x16 {
7664 val: unsafe { core::mem::transmute_copy(&val) },
7665 simd: self,
7666 }
7667 }
7668 #[inline(always)]
7669 fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16<Self> {
7670 i32x16 {
7671 val: unsafe { core::mem::transmute_copy(val) },
7672 simd: self,
7673 }
7674 }
7675 #[inline(always)]
7676 fn as_array_i32x16(self, a: i32x16<Self>) -> [i32; 16usize] {
7677 unsafe { core::mem::transmute::<[__m256i; 2usize], [i32; 16usize]>(a.val.0) }
7678 }
7679 #[inline(always)]
7680 fn as_array_ref_i32x16(self, a: &i32x16<Self>) -> &[i32; 16usize] {
7681 unsafe { core::mem::transmute::<&[__m256i; 2usize], &[i32; 16usize]>(&a.val.0) }
7682 }
7683 #[inline(always)]
7684 fn as_array_mut_i32x16(self, a: &mut i32x16<Self>) -> &mut [i32; 16usize] {
7685 unsafe { core::mem::transmute::<&mut [__m256i; 2usize], &mut [i32; 16usize]>(&mut a.val.0) }
7686 }
7687 #[inline(always)]
7688 fn store_array_i32x16(self, a: i32x16<Self>, dest: &mut [i32; 16usize]) -> () {
7689 unsafe {
7690 core::ptr::copy_nonoverlapping(
7691 (&raw const a.val.0) as *const i32,
7692 dest.as_mut_ptr(),
7693 16usize,
7694 );
7695 }
7696 }
7697 #[inline(always)]
7698 fn cvt_from_bytes_i32x16(self, a: u8x64<Self>) -> i32x16<Self> {
7699 unsafe {
7700 i32x16 {
7701 val: core::mem::transmute(a.val),
7702 simd: self,
7703 }
7704 }
7705 }
7706 #[inline(always)]
7707 fn cvt_to_bytes_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
7708 unsafe {
7709 u8x64 {
7710 val: core::mem::transmute(a.val),
7711 simd: self,
7712 }
7713 }
7714 }
7715 #[inline(always)]
7716 fn slide_i32x16<const SHIFT: usize>(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7717 unsafe {
7718 if SHIFT >= 16usize {
7719 return b;
7720 }
7721 let result = cross_block_alignr_256x2(
7722 self.cvt_to_bytes_i32x16(b).val.0,
7723 self.cvt_to_bytes_i32x16(a).val.0,
7724 SHIFT * 4usize,
7725 );
7726 self.cvt_from_bytes_i32x16(u8x64 {
7727 val: crate::support::Aligned512(result),
7728 simd: self,
7729 })
7730 }
7731 }
7732 #[inline(always)]
7733 fn slide_within_blocks_i32x16<const SHIFT: usize>(
7734 self,
7735 a: i32x16<Self>,
7736 b: i32x16<Self>,
7737 ) -> i32x16<Self> {
7738 let (a0, a1) = self.split_i32x16(a);
7739 let (b0, b1) = self.split_i32x16(b);
7740 self.combine_i32x8(
7741 self.slide_within_blocks_i32x8::<SHIFT>(a0, b0),
7742 self.slide_within_blocks_i32x8::<SHIFT>(a1, b1),
7743 )
7744 }
7745 #[inline(always)]
7746 fn add_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7747 let (a0, a1) = self.split_i32x16(a);
7748 let (b0, b1) = self.split_i32x16(b);
7749 self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1))
7750 }
7751 #[inline(always)]
7752 fn sub_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7753 let (a0, a1) = self.split_i32x16(a);
7754 let (b0, b1) = self.split_i32x16(b);
7755 self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1))
7756 }
7757 #[inline(always)]
7758 fn mul_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7759 let (a0, a1) = self.split_i32x16(a);
7760 let (b0, b1) = self.split_i32x16(b);
7761 self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1))
7762 }
7763 #[inline(always)]
7764 fn and_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7765 let (a0, a1) = self.split_i32x16(a);
7766 let (b0, b1) = self.split_i32x16(b);
7767 self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1))
7768 }
7769 #[inline(always)]
7770 fn or_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7771 let (a0, a1) = self.split_i32x16(a);
7772 let (b0, b1) = self.split_i32x16(b);
7773 self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1))
7774 }
7775 #[inline(always)]
7776 fn xor_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7777 let (a0, a1) = self.split_i32x16(a);
7778 let (b0, b1) = self.split_i32x16(b);
7779 self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1))
7780 }
7781 #[inline(always)]
7782 fn not_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
7783 let (a0, a1) = self.split_i32x16(a);
7784 self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1))
7785 }
7786 #[inline(always)]
7787 fn shl_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
7788 let (a0, a1) = self.split_i32x16(a);
7789 self.combine_i32x8(self.shl_i32x8(a0, shift), self.shl_i32x8(a1, shift))
7790 }
7791 #[inline(always)]
7792 fn shlv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7793 let (a0, a1) = self.split_i32x16(a);
7794 let (b0, b1) = self.split_i32x16(b);
7795 self.combine_i32x8(self.shlv_i32x8(a0, b0), self.shlv_i32x8(a1, b1))
7796 }
7797 #[inline(always)]
7798 fn shr_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
7799 let (a0, a1) = self.split_i32x16(a);
7800 self.combine_i32x8(self.shr_i32x8(a0, shift), self.shr_i32x8(a1, shift))
7801 }
7802 #[inline(always)]
7803 fn shrv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7804 let (a0, a1) = self.split_i32x16(a);
7805 let (b0, b1) = self.split_i32x16(b);
7806 self.combine_i32x8(self.shrv_i32x8(a0, b0), self.shrv_i32x8(a1, b1))
7807 }
7808 #[inline(always)]
7809 fn simd_eq_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
7810 let (a0, a1) = self.split_i32x16(a);
7811 let (b0, b1) = self.split_i32x16(b);
7812 self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1))
7813 }
7814 #[inline(always)]
7815 fn simd_lt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
7816 let (a0, a1) = self.split_i32x16(a);
7817 let (b0, b1) = self.split_i32x16(b);
7818 self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1))
7819 }
7820 #[inline(always)]
7821 fn simd_le_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
7822 let (a0, a1) = self.split_i32x16(a);
7823 let (b0, b1) = self.split_i32x16(b);
7824 self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1))
7825 }
7826 #[inline(always)]
7827 fn simd_ge_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
7828 let (a0, a1) = self.split_i32x16(a);
7829 let (b0, b1) = self.split_i32x16(b);
7830 self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1))
7831 }
7832 #[inline(always)]
7833 fn simd_gt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
7834 let (a0, a1) = self.split_i32x16(a);
7835 let (b0, b1) = self.split_i32x16(b);
7836 self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1))
7837 }
7838 #[inline(always)]
7839 fn zip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7840 let (a0, _) = self.split_i32x16(a);
7841 let (b0, _) = self.split_i32x16(b);
7842 self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0))
7843 }
7844 #[inline(always)]
7845 fn zip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7846 let (_, a1) = self.split_i32x16(a);
7847 let (_, b1) = self.split_i32x16(b);
7848 self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1))
7849 }
7850 #[inline(always)]
7851 fn unzip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7852 let (a0, a1) = self.split_i32x16(a);
7853 let (b0, b1) = self.split_i32x16(b);
7854 self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1))
7855 }
7856 #[inline(always)]
7857 fn unzip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7858 let (a0, a1) = self.split_i32x16(a);
7859 let (b0, b1) = self.split_i32x16(b);
7860 self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1))
7861 }
7862 #[inline(always)]
7863 fn interleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
7864 let (a0, a1) = self.split_i32x16(a);
7865 let (b0, b1) = self.split_i32x16(b);
7866 let lo_lo = self.zip_low_i32x8(a0, b0);
7867 let lo_hi = self.zip_high_i32x8(a0, b0);
7868 let hi_lo = self.zip_low_i32x8(a1, b1);
7869 let hi_hi = self.zip_high_i32x8(a1, b1);
7870 (
7871 self.combine_i32x8(lo_lo, lo_hi),
7872 self.combine_i32x8(hi_lo, hi_hi),
7873 )
7874 }
7875 #[inline(always)]
7876 fn deinterleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
7877 let (a0, a1) = self.split_i32x16(a);
7878 let (b0, b1) = self.split_i32x16(b);
7879 let lo_even = self.unzip_low_i32x8(a0, a1);
7880 let lo_odd = self.unzip_high_i32x8(a0, a1);
7881 let hi_even = self.unzip_low_i32x8(b0, b1);
7882 let hi_odd = self.unzip_high_i32x8(b0, b1);
7883 (
7884 self.combine_i32x8(lo_even, hi_even),
7885 self.combine_i32x8(lo_odd, hi_odd),
7886 )
7887 }
7888 #[inline(always)]
7889 fn select_i32x16(self, a: mask32x16<Self>, b: i32x16<Self>, c: i32x16<Self>) -> i32x16<Self> {
7890 let (a0, a1) = self.split_mask32x16(a);
7891 let (b0, b1) = self.split_i32x16(b);
7892 let (c0, c1) = self.split_i32x16(c);
7893 self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1))
7894 }
7895 #[inline(always)]
7896 fn min_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7897 let (a0, a1) = self.split_i32x16(a);
7898 let (b0, b1) = self.split_i32x16(b);
7899 self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1))
7900 }
7901 #[inline(always)]
7902 fn max_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
7903 let (a0, a1) = self.split_i32x16(a);
7904 let (b0, b1) = self.split_i32x16(b);
7905 self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1))
7906 }
7907 #[inline(always)]
7908 fn split_i32x16(self, a: i32x16<Self>) -> (i32x8<Self>, i32x8<Self>) {
7909 (
7910 i32x8 {
7911 val: crate::support::Aligned256(a.val.0[0]),
7912 simd: self,
7913 },
7914 i32x8 {
7915 val: crate::support::Aligned256(a.val.0[1]),
7916 simd: self,
7917 },
7918 )
7919 }
7920 #[inline(always)]
7921 fn neg_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
7922 let (a0, a1) = self.split_i32x16(a);
7923 self.combine_i32x8(self.neg_i32x8(a0), self.neg_i32x8(a1))
7924 }
7925 #[inline(always)]
7926 fn reinterpret_u8_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
7927 let (a0, a1) = self.split_i32x16(a);
7928 self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1))
7929 }
7930 #[inline(always)]
7931 fn reinterpret_u32_i32x16(self, a: i32x16<Self>) -> u32x16<Self> {
7932 let (a0, a1) = self.split_i32x16(a);
7933 self.combine_u32x8(
7934 self.reinterpret_u32_i32x8(a0),
7935 self.reinterpret_u32_i32x8(a1),
7936 )
7937 }
7938 #[inline(always)]
7939 fn cvt_f32_i32x16(self, a: i32x16<Self>) -> f32x16<Self> {
7940 let (a0, a1) = self.split_i32x16(a);
7941 self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1))
7942 }
7943 #[inline(always)]
7944 fn splat_u32x16(self, val: u32) -> u32x16<Self> {
7945 let half = self.splat_u32x8(val);
7946 self.combine_u32x8(half, half)
7947 }
7948 #[inline(always)]
7949 fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16<Self> {
7950 u32x16 {
7951 val: unsafe { core::mem::transmute_copy(&val) },
7952 simd: self,
7953 }
7954 }
7955 #[inline(always)]
7956 fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16<Self> {
7957 u32x16 {
7958 val: unsafe { core::mem::transmute_copy(val) },
7959 simd: self,
7960 }
7961 }
7962 #[inline(always)]
7963 fn as_array_u32x16(self, a: u32x16<Self>) -> [u32; 16usize] {
7964 unsafe { core::mem::transmute::<[__m256i; 2usize], [u32; 16usize]>(a.val.0) }
7965 }
7966 #[inline(always)]
7967 fn as_array_ref_u32x16(self, a: &u32x16<Self>) -> &[u32; 16usize] {
7968 unsafe { core::mem::transmute::<&[__m256i; 2usize], &[u32; 16usize]>(&a.val.0) }
7969 }
7970 #[inline(always)]
7971 fn as_array_mut_u32x16(self, a: &mut u32x16<Self>) -> &mut [u32; 16usize] {
7972 unsafe { core::mem::transmute::<&mut [__m256i; 2usize], &mut [u32; 16usize]>(&mut a.val.0) }
7973 }
7974 #[inline(always)]
7975 fn store_array_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
7976 unsafe {
7977 core::ptr::copy_nonoverlapping(
7978 (&raw const a.val.0) as *const u32,
7979 dest.as_mut_ptr(),
7980 16usize,
7981 );
7982 }
7983 }
7984 #[inline(always)]
7985 fn cvt_from_bytes_u32x16(self, a: u8x64<Self>) -> u32x16<Self> {
7986 unsafe {
7987 u32x16 {
7988 val: core::mem::transmute(a.val),
7989 simd: self,
7990 }
7991 }
7992 }
7993 #[inline(always)]
7994 fn cvt_to_bytes_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
7995 unsafe {
7996 u8x64 {
7997 val: core::mem::transmute(a.val),
7998 simd: self,
7999 }
8000 }
8001 }
8002 #[inline(always)]
8003 fn slide_u32x16<const SHIFT: usize>(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8004 unsafe {
8005 if SHIFT >= 16usize {
8006 return b;
8007 }
8008 let result = cross_block_alignr_256x2(
8009 self.cvt_to_bytes_u32x16(b).val.0,
8010 self.cvt_to_bytes_u32x16(a).val.0,
8011 SHIFT * 4usize,
8012 );
8013 self.cvt_from_bytes_u32x16(u8x64 {
8014 val: crate::support::Aligned512(result),
8015 simd: self,
8016 })
8017 }
8018 }
8019 #[inline(always)]
8020 fn slide_within_blocks_u32x16<const SHIFT: usize>(
8021 self,
8022 a: u32x16<Self>,
8023 b: u32x16<Self>,
8024 ) -> u32x16<Self> {
8025 let (a0, a1) = self.split_u32x16(a);
8026 let (b0, b1) = self.split_u32x16(b);
8027 self.combine_u32x8(
8028 self.slide_within_blocks_u32x8::<SHIFT>(a0, b0),
8029 self.slide_within_blocks_u32x8::<SHIFT>(a1, b1),
8030 )
8031 }
8032 #[inline(always)]
8033 fn add_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8034 let (a0, a1) = self.split_u32x16(a);
8035 let (b0, b1) = self.split_u32x16(b);
8036 self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1))
8037 }
8038 #[inline(always)]
8039 fn sub_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8040 let (a0, a1) = self.split_u32x16(a);
8041 let (b0, b1) = self.split_u32x16(b);
8042 self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1))
8043 }
8044 #[inline(always)]
8045 fn mul_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8046 let (a0, a1) = self.split_u32x16(a);
8047 let (b0, b1) = self.split_u32x16(b);
8048 self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1))
8049 }
8050 #[inline(always)]
8051 fn and_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8052 let (a0, a1) = self.split_u32x16(a);
8053 let (b0, b1) = self.split_u32x16(b);
8054 self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1))
8055 }
8056 #[inline(always)]
8057 fn or_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8058 let (a0, a1) = self.split_u32x16(a);
8059 let (b0, b1) = self.split_u32x16(b);
8060 self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1))
8061 }
8062 #[inline(always)]
8063 fn xor_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8064 let (a0, a1) = self.split_u32x16(a);
8065 let (b0, b1) = self.split_u32x16(b);
8066 self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1))
8067 }
8068 #[inline(always)]
8069 fn not_u32x16(self, a: u32x16<Self>) -> u32x16<Self> {
8070 let (a0, a1) = self.split_u32x16(a);
8071 self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1))
8072 }
8073 #[inline(always)]
8074 fn shl_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
8075 let (a0, a1) = self.split_u32x16(a);
8076 self.combine_u32x8(self.shl_u32x8(a0, shift), self.shl_u32x8(a1, shift))
8077 }
8078 #[inline(always)]
8079 fn shlv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8080 let (a0, a1) = self.split_u32x16(a);
8081 let (b0, b1) = self.split_u32x16(b);
8082 self.combine_u32x8(self.shlv_u32x8(a0, b0), self.shlv_u32x8(a1, b1))
8083 }
8084 #[inline(always)]
8085 fn shr_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
8086 let (a0, a1) = self.split_u32x16(a);
8087 self.combine_u32x8(self.shr_u32x8(a0, shift), self.shr_u32x8(a1, shift))
8088 }
8089 #[inline(always)]
8090 fn shrv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8091 let (a0, a1) = self.split_u32x16(a);
8092 let (b0, b1) = self.split_u32x16(b);
8093 self.combine_u32x8(self.shrv_u32x8(a0, b0), self.shrv_u32x8(a1, b1))
8094 }
8095 #[inline(always)]
8096 fn simd_eq_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
8097 let (a0, a1) = self.split_u32x16(a);
8098 let (b0, b1) = self.split_u32x16(b);
8099 self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1))
8100 }
8101 #[inline(always)]
8102 fn simd_lt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
8103 let (a0, a1) = self.split_u32x16(a);
8104 let (b0, b1) = self.split_u32x16(b);
8105 self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1))
8106 }
8107 #[inline(always)]
8108 fn simd_le_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
8109 let (a0, a1) = self.split_u32x16(a);
8110 let (b0, b1) = self.split_u32x16(b);
8111 self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1))
8112 }
8113 #[inline(always)]
8114 fn simd_ge_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
8115 let (a0, a1) = self.split_u32x16(a);
8116 let (b0, b1) = self.split_u32x16(b);
8117 self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1))
8118 }
8119 #[inline(always)]
8120 fn simd_gt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
8121 let (a0, a1) = self.split_u32x16(a);
8122 let (b0, b1) = self.split_u32x16(b);
8123 self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1))
8124 }
8125 #[inline(always)]
8126 fn zip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8127 let (a0, _) = self.split_u32x16(a);
8128 let (b0, _) = self.split_u32x16(b);
8129 self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0))
8130 }
8131 #[inline(always)]
8132 fn zip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8133 let (_, a1) = self.split_u32x16(a);
8134 let (_, b1) = self.split_u32x16(b);
8135 self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1))
8136 }
8137 #[inline(always)]
8138 fn unzip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8139 let (a0, a1) = self.split_u32x16(a);
8140 let (b0, b1) = self.split_u32x16(b);
8141 self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1))
8142 }
8143 #[inline(always)]
8144 fn unzip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8145 let (a0, a1) = self.split_u32x16(a);
8146 let (b0, b1) = self.split_u32x16(b);
8147 self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1))
8148 }
8149 #[inline(always)]
8150 fn interleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
8151 let (a0, a1) = self.split_u32x16(a);
8152 let (b0, b1) = self.split_u32x16(b);
8153 let lo_lo = self.zip_low_u32x8(a0, b0);
8154 let lo_hi = self.zip_high_u32x8(a0, b0);
8155 let hi_lo = self.zip_low_u32x8(a1, b1);
8156 let hi_hi = self.zip_high_u32x8(a1, b1);
8157 (
8158 self.combine_u32x8(lo_lo, lo_hi),
8159 self.combine_u32x8(hi_lo, hi_hi),
8160 )
8161 }
8162 #[inline(always)]
8163 fn deinterleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
8164 let (a0, a1) = self.split_u32x16(a);
8165 let (b0, b1) = self.split_u32x16(b);
8166 let lo_even = self.unzip_low_u32x8(a0, a1);
8167 let lo_odd = self.unzip_high_u32x8(a0, a1);
8168 let hi_even = self.unzip_low_u32x8(b0, b1);
8169 let hi_odd = self.unzip_high_u32x8(b0, b1);
8170 (
8171 self.combine_u32x8(lo_even, hi_even),
8172 self.combine_u32x8(lo_odd, hi_odd),
8173 )
8174 }
8175 #[inline(always)]
8176 fn select_u32x16(self, a: mask32x16<Self>, b: u32x16<Self>, c: u32x16<Self>) -> u32x16<Self> {
8177 let (a0, a1) = self.split_mask32x16(a);
8178 let (b0, b1) = self.split_u32x16(b);
8179 let (c0, c1) = self.split_u32x16(c);
8180 self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1))
8181 }
8182 #[inline(always)]
8183 fn min_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8184 let (a0, a1) = self.split_u32x16(a);
8185 let (b0, b1) = self.split_u32x16(b);
8186 self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1))
8187 }
8188 #[inline(always)]
8189 fn max_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
8190 let (a0, a1) = self.split_u32x16(a);
8191 let (b0, b1) = self.split_u32x16(b);
8192 self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1))
8193 }
8194 #[inline(always)]
8195 fn split_u32x16(self, a: u32x16<Self>) -> (u32x8<Self>, u32x8<Self>) {
8196 (
8197 u32x8 {
8198 val: crate::support::Aligned256(a.val.0[0]),
8199 simd: self,
8200 },
8201 u32x8 {
8202 val: crate::support::Aligned256(a.val.0[1]),
8203 simd: self,
8204 },
8205 )
8206 }
8207 #[inline(always)]
8208 fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self> {
8209 unsafe {
8210 let v0 = _mm_loadu_si128(src.as_ptr() as *const _);
8211 let v1 = _mm_loadu_si128(src.as_ptr().add(4usize) as *const _);
8212 let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 4usize) as *const _);
8213 let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 4usize) as *const _);
8214 let tmp0 = _mm_unpacklo_epi32(v0, v1);
8215 let tmp1 = _mm_unpackhi_epi32(v0, v1);
8216 let tmp2 = _mm_unpacklo_epi32(v2, v3);
8217 let tmp3 = _mm_unpackhi_epi32(v2, v3);
8218 let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
8219 let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
8220 let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
8221 let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
8222 self.combine_u32x8(
8223 self.combine_u32x4(out0.simd_into(self), out1.simd_into(self)),
8224 self.combine_u32x4(out2.simd_into(self), out3.simd_into(self)),
8225 )
8226 }
8227 }
8228 #[inline(always)]
8229 fn store_interleaved_128_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
8230 let (v01, v23) = self.split_u32x16(a);
8231 let (v0, v1) = self.split_u32x8(v01);
8232 let (v2, v3) = self.split_u32x8(v23);
8233 let v0 = v0.into();
8234 let v1 = v1.into();
8235 let v2 = v2.into();
8236 let v3 = v3.into();
8237 unsafe {
8238 let tmp0 = _mm_unpacklo_epi32(v0, v1);
8239 let tmp1 = _mm_unpackhi_epi32(v0, v1);
8240 let tmp2 = _mm_unpacklo_epi32(v2, v3);
8241 let tmp3 = _mm_unpackhi_epi32(v2, v3);
8242 let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
8243 let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
8244 let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
8245 let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
8246 _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0);
8247 _mm_storeu_si128(dest.as_mut_ptr().add(4usize) as *mut _, out1);
8248 _mm_storeu_si128(dest.as_mut_ptr().add(2 * 4usize) as *mut _, out2);
8249 _mm_storeu_si128(dest.as_mut_ptr().add(3 * 4usize) as *mut _, out3);
8250 }
8251 }
8252 #[inline(always)]
8253 fn reinterpret_u8_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
8254 let (a0, a1) = self.split_u32x16(a);
8255 self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1))
8256 }
8257 #[inline(always)]
8258 fn cvt_f32_u32x16(self, a: u32x16<Self>) -> f32x16<Self> {
8259 let (a0, a1) = self.split_u32x16(a);
8260 self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1))
8261 }
8262 #[inline(always)]
8263 fn splat_mask32x16(self, val: i32) -> mask32x16<Self> {
8264 let half = self.splat_mask32x8(val);
8265 self.combine_mask32x8(half, half)
8266 }
8267 #[inline(always)]
8268 fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16<Self> {
8269 mask32x16 {
8270 val: unsafe { core::mem::transmute_copy(&val) },
8271 simd: self,
8272 }
8273 }
8274 #[inline(always)]
8275 fn load_array_ref_mask32x16(self, val: &[i32; 16usize]) -> mask32x16<Self> {
8276 mask32x16 {
8277 val: unsafe { core::mem::transmute_copy(val) },
8278 simd: self,
8279 }
8280 }
8281 #[inline(always)]
8282 fn as_array_mask32x16(self, a: mask32x16<Self>) -> [i32; 16usize] {
8283 unsafe { core::mem::transmute::<[__m256i; 2usize], [i32; 16usize]>(a.val.0) }
8284 }
8285 #[inline(always)]
8286 fn as_array_ref_mask32x16(self, a: &mask32x16<Self>) -> &[i32; 16usize] {
8287 unsafe { core::mem::transmute::<&[__m256i; 2usize], &[i32; 16usize]>(&a.val.0) }
8288 }
8289 #[inline(always)]
8290 fn as_array_mut_mask32x16(self, a: &mut mask32x16<Self>) -> &mut [i32; 16usize] {
8291 unsafe { core::mem::transmute::<&mut [__m256i; 2usize], &mut [i32; 16usize]>(&mut a.val.0) }
8292 }
8293 #[inline(always)]
8294 fn store_array_mask32x16(self, a: mask32x16<Self>, dest: &mut [i32; 16usize]) -> () {
8295 unsafe {
8296 core::ptr::copy_nonoverlapping(
8297 (&raw const a.val.0) as *const i32,
8298 dest.as_mut_ptr(),
8299 16usize,
8300 );
8301 }
8302 }
8303 #[inline(always)]
8304 fn cvt_from_bytes_mask32x16(self, a: u8x64<Self>) -> mask32x16<Self> {
8305 unsafe {
8306 mask32x16 {
8307 val: core::mem::transmute(a.val),
8308 simd: self,
8309 }
8310 }
8311 }
8312 #[inline(always)]
8313 fn cvt_to_bytes_mask32x16(self, a: mask32x16<Self>) -> u8x64<Self> {
8314 unsafe {
8315 u8x64 {
8316 val: core::mem::transmute(a.val),
8317 simd: self,
8318 }
8319 }
8320 }
8321 #[inline(always)]
8322 fn slide_mask32x16<const SHIFT: usize>(
8323 self,
8324 a: mask32x16<Self>,
8325 b: mask32x16<Self>,
8326 ) -> mask32x16<Self> {
8327 unsafe {
8328 if SHIFT >= 16usize {
8329 return b;
8330 }
8331 let result = cross_block_alignr_256x2(
8332 self.cvt_to_bytes_mask32x16(b).val.0,
8333 self.cvt_to_bytes_mask32x16(a).val.0,
8334 SHIFT * 4usize,
8335 );
8336 self.cvt_from_bytes_mask32x16(u8x64 {
8337 val: crate::support::Aligned512(result),
8338 simd: self,
8339 })
8340 }
8341 }
8342 #[inline(always)]
8343 fn slide_within_blocks_mask32x16<const SHIFT: usize>(
8344 self,
8345 a: mask32x16<Self>,
8346 b: mask32x16<Self>,
8347 ) -> mask32x16<Self> {
8348 let (a0, a1) = self.split_mask32x16(a);
8349 let (b0, b1) = self.split_mask32x16(b);
8350 self.combine_mask32x8(
8351 self.slide_within_blocks_mask32x8::<SHIFT>(a0, b0),
8352 self.slide_within_blocks_mask32x8::<SHIFT>(a1, b1),
8353 )
8354 }
8355 #[inline(always)]
8356 fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
8357 let (a0, a1) = self.split_mask32x16(a);
8358 let (b0, b1) = self.split_mask32x16(b);
8359 self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1))
8360 }
8361 #[inline(always)]
8362 fn or_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
8363 let (a0, a1) = self.split_mask32x16(a);
8364 let (b0, b1) = self.split_mask32x16(b);
8365 self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1))
8366 }
8367 #[inline(always)]
8368 fn xor_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
8369 let (a0, a1) = self.split_mask32x16(a);
8370 let (b0, b1) = self.split_mask32x16(b);
8371 self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1))
8372 }
8373 #[inline(always)]
8374 fn not_mask32x16(self, a: mask32x16<Self>) -> mask32x16<Self> {
8375 let (a0, a1) = self.split_mask32x16(a);
8376 self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1))
8377 }
8378 #[inline(always)]
8379 fn select_mask32x16(
8380 self,
8381 a: mask32x16<Self>,
8382 b: mask32x16<Self>,
8383 c: mask32x16<Self>,
8384 ) -> mask32x16<Self> {
8385 let (a0, a1) = self.split_mask32x16(a);
8386 let (b0, b1) = self.split_mask32x16(b);
8387 let (c0, c1) = self.split_mask32x16(c);
8388 self.combine_mask32x8(
8389 self.select_mask32x8(a0, b0, c0),
8390 self.select_mask32x8(a1, b1, c1),
8391 )
8392 }
8393 #[inline(always)]
8394 fn simd_eq_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
8395 let (a0, a1) = self.split_mask32x16(a);
8396 let (b0, b1) = self.split_mask32x16(b);
8397 self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1))
8398 }
8399 #[inline(always)]
8400 fn any_true_mask32x16(self, a: mask32x16<Self>) -> bool {
8401 let (a0, a1) = self.split_mask32x16(a);
8402 self.any_true_mask32x8(a0) || self.any_true_mask32x8(a1)
8403 }
8404 #[inline(always)]
8405 fn all_true_mask32x16(self, a: mask32x16<Self>) -> bool {
8406 let (a0, a1) = self.split_mask32x16(a);
8407 self.all_true_mask32x8(a0) && self.all_true_mask32x8(a1)
8408 }
8409 #[inline(always)]
8410 fn any_false_mask32x16(self, a: mask32x16<Self>) -> bool {
8411 let (a0, a1) = self.split_mask32x16(a);
8412 self.any_false_mask32x8(a0) || self.any_false_mask32x8(a1)
8413 }
8414 #[inline(always)]
8415 fn all_false_mask32x16(self, a: mask32x16<Self>) -> bool {
8416 let (a0, a1) = self.split_mask32x16(a);
8417 self.all_false_mask32x8(a0) && self.all_false_mask32x8(a1)
8418 }
8419 #[inline(always)]
8420 fn split_mask32x16(self, a: mask32x16<Self>) -> (mask32x8<Self>, mask32x8<Self>) {
8421 (
8422 mask32x8 {
8423 val: crate::support::Aligned256(a.val.0[0]),
8424 simd: self,
8425 },
8426 mask32x8 {
8427 val: crate::support::Aligned256(a.val.0[1]),
8428 simd: self,
8429 },
8430 )
8431 }
8432 #[inline(always)]
8433 fn splat_f64x8(self, val: f64) -> f64x8<Self> {
8434 let half = self.splat_f64x4(val);
8435 self.combine_f64x4(half, half)
8436 }
8437 #[inline(always)]
8438 fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8<Self> {
8439 f64x8 {
8440 val: unsafe { core::mem::transmute_copy(&val) },
8441 simd: self,
8442 }
8443 }
8444 #[inline(always)]
8445 fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8<Self> {
8446 f64x8 {
8447 val: unsafe { core::mem::transmute_copy(val) },
8448 simd: self,
8449 }
8450 }
8451 #[inline(always)]
8452 fn as_array_f64x8(self, a: f64x8<Self>) -> [f64; 8usize] {
8453 unsafe { core::mem::transmute::<[__m256d; 2usize], [f64; 8usize]>(a.val.0) }
8454 }
8455 #[inline(always)]
8456 fn as_array_ref_f64x8(self, a: &f64x8<Self>) -> &[f64; 8usize] {
8457 unsafe { core::mem::transmute::<&[__m256d; 2usize], &[f64; 8usize]>(&a.val.0) }
8458 }
8459 #[inline(always)]
8460 fn as_array_mut_f64x8(self, a: &mut f64x8<Self>) -> &mut [f64; 8usize] {
8461 unsafe { core::mem::transmute::<&mut [__m256d; 2usize], &mut [f64; 8usize]>(&mut a.val.0) }
8462 }
8463 #[inline(always)]
8464 fn store_array_f64x8(self, a: f64x8<Self>, dest: &mut [f64; 8usize]) -> () {
8465 unsafe {
8466 core::ptr::copy_nonoverlapping(
8467 (&raw const a.val.0) as *const f64,
8468 dest.as_mut_ptr(),
8469 8usize,
8470 );
8471 }
8472 }
8473 #[inline(always)]
8474 fn cvt_from_bytes_f64x8(self, a: u8x64<Self>) -> f64x8<Self> {
8475 unsafe {
8476 f64x8 {
8477 val: core::mem::transmute(a.val),
8478 simd: self,
8479 }
8480 }
8481 }
8482 #[inline(always)]
8483 fn cvt_to_bytes_f64x8(self, a: f64x8<Self>) -> u8x64<Self> {
8484 unsafe {
8485 u8x64 {
8486 val: core::mem::transmute(a.val),
8487 simd: self,
8488 }
8489 }
8490 }
8491 #[inline(always)]
8492 fn slide_f64x8<const SHIFT: usize>(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8493 unsafe {
8494 if SHIFT >= 8usize {
8495 return b;
8496 }
8497 let result = cross_block_alignr_256x2(
8498 self.cvt_to_bytes_f64x8(b).val.0,
8499 self.cvt_to_bytes_f64x8(a).val.0,
8500 SHIFT * 8usize,
8501 );
8502 self.cvt_from_bytes_f64x8(u8x64 {
8503 val: crate::support::Aligned512(result),
8504 simd: self,
8505 })
8506 }
8507 }
8508 #[inline(always)]
8509 fn slide_within_blocks_f64x8<const SHIFT: usize>(
8510 self,
8511 a: f64x8<Self>,
8512 b: f64x8<Self>,
8513 ) -> f64x8<Self> {
8514 let (a0, a1) = self.split_f64x8(a);
8515 let (b0, b1) = self.split_f64x8(b);
8516 self.combine_f64x4(
8517 self.slide_within_blocks_f64x4::<SHIFT>(a0, b0),
8518 self.slide_within_blocks_f64x4::<SHIFT>(a1, b1),
8519 )
8520 }
8521 #[inline(always)]
8522 fn abs_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8523 let (a0, a1) = self.split_f64x8(a);
8524 self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1))
8525 }
8526 #[inline(always)]
8527 fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8528 let (a0, a1) = self.split_f64x8(a);
8529 self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1))
8530 }
8531 #[inline(always)]
8532 fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8533 let (a0, a1) = self.split_f64x8(a);
8534 self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
8535 }
8536 #[inline(always)]
8537 fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8538 let (a0, a1) = self.split_f64x8(a);
8539 let (b0, b1) = self.split_f64x8(b);
8540 self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1))
8541 }
8542 #[inline(always)]
8543 fn sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8544 let (a0, a1) = self.split_f64x8(a);
8545 let (b0, b1) = self.split_f64x8(b);
8546 self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1))
8547 }
8548 #[inline(always)]
8549 fn mul_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8550 let (a0, a1) = self.split_f64x8(a);
8551 let (b0, b1) = self.split_f64x8(b);
8552 self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1))
8553 }
8554 #[inline(always)]
8555 fn div_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8556 let (a0, a1) = self.split_f64x8(a);
8557 let (b0, b1) = self.split_f64x8(b);
8558 self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1))
8559 }
8560 #[inline(always)]
8561 fn copysign_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8562 let (a0, a1) = self.split_f64x8(a);
8563 let (b0, b1) = self.split_f64x8(b);
8564 self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1))
8565 }
8566 #[inline(always)]
8567 fn simd_eq_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
8568 let (a0, a1) = self.split_f64x8(a);
8569 let (b0, b1) = self.split_f64x8(b);
8570 self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1))
8571 }
8572 #[inline(always)]
8573 fn simd_lt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
8574 let (a0, a1) = self.split_f64x8(a);
8575 let (b0, b1) = self.split_f64x8(b);
8576 self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1))
8577 }
8578 #[inline(always)]
8579 fn simd_le_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
8580 let (a0, a1) = self.split_f64x8(a);
8581 let (b0, b1) = self.split_f64x8(b);
8582 self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1))
8583 }
8584 #[inline(always)]
8585 fn simd_ge_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
8586 let (a0, a1) = self.split_f64x8(a);
8587 let (b0, b1) = self.split_f64x8(b);
8588 self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1))
8589 }
8590 #[inline(always)]
8591 fn simd_gt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
8592 let (a0, a1) = self.split_f64x8(a);
8593 let (b0, b1) = self.split_f64x8(b);
8594 self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1))
8595 }
8596 #[inline(always)]
8597 fn zip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8598 let (a0, _) = self.split_f64x8(a);
8599 let (b0, _) = self.split_f64x8(b);
8600 self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0))
8601 }
8602 #[inline(always)]
8603 fn zip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8604 let (_, a1) = self.split_f64x8(a);
8605 let (_, b1) = self.split_f64x8(b);
8606 self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1))
8607 }
8608 #[inline(always)]
8609 fn unzip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8610 let (a0, a1) = self.split_f64x8(a);
8611 let (b0, b1) = self.split_f64x8(b);
8612 self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1))
8613 }
8614 #[inline(always)]
8615 fn unzip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8616 let (a0, a1) = self.split_f64x8(a);
8617 let (b0, b1) = self.split_f64x8(b);
8618 self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1))
8619 }
8620 #[inline(always)]
8621 fn interleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
8622 let (a0, a1) = self.split_f64x8(a);
8623 let (b0, b1) = self.split_f64x8(b);
8624 let lo_lo = self.zip_low_f64x4(a0, b0);
8625 let lo_hi = self.zip_high_f64x4(a0, b0);
8626 let hi_lo = self.zip_low_f64x4(a1, b1);
8627 let hi_hi = self.zip_high_f64x4(a1, b1);
8628 (
8629 self.combine_f64x4(lo_lo, lo_hi),
8630 self.combine_f64x4(hi_lo, hi_hi),
8631 )
8632 }
8633 #[inline(always)]
8634 fn deinterleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
8635 let (a0, a1) = self.split_f64x8(a);
8636 let (b0, b1) = self.split_f64x8(b);
8637 let lo_even = self.unzip_low_f64x4(a0, a1);
8638 let lo_odd = self.unzip_high_f64x4(a0, a1);
8639 let hi_even = self.unzip_low_f64x4(b0, b1);
8640 let hi_odd = self.unzip_high_f64x4(b0, b1);
8641 (
8642 self.combine_f64x4(lo_even, hi_even),
8643 self.combine_f64x4(lo_odd, hi_odd),
8644 )
8645 }
8646 #[inline(always)]
8647 fn max_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8648 let (a0, a1) = self.split_f64x8(a);
8649 let (b0, b1) = self.split_f64x8(b);
8650 self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1))
8651 }
8652 #[inline(always)]
8653 fn min_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8654 let (a0, a1) = self.split_f64x8(a);
8655 let (b0, b1) = self.split_f64x8(b);
8656 self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1))
8657 }
8658 #[inline(always)]
8659 fn max_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8660 let (a0, a1) = self.split_f64x8(a);
8661 let (b0, b1) = self.split_f64x8(b);
8662 self.combine_f64x4(
8663 self.max_precise_f64x4(a0, b0),
8664 self.max_precise_f64x4(a1, b1),
8665 )
8666 }
8667 #[inline(always)]
8668 fn min_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
8669 let (a0, a1) = self.split_f64x8(a);
8670 let (b0, b1) = self.split_f64x8(b);
8671 self.combine_f64x4(
8672 self.min_precise_f64x4(a0, b0),
8673 self.min_precise_f64x4(a1, b1),
8674 )
8675 }
8676 #[inline(always)]
8677 fn mul_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
8678 let (a0, a1) = self.split_f64x8(a);
8679 let (b0, b1) = self.split_f64x8(b);
8680 let (c0, c1) = self.split_f64x8(c);
8681 self.combine_f64x4(
8682 self.mul_add_f64x4(a0, b0, c0),
8683 self.mul_add_f64x4(a1, b1, c1),
8684 )
8685 }
8686 #[inline(always)]
8687 fn mul_sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
8688 let (a0, a1) = self.split_f64x8(a);
8689 let (b0, b1) = self.split_f64x8(b);
8690 let (c0, c1) = self.split_f64x8(c);
8691 self.combine_f64x4(
8692 self.mul_sub_f64x4(a0, b0, c0),
8693 self.mul_sub_f64x4(a1, b1, c1),
8694 )
8695 }
8696 #[inline(always)]
8697 fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8698 let (a0, a1) = self.split_f64x8(a);
8699 self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1))
8700 }
8701 #[inline(always)]
8702 fn ceil_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8703 let (a0, a1) = self.split_f64x8(a);
8704 self.combine_f64x4(self.ceil_f64x4(a0), self.ceil_f64x4(a1))
8705 }
8706 #[inline(always)]
8707 fn round_ties_even_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8708 let (a0, a1) = self.split_f64x8(a);
8709 self.combine_f64x4(
8710 self.round_ties_even_f64x4(a0),
8711 self.round_ties_even_f64x4(a1),
8712 )
8713 }
8714 #[inline(always)]
8715 fn fract_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8716 let (a0, a1) = self.split_f64x8(a);
8717 self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1))
8718 }
8719 #[inline(always)]
8720 fn trunc_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
8721 let (a0, a1) = self.split_f64x8(a);
8722 self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1))
8723 }
8724 #[inline(always)]
8725 fn select_f64x8(self, a: mask64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
8726 let (a0, a1) = self.split_mask64x8(a);
8727 let (b0, b1) = self.split_f64x8(b);
8728 let (c0, c1) = self.split_f64x8(c);
8729 self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1))
8730 }
8731 #[inline(always)]
8732 fn split_f64x8(self, a: f64x8<Self>) -> (f64x4<Self>, f64x4<Self>) {
8733 (
8734 f64x4 {
8735 val: crate::support::Aligned256(a.val.0[0]),
8736 simd: self,
8737 },
8738 f64x4 {
8739 val: crate::support::Aligned256(a.val.0[1]),
8740 simd: self,
8741 },
8742 )
8743 }
8744 #[inline(always)]
8745 fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self> {
8746 let (a0, a1) = self.split_f64x8(a);
8747 self.combine_f32x8(
8748 self.reinterpret_f32_f64x4(a0),
8749 self.reinterpret_f32_f64x4(a1),
8750 )
8751 }
8752 #[inline(always)]
8753 fn splat_mask64x8(self, val: i64) -> mask64x8<Self> {
8754 let half = self.splat_mask64x4(val);
8755 self.combine_mask64x4(half, half)
8756 }
8757 #[inline(always)]
8758 fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8<Self> {
8759 mask64x8 {
8760 val: unsafe { core::mem::transmute_copy(&val) },
8761 simd: self,
8762 }
8763 }
8764 #[inline(always)]
8765 fn load_array_ref_mask64x8(self, val: &[i64; 8usize]) -> mask64x8<Self> {
8766 mask64x8 {
8767 val: unsafe { core::mem::transmute_copy(val) },
8768 simd: self,
8769 }
8770 }
8771 #[inline(always)]
8772 fn as_array_mask64x8(self, a: mask64x8<Self>) -> [i64; 8usize] {
8773 unsafe { core::mem::transmute::<[__m256i; 2usize], [i64; 8usize]>(a.val.0) }
8774 }
8775 #[inline(always)]
8776 fn as_array_ref_mask64x8(self, a: &mask64x8<Self>) -> &[i64; 8usize] {
8777 unsafe { core::mem::transmute::<&[__m256i; 2usize], &[i64; 8usize]>(&a.val.0) }
8778 }
8779 #[inline(always)]
8780 fn as_array_mut_mask64x8(self, a: &mut mask64x8<Self>) -> &mut [i64; 8usize] {
8781 unsafe { core::mem::transmute::<&mut [__m256i; 2usize], &mut [i64; 8usize]>(&mut a.val.0) }
8782 }
8783 #[inline(always)]
8784 fn store_array_mask64x8(self, a: mask64x8<Self>, dest: &mut [i64; 8usize]) -> () {
8785 unsafe {
8786 core::ptr::copy_nonoverlapping(
8787 (&raw const a.val.0) as *const i64,
8788 dest.as_mut_ptr(),
8789 8usize,
8790 );
8791 }
8792 }
8793 #[inline(always)]
8794 fn cvt_from_bytes_mask64x8(self, a: u8x64<Self>) -> mask64x8<Self> {
8795 unsafe {
8796 mask64x8 {
8797 val: core::mem::transmute(a.val),
8798 simd: self,
8799 }
8800 }
8801 }
8802 #[inline(always)]
8803 fn cvt_to_bytes_mask64x8(self, a: mask64x8<Self>) -> u8x64<Self> {
8804 unsafe {
8805 u8x64 {
8806 val: core::mem::transmute(a.val),
8807 simd: self,
8808 }
8809 }
8810 }
8811 #[inline(always)]
8812 fn slide_mask64x8<const SHIFT: usize>(
8813 self,
8814 a: mask64x8<Self>,
8815 b: mask64x8<Self>,
8816 ) -> mask64x8<Self> {
8817 unsafe {
8818 if SHIFT >= 8usize {
8819 return b;
8820 }
8821 let result = cross_block_alignr_256x2(
8822 self.cvt_to_bytes_mask64x8(b).val.0,
8823 self.cvt_to_bytes_mask64x8(a).val.0,
8824 SHIFT * 8usize,
8825 );
8826 self.cvt_from_bytes_mask64x8(u8x64 {
8827 val: crate::support::Aligned512(result),
8828 simd: self,
8829 })
8830 }
8831 }
8832 #[inline(always)]
8833 fn slide_within_blocks_mask64x8<const SHIFT: usize>(
8834 self,
8835 a: mask64x8<Self>,
8836 b: mask64x8<Self>,
8837 ) -> mask64x8<Self> {
8838 let (a0, a1) = self.split_mask64x8(a);
8839 let (b0, b1) = self.split_mask64x8(b);
8840 self.combine_mask64x4(
8841 self.slide_within_blocks_mask64x4::<SHIFT>(a0, b0),
8842 self.slide_within_blocks_mask64x4::<SHIFT>(a1, b1),
8843 )
8844 }
8845 #[inline(always)]
8846 fn and_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
8847 let (a0, a1) = self.split_mask64x8(a);
8848 let (b0, b1) = self.split_mask64x8(b);
8849 self.combine_mask64x4(self.and_mask64x4(a0, b0), self.and_mask64x4(a1, b1))
8850 }
8851 #[inline(always)]
8852 fn or_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
8853 let (a0, a1) = self.split_mask64x8(a);
8854 let (b0, b1) = self.split_mask64x8(b);
8855 self.combine_mask64x4(self.or_mask64x4(a0, b0), self.or_mask64x4(a1, b1))
8856 }
8857 #[inline(always)]
8858 fn xor_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
8859 let (a0, a1) = self.split_mask64x8(a);
8860 let (b0, b1) = self.split_mask64x8(b);
8861 self.combine_mask64x4(self.xor_mask64x4(a0, b0), self.xor_mask64x4(a1, b1))
8862 }
8863 #[inline(always)]
8864 fn not_mask64x8(self, a: mask64x8<Self>) -> mask64x8<Self> {
8865 let (a0, a1) = self.split_mask64x8(a);
8866 self.combine_mask64x4(self.not_mask64x4(a0), self.not_mask64x4(a1))
8867 }
8868 #[inline(always)]
8869 fn select_mask64x8(
8870 self,
8871 a: mask64x8<Self>,
8872 b: mask64x8<Self>,
8873 c: mask64x8<Self>,
8874 ) -> mask64x8<Self> {
8875 let (a0, a1) = self.split_mask64x8(a);
8876 let (b0, b1) = self.split_mask64x8(b);
8877 let (c0, c1) = self.split_mask64x8(c);
8878 self.combine_mask64x4(
8879 self.select_mask64x4(a0, b0, c0),
8880 self.select_mask64x4(a1, b1, c1),
8881 )
8882 }
8883 #[inline(always)]
8884 fn simd_eq_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
8885 let (a0, a1) = self.split_mask64x8(a);
8886 let (b0, b1) = self.split_mask64x8(b);
8887 self.combine_mask64x4(self.simd_eq_mask64x4(a0, b0), self.simd_eq_mask64x4(a1, b1))
8888 }
8889 #[inline(always)]
8890 fn any_true_mask64x8(self, a: mask64x8<Self>) -> bool {
8891 let (a0, a1) = self.split_mask64x8(a);
8892 self.any_true_mask64x4(a0) || self.any_true_mask64x4(a1)
8893 }
8894 #[inline(always)]
8895 fn all_true_mask64x8(self, a: mask64x8<Self>) -> bool {
8896 let (a0, a1) = self.split_mask64x8(a);
8897 self.all_true_mask64x4(a0) && self.all_true_mask64x4(a1)
8898 }
8899 #[inline(always)]
8900 fn any_false_mask64x8(self, a: mask64x8<Self>) -> bool {
8901 let (a0, a1) = self.split_mask64x8(a);
8902 self.any_false_mask64x4(a0) || self.any_false_mask64x4(a1)
8903 }
8904 #[inline(always)]
8905 fn all_false_mask64x8(self, a: mask64x8<Self>) -> bool {
8906 let (a0, a1) = self.split_mask64x8(a);
8907 self.all_false_mask64x4(a0) && self.all_false_mask64x4(a1)
8908 }
8909 #[inline(always)]
8910 fn split_mask64x8(self, a: mask64x8<Self>) -> (mask64x4<Self>, mask64x4<Self>) {
8911 (
8912 mask64x4 {
8913 val: crate::support::Aligned256(a.val.0[0]),
8914 simd: self,
8915 },
8916 mask64x4 {
8917 val: crate::support::Aligned256(a.val.0[1]),
8918 simd: self,
8919 },
8920 )
8921 }
8922}
8923impl<S: Simd> SimdFrom<__m256, S> for f32x8<S> {
8924 #[inline(always)]
8925 fn simd_from(simd: S, arch: __m256) -> Self {
8926 Self {
8927 val: unsafe { core::mem::transmute_copy(&arch) },
8928 simd,
8929 }
8930 }
8931}
8932impl<S: Simd> From<f32x8<S>> for __m256 {
8933 #[inline(always)]
8934 fn from(value: f32x8<S>) -> Self {
8935 unsafe { core::mem::transmute_copy(&value.val) }
8936 }
8937}
8938impl<S: Simd> SimdFrom<__m256i, S> for i8x32<S> {
8939 #[inline(always)]
8940 fn simd_from(simd: S, arch: __m256i) -> Self {
8941 Self {
8942 val: unsafe { core::mem::transmute_copy(&arch) },
8943 simd,
8944 }
8945 }
8946}
8947impl<S: Simd> From<i8x32<S>> for __m256i {
8948 #[inline(always)]
8949 fn from(value: i8x32<S>) -> Self {
8950 unsafe { core::mem::transmute_copy(&value.val) }
8951 }
8952}
8953impl<S: Simd> SimdFrom<__m256i, S> for u8x32<S> {
8954 #[inline(always)]
8955 fn simd_from(simd: S, arch: __m256i) -> Self {
8956 Self {
8957 val: unsafe { core::mem::transmute_copy(&arch) },
8958 simd,
8959 }
8960 }
8961}
8962impl<S: Simd> From<u8x32<S>> for __m256i {
8963 #[inline(always)]
8964 fn from(value: u8x32<S>) -> Self {
8965 unsafe { core::mem::transmute_copy(&value.val) }
8966 }
8967}
8968impl<S: Simd> SimdFrom<__m256i, S> for mask8x32<S> {
8969 #[inline(always)]
8970 fn simd_from(simd: S, arch: __m256i) -> Self {
8971 Self {
8972 val: unsafe { core::mem::transmute_copy(&arch) },
8973 simd,
8974 }
8975 }
8976}
8977impl<S: Simd> From<mask8x32<S>> for __m256i {
8978 #[inline(always)]
8979 fn from(value: mask8x32<S>) -> Self {
8980 unsafe { core::mem::transmute_copy(&value.val) }
8981 }
8982}
8983impl<S: Simd> SimdFrom<__m256i, S> for i16x16<S> {
8984 #[inline(always)]
8985 fn simd_from(simd: S, arch: __m256i) -> Self {
8986 Self {
8987 val: unsafe { core::mem::transmute_copy(&arch) },
8988 simd,
8989 }
8990 }
8991}
8992impl<S: Simd> From<i16x16<S>> for __m256i {
8993 #[inline(always)]
8994 fn from(value: i16x16<S>) -> Self {
8995 unsafe { core::mem::transmute_copy(&value.val) }
8996 }
8997}
8998impl<S: Simd> SimdFrom<__m256i, S> for u16x16<S> {
8999 #[inline(always)]
9000 fn simd_from(simd: S, arch: __m256i) -> Self {
9001 Self {
9002 val: unsafe { core::mem::transmute_copy(&arch) },
9003 simd,
9004 }
9005 }
9006}
9007impl<S: Simd> From<u16x16<S>> for __m256i {
9008 #[inline(always)]
9009 fn from(value: u16x16<S>) -> Self {
9010 unsafe { core::mem::transmute_copy(&value.val) }
9011 }
9012}
9013impl<S: Simd> SimdFrom<__m256i, S> for mask16x16<S> {
9014 #[inline(always)]
9015 fn simd_from(simd: S, arch: __m256i) -> Self {
9016 Self {
9017 val: unsafe { core::mem::transmute_copy(&arch) },
9018 simd,
9019 }
9020 }
9021}
9022impl<S: Simd> From<mask16x16<S>> for __m256i {
9023 #[inline(always)]
9024 fn from(value: mask16x16<S>) -> Self {
9025 unsafe { core::mem::transmute_copy(&value.val) }
9026 }
9027}
9028impl<S: Simd> SimdFrom<__m256i, S> for i32x8<S> {
9029 #[inline(always)]
9030 fn simd_from(simd: S, arch: __m256i) -> Self {
9031 Self {
9032 val: unsafe { core::mem::transmute_copy(&arch) },
9033 simd,
9034 }
9035 }
9036}
9037impl<S: Simd> From<i32x8<S>> for __m256i {
9038 #[inline(always)]
9039 fn from(value: i32x8<S>) -> Self {
9040 unsafe { core::mem::transmute_copy(&value.val) }
9041 }
9042}
9043impl<S: Simd> SimdFrom<__m256i, S> for u32x8<S> {
9044 #[inline(always)]
9045 fn simd_from(simd: S, arch: __m256i) -> Self {
9046 Self {
9047 val: unsafe { core::mem::transmute_copy(&arch) },
9048 simd,
9049 }
9050 }
9051}
9052impl<S: Simd> From<u32x8<S>> for __m256i {
9053 #[inline(always)]
9054 fn from(value: u32x8<S>) -> Self {
9055 unsafe { core::mem::transmute_copy(&value.val) }
9056 }
9057}
9058impl<S: Simd> SimdFrom<__m256i, S> for mask32x8<S> {
9059 #[inline(always)]
9060 fn simd_from(simd: S, arch: __m256i) -> Self {
9061 Self {
9062 val: unsafe { core::mem::transmute_copy(&arch) },
9063 simd,
9064 }
9065 }
9066}
9067impl<S: Simd> From<mask32x8<S>> for __m256i {
9068 #[inline(always)]
9069 fn from(value: mask32x8<S>) -> Self {
9070 unsafe { core::mem::transmute_copy(&value.val) }
9071 }
9072}
9073impl<S: Simd> SimdFrom<__m256d, S> for f64x4<S> {
9074 #[inline(always)]
9075 fn simd_from(simd: S, arch: __m256d) -> Self {
9076 Self {
9077 val: unsafe { core::mem::transmute_copy(&arch) },
9078 simd,
9079 }
9080 }
9081}
9082impl<S: Simd> From<f64x4<S>> for __m256d {
9083 #[inline(always)]
9084 fn from(value: f64x4<S>) -> Self {
9085 unsafe { core::mem::transmute_copy(&value.val) }
9086 }
9087}
9088impl<S: Simd> SimdFrom<__m256i, S> for mask64x4<S> {
9089 #[inline(always)]
9090 fn simd_from(simd: S, arch: __m256i) -> Self {
9091 Self {
9092 val: unsafe { core::mem::transmute_copy(&arch) },
9093 simd,
9094 }
9095 }
9096}
9097impl<S: Simd> From<mask64x4<S>> for __m256i {
9098 #[inline(always)]
9099 fn from(value: mask64x4<S>) -> Self {
9100 unsafe { core::mem::transmute_copy(&value.val) }
9101 }
9102}
9103#[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"]
9104#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"]
9105#[doc = r" Rust doesn't currently let you do math on const generics."]
9106#[inline(always)]
9107unsafe fn dyn_alignr_128(a: __m128i, b: __m128i, shift: usize) -> __m128i {
9108 unsafe {
9109 match shift {
9110 0usize => _mm_alignr_epi8::<0i32>(a, b),
9111 1usize => _mm_alignr_epi8::<1i32>(a, b),
9112 2usize => _mm_alignr_epi8::<2i32>(a, b),
9113 3usize => _mm_alignr_epi8::<3i32>(a, b),
9114 4usize => _mm_alignr_epi8::<4i32>(a, b),
9115 5usize => _mm_alignr_epi8::<5i32>(a, b),
9116 6usize => _mm_alignr_epi8::<6i32>(a, b),
9117 7usize => _mm_alignr_epi8::<7i32>(a, b),
9118 8usize => _mm_alignr_epi8::<8i32>(a, b),
9119 9usize => _mm_alignr_epi8::<9i32>(a, b),
9120 10usize => _mm_alignr_epi8::<10i32>(a, b),
9121 11usize => _mm_alignr_epi8::<11i32>(a, b),
9122 12usize => _mm_alignr_epi8::<12i32>(a, b),
9123 13usize => _mm_alignr_epi8::<13i32>(a, b),
9124 14usize => _mm_alignr_epi8::<14i32>(a, b),
9125 15usize => _mm_alignr_epi8::<15i32>(a, b),
9126 _ => unreachable!(),
9127 }
9128 }
9129}
9130#[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"]
9131#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"]
9132#[doc = r" Rust doesn't currently let you do math on const generics."]
9133#[inline(always)]
9134unsafe fn dyn_alignr_256(a: __m256i, b: __m256i, shift: usize) -> __m256i {
9135 unsafe {
9136 match shift {
9137 0usize => _mm256_alignr_epi8::<0i32>(a, b),
9138 1usize => _mm256_alignr_epi8::<1i32>(a, b),
9139 2usize => _mm256_alignr_epi8::<2i32>(a, b),
9140 3usize => _mm256_alignr_epi8::<3i32>(a, b),
9141 4usize => _mm256_alignr_epi8::<4i32>(a, b),
9142 5usize => _mm256_alignr_epi8::<5i32>(a, b),
9143 6usize => _mm256_alignr_epi8::<6i32>(a, b),
9144 7usize => _mm256_alignr_epi8::<7i32>(a, b),
9145 8usize => _mm256_alignr_epi8::<8i32>(a, b),
9146 9usize => _mm256_alignr_epi8::<9i32>(a, b),
9147 10usize => _mm256_alignr_epi8::<10i32>(a, b),
9148 11usize => _mm256_alignr_epi8::<11i32>(a, b),
9149 12usize => _mm256_alignr_epi8::<12i32>(a, b),
9150 13usize => _mm256_alignr_epi8::<13i32>(a, b),
9151 14usize => _mm256_alignr_epi8::<14i32>(a, b),
9152 15usize => _mm256_alignr_epi8::<15i32>(a, b),
9153 _ => unreachable!(),
9154 }
9155 }
9156}
9157#[doc = r" Computes one output __m256i for `cross_block_alignr_*` operations."]
9158#[doc = r""]
9159#[doc = r" Given an array of registers, each containing two 128-bit blocks, extracts two adjacent blocks (`lo_idx` and"]
9160#[doc = r" `hi_idx` = `lo_idx + 1`) and performs `alignr` with `intra_shift`."]
9161#[inline(always)]
9162unsafe fn cross_block_alignr_one(
9163 regs: &[__m256i],
9164 block_idx: usize,
9165 shift_bytes: usize,
9166) -> __m256i {
9167 let lo_idx = block_idx + (shift_bytes / 16);
9168 let intra_shift = shift_bytes % 16;
9169 let lo_blocks = if lo_idx & 1 == 0 {
9170 regs[lo_idx / 2]
9171 } else {
9172 unsafe { _mm256_permute2x128_si256::<0x21>(regs[lo_idx / 2], regs[(lo_idx / 2) + 1]) }
9173 };
9174 let hi_idx = lo_idx + 1;
9175 let hi_blocks = if hi_idx & 1 == 0 {
9176 regs[hi_idx / 2]
9177 } else {
9178 unsafe { _mm256_permute2x128_si256::<0x21>(regs[hi_idx / 2], regs[(hi_idx / 2) + 1]) }
9179 };
9180 unsafe { dyn_alignr_256(hi_blocks, lo_blocks, intra_shift) }
9181}
9182#[doc = r" Concatenates `b` and `a` (each 2 x __m256i = 4 blocks) and extracts 4 blocks starting at byte offset"]
9183#[doc = r" `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics."]
9184#[inline(always)]
9185unsafe fn cross_block_alignr_256x2(
9186 a: [__m256i; 2],
9187 b: [__m256i; 2],
9188 shift_bytes: usize,
9189) -> [__m256i; 2] {
9190 let regs = [b[0], b[1], a[0], a[1]];
9191 unsafe {
9192 [
9193 cross_block_alignr_one(®s, 0, shift_bytes),
9194 cross_block_alignr_one(®s, 2, shift_bytes),
9195 ]
9196 }
9197}
9198#[doc = r" Concatenates `b` and `a` (each 1 x __m256i = 2 blocks) and extracts 2 blocks starting at byte offset"]
9199#[doc = r" `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics."]
9200#[inline(always)]
9201unsafe fn cross_block_alignr_256x1(a: __m256i, b: __m256i, shift_bytes: usize) -> __m256i {
9202 let regs = [b, a];
9203 unsafe { cross_block_alignr_one(®s, 0, shift_bytes) }
9204}