fearless_simd/core_arch/x86/
sse2.rs

1// Copyright 2024 the Fearless_SIMD Authors
2// SPDX-License-Identifier: Apache-2.0 OR MIT
3
4//! Access to SSE2 intrinsics.
5
6use crate::impl_macros::delegate;
7#[cfg(target_arch = "x86")]
8use core::arch::x86 as arch;
9#[cfg(target_arch = "x86_64")]
10use core::arch::x86_64 as arch;
11
12use arch::*;
13
14/// A token for SSE2 intrinsics on `x86` and `x86_64`.
15#[derive(Clone, Copy, Debug)]
16pub struct Sse2 {
17    _private: (),
18}
19
20#[expect(
21    clippy::missing_safety_doc,
22    reason = "TODO: https://github.com/linebender/fearless_simd/issues/40"
23)]
24impl Sse2 {
25    /// Create a SIMD token.
26    ///
27    /// # Safety
28    ///
29    /// The required CPU features must be available.
30    #[inline]
31    pub unsafe fn new_unchecked() -> Self {
32        Self { _private: () }
33    }
34
35    delegate! { arch:
36        fn _mm_pause();
37        #[allow(clippy::not_unsafe_ptr_arg_deref)]
38        fn _mm_clflush(p: *const u8);
39        fn _mm_lfence();
40        fn _mm_mfence();
41        fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i;
42        fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i;
43        fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i;
44        fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i;
45        fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i;
46        fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i;
47        fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i;
48        fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i;
49        fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i;
50        fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i;
51        fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i;
52        fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i;
53        fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i;
54        fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i;
55        fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i;
56        fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i;
57        fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i;
58        fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i;
59        fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i;
60        fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i;
61        fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i;
62        fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i;
63        fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i;
64        fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i;
65        fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i;
66        fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i;
67        fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i;
68        fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i;
69        fn _mm_slli_si128<const IMM8: i32>(a: __m128i) -> __m128i;
70        fn _mm_bslli_si128<const IMM8: i32>(a: __m128i) -> __m128i;
71        fn _mm_bsrli_si128<const IMM8: i32>(a: __m128i) -> __m128i;
72        fn _mm_slli_epi16<const IMM8: i32>(a: __m128i) -> __m128i;
73        fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i;
74        fn _mm_slli_epi32<const IMM8: i32>(a: __m128i) -> __m128i;
75        fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i;
76        fn _mm_slli_epi64<const IMM8: i32>(a: __m128i) -> __m128i;
77        fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i;
78        fn _mm_srai_epi16<const IMM8: i32>(a: __m128i) -> __m128i;
79        fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i;
80        fn _mm_srai_epi32<const IMM8: i32>(a: __m128i) -> __m128i;
81        fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i;
82        fn _mm_srli_si128<const IMM8: i32>(a: __m128i) -> __m128i;
83        fn _mm_srli_epi16<const IMM8: i32>(a: __m128i) -> __m128i;
84        fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i;
85        fn _mm_srli_epi32<const IMM8: i32>(a: __m128i) -> __m128i;
86        fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i;
87        fn _mm_srli_epi64<const IMM8: i32>(a: __m128i) -> __m128i;
88        fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i;
89        fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i;
90        fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i;
91        fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i;
92        fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i;
93        fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i;
94        fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i;
95        fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i;
96        fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i;
97        fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i;
98        fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i;
99        fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i;
100        fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i;
101        fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i;
102        fn _mm_cvtepi32_pd(a: __m128i) -> __m128d;
103        fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d;
104        fn _mm_cvtepi32_ps(a: __m128i) -> __m128;
105        fn _mm_cvtps_epi32(a: __m128) -> __m128i;
106        fn _mm_cvtsi32_si128(a: i32) -> __m128i;
107        fn _mm_cvtsi128_si32(a: __m128i) -> i32;
108        fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i;
109        fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i;
110        fn _mm_set_epi16(
111            e7: i16,
112            e6: i16,
113            e5: i16,
114            e4: i16,
115            e3: i16,
116            e2: i16,
117            e1: i16,
118            e0: i16,
119        ) -> __m128i;
120        fn _mm_set_epi8(
121            e15: i8,
122            e14: i8,
123            e13: i8,
124            e12: i8,
125            e11: i8,
126            e10: i8,
127            e9: i8,
128            e8: i8,
129            e7: i8,
130            e6: i8,
131            e5: i8,
132            e4: i8,
133            e3: i8,
134            e2: i8,
135            e1: i8,
136            e0: i8,
137        ) -> __m128i;
138        fn _mm_set1_epi64x(a: i64) -> __m128i;
139        fn _mm_set1_epi32(a: i32) -> __m128i;
140        fn _mm_set1_epi16(a: i16) -> __m128i;
141        fn _mm_set1_epi8(a: i8) -> __m128i;
142        fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i;
143        fn _mm_setr_epi16(
144            e7: i16,
145            e6: i16,
146            e5: i16,
147            e4: i16,
148            e3: i16,
149            e2: i16,
150            e1: i16,
151            e0: i16,
152        ) -> __m128i;
153        fn _mm_setr_epi8(
154            e15: i8,
155            e14: i8,
156            e13: i8,
157            e12: i8,
158            e11: i8,
159            e10: i8,
160            e9: i8,
161            e8: i8,
162            e7: i8,
163            e6: i8,
164            e5: i8,
165            e4: i8,
166            e3: i8,
167            e2: i8,
168            e1: i8,
169            e0: i8,
170        ) -> __m128i;
171        fn _mm_setzero_si128() -> __m128i;
172        unsafe fn _mm_loadl_epi64(mem_addr: *const __m128i) -> __m128i;
173        unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i;
174        unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i;
175        unsafe fn _mm_maskmoveu_si128(a: __m128i, mask: __m128i, mem_addr: *mut i8);
176        unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i);
177        unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i);
178        unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i);
179        unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i);
180        unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32);
181        fn _mm_move_epi64(a: __m128i) -> __m128i;
182        fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i;
183        fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i;
184        fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i;
185        fn _mm_extract_epi16<const IMM8: i32>(a: __m128i) -> i32;
186        fn _mm_insert_epi16<const IMM8: i32>(a: __m128i, i: i32) -> __m128i;
187        fn _mm_movemask_epi8(a: __m128i) -> i32;
188        fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i;
189        fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i;
190        fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i;
191        fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i;
192        fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i;
193        fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i;
194        fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i;
195        fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i;
196        fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i;
197        fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i;
198        fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i;
199        fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d;
200        fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d;
201        fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d;
202        fn _mm_div_pd(a: __m128d, b: __m128d) -> __m128d;
203        fn _mm_max_sd(a: __m128d, b: __m128d) -> __m128d;
204        fn _mm_max_pd(a: __m128d, b: __m128d) -> __m128d;
205        fn _mm_min_sd(a: __m128d, b: __m128d) -> __m128d;
206        fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d;
207        fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d;
208        fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d;
209        fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d;
210        fn _mm_sqrt_pd(a: __m128d) -> __m128d;
211        fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d;
212        fn _mm_sub_pd(a: __m128d, b: __m128d) -> __m128d;
213        fn _mm_and_pd(a: __m128d, b: __m128d) -> __m128d;
214        fn _mm_andnot_pd(a: __m128d, b: __m128d) -> __m128d;
215        fn _mm_or_pd(a: __m128d, b: __m128d) -> __m128d;
216        fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d;
217        fn _mm_cmpeq_sd(a: __m128d, b: __m128d) -> __m128d;
218        fn _mm_cmplt_sd(a: __m128d, b: __m128d) -> __m128d;
219        fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d;
220        fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d;
221        fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d;
222        fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d;
223        fn _mm_cmpunord_sd(a: __m128d, b: __m128d) -> __m128d;
224        fn _mm_cmpneq_sd(a: __m128d, b: __m128d) -> __m128d;
225        fn _mm_cmpnlt_sd(a: __m128d, b: __m128d) -> __m128d;
226        fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d;
227        fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d;
228        fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d;
229        fn _mm_cmpeq_pd(a: __m128d, b: __m128d) -> __m128d;
230        fn _mm_cmplt_pd(a: __m128d, b: __m128d) -> __m128d;
231        fn _mm_cmple_pd(a: __m128d, b: __m128d) -> __m128d;
232        fn _mm_cmpgt_pd(a: __m128d, b: __m128d) -> __m128d;
233        fn _mm_cmpge_pd(a: __m128d, b: __m128d) -> __m128d;
234        fn _mm_cmpord_pd(a: __m128d, b: __m128d) -> __m128d;
235        fn _mm_cmpunord_pd(a: __m128d, b: __m128d) -> __m128d;
236        fn _mm_cmpneq_pd(a: __m128d, b: __m128d) -> __m128d;
237        fn _mm_cmpnlt_pd(a: __m128d, b: __m128d) -> __m128d;
238        fn _mm_cmpnle_pd(a: __m128d, b: __m128d) -> __m128d;
239        fn _mm_cmpngt_pd(a: __m128d, b: __m128d) -> __m128d;
240        fn _mm_cmpnge_pd(a: __m128d, b: __m128d) -> __m128d;
241        fn _mm_comieq_sd(a: __m128d, b: __m128d) -> i32;
242        fn _mm_comilt_sd(a: __m128d, b: __m128d) -> i32;
243        fn _mm_comile_sd(a: __m128d, b: __m128d) -> i32;
244        fn _mm_comigt_sd(a: __m128d, b: __m128d) -> i32;
245        fn _mm_comige_sd(a: __m128d, b: __m128d) -> i32;
246        fn _mm_comineq_sd(a: __m128d, b: __m128d) -> i32;
247        fn _mm_ucomieq_sd(a: __m128d, b: __m128d) -> i32;
248        fn _mm_ucomilt_sd(a: __m128d, b: __m128d) -> i32;
249        fn _mm_ucomile_sd(a: __m128d, b: __m128d) -> i32;
250        fn _mm_ucomigt_sd(a: __m128d, b: __m128d) -> i32;
251        fn _mm_ucomige_sd(a: __m128d, b: __m128d) -> i32;
252        fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32;
253        fn _mm_cvtpd_ps(a: __m128d) -> __m128;
254        fn _mm_cvtps_pd(a: __m128) -> __m128d;
255        fn _mm_cvtpd_epi32(a: __m128d) -> __m128i;
256        fn _mm_cvtsd_si32(a: __m128d) -> i32;
257        fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128;
258        fn _mm_cvtsd_f64(a: __m128d) -> f64;
259        fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d;
260        fn _mm_cvttpd_epi32(a: __m128d) -> __m128i;
261        fn _mm_cvttsd_si32(a: __m128d) -> i32;
262        fn _mm_cvttps_epi32(a: __m128) -> __m128i;
263        fn _mm_set_sd(a: f64) -> __m128d;
264        fn _mm_set1_pd(a: f64) -> __m128d;
265        fn _mm_set_pd1(a: f64) -> __m128d;
266        fn _mm_set_pd(a: f64, b: f64) -> __m128d;
267        fn _mm_setr_pd(a: f64, b: f64) -> __m128d;
268        fn _mm_setzero_pd() -> __m128d;
269        fn _mm_movemask_pd(a: __m128d) -> i32;
270        unsafe fn _mm_load_pd(mem_addr: *const f64) -> __m128d;
271        unsafe fn _mm_load_sd(mem_addr: *const f64) -> __m128d;
272        unsafe fn _mm_loadh_pd(a: __m128d, mem_addr: *const f64) -> __m128d;
273        unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d;
274        unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d);
275        unsafe fn _mm_store_sd(mem_addr: *mut f64, a: __m128d);
276        unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d);
277        unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d);
278        unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d);
279        unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d);
280        unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: __m128d);
281        unsafe fn _mm_storeh_pd(mem_addr: *mut f64, a: __m128d);
282        unsafe fn _mm_storel_pd(mem_addr: *mut f64, a: __m128d);
283        unsafe fn _mm_load1_pd(mem_addr: *const f64) -> __m128d;
284        unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d;
285        unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d;
286        unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d;
287        fn _mm_shuffle_pd<const MASK: i32>(a: __m128d, b: __m128d) -> __m128d;
288        fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d;
289        fn _mm_castpd_ps(a: __m128d) -> __m128;
290        fn _mm_castpd_si128(a: __m128d) -> __m128i;
291        fn _mm_castps_pd(a: __m128) -> __m128d;
292        fn _mm_castps_si128(a: __m128) -> __m128i;
293        fn _mm_castsi128_pd(a: __m128i) -> __m128d;
294        fn _mm_castsi128_ps(a: __m128i) -> __m128;
295        fn _mm_undefined_pd() -> __m128d;
296        fn _mm_undefined_si128() -> __m128i;
297        fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d;
298        fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d;
299    }
300}