polyval/backend/intrinsics/
x86.rs1#![allow(unsafe_op_in_unsafe_fn)]
22
23use super::ExpandedKey;
24use crate::{Block, ParBlocks, field_element::FieldElement};
25
26#[cfg(target_arch = "x86")]
27use core::arch::x86::*;
28#[cfg(target_arch = "x86_64")]
29use core::arch::x86_64::*;
30
31const P1: u64 = 0xC200000000000000;
33
34cpufeatures::new!(clmul, "vpclmulqdq");
35pub(crate) use clmul::InitToken;
36
37type ByteArray = [u8; 16];
39
40impl FieldElement {
41 #[target_feature(enable = "sse2")]
42 #[inline]
43 unsafe fn from_m128i(reg: __m128i) -> Self {
44 let mut out = ByteArray::default();
45 _mm_storeu_si128(out.as_mut_ptr().cast(), reg);
46 out.into()
47 }
48
49 #[target_feature(enable = "sse2")]
50 #[inline]
51 unsafe fn to_m128i(self) -> __m128i {
52 load_bytes(&self.into())
53 }
54}
55
56#[target_feature(enable = "sse2")]
61#[inline]
62unsafe fn load_bytes(bytes: &ByteArray) -> __m128i {
63 _mm_loadu_si128(bytes.as_ptr().cast())
64}
65
66#[target_feature(enable = "avx", enable = "pclmulqdq")]
72#[inline]
73pub(super) unsafe fn proc_block(
74 key: &ExpandedKey,
75 acc: FieldElement,
76 block: &Block,
77) -> FieldElement {
78 let data = load_bytes(&block.0);
79
80 let y = _mm_xor_si128(acc.to_m128i(), data);
82
83 FieldElement::from_m128i(gf128_mul_rf(y, key.h1.to_m128i(), key.d1.to_m128i()))
85}
86
87#[target_feature(enable = "avx", enable = "pclmulqdq")]
91#[inline]
92pub(super) unsafe fn proc_par_blocks(
93 key: &ExpandedKey,
94 acc: FieldElement,
95 par_blocks: &ParBlocks,
96) -> FieldElement {
97 let m0 = load_bytes(&par_blocks[0].0);
99 let m1 = load_bytes(&par_blocks[1].0);
100 let m2 = load_bytes(&par_blocks[2].0);
101 let m3 = load_bytes(&par_blocks[3].0);
102
103 let y0 = _mm_xor_si128(acc.to_m128i(), m0);
105
106 let (r0, f0) = rf_mul_unreduced(y0, key.h4.to_m128i(), key.d4.to_m128i());
108 let (r1, f1) = rf_mul_unreduced(m1, key.h3.to_m128i(), key.d3.to_m128i());
109 let (r2, f2) = rf_mul_unreduced(m2, key.h2.to_m128i(), key.d2.to_m128i());
110 let (r3, f3) = rf_mul_unreduced(m3, key.h1.to_m128i(), key.d1.to_m128i());
111
112 let r = _mm_xor_si128(_mm_xor_si128(r0, r1), _mm_xor_si128(r2, r3));
114 let f = _mm_xor_si128(_mm_xor_si128(f0, f1), _mm_xor_si128(f2, f3));
115
116 FieldElement::from_m128i(reduce_rf(r, f))
118}
119
120#[target_feature(enable = "avx", enable = "pclmulqdq")]
125pub(super) unsafe fn expand_key(h: &[u8; 16]) -> ExpandedKey {
126 let h1 = load_bytes(h);
127 let d1 = compute_d(h1);
128
129 let h2 = gf128_mul_rf(h1, h1, d1);
131 let d2 = compute_d(h2);
132
133 let h3 = gf128_mul_rf(h2, h1, d1);
134 let d3 = compute_d(h3);
135
136 let h4 = gf128_mul_rf(h2, h2, d2);
137 let d4 = compute_d(h4);
138
139 ExpandedKey {
140 h1: FieldElement::from_m128i(h1),
141 d1: FieldElement::from_m128i(d1),
142 h2: FieldElement::from_m128i(h2),
143 d2: FieldElement::from_m128i(d2),
144 h3: FieldElement::from_m128i(h3),
145 d3: FieldElement::from_m128i(d3),
146 h4: FieldElement::from_m128i(h4),
147 d4: FieldElement::from_m128i(d4),
148 }
149}
150
151#[target_feature(enable = "avx", enable = "pclmulqdq")]
155#[inline]
156unsafe fn compute_d(h: __m128i) -> __m128i {
157 #[allow(clippy::cast_possible_wrap)]
159 let p = _mm_set_epi64x(P1 as i64, 0);
160
161 let h_swap = _mm_shuffle_epi32(h, 0x4e);
163
164 let t = _mm_clmulepi64_si128(h, p, 0x10);
166
167 _mm_xor_si128(h_swap, t)
169}
170
171#[target_feature(enable = "avx", enable = "pclmulqdq")]
179#[inline]
180unsafe fn rf_mul_unreduced(m: __m128i, h: __m128i, d: __m128i) -> (__m128i, __m128i) {
181 let r0 = _mm_clmulepi64_si128(m, d, 0x10); let r1 = _mm_clmulepi64_si128(m, h, 0x11); let r = _mm_xor_si128(r0, r1);
185
186 let f0 = _mm_clmulepi64_si128(m, d, 0x00); let f1 = _mm_clmulepi64_si128(m, h, 0x01); let f = _mm_xor_si128(f0, f1);
190
191 (r, f)
192}
193
194#[target_feature(enable = "avx", enable = "pclmulqdq")]
198#[inline]
199unsafe fn reduce_rf(r: __m128i, f: __m128i) -> __m128i {
200 #[allow(clippy::cast_possible_wrap)]
202 let p1 = _mm_set_epi64x(0, P1 as i64);
203
204 let f1 = _mm_srli_si128(f, 8);
206
207 let f0_shifted = _mm_slli_si128(f, 8);
209
210 let p1_f0 = _mm_clmulepi64_si128(f, p1, 0x00);
212
213 let result = _mm_xor_si128(r, f1);
215 let result = _mm_xor_si128(result, f0_shifted);
216 _mm_xor_si128(result, p1_f0)
217}
218
219#[target_feature(enable = "avx", enable = "pclmulqdq")]
221#[inline]
222unsafe fn gf128_mul_rf(m: __m128i, h: __m128i, d: __m128i) -> __m128i {
223 let (r, f) = rf_mul_unreduced(m, h, d);
224 reduce_rf(r, f)
225}