poly1305/backend/avx2/helpers.rs
1//! AVX2 helpers for implementing Poly1305 using 26-bit limbs.
2#![allow(unsafe_op_in_unsafe_fn)]
3
4use core::fmt;
5use core::ops::{Add, Mul};
6
7#[cfg(target_arch = "x86")]
8use core::arch::x86::*;
9#[cfg(target_arch = "x86_64")]
10use core::arch::x86_64::*;
11
12use super::ParBlocks;
13use crate::{Block, Key};
14
15const fn set02(x3: u8, x2: u8, x1: u8, x0: u8) -> i32 {
16 (((x3) << 6) | ((x2) << 4) | ((x1) << 2) | (x0)) as i32
17}
18
19/// Helper for Display impls of aligned values.
20fn write_130(f: &mut fmt::Formatter<'_>, limbs: [u32; 5]) -> fmt::Result {
21 let r0 = u128::from(limbs[0]);
22 let r1 = u128::from(limbs[1]);
23 let r2 = u128::from(limbs[2]);
24 let r3 = u128::from(limbs[3]);
25 let r4 = u128::from(limbs[4]);
26
27 // Reduce into two u128s
28 let l0 = r0 + (r1 << 26) + (r2 << 52) + (r3 << 78);
29 let (l0, c) = l0.overflowing_add(r4 << 104);
30 let l1 = (r4 >> 24) + if c { 1 } else { 0 };
31
32 write!(f, "0x{:02x}{:032x}", l1, l0)
33}
34
35/// Helper for Display impls of unreduced values.
36fn write_130_wide(f: &mut fmt::Formatter<'_>, limbs: [u64; 5]) -> fmt::Result {
37 let r0 = u128::from(limbs[0]);
38 let r1 = u128::from(limbs[1]);
39 let r2 = u128::from(limbs[2]);
40 let r3 = u128::from(limbs[3]);
41 let r4 = u128::from(limbs[4]);
42
43 // Reduce into two u128s
44 let l0 = r0 + (r1 << 26) + (r2 << 52);
45 let (l0, c1) = l0.overflowing_add(r3 << 78);
46 let (l0, c2) = l0.overflowing_add(r4 << 104);
47 let l1 = (r3 >> 50) + (r4 >> 24) + if c1 { 1 } else { 0 } + if c2 { 1 } else { 0 };
48
49 write!(f, "0x{:02x}{:032x}", l1, l0)
50}
51
52/// Derives the Poly1305 addition and polynomial keys.
53#[target_feature(enable = "avx2")]
54pub(super) unsafe fn prepare_keys(key: &Key) -> (AdditionKey, PrecomputedMultiplier) {
55 // [k7, k6, k5, k4, k3, k2, k1, k0]
56 let key = _mm256_loadu_si256(key.as_ptr().cast());
57
58 // Prepare addition key: [0, k7, 0, k6, 0, k5, 0, k4]
59 let k = AdditionKey(_mm256_and_si256(
60 _mm256_permutevar8x32_epi32(key, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)),
61 _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1),
62 ));
63
64 // Prepare polynomial key R = k & 0xffffffc0ffffffc0ffffffc0fffffff:
65 let r = Aligned130::new(_mm256_and_si256(
66 key,
67 _mm256_set_epi32(0, 0, 0, 0, 0x0ffffffc, 0x0ffffffc, 0x0ffffffc, 0x0fffffff),
68 ));
69
70 (k, r.into())
71}
72
73/// A 130-bit integer aligned across five 26-bit limbs.
74///
75/// The top three 32-bit words of the underlying 256-bit vector are ignored.
76#[derive(Clone, Copy, Debug)]
77pub(super) struct Aligned130(pub(super) __m256i);
78
79impl fmt::Display for Aligned130 {
80 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
81 let mut v0 = [0u8; 32];
82 unsafe {
83 _mm256_storeu_si256(v0.as_mut_ptr().cast(), self.0);
84 }
85
86 write!(f, "Aligned130(")?;
87 write_130(
88 f,
89 [
90 u32::from_le_bytes(v0[0..4].try_into().unwrap()),
91 u32::from_le_bytes(v0[4..8].try_into().unwrap()),
92 u32::from_le_bytes(v0[8..12].try_into().unwrap()),
93 u32::from_le_bytes(v0[12..16].try_into().unwrap()),
94 u32::from_le_bytes(v0[16..20].try_into().unwrap()),
95 ],
96 )?;
97 write!(f, ")")
98 }
99}
100
101impl Aligned130 {
102 /// Aligns a 16-byte Poly1305 block at 26-bit boundaries within 32-bit words, and sets
103 /// the high bit.
104 #[target_feature(enable = "avx2")]
105 pub(super) unsafe fn from_block(block: &Block) -> Self {
106 Aligned130::new(_mm256_or_si256(
107 _mm256_and_si256(
108 // Load the 128-bit block into a 256-bit vector.
109 _mm256_castsi128_si256(_mm_loadu_si128(block.as_ptr().cast())),
110 // Mask off the upper 128 bits (undefined by _mm256_castsi128_si256).
111 _mm256_set_epi64x(0, 0, -1, -1),
112 ),
113 // Set the high bit.
114 _mm256_set_epi64x(0, 1, 0, 0),
115 ))
116 }
117
118 /// Aligns a partial Poly1305 block at 26-bit boundaries within 32-bit words.
119 ///
120 /// Assumes that the high bit is already correctly set for the partial block.
121 #[target_feature(enable = "avx2")]
122 pub(super) unsafe fn from_partial_block(block: &Block) -> Self {
123 Aligned130::new(_mm256_and_si256(
124 // Load the 128-bit block into a 256-bit vector.
125 _mm256_castsi128_si256(_mm_loadu_si128(block.as_ptr().cast())),
126 // Mask off the upper 128 bits (undefined by _mm256_castsi128_si256).
127 _mm256_set_epi64x(0, 0, -1, -1),
128 ))
129 }
130
131 /// Splits a 130-bit integer into five 26-bit limbs.
132 #[target_feature(enable = "avx2")]
133 unsafe fn new(x: __m256i) -> Self {
134 // Starting from a 130-bit integer split across 32-bit words:
135 // [0, 0, 0, [0; 30] || x4[2..0], x3, x2, x1, x0]
136
137 // - Grab the low bits of each word:
138 // x1 = [
139 // [0; 32],
140 // [0; 32],
141 // [0; 32],
142 // [0; 6] || x4[ 2..0] || [0; 24],
143 // x3[14..0] || [0; 18],
144 // x2[20..0] || [0; 12],
145 // x1[26..0] || [0; 6],
146 // x0,
147 // ]
148 let xl = _mm256_sllv_epi32(x, _mm256_set_epi32(32, 32, 32, 24, 18, 12, 6, 0));
149
150 // Grab the high bits of each word, rotated up by one word:
151 // xh = [
152 // [0; 32],
153 // [0; 32],
154 // [0; 32],
155 // [0; 8] || x3[32.. 8]
156 // [0; 14] || x2[32..14]
157 // [0; 20] || x1[32..20]
158 // [0; 26] || x0[32..26],
159 // [0; 32],
160 // ]
161 let xh = _mm256_permutevar8x32_epi32(
162 _mm256_srlv_epi32(x, _mm256_set_epi32(32, 32, 32, 2, 8, 14, 20, 26)),
163 _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7),
164 );
165
166 // - Combine the low and high bits:
167 // [
168 // [0; 32],
169 // [0; 32],
170 // [0; 32],
171 // [0; 6] || x4[ 2..0] || x3[32.. 8]
172 // x3[14..0] || x2[32..14]
173 // x2[20..0] || x1[32..20]
174 // x1[26..0] || x0[32..26],
175 // x0,
176 // ]
177 // - Mask to 26 bits:
178 // [
179 // [0; 32],
180 // [0; 32],
181 // [0; 32],
182 // [0; 6] || x4[ 2..0] || x3[32.. 8]
183 // [0; 6] || x3[ 8..0] || x2[32..14]
184 // [0; 6] || x2[14..0] || x1[32..20]
185 // [0; 6] || x1[20..0] || x0[32..26],
186 // [0; 6] || x0[26..0],
187 // ]
188 Aligned130(_mm256_and_si256(
189 _mm256_or_si256(xl, xh),
190 _mm256_set_epi32(
191 0, 0, 0, 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff,
192 ),
193 ))
194 }
195}
196
197impl Add<Aligned130> for Aligned130 {
198 type Output = Aligned130;
199
200 fn add(self, other: Aligned130) -> Aligned130 {
201 // With 26-bit limbs inside 32-bit words, there is plenty of space for unreduced
202 // addition.
203 unsafe { Aligned130(_mm256_add_epi32(self.0, other.0)) }
204 }
205}
206
207/// A pre-computed multiplier.
208#[derive(Clone, Copy, Debug)]
209pub(super) struct PrecomputedMultiplier {
210 pub(super) a: __m256i,
211 pub(super) a_5: __m256i,
212}
213
214impl From<Aligned130> for PrecomputedMultiplier {
215 fn from(r: Aligned130) -> Self {
216 unsafe {
217 // Precompute 5*R.
218 //
219 // The 5-limb representation (r_4, r_3, r_2, r_1, r_0) of R and
220 // (5·r4, 5·r3, 5·r2, 5·r1) are represented in two 256-bit vectors in the
221 // following manner:
222 // r1: [5·r_4, 5·r_3, 5·r_2, r_4, r_3, r_2, r_1, r_0]
223 // r1_5: [5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1]
224 let a_5 = _mm256_permutevar8x32_epi32(
225 _mm256_add_epi32(r.0, _mm256_slli_epi32(r.0, 2)),
226 _mm256_set_epi32(4, 3, 2, 1, 1, 1, 1, 1),
227 );
228 let a = _mm256_blend_epi32(r.0, a_5, 0b11100000);
229 let a_5 = _mm256_permute2x128_si256(a_5, a_5, 0);
230
231 PrecomputedMultiplier { a, a_5 }
232 }
233 }
234}
235
236impl Mul<PrecomputedMultiplier> for PrecomputedMultiplier {
237 type Output = Unreduced130;
238
239 fn mul(self, other: PrecomputedMultiplier) -> Unreduced130 {
240 // Pass through to `self.a` for multiplication.
241 Aligned130(self.a) * other
242 }
243}
244
245impl Mul<PrecomputedMultiplier> for Aligned130 {
246 type Output = Unreduced130;
247
248 /// Multiplies 2 values using lazy reduction.
249 ///
250 /// Context switches from 32 bit to 64 bit.
251 #[inline(always)]
252 fn mul(self, other: PrecomputedMultiplier) -> Unreduced130 {
253 unsafe {
254 // Starting with the following limb layout:
255 // x = [ 0, 0, 0, x_4, x_3, x_2, x_1, x_0]
256 // y = [5·r_4, 5·r_3, 5·r_2, r_4, r_3, r_2, r_1, r_0]
257 // z = [5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1]
258 let x = self.0;
259 let y = other.a;
260 let z = other.a_5;
261
262 // [ 0, x_4, 0, x_3, 0, x_2, 0, x_1] (32-bit words)
263 // * 5·r_4
264 // = [5·r_4·x_4, 5·r_4·x_3, 5·r_4·x_2, 5·r_4·x_1] (64-bit words)
265 let v0 = _mm256_mul_epu32(
266 _mm256_permutevar8x32_epi32(x, _mm256_set_epi64x(4, 3, 2, 1)),
267 _mm256_permutevar8x32_epi32(y, _mm256_set_epi64x(7, 7, 7, 7)),
268 );
269 // [ 0, x_3, 0, x_2, 0, x_1, 0, x_0] (32-bit words)
270 // * r_0
271 // = [ r_0·x_3, r_0·x_2, r_0·x_1, r_0·x_0] (64-bit words)
272 // + previous step
273 // = [
274 // r_0·x_3 + 5·r_4·x_4,
275 // r_0·x_2 + 5·r_4·x_3,
276 // r_0·x_1 + 5·r_4·x_2,
277 // r_0·x_0 + 5·r_4·x_1,
278 // ]
279 let v0 = _mm256_add_epi64(
280 v0,
281 _mm256_mul_epu32(
282 _mm256_permutevar8x32_epi32(x, _mm256_set_epi64x(3, 2, 1, 0)),
283 _mm256_broadcastd_epi32(_mm256_castsi256_si128(y)),
284 ),
285 );
286 // [ 0, x_1, 0, x_1, 0, x_3, 0, x_3]
287 // * [ 0, r_2, 0, r_1, 0, 5·r_3, 0, 5·r_2]
288 // = [r_2·x_1, r_1·x_1, 5·r_3·x_3, 5·r_2·x_3]
289 // + previous step
290 // = [
291 // r_0·x_3 + r_2·x_1 + 5·r_4·x_4,
292 // r_0·x_2 + r_1·x_1 + 5·r_4·x_3,
293 // r_0·x_1 + 5·r_3·x_3 + 5·r_4·x_2,
294 // r_0·x_0 + 5·r_2·x_3 + 5·r_4·x_1,
295 // ]
296 let v0 = _mm256_add_epi64(
297 v0,
298 _mm256_mul_epu32(
299 _mm256_permutevar8x32_epi32(x, _mm256_set_epi64x(1, 1, 3, 3)),
300 _mm256_permutevar8x32_epi32(y, _mm256_set_epi64x(2, 1, 6, 5)),
301 ),
302 );
303 // [x_3, x_2, x_1, x_0, x_1, x_0, 0, x_4]
304 // * [ 0, r_1, 0, r_2, 0, r_1, 5·r_1, 5·r_1]
305 // = [ r_1·x_2, r_2·x_0, r_1·x_0, 5·r_1·x_4]
306 // + previous step
307 // = [
308 // r_0·x_3 + r_1·x_2 + r_2·x_1 + 5·r_4·x_4,
309 // r_0·x_2 + r_1·x_1 + r_2·x_0 + 5·r_4·x_3,
310 // r_0·x_1 + r_1·x_0 + 5·r_3·x_3 + 5·r_4·x_2,
311 // r_0·x_0 + 5·r_1·x_4 + 5·r_2·x_3 + 5·r_4·x_1,
312 // ]
313 let v0 = _mm256_add_epi64(
314 v0,
315 _mm256_mul_epu32(
316 _mm256_permute4x64_epi64(x, set02(1, 0, 0, 2)),
317 _mm256_blend_epi32(
318 _mm256_permutevar8x32_epi32(y, _mm256_set_epi64x(1, 2, 1, 1)),
319 z,
320 0x03,
321 ),
322 ),
323 );
324 // [x_1, x_0, 0, x_4, 0, x_4, x_3, x_2]
325 // * [ 0, r_3, 0, 5·r_3, 0, 5·r_2, 0, 5·r_3]
326 // = [ r_3·x_0, 5·r_3·x_4, 5·r_2·x_4, 5·r_3·x_2]
327 // + previous step
328 // v0 = [
329 // r_0·x_3 + r_1·x_2 + r_2·x_1 + r_3·x_0 + 5·r_4·x_4,
330 // r_0·x_2 + r_1·x_1 + r_2·x_0 + 5·r_3·x_4 + 5·r_4·x_3,
331 // r_0·x_1 + r_1·x_0 + 5·r_2·x_4 + 5·r_3·x_3 + 5·r_4·x_2,
332 // r_0·x_0 + 5·r_1·x_4 + 5·r_2·x_3 + 5·r_3·x_2 + 5·r_4·x_1,
333 // ]
334 let v0 = _mm256_add_epi64(
335 v0,
336 _mm256_mul_epu32(
337 _mm256_permute4x64_epi64(x, set02(0, 2, 2, 1)),
338 _mm256_permutevar8x32_epi32(y, _mm256_set_epi64x(3, 6, 5, 6)),
339 ),
340 );
341
342 // [ 0, x_3, 0, x_2, 0, x_1, 0, x_0]
343 // * [ 0, r_1, 0, r_2, 0, r_3, 0, r_4]
344 // = [r_1·x_3, r_2·x_2, r_3·x_1, r_4·x_0]
345 let v1 = _mm256_mul_epu32(
346 _mm256_permutevar8x32_epi32(x, _mm256_set_epi64x(3, 2, 1, 0)),
347 _mm256_permutevar8x32_epi32(y, _mm256_set_epi64x(1, 2, 3, 4)),
348 );
349 // [r_3·x_1, r_4·x_0, r_1·x_3, r_2·x_2]
350 // + previous step
351 // = [
352 // r_1·x_3 + r_3·x_1,
353 // r_2·x_2 + r_4·x_0,
354 // r_1·x_3 + r_3·x_1,
355 // r_2·x_2 + r_4·x_0,
356 // ]
357 let v1 = _mm256_add_epi64(v1, _mm256_permute4x64_epi64(v1, set02(1, 0, 3, 2)));
358 // [
359 // r_2·x_2 + r_4·x_0,
360 // r_2·x_2 + r_4·x_0,
361 // r_2·x_2 + r_4·x_0,
362 // r_1·x_3 + r_3·x_1,
363 // ]
364 // + previous step
365 // = [
366 // r_1·x_3 + r_2·x_2 + r_3·x_1 + r_4·x_0,
367 // 2·r_2·x_2 + 2·r_4·x_0,
368 // r_1·x_3 + r_2·x_2 + r_3·x_1 + r_4·x_0,
369 // r_1·x_3 + r_2·x_2 + r_3·x_1 + r_4·x_0,
370 // ]
371 let v1 = _mm256_add_epi64(v1, _mm256_permute4x64_epi64(v1, set02(0, 0, 0, 1)));
372 // [ x_1, x_0, x_1, x_0, x_1, x_0, 0, x_4]
373 // * [5·r_4, 5·r_3, 5·r_2, r_4, r_3, r_2, r_1, r_0]
374 // = [ 5·r_3·x_0, r_4·x_0, r_2·x_0, r_0·x_4]
375 // + previous step
376 // v1 = [
377 // 5·r_3·x_0 + r_1·x_3 + r_2·x_2 + r_3·x_1 + r_4·x_0,
378 // 2·r_2·x_2 + 3·r_4·x_0,
379 // r_2·x_0 + r_1·x_3 + r_2·x_2 + r_3·x_1 + r_4·x_0,
380 // r_0·x_4 + r_1·x_3 + r_2·x_2 + r_3·x_1 + r_4·x_0,
381 // ]
382 let v1 = _mm256_add_epi64(
383 v1,
384 _mm256_mul_epu32(_mm256_permute4x64_epi64(x, set02(0, 0, 0, 2)), y),
385 );
386
387 // The result:
388 // v1 = [
389 // 5·r_3·x_0 + r_1·x_3 + r_2·x_2 + r_3·x_1 + r_4·x_0,
390 // 2·r_2·x_2 + 3·r_4·x_0,
391 // r_2·x_0 + r_1·x_3 + r_2·x_2 + r_3·x_1 + r_4·x_0,
392 // r_0·x_4 + r_1·x_3 + r_2·x_2 + r_3·x_1 + r_4·x_0,
393 // ]
394 // v0 = [
395 // r_0·x_3 + r_1·x_2 + r_2·x_1 + r_3·x_0 + 5·r_4·x_4,
396 // r_0·x_2 + r_1·x_1 + r_2·x_0 + 5·r_3·x_4 + 5·r_4·x_3,
397 // r_0·x_1 + r_1·x_0 + 5·r_2·x_4 + 5·r_3·x_3 + 5·r_4·x_2,
398 // r_0·x_0 + 5·r_1·x_4 + 5·r_2·x_3 + 5·r_3·x_2 + 5·r_4·x_1,
399 // ]
400 // This corresponds to (3) in Goll Gueron 2015:
401 // v1 = [ _, _, _, t_4]
402 // v0 = [t_3, t_2, t_1, t_0]
403 Unreduced130 { v0, v1 }
404 }
405 }
406}
407
408/// The unreduced output of an `Aligned130` multiplication.
409///
410/// Represented internally with 64-bit limbs.
411#[derive(Copy, Clone, Debug)]
412pub(super) struct Unreduced130 {
413 v0: __m256i,
414 v1: __m256i,
415}
416
417impl fmt::Display for Unreduced130 {
418 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
419 let mut v0 = [0u8; 32];
420 let mut v1 = [0u8; 32];
421 unsafe {
422 _mm256_storeu_si256(v0.as_mut_ptr().cast(), self.v0);
423 _mm256_storeu_si256(v1.as_mut_ptr().cast(), self.v1);
424 }
425
426 write!(f, "Unreduced130(")?;
427 write_130_wide(
428 f,
429 [
430 u64::from_le_bytes(v0[0..8].try_into().unwrap()),
431 u64::from_le_bytes(v0[8..16].try_into().unwrap()),
432 u64::from_le_bytes(v0[16..24].try_into().unwrap()),
433 u64::from_le_bytes(v0[24..32].try_into().unwrap()),
434 u64::from_le_bytes(v1[0..8].try_into().unwrap()),
435 ],
436 )?;
437 write!(f, ")")
438 }
439}
440
441impl Unreduced130 {
442 /// Reduces x modulo 2^130 - 5.
443 ///
444 /// Context switches from 64 bit to 32 bit.
445 #[inline(always)]
446 pub(super) fn reduce(self) -> Aligned130 {
447 unsafe {
448 // Starting with the following limb layout:
449 // self.v1 = [ _, _, _, t_4]
450 // self.v0 = [t_3, t_2, t_1, t_0]
451 let (red_1, red_0) = adc(self.v1, self.v0);
452 let (red_1, red_0) = red(red_1, red_0);
453 let (red_1, red_0) = adc(red_1, red_0);
454
455 // - Switch context from 64-bit limbs to 32-bit limbs:
456 Aligned130(_mm256_blend_epi32(
457 _mm256_permutevar8x32_epi32(red_0, _mm256_set_epi32(0, 6, 4, 0, 6, 4, 2, 0)),
458 _mm256_permutevar8x32_epi32(red_1, _mm256_set_epi32(0, 6, 4, 0, 6, 4, 2, 0)),
459 0x90,
460 ))
461 }
462 }
463}
464
465/// Carry chain
466#[inline(always)]
467unsafe fn adc(v1: __m256i, v0: __m256i) -> (__m256i, __m256i) {
468 // [t_3, t_2 % 2^26, t_1 % 2^26, t_0 % 2^26]
469 // + [t_2 >> 26, t_1 >> 26, t_0 >> 26, 0 ]
470 // = [
471 // t_3 + t_2 >> 26,
472 // t_2 % 2^26 + t_1 >> 26,
473 // t_1 % 2^26 + t_0 >> 26,
474 // t_0 % 2^26,
475 // ]
476 let v0 = _mm256_add_epi64(
477 _mm256_and_si256(v0, _mm256_set_epi64x(-1, 0x3ffffff, 0x3ffffff, 0x3ffffff)),
478 _mm256_permute4x64_epi64(
479 _mm256_srlv_epi64(v0, _mm256_set_epi64x(64, 26, 26, 26)),
480 set02(2, 1, 0, 3),
481 ),
482 );
483 // [_, _, _, t_4]
484 // + [
485 // (t_2 % 2^26 + t_1 >> 26) >> 26,
486 // (t_1 % 2^26 + t_0 >> 26) >> 26,
487 // (t_0 % 2^26 ) >> 26,
488 // (t_3 + t_2 >> 26) >> 26,
489 // ]
490 // = [_, _, _, t_4 + (t_3 + t_2 >> 26) >> 26]
491 let v1 = _mm256_add_epi64(
492 v1,
493 _mm256_permute4x64_epi64(_mm256_srli_epi64(v0, 26), set02(2, 1, 0, 3)),
494 );
495 // [
496 // (t_3 + t_2 >> 26) % 2^26,
497 // t_2 % 2^26 + t_1 >> 26,
498 // t_1 % 2^26 + t_0 >> 26,
499 // t_0 % 2^26,
500 // ]
501 let chain = _mm256_and_si256(v0, _mm256_set_epi64x(0x3ffffff, -1, -1, -1));
502
503 (v1, chain)
504}
505
506/// Reduction modulus 2^130-5
507#[inline(always)]
508unsafe fn red(v1: __m256i, v0: __m256i) -> (__m256i, __m256i) {
509 // t = [0, 0, 0, t_4 >> 26]
510 let t = _mm256_srlv_epi64(v1, _mm256_set_epi64x(64, 64, 64, 26));
511 // v0 + 5·t = [t_3, t_2, t_1, t_0 + 5·(t_4 >> 26)]
512 let red_0 = _mm256_add_epi64(_mm256_add_epi64(v0, t), _mm256_slli_epi64(t, 2));
513 // [0, 0, 0, t_4 % 2^26]
514 let red_1 = _mm256_and_si256(v1, _mm256_set_epi64x(0, 0, 0, 0x3ffffff));
515 (red_1, red_0)
516}
517
518/// A pair of `Aligned130`s.
519#[derive(Clone, Debug)]
520pub(super) struct Aligned2x130 {
521 v0: Aligned130,
522 v1: Aligned130,
523}
524
525impl fmt::Display for Aligned2x130 {
526 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
527 writeln!(f, "Aligned2x130([")?;
528 writeln!(f, " {},", self.v0)?;
529 writeln!(f, " {},", self.v1)?;
530 write!(f, "])")
531 }
532}
533
534impl Aligned2x130 {
535 /// Aligns two 16-byte Poly1305 blocks at 26-bit boundaries within 32-bit words, and
536 /// sets the high bit for each block.
537 ///
538 /// # Panics
539 ///
540 /// Panics if `src.len() < 32`.
541 #[target_feature(enable = "avx2")]
542 pub(super) unsafe fn from_blocks(src: &[Block; 2]) -> Self {
543 Aligned2x130 {
544 v0: Aligned130::from_block(&src[0]),
545 v1: Aligned130::from_block(&src[1]),
546 }
547 }
548
549 /// Multiplies 2x2 and add both results simultaneously using lazy reduction.
550 ///
551 /// Context switches from 32 bit to 64 bit.
552 #[inline(always)]
553 pub(super) fn mul_and_sum(
554 self,
555 r1: PrecomputedMultiplier,
556 r2: PrecomputedMultiplier,
557 ) -> Unreduced130 {
558 unsafe {
559 // Starting with the following limb layout:
560 // x.v1 = [ 0, 0, 0, x1_4, x1_3, x1_2, x1_1, x1_0]
561 // x.v0 = [ 0, 0, 0, x0_4, x0_3, x0_2, x0_1, x0_0]
562 // r1 = [5·r1_4, 5·r1_3, 5·r1_2, r1_4, r1_3, r1_2, r1_1, r1_0]
563 // r15 = [5·r1_1, 5·r1_1, 5·r1_1, 5·r1_1, 5·r1_1, 5·r1_1, 5·r1_1, 5·r1_1]
564 // r2 = [5·r2_4, 5·r2_3, 5·r2_2, r2_4, r2_3, r2_2, r2_1, r2_0]
565 // r25 = [5·r2_1, 5·r2_1, 5·r2_1, 5·r2_1, 5·r2_1, 5·r2_1, 5·r2_1, 5·r2_1]
566 let x = self;
567 let r15 = r1.a_5;
568 let r25 = r2.a_5;
569 let r1 = r1.a;
570 let r2 = r2.a;
571
572 // v0 = [
573 // 5·x0_4·r2_4,
574 // 5·x0_3·r2_4,
575 // 5·x0_2·r2_4,
576 // 5·x0_1·r2_4,
577 // ]
578 let mut v0 = _mm256_mul_epu32(
579 // [_, x0_4, _, x0_3, _, x0_2, _, x0_1]
580 _mm256_permutevar8x32_epi32(x.v0.0, _mm256_set_epi64x(4, 3, 2, 1)),
581 // [_, 5·r2_4, _, 5·r2_4, _, 5·r2_4, _, 5·r2_4]
582 _mm256_permutevar8x32_epi32(r2, _mm256_set1_epi64x(7)),
583 );
584 // v1 = [
585 // 5·x1_4·r1_4,
586 // 5·x1_3·r1_4,
587 // 5·x1_2·r1_4,
588 // 5·x1_1·r1_4,
589 // ]
590 let mut v1 = _mm256_mul_epu32(
591 // [_, x1_4, _, x1_3, _, x1_2, _, x1_1]
592 _mm256_permutevar8x32_epi32(x.v1.0, _mm256_set_epi64x(4, 3, 2, 1)),
593 // [_, 5·r1_4, _, 5·r1_4, _, 5·r1_4, _, 5·r1_4]
594 _mm256_permutevar8x32_epi32(r1, _mm256_set1_epi64x(7)),
595 );
596
597 // v0 = [
598 // `x0_0·r2_3`+ 5·x0_4·r2_4,
599 // `5·x0_4·r2_3`+ 5·x0_3·r2_4,
600 // `5·x0_4·r2_2` + 5·x0_2·r2_4,
601 // `5·x0_2·r2_3`+ 5·x0_1·r2_4,
602 // ]
603 v0 = _mm256_add_epi64(
604 v0,
605 _mm256_mul_epu32(
606 // [_, x0_0, _, x0_4, _, x0_4, _, x0_2]
607 _mm256_permute4x64_epi64(x.v0.0, set02(0, 2, 2, 1)),
608 // [_, r2_3, _, 5·r2_3, _, 5·r2_2, _, 5·r2_3]
609 _mm256_permutevar8x32_epi32(r2, _mm256_set_epi64x(3, 6, 5, 6)),
610 ),
611 );
612 // v1 = [
613 // `x1_0·r1_3`+ 5·x1_4·r1_4,
614 // `5·x1_4·r1_3`+ 5·x1_3·r1_4,
615 // `5·x1_4·r1_2` + 5·x1_2·r1_4,
616 // `5·x1_2·r1_3`+ 5·x1_1·r1_4,
617 // ]
618 v1 = _mm256_add_epi64(
619 v1,
620 _mm256_mul_epu32(
621 // [_, x1_0, _, x1_4, _, x1_4, _, x1_2]
622 _mm256_permute4x64_epi64(x.v1.0, set02(0, 2, 2, 1)),
623 // [_, r1_3, _, 5·r1_3, _, 5·r1_2, _, 5·r1_3]
624 _mm256_permutevar8x32_epi32(r1, _mm256_set_epi64x(3, 6, 5, 6)),
625 ),
626 );
627 // v0 = [
628 // `x0_1·r2_2`+ x0_0·r2_3 + 5·x0_4·r2_4,
629 // `x0_1·r2_1` + 5·x0_4·r2_3 + 5·x0_3·r2_4,
630 // 5·x0_4·r2_2 +`5·x0_3·r2_3`+ 5·x0_2·r2_4,
631 // `5·x0_3·r2_2`+ 5·x0_2·r2_3 + 5·x0_1·r2_4,
632 // ]
633 v0 = _mm256_add_epi64(
634 v0,
635 _mm256_mul_epu32(
636 // [_, x0_1, _, x0_1, _, x0_3, _, x0_3]
637 _mm256_permutevar8x32_epi32(x.v0.0, _mm256_set_epi64x(1, 1, 3, 3)),
638 // [_, r2_2, _, r2_1, _, 5·r2_3, _, 5·r2_2]
639 _mm256_permutevar8x32_epi32(r2, _mm256_set_epi64x(2, 1, 6, 5)),
640 ),
641 );
642 // v1 = [
643 // `x1_1·r1_2`+ x1_0·r1_3 + 5·x1_4·r1_4,
644 // `x1_1·r1_1` + 5·x1_4·r1_3 + 5·x1_3·r1_4,
645 // 5·x1_4·r1_2 +`5·x1_3·r1_3`+ 5·x1_2·r1_4,
646 // `5·x1_3·r1_2`+ 5·x1_2·r1_3 + 5·x1_1·r1_4,
647 // ]
648 v1 = _mm256_add_epi64(
649 v1,
650 _mm256_mul_epu32(
651 // [_, x1_1, _, x1_1, _, x1_3, _, x1_3]
652 _mm256_permutevar8x32_epi32(x.v1.0, _mm256_set_epi64x(1, 1, 3, 3)),
653 // [_, r1_2, _, r1_1, _, 5·r1_3, _, 5·r1_2]
654 _mm256_permutevar8x32_epi32(r1, _mm256_set_epi64x(2, 1, 6, 5)),
655 ),
656 );
657 // v0 = [
658 // `x0_3·r2_0` + x0_1·r2_2 + x0_0·r2_3 + 5·x0_4·r2_4,
659 // `x0_2·r2_0`+ x0_1·r2_1 + 5·x0_4·r2_3 + 5·x0_3·r2_4,
660 // `x0_1·r2_0` + 5·x0_4·r2_2 + 5·x0_3·r2_3 + 5·x0_2·r2_4,
661 // `x0_0·r2_0` + 5·x0_3·r2_2 + 5·x0_2·r2_3 + 5·x0_1·r2_4,
662 // ]
663 v0 = _mm256_add_epi64(
664 v0,
665 _mm256_mul_epu32(
666 // [_, x0_3, _, x0_2, _, x0_1, _, x0_0]
667 _mm256_permutevar8x32_epi32(x.v0.0, _mm256_set_epi64x(3, 2, 1, 0)),
668 // [_, r2_0, _, r2_0, _, r2_0, _, r2_0]
669 _mm256_broadcastd_epi32(_mm256_castsi256_si128(r2)),
670 ),
671 );
672 // v1 = [
673 // `x1_3·r1_0` + x1_1·r1_2 + x1_0·r1_3 + 5·x1_4·r1_4,
674 // `x1_2·r1_0`+ x1_1·r1_1 + 5·x1_4·r1_3 + 5·x1_3·r1_4,
675 // `x1_1·r1_0` + 5·x1_4·r1_2 + 5·x1_3·r1_3 + 5·x1_2·r1_4,
676 // `x1_0·r1_0` + 5·x1_3·r1_2 + 5·x1_2·r1_3 + 5·x1_1·r1_4,
677 // ]
678 v1 = _mm256_add_epi64(
679 v1,
680 _mm256_mul_epu32(
681 // [_, x1_3, _, x1_2, _, x1_1, _, x1_0]
682 _mm256_permutevar8x32_epi32(x.v1.0, _mm256_set_epi64x(3, 2, 1, 0)),
683 // [_, r1_0, _, r1_0, _, r1_0, _, r1_0]
684 _mm256_broadcastd_epi32(_mm256_castsi256_si128(r1)),
685 ),
686 );
687
688 // t0 = [x0_3, x0_2, x0_1, x0_0, x0_1, x0_0, 0, x0_4]
689 // t1 = [x1_3, x1_2, x1_1, x1_0, x1_1, x1_0, 0, x1_4]
690 let mut t0 = _mm256_permute4x64_epi64(x.v0.0, set02(1, 0, 0, 2));
691 let mut t1 = _mm256_permute4x64_epi64(x.v1.0, set02(1, 0, 0, 2));
692
693 // v0 = [
694 // x0_3·r2_0 + `x0_2·r2_1`+ x0_1·r2_2 + x0_0·r2_3 + 5·x0_4·r2_4,
695 // x0_2·r2_0 + x0_1·r2_1 + `x0_0·r2_2`+ 5·x0_4·r2_3 + 5·x0_3·r2_4,
696 // x0_1·r2_0 + `x0_0·r2_1`+ 5·x0_4·r2_2 + 5·x0_3·r2_3 + 5·x0_2·r2_4,
697 // x0_0·r2_0 +`5·x0_4·r2_1`+ 5·x0_3·r2_2 + 5·x0_2·r2_3 + 5·x0_1·r2_4,
698 // ]
699 v0 = _mm256_add_epi64(
700 v0,
701 _mm256_mul_epu32(
702 // [_, x0_2, _, x0_0, _, x0_0, _, x0_4]
703 t0,
704 // [_, r2_1, _, r2_2, _, r2_1, _, 5·r2_1]
705 _mm256_blend_epi32(
706 // [r2_0, r2_1, r2_0, r2_2, r2_0, r2_1, r2_0, r2_1]
707 _mm256_permutevar8x32_epi32(r2, _mm256_set_epi64x(1, 2, 1, 1)),
708 r25,
709 0b00000011,
710 ),
711 ),
712 );
713 // v1 = [
714 // x1_3·r1_0 + `x1_2·r1_1`+ x1_1·r1_2 + x1_0·r1_3 + 5·x1_4·r1_4,
715 // x1_2·r1_0 + x1_1·r1_1 + `x1_0·r1_2`+ 5·x1_4·r1_3 + 5·x1_3·r1_4,
716 // x1_1·r1_0 + `x1_0·r1_1`+ 5·x1_4·r1_2 + 5·x1_3·r1_3 + 5·x1_2·r1_4,
717 // x1_0·r1_0 +`5·x1_4·r1_1`+ 5·x1_3·r1_2 + 5·x1_2·r1_3 + 5·x1_1·r1_4,
718 // ]
719 v1 = _mm256_add_epi64(
720 v1,
721 _mm256_mul_epu32(
722 // [_, x1_2, _, x1_0, _, x1_0, _, x1_4]
723 t1,
724 // [_, r1_1, _, r1_2, _, r1_1, _, 5·r1_1]
725 _mm256_blend_epi32(
726 // [r1_0, r1_1, r1_0, r1_2, r1_0, r1_1, r1_0, r1_1]
727 _mm256_permutevar8x32_epi32(r1, _mm256_set_epi64x(1, 2, 1, 1)),
728 r15,
729 0b00000011,
730 ),
731 ),
732 );
733 // v0 = [
734 // x0_3·r2_0 + x0_2·r2_1 + x0_1·r2_2 + x0_0·r2_3 + 5·x0_4·r2_4 + x1_3·r1_0 + x1_2·r1_1 + x1_1·r1_2 + x1_0·r1_3 + 5·x1_4·r1_4,
735 // x0_2·r2_0 + x0_1·r2_1 + x0_0·r2_2 + 5·x0_4·r2_3 + 5·x0_3·r2_4 + x1_2·r1_0 + x1_1·r1_1 + x1_0·r1_2 + 5·x1_4·r1_3 + 5·x1_3·r1_4,
736 // x0_1·r2_0 + x0_0·r2_1 + 5·x0_4·r2_2 + 5·x0_3·r2_3 + 5·x0_2·r2_4 + x1_1·r1_0 + x1_0·r1_1 + 5·x1_4·r1_2 + 5·x1_3·r1_3 + 5·x1_2·r1_4,
737 // x0_0·r2_0 + 5·x0_4·r2_1 + 5·x0_3·r2_2 + 5·x0_2·r2_3 + 5·x0_1·r2_4 + x1_0·r1_0 + 5·x1_4·r1_1 + 5·x1_3·r1_2 + 5·x1_2·r1_3 + 5·x1_1·r1_4,
738 // ]
739 v0 = _mm256_add_epi64(v0, v1);
740
741 // t0 = [
742 // 5·x0_2·r2_3,
743 // x0_0·r2_4,
744 // x0_0·r2_2,
745 // x0_4·r2_0,
746 // ]
747 // t1 = [
748 // 5·x1_2·r1_3,
749 // x1_0·r1_4,
750 // x1_0·r1_2,
751 // x1_4·r1_0,
752 // ]
753 t0 = _mm256_mul_epu32(t0, r2);
754 t1 = _mm256_mul_epu32(t1, r1);
755
756 // v1 = [
757 // 5·x0_2·r2_3 + 5·x1_2·r1_3,
758 // x0_0·r2_4 + x1_0·r1_4,
759 // x0_0·r2_2 + x1_0·r1_2,
760 // x0_4·r2_0 + x1_4·r1_0,
761 // ]
762 v1 = _mm256_add_epi64(t0, t1);
763
764 // t0 = [
765 // x0_3·r2_1,
766 // x0_2·r2_2,
767 // x0_1·r2_3,
768 // x0_0·r2_4,
769 // ]
770 t0 = _mm256_mul_epu32(
771 // [_, x0_3, _, x0_2, _, x0_1, _, x0_0]
772 _mm256_permutevar8x32_epi32(x.v0.0, _mm256_set_epi64x(3, 2, 1, 0)),
773 // [_, r2_1, _, r2_2, _, r2_3, _, r2_4]
774 _mm256_permutevar8x32_epi32(r2, _mm256_set_epi64x(1, 2, 3, 4)),
775 );
776 // t1 = [
777 // x1_3·r1_1,
778 // x1_2·r1_2,
779 // x1_1·r1_3,
780 // x1_0·r1_4,
781 // ]
782 t1 = _mm256_mul_epu32(
783 // [_, x1_3, _, x1_2, _, x1_1, _, x1_0]
784 _mm256_permutevar8x32_epi32(x.v1.0, _mm256_set_epi64x(3, 2, 1, 0)),
785 // [_, r1_1, _, r1_2, _, r1_3, _, r1_4]
786 _mm256_permutevar8x32_epi32(r1, _mm256_set_epi64x(1, 2, 3, 4)),
787 );
788 // t0 = [
789 // x0_3·r2_1 + x1_3·r1_1,
790 // x0_2·r2_2 + x1_2·r1_2,
791 // x0_1·r2_3 + x1_1·r1_3,
792 // x0_0·r2_4 + x1_0·r1_4,
793 // ]
794 t0 = _mm256_add_epi64(t0, t1);
795 // t0 = [
796 // x0_3·r2_1 + x0_1·r2_3 + x1_3·r1_1 + x1_1·r1_3,
797 // x0_2·r2_2 + x0_0·r2_4 + x1_2·r1_2 + x1_0·r1_4,
798 // x0_3·r2_1 + x0_1·r2_3 + x1_3·r1_1 + x1_1·r1_3,
799 // x0_2·r2_2 + x0_0·r2_4 + x1_2·r1_2 + x1_0·r1_4,
800 // ]
801 t0 = _mm256_add_epi64(t0, _mm256_permute4x64_epi64(t0, set02(1, 0, 3, 2)));
802 // t0 = [
803 // x0_3·r2_1 + x0_2·r2_2 + x0_1·r2_3 + x0_0·r2_4 + x1_3·r1_1 + x1_2·r1_2 + x1_1·r1_3 + x1_0·r1_4,
804 // x0_3·r2_1 + x0_2·r2_2 + x0_1·r2_3 + x0_0·r2_4 + x1_3·r1_1 + x1_2·r1_2 + x1_1·r1_3 + x1_0·r1_4,
805 // x0_3·r2_1 + x0_2·r2_2 + x0_1·r2_3 + x0_0·r2_4 + x1_3·r1_1 + x1_2·r1_2 + x1_1·r1_3 + x1_0·r1_4,
806 // x0_3·r2_1 + x0_2·r2_2 + x0_1·r2_3 + x0_0·r2_4 + x1_3·r1_1 + x1_2·r1_2 + x1_1·r1_3 + x1_0·r1_4,
807 // ]
808 t0 = _mm256_add_epi64(t0, _mm256_permute4x64_epi64(t0, set02(2, 3, 0, 1)));
809
810 // v1 = [
811 // 5·x0_2·r2_3 + x0_3·r2_1 + x0_2·r2_2 + x0_1·r2_3 + x0_0·r2_4 + 5·x1_2·r1_3 + x1_3·r1_1 + x1_2·r1_2 + x1_1·r1_3 + x1_0·r1_4,
812 // x0_0·r2_4 + x0_3·r2_1 + x0_2·r2_2 + x0_1·r2_3 + x0_0·r2_4 + x1_0·r1_4 + x1_3·r1_1 + x1_2·r1_2 + x1_1·r1_3 + x1_0·r1_4,
813 // x0_0·r2_2 + x0_3·r2_1 + x0_2·r2_2 + x0_1·r2_3 + x0_0·r2_4 + x1_0·r1_2 + x1_3·r1_1 + x1_2·r1_2 + x1_1·r1_3 + x1_0·r1_4,
814 // x0_4·r2_0 + x0_3·r2_1 + x0_2·r2_2 + x0_1·r2_3 + x0_0·r2_4 + x1_4·r1_0 + x1_3·r1_1 + x1_2·r1_2 + x1_1·r1_3 + x1_0·r1_4,
815 // ]
816 v1 = _mm256_add_epi64(v1, t0);
817
818 // The result:
819 // v1 = [
820 // _, _, _,
821 // x0_4·r2_0 + x0_3·r2_1 + x0_2·r2_2 + x0_1·r2_3 + x0_0·r2_4 + x1_4·r1_0 + x1_3·r1_1 + x1_2·r1_2 + x1_1·r1_3 + x1_0·r1_4,
822 // ]
823 // v0 = [
824 // x0_3·r2_0 + x0_2·r2_1 + x0_1·r2_2 + x0_0·r2_3 + 5·x0_4·r2_4 + x1_3·r1_0 + x1_2·r1_1 + x1_1·r1_2 + x1_0·r1_3 + 5·x1_4·r1_4,
825 // x0_2·r2_0 + x0_1·r2_1 + x0_0·r2_2 + 5·x0_4·r2_3 + 5·x0_3·r2_4 + x1_2·r1_0 + x1_1·r1_1 + x1_0·r1_2 + 5·x1_4·r1_3 + 5·x1_3·r1_4,
826 // x0_1·r2_0 + x0_0·r2_1 + 5·x0_4·r2_2 + 5·x0_3·r2_3 + 5·x0_2·r2_4 + x1_1·r1_0 + x1_0·r1_1 + 5·x1_4·r1_2 + 5·x1_3·r1_3 + 5·x1_2·r1_4,
827 // x0_0·r2_0 + 5·x0_4·r2_1 + 5·x0_3·r2_2 + 5·x0_2·r2_3 + 5·x0_1·r2_4 + x1_0·r1_0 + 5·x1_4·r1_1 + 5·x1_3·r1_2 + 5·x1_2·r1_3 + 5·x1_1·r1_4,
828 // ]
829 Unreduced130 { v0, v1 }
830 }
831 }
832}
833
834impl Add<Aligned130> for Aligned2x130 {
835 type Output = Aligned2x130;
836
837 /// Adds `other` into the lower integer of `self`.
838 fn add(self, other: Aligned130) -> Aligned2x130 {
839 Aligned2x130 {
840 v0: self.v0 + other,
841 v1: self.v1,
842 }
843 }
844}
845
846/// A multiplier that takes 130-bit integers `(x3, x2, x1, x0)` and computes
847/// `(x3·R^4, x2·R^3, x1·R^2, x0·R) mod 2^130 - 5`.
848#[derive(Copy, Clone, Debug)]
849pub(super) struct SpacedMultiplier4x130 {
850 v0: __m256i,
851 v1: __m256i,
852 r1: PrecomputedMultiplier,
853}
854
855impl SpacedMultiplier4x130 {
856 /// Returns `(multipler, R^4)` given `(R^1, R^2)`.
857 #[target_feature(enable = "avx2")]
858 pub(super) unsafe fn new(
859 r1: PrecomputedMultiplier,
860 r2: PrecomputedMultiplier,
861 ) -> (Self, PrecomputedMultiplier) {
862 let r3 = (r2 * r1).reduce();
863 let r4 = (r2 * r2).reduce();
864
865 // v0 = [r2_4, r2_3, r2_1, r3_4, r3_3, r3_2, r3_1, r3_0]
866 let v0 = _mm256_blend_epi32(
867 r3.0,
868 _mm256_permutevar8x32_epi32(r2.a, _mm256_set_epi32(4, 3, 1, 0, 0, 0, 0, 0)),
869 0b11100000,
870 );
871
872 // v1 = [r2_4, r2_2, r2_0, r4_4, r4_3, r4_2, r4_1, r4_0]
873 let v1 = _mm256_blend_epi32(
874 r4.0,
875 _mm256_permutevar8x32_epi32(r2.a, _mm256_set_epi32(4, 2, 0, 0, 0, 0, 0, 0)),
876 0b11100000,
877 );
878
879 let m = SpacedMultiplier4x130 { v0, v1, r1 };
880
881 (m, r4.into())
882 }
883}
884
885/// Four 130-bit integers aligned across five 26-bit limbs each.
886///
887/// Unlike `Aligned2x130` which wraps two `Aligned130`s, this struct represents the four
888/// integers as 20 limbs spread across three 256-bit vectors.
889#[derive(Copy, Clone, Debug)]
890pub(super) struct Aligned4x130 {
891 v0: __m256i,
892 v1: __m256i,
893 v2: __m256i,
894}
895
896impl fmt::Display for Aligned4x130 {
897 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
898 let mut v0 = [0u8; 32];
899 let mut v1 = [0u8; 32];
900 let mut v2 = [0u8; 32];
901 unsafe {
902 _mm256_storeu_si256(v0.as_mut_ptr().cast(), self.v0);
903 _mm256_storeu_si256(v1.as_mut_ptr().cast(), self.v1);
904 _mm256_storeu_si256(v2.as_mut_ptr().cast(), self.v2);
905 }
906
907 writeln!(f, "Aligned4x130([")?;
908 write!(f, " ")?;
909 write_130(
910 f,
911 [
912 u32::from_le_bytes(v0[0..4].try_into().unwrap()),
913 u32::from_le_bytes(v1[0..4].try_into().unwrap()),
914 u32::from_le_bytes(v0[4..8].try_into().unwrap()),
915 u32::from_le_bytes(v1[4..8].try_into().unwrap()),
916 u32::from_le_bytes(v2[0..4].try_into().unwrap()),
917 ],
918 )?;
919 writeln!(f, ",")?;
920 write!(f, " ")?;
921 write_130(
922 f,
923 [
924 u32::from_le_bytes(v0[8..12].try_into().unwrap()),
925 u32::from_le_bytes(v1[8..12].try_into().unwrap()),
926 u32::from_le_bytes(v0[12..16].try_into().unwrap()),
927 u32::from_le_bytes(v1[12..16].try_into().unwrap()),
928 u32::from_le_bytes(v2[8..12].try_into().unwrap()),
929 ],
930 )?;
931 writeln!(f, ",")?;
932 write!(f, " ")?;
933 write_130(
934 f,
935 [
936 u32::from_le_bytes(v0[16..20].try_into().unwrap()),
937 u32::from_le_bytes(v1[16..20].try_into().unwrap()),
938 u32::from_le_bytes(v0[20..24].try_into().unwrap()),
939 u32::from_le_bytes(v1[20..24].try_into().unwrap()),
940 u32::from_le_bytes(v2[16..20].try_into().unwrap()),
941 ],
942 )?;
943 writeln!(f, ",")?;
944 write!(f, " ")?;
945 write_130(
946 f,
947 [
948 u32::from_le_bytes(v0[24..28].try_into().unwrap()),
949 u32::from_le_bytes(v1[24..28].try_into().unwrap()),
950 u32::from_le_bytes(v0[28..32].try_into().unwrap()),
951 u32::from_le_bytes(v1[28..32].try_into().unwrap()),
952 u32::from_le_bytes(v2[24..28].try_into().unwrap()),
953 ],
954 )?;
955 writeln!(f, ",")?;
956 write!(f, "])")
957 }
958}
959
960impl Aligned4x130 {
961 /// Aligns four 16-byte Poly1305 blocks at 26-bit boundaries within 32-bit words, and
962 /// sets the high bit for each block.
963 ///
964 /// # Panics
965 ///
966 /// Panics if `src.len() < 64`.
967 #[target_feature(enable = "avx2")]
968 pub(super) unsafe fn from_blocks(src: &[Block; 4]) -> Self {
969 let (lo, hi) = src.split_at(2);
970 let blocks_23 = _mm256_loadu_si256(hi.as_ptr().cast());
971 let blocks_01 = _mm256_loadu_si256(lo.as_ptr().cast());
972
973 Self::from_loaded_blocks(blocks_01, blocks_23)
974 }
975
976 /// Aligns four 16-byte Poly1305 blocks at 26-bit boundaries within 32-bit words, and
977 /// sets the high bit for each block.
978 #[target_feature(enable = "avx2")]
979 pub(super) unsafe fn from_par_blocks(src: &ParBlocks) -> Self {
980 let (lo, hi) = src.split_at(2);
981 let blocks_23 = _mm256_loadu_si256(hi.as_ptr().cast());
982 let blocks_01 = _mm256_loadu_si256(lo.as_ptr().cast());
983
984 Self::from_loaded_blocks(blocks_01, blocks_23)
985 }
986
987 /// Aligns four 16-byte Poly1305 blocks at 26-bit boundaries within 32-bit words, and
988 /// sets the high bit for each block.
989 ///
990 /// The four blocks must be in the following 32-bit word layout:
991 /// [b33, b32, b31, b30, b23, b22, b21, b20]
992 /// [b13, b12, b11, b10, b03, b02, b01, b00]
993 #[target_feature(enable = "avx2")]
994 unsafe fn from_loaded_blocks(blocks_01: __m256i, blocks_23: __m256i) -> Self {
995 // 26-bit mask on each 32-bit word.
996 let mask_26 = _mm256_set1_epi32(0x3ffffff);
997 // Sets bit 24 of each 32-bit word.
998 let set_hibit = _mm256_set1_epi32(1 << 24);
999
1000 // - Unpack the upper and lower 64 bits:
1001 // [b33, b32, b13, b12, b23, b22, b03, b02]
1002 // [b31, b30, b11, b10, b21, b20, b01, b00]
1003 //
1004 // - Swap the middle two 64-bit words:
1005 // a0 = [b33, b32, b23, b22, b13, b12, b03, b02]
1006 // a1 = [b31, b30, b21, b20, b11, b10, b01, b00]
1007 let a0 = _mm256_permute4x64_epi64(
1008 _mm256_unpackhi_epi64(blocks_01, blocks_23),
1009 set02(3, 1, 2, 0),
1010 );
1011 let a1 = _mm256_permute4x64_epi64(
1012 _mm256_unpacklo_epi64(blocks_01, blocks_23),
1013 set02(3, 1, 2, 0),
1014 );
1015
1016 // - Take the upper 24 bits of each 64-bit word in a0, and set the high bits:
1017 // v2 = [
1018 // [0; 7] || 1 || [0; 31] || 1 || b33[32..8],
1019 // [0; 7] || 1 || [0; 31] || 1 || b23[32..8],
1020 // [0; 7] || 1 || [0; 31] || 1 || b13[32..8],
1021 // [0; 7] || 1 || [0; 31] || 1 || b03[32..8],
1022 // ]
1023 let v2 = _mm256_or_si256(_mm256_srli_epi64(a0, 40), set_hibit);
1024
1025 // - Combine the lower 46 bits of each 64-bit word in a0 with the upper 18
1026 // bits of each 64-bit word in a1:
1027 // a2 = [
1028 // b33[14..0] || b32 || b31[32..14],
1029 // b23[14..0] || b22 || b21[32..14],
1030 // b13[14..0] || b12 || b11[32..14],
1031 // b03[14..0] || b02 || b01[32..14],
1032 // ]
1033 let a2 = _mm256_or_si256(_mm256_srli_epi64(a1, 46), _mm256_slli_epi64(a0, 18));
1034
1035 // - Take the upper 38 bits of each 64-bit word in a1:
1036 // [
1037 // [0; 26] || b31 || b30[32..26],
1038 // [0; 26] || b21 || b20[32..26],
1039 // [0; 26] || b11 || b10[32..26],
1040 // [0; 26] || b01 || b00[32..26],
1041 // ]
1042 // - Blend in a2 on 32-bit words with alternating [a2 a1 ..] control pattern:
1043 // [
1044 // b33[14..0] || b32[32..14] || b31[26..0] || b30[32..26],
1045 // b23[14..0] || b22[32..14] || b21[26..0] || b20[32..26],
1046 // b13[14..0] || b12[32..14] || b11[26..0] || b10[32..26],
1047 // b03[14..0] || b02[32..14] || b01[26..0] || b00[32..26],
1048 // ]
1049 // - Apply the 26-bit mask to each 32-bit word:
1050 // v1 = [
1051 // [0; 6] || b33[8..0] || b32[32..14] || [0; 6] || b31[20..0] || b30[32..26],
1052 // [0; 6] || b23[8..0] || b22[32..14] || [0; 6] || b21[20..0] || b20[32..26],
1053 // [0; 6] || b13[8..0] || b12[32..14] || [0; 6] || b11[20..0] || b10[32..26],
1054 // [0; 6] || b03[8..0] || b02[32..14] || [0; 6] || b01[20..0] || b00[32..26],
1055 // ]
1056 let v1 = _mm256_and_si256(
1057 _mm256_blend_epi32(_mm256_srli_epi64(a1, 26), a2, 0xAA),
1058 mask_26,
1059 );
1060
1061 // - Take the lower 38 bits of each 64-bit word in a2:
1062 // [
1063 // b32[20..0] || b31[32..14] || [0; 26],
1064 // b22[20..0] || b21[32..14] || [0; 26],
1065 // b12[20..0] || b11[32..14] || [0; 26],
1066 // b02[20..0] || b01[32..14] || [0; 26],
1067 // ]
1068 // - Blend in a1 on 32-bit words with alternating [a2 a1 ..] control pattern:
1069 // [
1070 // b32[20..0] || b31[32..20] || b30,
1071 // b22[20..0] || b21[32..20] || b20,
1072 // b12[20..0] || b11[32..20] || b10,
1073 // b02[20..0] || b01[32..20] || b00,
1074 // ]
1075 // - Apply the 26-bit mask to each 32-bit word:
1076 // v0 = [
1077 // [0; 6] || b32[14..0] || b31[32..20] || [0; 6] || b30[26..0],
1078 // [0; 6] || b22[14..0] || b21[32..20] || [0; 6] || b20[26..0],
1079 // [0; 6] || b12[14..0] || b11[32..20] || [0; 6] || b10[26..0],
1080 // [0; 6] || b02[14..0] || b01[32..20] || [0; 6] || b00[26..0],
1081 // ]
1082 let v0 = _mm256_and_si256(
1083 _mm256_blend_epi32(a1, _mm256_slli_epi64(a2, 26), 0xAA),
1084 mask_26,
1085 );
1086
1087 // The result:
1088 // v2 = [ v1 = [ v0 = [
1089 // [0; 7] || 1 || [0; 24], [0; 6] || b33[ 8..0] || b32[32..14], [0; 6] || b32[14..0] || b31[32..20],
1090 // [0; 7] || 1 || b33[32..8], [0; 6] || b31[20..0] || b30[32..26], [0; 6] || b30[26..0],
1091 // [0; 7] || 1 || [0; 24], [0; 6] || b23[ 8..0] || b22[32..14], [0; 6] || b22[14..0] || b21[32..20],
1092 // [0; 7] || 1 || b23[32..8], [0; 6] || b21[20..0] || b20[32..26], [0; 6] || b20[26..0],
1093 // [0; 7] || 1 || [0; 24], [0; 6] || b13[ 8..0] || b12[32..14], [0; 6] || b12[14..0] || b11[32..20],
1094 // [0; 7] || 1 || b13[32..8], [0; 6] || b11[20..0] || b10[32..26], [0; 6] || b10[26..0],
1095 // [0; 7] || 1 || [0; 24], [0; 6] || b03[ 8..0] || b02[32..14], [0; 6] || b02[14..0] || b01[32..20],
1096 // [0; 7] || 1 || b03[32..8], [0; 6] || b01[20..0] || b00[32..26], [0; 6] || b00[26..0],
1097 // ] ] ]
1098 Aligned4x130 { v0, v1, v2 }
1099 }
1100}
1101
1102impl Add<Aligned4x130> for Aligned4x130 {
1103 type Output = Aligned4x130;
1104
1105 #[inline(always)]
1106 fn add(self, other: Aligned4x130) -> Aligned4x130 {
1107 // With 26-bit limbs inside 32-bit words, there is plenty of space for unreduced
1108 // addition.
1109 unsafe {
1110 Aligned4x130 {
1111 v0: _mm256_add_epi32(self.v0, other.v0),
1112 v1: _mm256_add_epi32(self.v1, other.v1),
1113 v2: _mm256_add_epi32(self.v2, other.v2),
1114 }
1115 }
1116 }
1117}
1118
1119impl Mul<PrecomputedMultiplier> for &Aligned4x130 {
1120 type Output = Unreduced4x130;
1121
1122 #[inline(always)]
1123 fn mul(self, other: PrecomputedMultiplier) -> Unreduced4x130 {
1124 unsafe {
1125 // Starting with the following limb layout:
1126 // x.v2 = [ _, x34, _, x24, _, x14, _, x04]
1127 // x.v1 = [ x33, x31, x23, x21, x13, x11, x03, x01]
1128 // x.v0 = [ x32, x30, x22, x20, x12, x10, x02, x00]
1129 // y = [5·r_4, 5·r_3, 5·r_2, r_4, r_3, r_2, r_1, r_0]
1130 // z = [5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1]
1131 let mut x = *self;
1132 let y = other.a;
1133 let z = other.a_5;
1134
1135 // Prepare a permutation that swaps the two limbs within a 64-bit window.
1136 let ord = _mm256_set_epi32(6, 7, 4, 5, 2, 3, 0, 1);
1137
1138 // t0 = [r_1, r_0, r_1, r_0, r_1, r_0, r_1, r_0] -> ·r_0
1139 // t1 = [r_3, r_2, r_3, r_2, r_3, r_2, r_3, r_2] -> ·r_2
1140 let mut t0 = _mm256_permute4x64_epi64(y, set02(0, 0, 0, 0));
1141 let mut t1 = _mm256_permute4x64_epi64(y, set02(1, 1, 1, 1));
1142
1143 // v0 = [x30·r_0, x20·r_0, x10·r_0, x00·r_0]
1144 // v1 = [x31·r_0, x21·r_0, x11·r_0, x01·r_0]
1145 // v4 = [x34·r_0, x24·r_0, x14·r_0, x04·r_0]
1146 // v2 = [x30·r_2, x20·r_2, x10·r_2, x00·r_2]
1147 // v3 = [x31·r_2, x21·r_2, x11·r_2, x01·r_2]
1148 let mut v0 = _mm256_mul_epu32(x.v0, t0); // xN0·r_0
1149 let mut v1 = _mm256_mul_epu32(x.v1, t0); // xN1·r_0
1150 let mut v4 = _mm256_mul_epu32(x.v2, t0); // xN4·r_0
1151 let mut v2 = _mm256_mul_epu32(x.v0, t1); // xN0·r_2
1152 let mut v3 = _mm256_mul_epu32(x.v1, t1); // xN1·r_2
1153
1154 // t0 = [r_0, r_1, r_0, r_1, r_0, r_1, r_0, r_1] -> ·r_1
1155 // t1 = [r_2, r_3, r_2, r_3, r_2, r_3, r_2, r_3] -> ·r_3
1156 t0 = _mm256_permutevar8x32_epi32(t0, ord);
1157 t1 = _mm256_permutevar8x32_epi32(t1, ord);
1158
1159 // v1 = [x31·r_0 + x30·r_1, x21·r_0 + x20·r_1, x11·r_0 + x10·r_1, x01·r_0 + x00·r_1]
1160 // v2 = [x31·r_1 + x30·r_2, x21·r_1 + x20·r_2, x11·r_1 + x10·r_2, x01·r_1 + x00·r_2]
1161 // v3 = [x31·r_2 + x30·r_3, x21·r_2 + x20·r_3, x11·r_2 + x10·r_3, x01·r_2 + x00·r_3]
1162 // v4 = [x34·r_0 + x31·r_3, x24·r_0 + x21·r_3, x14·r_0 + x11·r_3, x04·r_0 + x01·r_3]
1163 v1 = _mm256_add_epi64(v1, _mm256_mul_epu32(x.v0, t0)); // + xN0·r_1
1164 v2 = _mm256_add_epi64(v2, _mm256_mul_epu32(x.v1, t0)); // + xN1·r_1
1165 v3 = _mm256_add_epi64(v3, _mm256_mul_epu32(x.v0, t1)); // + xN0·r_3
1166 v4 = _mm256_add_epi64(v4, _mm256_mul_epu32(x.v1, t1)); // + xN1·r_3
1167
1168 // t2 = [5·r_2, r_4, 5·r_2, r_4, 5·r_2, r_4, 5·r_2, r_4] -> ·r_4
1169 let mut t2 = _mm256_permute4x64_epi64(y, set02(2, 2, 2, 2));
1170
1171 // v4 = [
1172 // x34·r_0 + x31·r_3 + x30·r_4,
1173 // x24·r_0 + x21·r_3 + x20·r_4,
1174 // x14·r_0 + x11·r_3 + x10·r_4,
1175 // x04·r_0 + x01·r_3 + x00·r_4,
1176 // ]
1177 v4 = _mm256_add_epi64(v4, _mm256_mul_epu32(x.v0, t2)); // + xN0·r_4
1178
1179 // x.v0 = [x30, x32, x20, x22, x10, x12, x00, x02]
1180 // x.v1 = [x31, x33, x21, x23, x11, x13, x01, x03]
1181 // t2 = [r_4, 5·r_2, r_4, 5·r_2, r_4, 5·r_2, r_4, 5·r_2] -> ·5·r_2
1182 x.v0 = _mm256_permutevar8x32_epi32(x.v0, ord);
1183 x.v1 = _mm256_permutevar8x32_epi32(x.v1, ord);
1184 t2 = _mm256_permutevar8x32_epi32(t2, ord);
1185
1186 // v0 = [
1187 // x30·r_0 + 5·x33·r_2,
1188 // x20·r_0 + 5·x23·r_2,
1189 // x10·r_0 + 5·x13·r_2,
1190 // x00·r_0 + 5·x03·r_2,
1191 // ]
1192 // v1 = [
1193 // x31·r_0 + x30·r_1 + 5·x34·r_2,
1194 // x21·r_0 + x20·r_1 + 5·x24·r_2,
1195 // x11·r_0 + x10·r_1 + 5·x14·r_2,
1196 // x01·r_0 + x00·r_1 + 5·x04·r_2,
1197 // ]
1198 // v3 = [
1199 // x32·r_1 + x31·r_2 + x30·r_3,
1200 // x22·r_1 + x21·r_2 + x20·r_3,
1201 // x12·r_1 + x11·r_2 + x10·r_3,
1202 // x02·r_1 + x01·r_2 + x00·r_3,
1203 // ]
1204 // v4 = [
1205 // x34·r_0 + x33·r_1 + x31·r_3 + x30·r_4,
1206 // x24·r_0 + x23·r_1 + x21·r_3 + x20·r_4,
1207 // x14·r_0 + x13·r_1 + x11·r_3 + x10·r_4,
1208 // x04·r_0 + x03·r_1 + x01·r_3 + x00·r_4,
1209 // ]
1210 v0 = _mm256_add_epi64(v0, _mm256_mul_epu32(x.v1, t2)); // + 5·xN3·r_2
1211 v1 = _mm256_add_epi64(v1, _mm256_mul_epu32(x.v2, t2)); // + 5·xN4·r_2
1212 v3 = _mm256_add_epi64(v3, _mm256_mul_epu32(x.v0, t0)); // + xN2·r_1
1213 v4 = _mm256_add_epi64(v4, _mm256_mul_epu32(x.v1, t0)); // + xN3·r_1
1214
1215 // t0 = [r_1, r_0, r_1, r_0, r_1, r_0, r_1, r_0] -> ·r_0
1216 // t1 = [r_3, r_2, r_3, r_2, r_3, r_2, r_3, r_2] -> ·r_2
1217 t0 = _mm256_permutevar8x32_epi32(t0, ord);
1218 t1 = _mm256_permutevar8x32_epi32(t1, ord);
1219
1220 // v2 = [
1221 // x32·r_0 + x31·r_1 + x30·r_2,
1222 // x22·r_0 + x21·r_1 + x20·r_2,
1223 // x12·r_0 + x11·r_1 + x10·r_2,
1224 // x02·r_0 + x01·r_1 + x00·r_2,
1225 // ]
1226 // v3 = [
1227 // x33·r_0 + x32·r_1 + x31·r_2 + x30·r_3,
1228 // x23·r_0 + x22·r_1 + x21·r_2 + x20·r_3,
1229 // x13·r_0 + x12·r_1 + x11·r_2 + x10·r_3,
1230 // x03·r_0 + x02·r_1 + x01·r_2 + x00·r_3,
1231 // ]
1232 // v4 = [
1233 // x34·r_0 + x33·r_1 + x32·r_2 + x31·r_3 + x30·r_4,
1234 // x24·r_0 + x23·r_1 + x22·r_2 + x21·r_3 + x20·r_4,
1235 // x14·r_0 + x13·r_1 + x12·r_2 + x11·r_3 + x10·r_4,
1236 // x04·r_0 + x03·r_1 + x02·r_2 + x01·r_3 + x00·r_4,
1237 // ]
1238 v2 = _mm256_add_epi64(v2, _mm256_mul_epu32(x.v0, t0)); // + xN2·r_0
1239 v3 = _mm256_add_epi64(v3, _mm256_mul_epu32(x.v1, t0)); // + xN3·r_0
1240 v4 = _mm256_add_epi64(v4, _mm256_mul_epu32(x.v0, t1)); // + xN2·r_2
1241
1242 // t0 = [5·r_4, 5·r_3, 5·r_4, 5·r_3, 5·r_4, 5·r_3, 5·r_4, 5·r_3] -> ·5·r_3
1243 t0 = _mm256_permute4x64_epi64(y, set02(3, 3, 3, 3));
1244
1245 // v0 = [
1246 // x30·r_0 + 5·x33·r_2 + 5·x32·r_3,
1247 // x20·r_0 + 5·x23·r_2 + 5·x22·r_3,
1248 // x10·r_0 + 5·x13·r_2 + 5·x12·r_3,
1249 // x00·r_0 + 5·x03·r_2 + 5·x02·r_3,
1250 // ]
1251 // v1 = [
1252 // x31·r_0 + x30·r_1 + 5·x34·r_2 + 5·x33·r_3,
1253 // x21·r_0 + x20·r_1 + 5·x24·r_2 + 5·x23·r_3,
1254 // x11·r_0 + x10·r_1 + 5·x14·r_2 + 5·x13·r_3,
1255 // x01·r_0 + x00·r_1 + 5·x04·r_2 + 5·x03·r_3,
1256 // ]
1257 // v2 = [
1258 // x32·r_0 + x31·r_1 + x30·r_2 + 5·x34·r_3,
1259 // x22·r_0 + x21·r_1 + x20·r_2 + 5·x24·r_3,
1260 // x12·r_0 + x11·r_1 + x10·r_2 + 5·x14·r_3,
1261 // x02·r_0 + x01·r_1 + x00·r_2 + 5·x04·r_3,
1262 // ]
1263 v0 = _mm256_add_epi64(v0, _mm256_mul_epu32(x.v0, t0)); // + 5·xN2·r_3
1264 v1 = _mm256_add_epi64(v1, _mm256_mul_epu32(x.v1, t0)); // + 5·xN3·r_3
1265 v2 = _mm256_add_epi64(v2, _mm256_mul_epu32(x.v2, t0)); // + 5·xN4·r_3
1266
1267 // t0 = [5·r_3, 5·r_4, 5·r_3, 5·r_4, 5·r_3, 5·r_4, 5·r_3, 5·r_4] -> ·5·r_4
1268 t0 = _mm256_permutevar8x32_epi32(t0, ord);
1269
1270 // v1 = [
1271 // x31·r_0 + x30·r_1 + 5·x34·r_2 + 5·x33·r_3 + 5·x32·r_4,
1272 // x21·r_0 + x20·r_1 + 5·x24·r_2 + 5·x23·r_3 + 5·x22·r_4,
1273 // x11·r_0 + x10·r_1 + 5·x14·r_2 + 5·x13·r_3 + 5·x12·r_4,
1274 // x01·r_0 + x00·r_1 + 5·x04·r_2 + 5·x03·r_3 + 5·x02·r_4,
1275 // ]
1276 // v2 = [
1277 // x32·r_0 + x31·r_1 + x30·r_2 + 5·x34·r_3 + 5·x33·r_4,
1278 // x22·r_0 + x21·r_1 + x20·r_2 + 5·x24·r_3 + 5·x23·r_4,
1279 // x12·r_0 + x11·r_1 + x10·r_2 + 5·x14·r_3 + 5·x13·r_4,
1280 // x02·r_0 + x01·r_1 + x00·r_2 + 5·x04·r_3 + 5·x03·r_4,
1281 // ]
1282 // v3 = [
1283 // x33·r_0 + x32·r_1 + x31·r_2 + x30·r_3 + 5·x34·r_4,
1284 // x23·r_0 + x22·r_1 + x21·r_2 + x20·r_3 + 5·x24·r_4,
1285 // x13·r_0 + x12·r_1 + x11·r_2 + x10·r_3 + 5·x14·r_4,
1286 // x03·r_0 + x02·r_1 + x01·r_2 + x00·r_3 + 5·x04·r_4,
1287 // ]
1288 v1 = _mm256_add_epi64(v1, _mm256_mul_epu32(x.v0, t0)); // + 5·xN2·r_4
1289 v2 = _mm256_add_epi64(v2, _mm256_mul_epu32(x.v1, t0)); // + 5·xN3·r_4
1290 v3 = _mm256_add_epi64(v3, _mm256_mul_epu32(x.v2, t0)); // + 5·xN4·r_4
1291
1292 // x.v1 = [x33, x31, x23, x21, x13, x11, x03, x01]
1293 x.v1 = _mm256_permutevar8x32_epi32(x.v1, ord);
1294
1295 // v0 = [
1296 // x30·r_0 + 5·x34·r_1 + 5·x33·r_2 + 5·x32·r_3 + 5·x31·r_4,
1297 // x20·r_0 + 5·x24·r_1 + 5·x23·r_2 + 5·x22·r_3 + 5·x21·r_4,
1298 // x10·r_0 + 5·x14·r_1 + 5·x13·r_2 + 5·x12·r_3 + 5·x11·r_4,
1299 // x00·r_0 + 5·x04·r_1 + 5·x03·r_2 + 5·x02·r_3 + 5·x01·r_4,
1300 // ]
1301 v0 = _mm256_add_epi64(v0, _mm256_mul_epu32(x.v1, t0)); // + 5·xN1·r_4
1302 v0 = _mm256_add_epi64(v0, _mm256_mul_epu32(x.v2, z)); // + 5·xN4·r_1
1303
1304 // The result:
1305 // v4 = [
1306 // x34·r_0 + x33·r_1 + x32·r_2 + x31·r_3 + x30·r_4,
1307 // x24·r_0 + x23·r_1 + x22·r_2 + x21·r_3 + x20·r_4,
1308 // x14·r_0 + x13·r_1 + x12·r_2 + x11·r_3 + x10·r_4,
1309 // x04·r_0 + x03·r_1 + x02·r_2 + x01·r_3 + x00·r_4,
1310 // ]
1311 // v3 = [
1312 // x33·r_0 + x32·r_1 + x31·r_2 + x30·r_3 + 5·x34·r_4,
1313 // x23·r_0 + x22·r_1 + x21·r_2 + x20·r_3 + 5·x24·r_4,
1314 // x13·r_0 + x12·r_1 + x11·r_2 + x10·r_3 + 5·x14·r_4,
1315 // x03·r_0 + x02·r_1 + x01·r_2 + x00·r_3 + 5·x04·r_4,
1316 // ]
1317 // v2 = [
1318 // x32·r_0 + x31·r_1 + x30·r_2 + 5·x34·r_3 + 5·x33·r_4,
1319 // x22·r_0 + x21·r_1 + x20·r_2 + 5·x24·r_3 + 5·x23·r_4,
1320 // x12·r_0 + x11·r_1 + x10·r_2 + 5·x14·r_3 + 5·x13·r_4,
1321 // x02·r_0 + x01·r_1 + x00·r_2 + 5·x04·r_3 + 5·x03·r_4,
1322 // ]
1323 // v1 = [
1324 // x31·r_0 + x30·r_1 + 5·x34·r_2 + 5·x33·r_3 + 5·x32·r_4,
1325 // x21·r_0 + x20·r_1 + 5·x24·r_2 + 5·x23·r_3 + 5·x22·r_4,
1326 // x11·r_0 + x10·r_1 + 5·x14·r_2 + 5·x13·r_3 + 5·x12·r_4,
1327 // x01·r_0 + x00·r_1 + 5·x04·r_2 + 5·x03·r_3 + 5·x02·r_4,
1328 // ]
1329 // v0 = [
1330 // x30·r_0 + 5·x34·r_1 + 5·x33·r_2 + 5·x32·r_3 + 5·x31·r_4,
1331 // x20·r_0 + 5·x24·r_1 + 5·x23·r_2 + 5·x22·r_3 + 5·x21·r_4,
1332 // x10·r_0 + 5·x14·r_1 + 5·x13·r_2 + 5·x12·r_3 + 5·x11·r_4,
1333 // x00·r_0 + 5·x04·r_1 + 5·x03·r_2 + 5·x02·r_3 + 5·x01·r_4,
1334 // ]
1335 Unreduced4x130 { v0, v1, v2, v3, v4 }
1336 }
1337 }
1338}
1339
1340impl Mul<SpacedMultiplier4x130> for Aligned4x130 {
1341 type Output = Unreduced4x130;
1342
1343 #[inline(always)]
1344 fn mul(self, m: SpacedMultiplier4x130) -> Unreduced4x130 {
1345 unsafe {
1346 // Starting with the following limb layout:
1347 // x.v2 = [ _, x34, _, x24, _, x14, _, x04]
1348 // x.v1 = [ x33, x31, x23, x21, x13, x11, x03, x01]
1349 // x.v0 = [ x32, x30, x22, x20, x12, x10, x02, x00]
1350 // m.v1 = [ r2_4, r2_2, r2_0, r4_4, r4_3, r4_2, r4_1, r4_0]
1351 // m.v0 = [ r2_4, r2_3, r2_1, r3_4, r3_3, r3_2, r3_1, r3_0]
1352 // r1 = [5·r1_4, 5·r1_3, 5·r1_2, r1_4, r1_3, r1_2, r1_1, r1_0]
1353 let mut x = self;
1354 let r1 = m.r1.a;
1355
1356 // v0 = [r2_0, r2_1, r4_4, r3_4, r4_1, r3_1, r4_0, r3_0]
1357 // v1 = [r2_4, r2_4, r2_2, r2_3, r4_3, r3_3, r4_2, r3_2]
1358 let v0 = _mm256_unpacklo_epi32(m.v0, m.v1);
1359 let v1 = _mm256_unpackhi_epi32(m.v0, m.v1);
1360
1361 // m_r_0 = [r1_1, r1_0, r2_1, r2_0, r3_1, r3_0, r4_1, r4_0] -> ·rN_0
1362 // m_r_2 = [r1_3, r1_2, r2_3, r2_2, r3_3, r3_2, r4_3, r4_2] -> ·rN_2
1363 // m_r_4 = [r1_1, r1_4, r2_1, r2_4, r3_1, r3_4, r4_1, r4_4] -> ·rN_4
1364 let ord = _mm256_set_epi32(1, 0, 6, 7, 2, 0, 3, 1);
1365 let m_r_0 = _mm256_blend_epi32(
1366 _mm256_permutevar8x32_epi32(r1, ord),
1367 _mm256_permutevar8x32_epi32(v0, ord),
1368 0b00111111,
1369 );
1370 let ord = _mm256_set_epi32(3, 2, 4, 5, 2, 0, 3, 1);
1371 let m_r_2 = _mm256_blend_epi32(
1372 _mm256_permutevar8x32_epi32(r1, ord),
1373 _mm256_permutevar8x32_epi32(v1, ord),
1374 0b00111111,
1375 );
1376 let ord = _mm256_set_epi32(1, 4, 6, 6, 2, 4, 3, 5);
1377 let m_r_4 = _mm256_blend_epi32(
1378 _mm256_blend_epi32(
1379 _mm256_permutevar8x32_epi32(r1, ord),
1380 _mm256_permutevar8x32_epi32(v1, ord),
1381 0b00010000,
1382 ),
1383 _mm256_permutevar8x32_epi32(v0, ord),
1384 0b00101111,
1385 );
1386
1387 // v0 = [x30·r1_0, x20·r2_0, x10·r3_0, x00·r4_0]
1388 // v1 = [x31·r1_0, x21·r2_0, x11·r3_0, x01·r4_0]
1389 // v2 = [x30·r1_2, x20·r2_2, x10·r3_2, x00·r4_2]
1390 // v3 = [x31·r1_2, x21·r2_2, x11·r3_2, x01·r4_2]
1391 // v4 = [x30·r1_4, x20·r2_4, x10·r3_4, x00·r4_4]
1392 let mut v0 = _mm256_mul_epu32(x.v0, m_r_0); // xM0·rN_0
1393 let mut v1 = _mm256_mul_epu32(x.v1, m_r_0); // xM1·rN_0
1394 let mut v2 = _mm256_mul_epu32(x.v0, m_r_2); // xM0·rN_2
1395 let mut v3 = _mm256_mul_epu32(x.v1, m_r_2); // xM1·rN_2
1396 let mut v4 = _mm256_mul_epu32(x.v0, m_r_4); // xM0·rN_4
1397
1398 // m_r_1 = [r1_0, r1_1, r2_0, r2_1, r3_0, r3_1, r4_0, r4_1] -> ·rN_1
1399 // m_r_3 = [r1_2, r1_3, r2_2, r2_3, r3_2, r3_3, r4_2, r4_3] -> ·rN_3
1400 let ord = _mm256_set_epi32(6, 7, 4, 5, 2, 3, 0, 1);
1401 let m_r_1 = _mm256_permutevar8x32_epi32(m_r_0, ord);
1402 let m_r_3 = _mm256_permutevar8x32_epi32(m_r_2, ord);
1403
1404 // v1 = [
1405 // x31·r1_0 + x30·r1_1,
1406 // x21·r2_0 + x20·r2_1,
1407 // x11·r3_0 + x10·r3_1,
1408 // x01·r4_0 + x00·r4_1,
1409 // ]
1410 // v2 = [
1411 // x31·r1_1 + x30·r1_2,
1412 // x21·r2_1 + x20·r2_2,
1413 // x11·r3_1 + x10·r3_2,
1414 // x01·r4_1 + x00·r4_2,
1415 // ]
1416 // v3 = [
1417 // x31·r1_2 + x30·r1_3,
1418 // x21·r2_2 + x20·r2_3,
1419 // x11·r3_2 + x10·r3_3,
1420 // x01·r4_2 + x00·r4_3,
1421 // ]
1422 // v4 = [
1423 // x34·r1_0 + x31·r1_3 + x30·r1_4,
1424 // x24·r2_0 + x21·r2_3 + x20·r2_4,
1425 // x14·r3_0 + x11·r3_3 + x10·r3_4,
1426 // x04·r4_0 + x01·r4_3 + x00·r4_4,
1427 // ]
1428 v1 = _mm256_add_epi64(v1, _mm256_mul_epu32(x.v0, m_r_1)); // + xM0·rN_1
1429 v2 = _mm256_add_epi64(v2, _mm256_mul_epu32(x.v1, m_r_1)); // + xM1·rN_1
1430 v3 = _mm256_add_epi64(v3, _mm256_mul_epu32(x.v0, m_r_3)); // + xM0·rN_3
1431 v4 = _mm256_add_epi64(v4, _mm256_mul_epu32(x.v1, m_r_3)); // + xM1·rN_3
1432 v4 = _mm256_add_epi64(v4, _mm256_mul_epu32(x.v2, m_r_0)); // + xM4·rN_0
1433
1434 // x.v0 = [x30, x32, x20, x22, x10, x12, x00, x02]
1435 x.v0 = _mm256_permutevar8x32_epi32(x.v0, ord);
1436
1437 // v2 = [
1438 // x32·r1_0 + x31·r1_1 + x30·r1_2,
1439 // x22·r2_0 + x21·r2_1 + x20·r2_2,
1440 // x12·r3_0 + x11·r3_1 + x10·r3_2,
1441 // x02·r4_0 + x01·r4_1 + x00·r4_2,
1442 // ]
1443 // v3 = [
1444 // x32·r1_1 + x31·r1_2 + x30·r1_3,
1445 // x22·r2_1 + x21·r2_2 + x20·r2_3,
1446 // x12·r3_1 + x11·r3_2 + x10·r3_3,
1447 // x02·r4_1 + x01·r4_2 + x00·r4_3,
1448 // ]
1449 // v4 = [
1450 // x34·r1_0 + x32·r1_2 + x31·r1_3 + x30·r1_4,
1451 // x24·r2_0 + x22·r2_2 + x21·r2_3 + x20·r2_4,
1452 // x14·r3_0 + x12·r3_2 + x11·r3_3 + x10·r3_4,
1453 // x04·r4_0 + x02·r4_2 + x01·r4_3 + x00·r4_4,
1454 // ]
1455 v2 = _mm256_add_epi64(v2, _mm256_mul_epu32(x.v0, m_r_0)); // + xM2·rN_0
1456 v3 = _mm256_add_epi64(v3, _mm256_mul_epu32(x.v0, m_r_1)); // + xM2·rN_1
1457 v4 = _mm256_add_epi64(v4, _mm256_mul_epu32(x.v0, m_r_2)); // + xM2·rN_2
1458
1459 // m_5r_3 = [5·r1_2, 5·r1_3, 5·r2_2, 5·r2_3, 5·r3_2, 5·r3_3, 5·r4_2, 5·r4_3] -> ·5·rN_3
1460 // m_5r_4 = [5·r1_1, 5·r1_4, 5·r2_1, 5·r2_4, 5·r3_1, 5·r3_4, 5·r4_1, 5·r4_4] -> ·5·rN_4
1461 let m_5r_3 = _mm256_add_epi32(m_r_3, _mm256_slli_epi32(m_r_3, 2));
1462 let m_5r_4 = _mm256_add_epi32(m_r_4, _mm256_slli_epi32(m_r_4, 2));
1463
1464 // v0 = [
1465 // x30·r1_0 + 5·x32·r1_3 + 5·x31·r1_4,
1466 // x20·r2_0 + 5·x22·r2_3 + 5·x21·r2_4,
1467 // x10·r3_0 + 5·x12·r3_3 + 5·x11·r3_4,
1468 // x00·r4_0 + 5·x02·r4_3 + 5·x01·r4_4,
1469 // ]
1470 // v1 = [
1471 // x31·r1_0 + x30·r1_1 + 5·x32·r1_4,
1472 // x21·r2_0 + x20·r2_1 + 5·x22·r2_4,
1473 // x11·r3_0 + x10·r3_1 + 5·x12·r3_4,
1474 // x01·r4_0 + x00·r4_1 + 5·x02·r4_4,
1475 // ]
1476 // v2 = [
1477 // x32·r1_0 + x31·r1_1 + x30·r1_2 + 5·x34·r1_3,
1478 // x22·r2_0 + x21·r2_1 + x20·r2_2 + 5·x24·r2_3,
1479 // x12·r3_0 + x11·r3_1 + x10·r3_2 + 5·x14·r3_3,
1480 // x02·r4_0 + x01·r4_1 + x00·r4_2 + 5·x04·r4_3,
1481 // ]
1482 // v3 = [
1483 // x32·r1_1 + x31·r1_2 + x30·r1_3 + 5·x34·r1_4,
1484 // x22·r2_1 + x21·r2_2 + x20·r2_3 + 5·x24·r2_4,
1485 // x12·r3_1 + x11·r3_2 + x10·r3_3 + 5·x14·r3_4,
1486 // x02·r4_1 + x01·r4_2 + x00·r4_3 + 5·x04·r4_4,
1487 // ]
1488 v0 = _mm256_add_epi64(v0, _mm256_mul_epu32(x.v0, m_5r_3)); // + 5·xM2·rN_3
1489 v0 = _mm256_add_epi64(v0, _mm256_mul_epu32(x.v1, m_5r_4)); // + 5·xM1·rN_4
1490 v1 = _mm256_add_epi64(v1, _mm256_mul_epu32(x.v0, m_5r_4)); // + 5·xM2·rN_4
1491 v2 = _mm256_add_epi64(v2, _mm256_mul_epu32(x.v2, m_5r_3)); // + 5·xM4·rN_3
1492 v3 = _mm256_add_epi64(v3, _mm256_mul_epu32(x.v2, m_5r_4)); // + 5·xM4·rN_4
1493
1494 // x.v1 = [x31, x33, x21, x23, x11, x13, x01, x03]
1495 x.v1 = _mm256_permutevar8x32_epi32(x.v1, ord);
1496
1497 // v1 = [
1498 // x31·r1_0 + x30·r1_1 + 5·x33·r1_3 + 5·x32·r1_4,
1499 // x21·r2_0 + x20·r2_1 + 5·x23·r2_3 + 5·x22·r2_4,
1500 // x11·r3_0 + x10·r3_1 + 5·x13·r3_3 + 5·x12·r3_4,
1501 // x01·r4_0 + x00·r4_1 + 5·x03·r4_3 + 5·x02·r4_4,
1502 // ]
1503 // v2 = [
1504 // x32·r1_0 + x31·r1_1 + x30·r1_2 + 5·x34·r1_3 + 5·x33·r1_4,
1505 // x22·r2_0 + x21·r2_1 + x20·r2_2 + 5·x24·r2_3 + 5·x23·r2_4,
1506 // x12·r3_0 + x11·r3_1 + x10·r3_2 + 5·x14·r3_3 + 5·x13·r3_4,
1507 // x02·r4_0 + x01·r4_1 + x00·r4_2 + 5·x04·r4_3 + 5·x03·r4_4,
1508 // ]
1509 // v3 = [
1510 // x33·r1_0 + x32·r1_1 + x31·r1_2 + x30·r1_3 + 5·x34·r1_4,
1511 // x23·r2_0 + x22·r2_1 + x21·r2_2 + x20·r2_3 + 5·x24·r2_4,
1512 // x13·r3_0 + x12·r3_1 + x11·r3_2 + x10·r3_3 + 5·x14·r3_4,
1513 // x03·r4_0 + x02·r4_1 + x01·r4_2 + x00·r4_3 + 5·x04·r4_4,
1514 // ]
1515 // v4 = [
1516 // x34·r1_0 + x33·r1_1 + x32·r1_2 + x31·r1_3 + x30·r1_4,
1517 // x24·r2_0 + x23·r2_1 + x22·r2_2 + x21·r2_3 + x20·r2_4,
1518 // x14·r3_0 + x13·r3_1 + x12·r3_2 + x11·r3_3 + x10·r3_4,
1519 // x04·r4_0 + x03·r4_1 + x02·r4_2 + x01·r4_3 + x00·r4_4,
1520 // ]
1521 v1 = _mm256_add_epi64(v1, _mm256_mul_epu32(x.v1, m_5r_3)); // + 5·xM3·rN_3
1522 v2 = _mm256_add_epi64(v2, _mm256_mul_epu32(x.v1, m_5r_4)); // + 5·xM3·rN_4
1523 v3 = _mm256_add_epi64(v3, _mm256_mul_epu32(x.v1, m_r_0)); // + xM3·rN_0
1524 v4 = _mm256_add_epi64(v4, _mm256_mul_epu32(x.v1, m_r_1)); // + xM3·rN_1
1525
1526 // m_5r_1 = [5·r1_4, 5·r1_1, 5·r2_4, 5·r2_1, 5·r3_4, 5·r3_1, 5·r4_4, 5·r4_1] -> ·5·rN_1
1527 // m_5r_2 = [5·r1_3, 5·r1_2, 5·r2_3, 5·r2_2, 5·r3_3, 5·r3_2, 5·r4_3, 5·r4_2] -> ·5·rN_2
1528 let m_5r_1 = _mm256_permutevar8x32_epi32(m_5r_4, ord);
1529 let m_5r_2 = _mm256_permutevar8x32_epi32(m_5r_3, ord);
1530
1531 // v0 = [
1532 // x30·r1_0 + 5·x34·r1_1 + 5·x33·r1_2 + 5·x32·r1_3 + 5·x31·r1_4,
1533 // x20·r2_0 + 5·x24·r2_1 + 5·x23·r2_2 + 5·x22·r2_3 + 5·x21·r2_4,
1534 // x10·r3_0 + 5·x14·r3_1 + 5·x13·r3_2 + 5·x12·r3_3 + 5·x11·r3_4,
1535 // x00·r4_0 + 5·x04·r4_1 + 5·x03·r4_2 + 5·x02·r4_3 + 5·x01·r4_4,
1536 // ]
1537 // v1 = [
1538 // x31·r1_0 + x30·r1_1 + 5·x34·r1_2 + 5·x33·r1_3 + 5·x32·r1_4,
1539 // x21·r2_0 + x20·r2_1 + 5·x24·r2_2 + 5·x23·r2_3 + 5·x22·r2_4,
1540 // x11·r3_0 + x10·r3_1 + 5·x14·r3_2 + 5·x13·r3_3 + 5·x12·r3_4,
1541 // x01·r4_0 + x00·r4_1 + 5·x04·r4_2 + 5·x03·r4_3 + 5·x02·r4_4,
1542 // ]
1543 v0 = _mm256_add_epi64(v0, _mm256_mul_epu32(x.v1, m_5r_2)); // + 5·xM3·rN_2
1544 v0 = _mm256_add_epi64(v0, _mm256_mul_epu32(x.v2, m_5r_1)); // + 5·xM4·rN_1
1545 v1 = _mm256_add_epi64(v1, _mm256_mul_epu32(x.v2, m_5r_2)); // + 5·xM4·rN_2
1546
1547 // The result:
1548 // v4 = [
1549 // x34·r1_0 + x33·r1_1 + x32·r1_2 + x31·r1_3 + x30·r1_4,
1550 // x24·r2_0 + x23·r2_1 + x22·r2_2 + x21·r2_3 + x20·r2_4,
1551 // x14·r3_0 + x13·r3_1 + x12·r3_2 + x11·r3_3 + x10·r3_4,
1552 // x04·r4_0 + x03·r4_1 + x02·r4_2 + x01·r4_3 + x00·r4_4,
1553 // ]
1554 // v3 = [
1555 // x33·r1_0 + x32·r1_1 + x31·r1_2 + x30·r1_3 + 5·x34·r1_4,
1556 // x23·r2_0 + x22·r2_1 + x21·r2_2 + x20·r2_3 + 5·x24·r2_4,
1557 // x13·r3_0 + x12·r3_1 + x11·r3_2 + x10·r3_3 + 5·x14·r3_4,
1558 // x03·r4_0 + x02·r4_1 + x01·r4_2 + x00·r4_3 + 5·x04·r4_4,
1559 // ]
1560 // v2 = [
1561 // x32·r1_0 + x31·r1_1 + x30·r1_2 + 5·x34·r1_3 + 5·x33·r1_4,
1562 // x22·r2_0 + x21·r2_1 + x20·r2_2 + 5·x24·r2_3 + 5·x23·r2_4,
1563 // x12·r3_0 + x11·r3_1 + x10·r3_2 + 5·x14·r3_3 + 5·x13·r3_4,
1564 // x02·r4_0 + x01·r4_1 + x00·r4_2 + 5·x04·r4_3 + 5·x03·r4_4,
1565 // ]
1566 // v1 = [
1567 // x31·r1_0 + x30·r1_1 + 5·x34·r1_2 + 5·x33·r1_3 + 5·x32·r1_4,
1568 // x21·r2_0 + x20·r2_1 + 5·x24·r2_2 + 5·x23·r2_3 + 5·x22·r2_4,
1569 // x11·r3_0 + x10·r3_1 + 5·x14·r3_2 + 5·x13·r3_3 + 5·x12·r3_4,
1570 // x01·r4_0 + x00·r4_1 + 5·x04·r4_2 + 5·x03·r4_3 + 5·x02·r4_4,
1571 // ]
1572 // v0 = [
1573 // x30·r1_0 + 5·x34·r1_1 + 5·x33·r1_2 + 5·x32·r1_3 + 5·x31·r1_4,
1574 // x20·r2_0 + 5·x24·r2_1 + 5·x23·r2_2 + 5·x22·r2_3 + 5·x21·r2_4,
1575 // x10·r3_0 + 5·x14·r3_1 + 5·x13·r3_2 + 5·x12·r3_3 + 5·x11·r3_4,
1576 // x00·r4_0 + 5·x04·r4_1 + 5·x03·r4_2 + 5·x02·r4_3 + 5·x01·r4_4,
1577 // ]
1578 Unreduced4x130 { v0, v1, v2, v3, v4 }
1579 }
1580 }
1581}
1582
1583/// The unreduced output of an Aligned4x130 multiplication.
1584#[derive(Clone, Debug)]
1585pub(super) struct Unreduced4x130 {
1586 v0: __m256i,
1587 v1: __m256i,
1588 v2: __m256i,
1589 v3: __m256i,
1590 v4: __m256i,
1591}
1592
1593impl fmt::Display for Unreduced4x130 {
1594 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1595 let mut v0 = [0u8; 32];
1596 let mut v1 = [0u8; 32];
1597 let mut v2 = [0u8; 32];
1598 let mut v3 = [0u8; 32];
1599 let mut v4 = [0u8; 32];
1600 unsafe {
1601 _mm256_storeu_si256(v0.as_mut_ptr().cast(), self.v0);
1602 _mm256_storeu_si256(v1.as_mut_ptr().cast(), self.v1);
1603 _mm256_storeu_si256(v2.as_mut_ptr().cast(), self.v2);
1604 _mm256_storeu_si256(v3.as_mut_ptr().cast(), self.v3);
1605 _mm256_storeu_si256(v4.as_mut_ptr().cast(), self.v4);
1606 }
1607
1608 writeln!(f, "Unreduced4x130([")?;
1609 write!(f, " ")?;
1610 write_130_wide(
1611 f,
1612 [
1613 u64::from_le_bytes(v0[0..8].try_into().unwrap()),
1614 u64::from_le_bytes(v1[0..8].try_into().unwrap()),
1615 u64::from_le_bytes(v2[0..8].try_into().unwrap()),
1616 u64::from_le_bytes(v3[0..8].try_into().unwrap()),
1617 u64::from_le_bytes(v4[0..8].try_into().unwrap()),
1618 ],
1619 )?;
1620 writeln!(f, ",")?;
1621 write!(f, " ")?;
1622 write_130_wide(
1623 f,
1624 [
1625 u64::from_le_bytes(v0[8..16].try_into().unwrap()),
1626 u64::from_le_bytes(v1[8..16].try_into().unwrap()),
1627 u64::from_le_bytes(v2[8..16].try_into().unwrap()),
1628 u64::from_le_bytes(v3[8..16].try_into().unwrap()),
1629 u64::from_le_bytes(v4[8..16].try_into().unwrap()),
1630 ],
1631 )?;
1632 writeln!(f, ",")?;
1633 write!(f, " ")?;
1634 write_130_wide(
1635 f,
1636 [
1637 u64::from_le_bytes(v0[16..24].try_into().unwrap()),
1638 u64::from_le_bytes(v1[16..24].try_into().unwrap()),
1639 u64::from_le_bytes(v2[16..24].try_into().unwrap()),
1640 u64::from_le_bytes(v3[16..24].try_into().unwrap()),
1641 u64::from_le_bytes(v4[16..24].try_into().unwrap()),
1642 ],
1643 )?;
1644 writeln!(f, ",")?;
1645 write!(f, " ")?;
1646 write_130_wide(
1647 f,
1648 [
1649 u64::from_le_bytes(v0[24..32].try_into().unwrap()),
1650 u64::from_le_bytes(v1[24..32].try_into().unwrap()),
1651 u64::from_le_bytes(v2[24..32].try_into().unwrap()),
1652 u64::from_le_bytes(v3[24..32].try_into().unwrap()),
1653 u64::from_le_bytes(v4[24..32].try_into().unwrap()),
1654 ],
1655 )?;
1656 writeln!(f, ",")?;
1657 write!(f, "])")
1658 }
1659}
1660
1661impl Unreduced4x130 {
1662 #[inline(always)]
1663 pub(super) fn reduce(self) -> Aligned4x130 {
1664 unsafe {
1665 // Starting with the following limb layout across 64-bit words:
1666 // x.v4 = [u34, u24, u14, u04]
1667 // x.v3 = [u33, u23, u13, u03]
1668 // x.v2 = [u32, u22, u12, u02]
1669 // x.v1 = [u31, u21, u11, u01]
1670 // x.v0 = [u30, u20, u10, u00]
1671 let x = self;
1672
1673 // 26-bit mask on each 64-bit word.
1674 let mask_26 = _mm256_set1_epi64x(0x3ffffff);
1675
1676 // Carry from x0 up into x1, returning their new values.
1677 let adc = |x1: __m256i, x0: __m256i| -> (__m256i, __m256i) {
1678 let y1 = _mm256_add_epi64(x1, _mm256_srli_epi64(x0, 26));
1679 let y0 = _mm256_and_si256(x0, mask_26);
1680 (y1, y0)
1681 };
1682
1683 // Reduce modulo 2^130 - 5 from x4 down into x0, returning their new values.
1684 let red = |x4: __m256i, x0: __m256i| -> (__m256i, __m256i) {
1685 let y0 = _mm256_add_epi64(
1686 x0,
1687 _mm256_mul_epu32(_mm256_srli_epi64(x4, 26), _mm256_set1_epi64x(5)),
1688 );
1689 let y4 = _mm256_and_si256(x4, mask_26);
1690 (y4, y0)
1691 };
1692
1693 // Reduce the four integers in parallel to below 2^130.
1694 let (red_1, red_0) = adc(x.v1, x.v0);
1695 let (red_4, red_3) = adc(x.v4, x.v3);
1696 let (red_2, red_1) = adc(x.v2, red_1);
1697 let (red_4, red_0) = red(red_4, red_0);
1698 let (red_3, red_2) = adc(red_3, red_2);
1699 let (red_1, red_0) = adc(red_1, red_0);
1700 let (red_4, red_3) = adc(red_4, red_3);
1701
1702 // At this point, all limbs are contained within the lower 32 bits of each
1703 // 64-bit word. The upper limb of each integer (in red_4) is positioned
1704 // correctly for Aligned4x130, but the other limbs need to be blended
1705 // together:
1706 // - v0 contains limbs 0 and 2.
1707 // - v1 contains limbs 1 and 3.
1708 Aligned4x130 {
1709 v0: _mm256_blend_epi32(red_0, _mm256_slli_epi64(red_2, 32), 0b10101010),
1710 v1: _mm256_blend_epi32(red_1, _mm256_slli_epi64(red_3, 32), 0b10101010),
1711 v2: red_4,
1712 }
1713 }
1714 }
1715
1716 /// Returns the unreduced sum of the four 130-bit integers.
1717 #[inline(always)]
1718 pub(super) fn sum(self) -> Unreduced130 {
1719 unsafe {
1720 // Starting with the following limb layout across 64-bit words:
1721 // x.v4 = [u34, u24, u14, u04]
1722 // x.v3 = [u33, u23, u13, u03]
1723 // x.v2 = [u32, u22, u12, u02]
1724 // x.v1 = [u31, u21, u11, u01]
1725 // x.v0 = [u30, u20, u10, u00]
1726 let x = self;
1727
1728 // v0 = [
1729 // u31 + u21,
1730 // u30 + u20,
1731 // u11 + u01,
1732 // u10 + u00,
1733 // ]
1734 let v0 = _mm256_add_epi64(
1735 _mm256_unpackhi_epi64(x.v0, x.v1),
1736 _mm256_unpacklo_epi64(x.v0, x.v1),
1737 );
1738
1739 // v1 = [
1740 // u33 + u23,
1741 // u32 + u22,
1742 // u13 + u03,
1743 // u12 + u02,
1744 // ]
1745 let v1 = _mm256_add_epi64(
1746 _mm256_unpackhi_epi64(x.v2, x.v3),
1747 _mm256_unpacklo_epi64(x.v2, x.v3),
1748 );
1749
1750 // v0 = [
1751 // u33 + u23 + u13 + u03,
1752 // u32 + u22 + u12 + u02,
1753 // u31 + u21 + u11 + u01,
1754 // u30 + u20 + u10 + u00,
1755 // ]
1756 let v0 = _mm256_add_epi64(
1757 _mm256_inserti128_si256(v0, _mm256_castsi256_si128(v1), 1),
1758 _mm256_inserti128_si256(v1, _mm256_extractf128_si256(v0, 1), 0),
1759 );
1760
1761 // v1 = [
1762 // u34 + u14,
1763 // u24 + u04,
1764 // u14 + u34,
1765 // u04 + u24,
1766 // ]
1767 let v1 = _mm256_add_epi64(x.v4, _mm256_permute4x64_epi64(x.v4, set02(1, 0, 3, 2)));
1768
1769 // v1 = [
1770 // u34 + u24 + u14 + u04,
1771 // u24 + u24 + u04 + u04,
1772 // u34 + u24 + u14 + u04,
1773 // u34 + u24 + u14 + u04,
1774 // ]
1775 let v1 = _mm256_add_epi64(v1, _mm256_permute4x64_epi64(v1, set02(0, 0, 0, 1)));
1776
1777 // The result:
1778 // v1 = [
1779 // u34 + u24 + u14 + u04,
1780 // u24 + u24 + u04 + u04,
1781 // u34 + u24 + u14 + u04,
1782 // u34 + u24 + u14 + u04,
1783 // ]
1784 // v0 = [
1785 // u33 + u23 + u13 + u03,
1786 // u32 + u22 + u12 + u02,
1787 // u31 + u21 + u11 + u01,
1788 // u30 + u20 + u10 + u00,
1789 // ]
1790 // This corresponds to:
1791 // v1 = [ _, _, _, t_4]
1792 // v0 = [t_3, t_2, t_1, t_0]
1793 Unreduced130 { v0, v1 }
1794 }
1795 }
1796}
1797
1798#[derive(Clone, Copy, Debug)]
1799pub(super) struct AdditionKey(__m256i);
1800
1801impl Add<Aligned130> for AdditionKey {
1802 type Output = IntegerTag;
1803
1804 /// Computes x + k mod 2^128
1805 #[inline(always)]
1806 fn add(self, x: Aligned130) -> IntegerTag {
1807 unsafe {
1808 // Starting with the following limb layout:
1809 // x = [0, _, _, x4, x3, x2, x1, x0]
1810 // k = [0, k7, 0, k6, 0, k5, 0, k4]
1811 let mut x = _mm256_and_si256(x.0, _mm256_set_epi32(0, 0, 0, -1, -1, -1, -1, -1));
1812 let k = self.0;
1813
1814 /// Reduce to an integer below 2^130.
1815 unsafe fn propagate_carry(x: __m256i) -> __m256i {
1816 // t = [
1817 // 0,
1818 // 0,
1819 // 0,
1820 // x3 >> 26,
1821 // x2 >> 26,
1822 // x1 >> 26,
1823 // x0 >> 26,
1824 // x4 >> 26,
1825 // ];
1826 let t = _mm256_permutevar8x32_epi32(
1827 _mm256_srli_epi32(x, 26),
1828 _mm256_set_epi32(7, 7, 7, 3, 2, 1, 0, 4),
1829 );
1830
1831 // [
1832 // 0,
1833 // 0,
1834 // 0,
1835 // x4 % 2^26,
1836 // x3 % 2^26,
1837 // x2 % 2^26,
1838 // x1 % 2^26,
1839 // x0 % 2^26,
1840 // ]
1841 // + t + [0, 0, 0, 0, 0, 0, 0, 4·(x4 >> 26)]
1842 // = [
1843 // 0,
1844 // 0,
1845 // 0,
1846 // x4 % 2^26 + x3 >> 26,
1847 // x3 % 2^26 + x2 >> 26,
1848 // x2 % 2^26 + x1 >> 26,
1849 // x1 % 2^26 + x0 >> 26,
1850 // x0 % 2^26 + 5·(x4 >> 26),
1851 // ] => [0, 0, 0, x4, x3, x2, x1, x0]
1852 _mm256_add_epi32(
1853 _mm256_add_epi32(
1854 _mm256_and_si256(
1855 x,
1856 _mm256_set_epi32(
1857 0, 0, 0, 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff,
1858 ),
1859 ),
1860 t,
1861 ),
1862 _mm256_permutevar8x32_epi32(
1863 _mm256_slli_epi32(t, 2),
1864 _mm256_set_epi32(7, 7, 7, 7, 7, 7, 7, 0),
1865 ),
1866 )
1867 }
1868
1869 // Reduce modulus 2^130-5:
1870 // - Reduce to an integer below 2^130:
1871 // TODO: Is it more efficient to unpack the limbs for this?
1872 for _ in 0..5 {
1873 x = propagate_carry(x);
1874 }
1875
1876 // - Compute x + -p by adding 5 and carrying up to the top limb:
1877 // g = [0, 0, 0, g4, g3, g2, g1, g0]
1878 let mut g = _mm256_add_epi32(x, _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 5));
1879 // TODO: Is it more efficient to unpack the limbs for this?
1880 for _ in 0..4 {
1881 g = propagate_carry(g);
1882 }
1883 let g = _mm256_sub_epi32(g, _mm256_set_epi32(0, 0, 0, 1 << 26, 0, 0, 0, 0));
1884
1885 // - Check whether g4 overflowed:
1886 let mask = _mm256_permutevar8x32_epi32(
1887 _mm256_sub_epi32(_mm256_srli_epi32(g, 32 - 1), _mm256_set1_epi32(1)),
1888 _mm256_set1_epi32(4),
1889 );
1890
1891 // - Select x if g4 overflowed, else g:
1892 let x = _mm256_or_si256(
1893 _mm256_and_si256(x, _mm256_xor_si256(mask, _mm256_set1_epi32(-1))),
1894 _mm256_and_si256(g, mask),
1895 );
1896
1897 // Align back to 32 bits per digit. We drop the top two bits of the top limb,
1898 // because we only care about the lower 128 bits from here onward, and don't
1899 // need to track overflow or reduce.
1900 // [
1901 // 0,
1902 // 0,
1903 // 0,
1904 // 0,
1905 // x4[24..0] || x3[26..18],
1906 // x3[18..0] || x2[26..12],
1907 // x2[12..0] || x1[26.. 6],
1908 // x1[ 6..0] || x0[26.. 0],
1909 // ]
1910 let x = _mm256_or_si256(
1911 _mm256_srlv_epi32(x, _mm256_set_epi32(32, 32, 32, 32, 18, 12, 6, 0)),
1912 _mm256_permutevar8x32_epi32(
1913 _mm256_sllv_epi32(x, _mm256_set_epi32(32, 32, 32, 8, 14, 20, 26, 32)),
1914 _mm256_set_epi32(7, 7, 7, 7, 4, 3, 2, 1),
1915 ),
1916 );
1917
1918 // Add key
1919 // [
1920 // (x4[24..0] || x3[26..18]) + k7,
1921 // (x3[18..0] || x2[26..12]) + k6,
1922 // (x2[12..0] || x1[26.. 6]) + k5,
1923 // (x1[ 6..0] || x0[26.. 0]) + k4,
1924 // ]
1925 let mut x = _mm256_add_epi64(
1926 _mm256_permutevar8x32_epi32(x, _mm256_set_epi32(7, 3, 7, 2, 7, 1, 7, 0)),
1927 k,
1928 );
1929
1930 // Ensure that all carries are handled
1931 unsafe fn propagate_carry_32(x: __m256i) -> __m256i {
1932 // [
1933 // (l4 % 2^32) + (l3 >> 32),
1934 // (l3 % 2^32) + (l2 >> 32),
1935 // (l2 % 2^32) + (l1 >> 32),
1936 // (l1 % 2^32),
1937 // ]
1938 _mm256_add_epi64(
1939 _mm256_and_si256(x, _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1)),
1940 _mm256_permute4x64_epi64(
1941 _mm256_and_si256(
1942 _mm256_srli_epi64(x, 32),
1943 _mm256_set_epi64x(0, -1, -1, -1),
1944 ),
1945 set02(2, 1, 0, 3),
1946 ),
1947 )
1948 }
1949 for _ in 0..3 {
1950 x = propagate_carry_32(x);
1951 }
1952
1953 // Now that all limbs are at most 32 bits, realign from 64- to 32-bit limbs.
1954 // [
1955 // 0,
1956 // 0,
1957 // 0,
1958 // 0,
1959 // ((x4[24..0] || x3[26..18]) + k7) % 2^32 + ((x3[18..0] || x2[26..12]) + k6) >> 32,
1960 // ((x3[18..0] || x2[26..12]) + k6) % 2^32 + ((x2[12..0] || x1[26.. 6]) + k5) >> 32,
1961 // ((x2[12..0] || x1[26.. 6]) + k5) % 2^32 + ((x1[ 6..0] || x0[26.. 0]) + k4) >> 32,
1962 // ((x1[ 6..0] || x0[26.. 0]) + k4) % 2^32,
1963 // ]
1964 let x = _mm256_permutevar8x32_epi32(x, _mm256_set_epi32(7, 7, 7, 7, 6, 4, 2, 0));
1965
1966 // Reduce modulus 2^128
1967 IntegerTag(_mm256_castsi256_si128(x))
1968 }
1969 }
1970}
1971
1972pub(super) struct IntegerTag(__m128i);
1973
1974impl From<AdditionKey> for IntegerTag {
1975 fn from(k: AdditionKey) -> Self {
1976 unsafe {
1977 // There was no polynomial to add.
1978 IntegerTag(_mm256_castsi256_si128(_mm256_permutevar8x32_epi32(
1979 k.0,
1980 _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0),
1981 )))
1982 }
1983 }
1984}
1985
1986impl IntegerTag {
1987 pub(super) fn write(self, tag: &mut [u8]) {
1988 unsafe {
1989 _mm_storeu_si128(tag.as_mut_ptr().cast(), self.0);
1990 }
1991 }
1992}