polyval/backend/intrinsics/
x86.rs

1//! VPCLMULQDQ optimized POLYVAL implementation using R/F Algorithm
2//! Adapted from the implementation in the Apache 2.0 + MIT-licensed HPCrypt library
3//! Copyright (c) 2024 HPCrypt Contributors
4//!
5//! Uses the R/F algorithm from "Efficient GHASH Implementation Using CLMUL":
6//! - 4 CLMULs per block for multiplication (R and F terms)
7//! - 1 CLMUL for reduction (Lemma 3)
8//! - 4-block aggregated processing with single reduction
9//!
10//! Key equations:
11//! - D = swap(H) ⊕ (H0 × P1)
12//! - R = M0×D1 ⊕ M1×H1
13//! - F = M0×D0 ⊕ M1×H0
14//! - Result = R ⊕ F1 ⊕ (x^64×F0) ⊕ (P1×F0)
15//!
16//! POLYVAL operates in GF(2^128) with polynomial x^128 + x^127 + x^126 + x^121 + 1
17//! Unlike GHASH, POLYVAL uses little-endian byte ordering (no byte swap needed).
18//!
19//! <https://eprint.iacr.org/2025/2171.pdf>
20
21#![allow(unsafe_op_in_unsafe_fn)]
22
23use super::ExpandedKey;
24use crate::{Block, ParBlocks, field_element::FieldElement};
25
26#[cfg(target_arch = "x86")]
27use core::arch::x86::*;
28#[cfg(target_arch = "x86_64")]
29use core::arch::x86_64::*;
30
31/// P1 polynomial: x^63 + x^62 + x^57 = 0xC200000000000000
32const P1: u64 = 0xC200000000000000;
33
34cpufeatures::new!(clmul, "vpclmulqdq");
35pub(crate) use clmul::InitToken;
36
37/// Byte array which is the inner type of `FieldElement`
38type ByteArray = [u8; 16];
39
40impl FieldElement {
41    #[target_feature(enable = "sse2")]
42    #[inline]
43    unsafe fn from_m128i(reg: __m128i) -> Self {
44        let mut out = ByteArray::default();
45        _mm_storeu_si128(out.as_mut_ptr().cast(), reg);
46        out.into()
47    }
48
49    #[target_feature(enable = "sse2")]
50    #[inline]
51    unsafe fn to_m128i(self) -> __m128i {
52        load_bytes(&self.into())
53    }
54}
55
56/// Convert 16 bytes into `__m128i`.
57///
58/// # Safety
59/// Requires SSE2 support
60#[target_feature(enable = "sse2")]
61#[inline]
62unsafe fn load_bytes(bytes: &ByteArray) -> __m128i {
63    _mm_loadu_si128(bytes.as_ptr().cast())
64}
65
66/// Update with a single block (5 CLMULs)
67///
68/// # Safety
69/// Requires VPCLMULQDQ support
70// TODO(tarcieri): use `enable = "vpclmulqdq"` when MSRV 1.89+
71#[target_feature(enable = "avx", enable = "pclmulqdq")]
72#[inline]
73pub(super) unsafe fn proc_block(
74    key: &ExpandedKey,
75    acc: FieldElement,
76    block: &Block,
77) -> FieldElement {
78    let data = load_bytes(&block.0);
79
80    // XOR with accumulator
81    let y = _mm_xor_si128(acc.to_m128i(), data);
82
83    // Multiply by H using R/F algorithm
84    FieldElement::from_m128i(gf128_mul_rf(y, key.h1.to_m128i(), key.d1.to_m128i()))
85}
86
87/// Process 4 blocks with R/F algorithm and aggregated reduction
88///
89/// Uses 16 CLMULs for multiplication (4 per block) + 1 CLMUL for reduction = 17 CLMULs total
90#[target_feature(enable = "avx", enable = "pclmulqdq")]
91#[inline]
92pub(super) unsafe fn proc_par_blocks(
93    key: &ExpandedKey,
94    acc: FieldElement,
95    par_blocks: &ParBlocks,
96) -> FieldElement {
97    // Load all 4 blocks (no byte swap for POLYVAL)
98    let m0 = load_bytes(&par_blocks[0].0);
99    let m1 = load_bytes(&par_blocks[1].0);
100    let m2 = load_bytes(&par_blocks[2].0);
101    let m3 = load_bytes(&par_blocks[3].0);
102
103    // XOR first block with accumulator
104    let y0 = _mm_xor_si128(acc.to_m128i(), m0);
105
106    // R/F multiply all 4 blocks (16 CLMULs)
107    let (r0, f0) = rf_mul_unreduced(y0, key.h4.to_m128i(), key.d4.to_m128i());
108    let (r1, f1) = rf_mul_unreduced(m1, key.h3.to_m128i(), key.d3.to_m128i());
109    let (r2, f2) = rf_mul_unreduced(m2, key.h2.to_m128i(), key.d2.to_m128i());
110    let (r3, f3) = rf_mul_unreduced(m3, key.h1.to_m128i(), key.d1.to_m128i());
111
112    // Aggregate R and F values
113    let r = _mm_xor_si128(_mm_xor_si128(r0, r1), _mm_xor_si128(r2, r3));
114    let f = _mm_xor_si128(_mm_xor_si128(f0, f1), _mm_xor_si128(f2, f3));
115
116    // Single reduction (1 CLMUL)
117    FieldElement::from_m128i(reduce_rf(r, f))
118}
119
120/// Create a new POLYVAL key with R/F algorithm
121///
122/// # Safety
123/// Requires VPCLMULQDQ support
124#[target_feature(enable = "avx", enable = "pclmulqdq")]
125pub(super) unsafe fn expand_key(h: &[u8; 16]) -> ExpandedKey {
126    let h1 = load_bytes(h);
127    let d1 = compute_d(h1);
128
129    // Compute powers using R/F multiplication
130    let h2 = gf128_mul_rf(h1, h1, d1);
131    let d2 = compute_d(h2);
132
133    let h3 = gf128_mul_rf(h2, h1, d1);
134    let d3 = compute_d(h3);
135
136    let h4 = gf128_mul_rf(h2, h2, d2);
137    let d4 = compute_d(h4);
138
139    ExpandedKey {
140        h1: FieldElement::from_m128i(h1),
141        d1: FieldElement::from_m128i(d1),
142        h2: FieldElement::from_m128i(h2),
143        d2: FieldElement::from_m128i(d2),
144        h3: FieldElement::from_m128i(h3),
145        d3: FieldElement::from_m128i(d3),
146        h4: FieldElement::from_m128i(h4),
147        d4: FieldElement::from_m128i(d4),
148    }
149}
150
151/// Compute D from H using the R/F algorithm
152///
153/// D = swap(H) ⊕ (H0 × P1)
154#[target_feature(enable = "avx", enable = "pclmulqdq")]
155#[inline]
156unsafe fn compute_d(h: __m128i) -> __m128i {
157    // TODO(tarcieri): P1.cast_signed() when MSRV 1.87+
158    #[allow(clippy::cast_possible_wrap)]
159    let p = _mm_set_epi64x(P1 as i64, 0);
160
161    // Swap halves: [H1 : H0] -> [H0 : H1]
162    let h_swap = _mm_shuffle_epi32(h, 0x4e);
163
164    // T = H0 × P1
165    let t = _mm_clmulepi64_si128(h, p, 0x10);
166
167    // D = swap(H) ⊕ T
168    _mm_xor_si128(h_swap, t)
169}
170
171/// R/F multiplication using 4 CLMULs per block
172///
173/// Given M = [M1 : M0] and precomputed H = [H1 : H0], D = [D1 : D0]:
174/// - R = M0×D1 ⊕ M1×H1 (2 CLMULs)
175/// - F = M0×D0 ⊕ M1×H0 (2 CLMULs)
176///
177/// Returns (R, F) for later reduction
178#[target_feature(enable = "avx", enable = "pclmulqdq")]
179#[inline]
180unsafe fn rf_mul_unreduced(m: __m128i, h: __m128i, d: __m128i) -> (__m128i, __m128i) {
181    // R = M0×D1 ⊕ M1×H1
182    let r0 = _mm_clmulepi64_si128(m, d, 0x10); // M0 × D1
183    let r1 = _mm_clmulepi64_si128(m, h, 0x11); // M1 × H1
184    let r = _mm_xor_si128(r0, r1);
185
186    // F = M0×D0 ⊕ M1×H0
187    let f0 = _mm_clmulepi64_si128(m, d, 0x00); // M0 × D0
188    let f1 = _mm_clmulepi64_si128(m, h, 0x01); // M1 × H0
189    let f = _mm_xor_si128(f0, f1);
190
191    (r, f)
192}
193
194/// Reduction using Lemma 3: Result = R ⊕ F1 ⊕ (x^64×F0) ⊕ (P1×F0)
195///
196/// Uses 1 CLMUL for reduction
197#[target_feature(enable = "avx", enable = "pclmulqdq")]
198#[inline]
199unsafe fn reduce_rf(r: __m128i, f: __m128i) -> __m128i {
200    // TODO(tarcieri): P1.cast_signed() when MSRV 1.87+
201    #[allow(clippy::cast_possible_wrap)]
202    let p1 = _mm_set_epi64x(0, P1 as i64);
203
204    // F1 in low position
205    let f1 = _mm_srli_si128(f, 8);
206
207    // x^64×F0 (shift F0 to high position)
208    let f0_shifted = _mm_slli_si128(f, 8);
209
210    // P1×F0
211    let p1_f0 = _mm_clmulepi64_si128(f, p1, 0x00);
212
213    // Result = R ⊕ F1 ⊕ (x^64×F0) ⊕ (P1×F0)
214    let result = _mm_xor_si128(r, f1);
215    let result = _mm_xor_si128(result, f0_shifted);
216    _mm_xor_si128(result, p1_f0)
217}
218
219/// Complete R/F multiplication with reduction (5 CLMULs total)
220#[target_feature(enable = "avx", enable = "pclmulqdq")]
221#[inline]
222unsafe fn gf128_mul_rf(m: __m128i, h: __m128i, d: __m128i) -> __m128i {
223    let (r, f) = rf_mul_unreduced(m, h, d);
224    reduce_rf(r, f)
225}
polyval/backend/intrinsics/x86.rs

polyval/backend/intrinsics/
x86.rs