polyval/backend/
clmul.rs

1//! Intel `CLMUL`-accelerated implementation for modern x86/x86_64 CPUs
2//! (i.e. Intel Sandy Bridge-compatible or newer)
3
4#[cfg(target_arch = "x86")]
5use core::arch::x86::*;
6#[cfg(target_arch = "x86_64")]
7use core::arch::x86_64::*;
8
9use universal_hash::{
10    consts::{U1, U16},
11    crypto_common::{BlockSizeUser, KeySizeUser, ParBlocksSizeUser},
12    KeyInit, Reset, UhfBackend,
13};
14
15use crate::{Block, Key, Tag};
16
17/// **POLYVAL**: GHASH-like universal hash over GF(2^128).
18#[derive(Clone)]
19pub struct Polyval {
20    h: __m128i,
21    y: __m128i,
22}
23
24impl KeySizeUser for Polyval {
25    type KeySize = U16;
26}
27
28impl Polyval {
29    /// Initialize POLYVAL with the given `H` field element and initial block
30    pub fn new_with_init_block(h: &Key, init_block: u128) -> Self {
31        unsafe {
32            // `_mm_loadu_si128` performs an unaligned load
33            #[allow(clippy::cast_ptr_alignment)]
34            Self {
35                h: _mm_loadu_si128(h.as_ptr() as *const __m128i),
36                y: _mm_loadu_si128(&init_block.to_be_bytes()[..] as *const _ as *const __m128i),
37            }
38        }
39    }
40}
41
42impl KeyInit for Polyval {
43    /// Initialize POLYVAL with the given `H` field element
44    fn new(h: &Key) -> Self {
45        Self::new_with_init_block(h, 0)
46    }
47}
48
49impl BlockSizeUser for Polyval {
50    type BlockSize = U16;
51}
52
53impl ParBlocksSizeUser for Polyval {
54    type ParBlocksSize = U1;
55}
56
57impl UhfBackend for Polyval {
58    fn proc_block(&mut self, x: &Block) {
59        unsafe {
60            self.mul(x);
61        }
62    }
63}
64
65impl Polyval {
66    /// Get GHASH output
67    pub(crate) fn finalize(self) -> Tag {
68        unsafe { core::mem::transmute(self.y) }
69    }
70}
71
72impl Polyval {
73    #[inline]
74    #[target_feature(enable = "pclmulqdq")]
75    unsafe fn mul(&mut self, x: &Block) {
76        let h = self.h;
77
78        // `_mm_loadu_si128` performs an unaligned load
79        #[allow(clippy::cast_ptr_alignment)]
80        let x = _mm_loadu_si128(x.as_ptr() as *const __m128i);
81        let y = _mm_xor_si128(self.y, x);
82
83        let h0 = h;
84        let h1 = _mm_shuffle_epi32(h, 0x0E);
85        let h2 = _mm_xor_si128(h0, h1);
86        let y0 = y;
87
88        // Multiply values partitioned to 64-bit parts
89        let y1 = _mm_shuffle_epi32(y, 0x0E);
90        let y2 = _mm_xor_si128(y0, y1);
91        let t0 = _mm_clmulepi64_si128(y0, h0, 0x00);
92        let t1 = _mm_clmulepi64_si128(y, h, 0x11);
93        let t2 = _mm_clmulepi64_si128(y2, h2, 0x00);
94        let t2 = _mm_xor_si128(t2, _mm_xor_si128(t0, t1));
95        let v0 = t0;
96        let v1 = _mm_xor_si128(_mm_shuffle_epi32(t0, 0x0E), t2);
97        let v2 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
98        let v3 = _mm_shuffle_epi32(t1, 0x0E);
99
100        // Polynomial reduction
101        let v2 = xor5(
102            v2,
103            v0,
104            _mm_srli_epi64(v0, 1),
105            _mm_srli_epi64(v0, 2),
106            _mm_srli_epi64(v0, 7),
107        );
108
109        let v1 = xor4(
110            v1,
111            _mm_slli_epi64(v0, 63),
112            _mm_slli_epi64(v0, 62),
113            _mm_slli_epi64(v0, 57),
114        );
115
116        let v3 = xor5(
117            v3,
118            v1,
119            _mm_srli_epi64(v1, 1),
120            _mm_srli_epi64(v1, 2),
121            _mm_srli_epi64(v1, 7),
122        );
123
124        let v2 = xor4(
125            v2,
126            _mm_slli_epi64(v1, 63),
127            _mm_slli_epi64(v1, 62),
128            _mm_slli_epi64(v1, 57),
129        );
130
131        self.y = _mm_unpacklo_epi64(v2, v3);
132    }
133}
134
135impl Reset for Polyval {
136    fn reset(&mut self) {
137        unsafe {
138            self.y = _mm_setzero_si128();
139        }
140    }
141}
142
143#[cfg(feature = "zeroize")]
144impl Drop for Polyval {
145    fn drop(&mut self) {
146        use zeroize::Zeroize;
147        self.h.zeroize();
148        self.y.zeroize();
149    }
150}
151
152#[inline(always)]
153unsafe fn xor4(e1: __m128i, e2: __m128i, e3: __m128i, e4: __m128i) -> __m128i {
154    _mm_xor_si128(_mm_xor_si128(e1, e2), _mm_xor_si128(e3, e4))
155}
156
157#[inline(always)]
158unsafe fn xor5(e1: __m128i, e2: __m128i, e3: __m128i, e4: __m128i, e5: __m128i) -> __m128i {
159    _mm_xor_si128(
160        e1,
161        _mm_xor_si128(_mm_xor_si128(e2, e3), _mm_xor_si128(e4, e5)),
162    )
163}