1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
//! Utility functions

// TODO(tarcieri): check performance impact / generated assembly changes
#![allow(clippy::needless_range_loop)]

use super::arch::*;
use crate::{Block, Block8};

pub type U128x8 = [__m128i; 8];

#[cfg(test)]
pub(crate) fn check(a: &[__m128i], b: &[[u64; 2]]) {
    for (v1, v2) in a.iter().zip(b) {
        let t1: [u64; 2] = unsafe { core::mem::transmute(*v1) };
        let t2 = [v2[0].to_be(), v2[1].to_be()];
        assert_eq!(t1, t2);
    }
}

#[inline(always)]
pub(crate) fn load8(blocks: *const Block8) -> U128x8 {
    unsafe {
        let p = blocks as *const Block;
        [
            _mm_loadu_si128(p.add(0) as *const __m128i),
            _mm_loadu_si128(p.add(1) as *const __m128i),
            _mm_loadu_si128(p.add(2) as *const __m128i),
            _mm_loadu_si128(p.add(3) as *const __m128i),
            _mm_loadu_si128(p.add(4) as *const __m128i),
            _mm_loadu_si128(p.add(5) as *const __m128i),
            _mm_loadu_si128(p.add(6) as *const __m128i),
            _mm_loadu_si128(p.add(7) as *const __m128i),
        ]
    }
}

#[inline(always)]
pub(crate) fn store8(blocks: *mut Block8, b: U128x8) {
    unsafe {
        let p = blocks as *mut Block;
        _mm_storeu_si128(p.add(0) as *mut __m128i, b[0]);
        _mm_storeu_si128(p.add(1) as *mut __m128i, b[1]);
        _mm_storeu_si128(p.add(2) as *mut __m128i, b[2]);
        _mm_storeu_si128(p.add(3) as *mut __m128i, b[3]);
        _mm_storeu_si128(p.add(4) as *mut __m128i, b[4]);
        _mm_storeu_si128(p.add(5) as *mut __m128i, b[5]);
        _mm_storeu_si128(p.add(6) as *mut __m128i, b[6]);
        _mm_storeu_si128(p.add(7) as *mut __m128i, b[7]);
    }
}

#[inline(always)]
pub(crate) fn xor8(b: &mut U128x8, key: __m128i) {
    unsafe {
        b[0] = _mm_xor_si128(b[0], key);
        b[1] = _mm_xor_si128(b[1], key);
        b[2] = _mm_xor_si128(b[2], key);
        b[3] = _mm_xor_si128(b[3], key);
        b[4] = _mm_xor_si128(b[4], key);
        b[5] = _mm_xor_si128(b[5], key);
        b[6] = _mm_xor_si128(b[6], key);
        b[7] = _mm_xor_si128(b[7], key);
    }
}

#[inline(always)]
pub(crate) fn aesenc8(buffer: &mut U128x8, key: __m128i) {
    for i in 0..8 {
        buffer[i] = unsafe { _mm_aesenc_si128(buffer[i], key) };
    }
}

#[inline(always)]
pub(crate) fn aesenclast8(buffer: &mut U128x8, key: __m128i) {
    for i in 0..8 {
        buffer[i] = unsafe { _mm_aesenclast_si128(buffer[i], key) };
    }
}

#[inline(always)]
pub(crate) fn aesdec8(buffer: &mut U128x8, key: __m128i) {
    for i in 0..8 {
        buffer[i] = unsafe { _mm_aesdec_si128(buffer[i], key) };
    }
}

#[inline(always)]
pub(crate) fn aesdeclast8(buffer: &mut U128x8, key: __m128i) {
    for i in 0..8 {
        buffer[i] = unsafe { _mm_aesdeclast_si128(buffer[i], key) };
    }
}