1#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
32#![cfg(feature = "x86")]
33#![allow(
34    clippy::wildcard_imports,
35    clippy::cast_possible_truncation,
36    clippy::too_many_arguments,
37    clippy::inline_always,
38    clippy::doc_markdown,
39    dead_code
40)]
41
42#[cfg(target_arch = "x86")]
43use core::arch::x86::*;
44#[cfg(target_arch = "x86_64")]
45use core::arch::x86_64::*;
46
47use crate::color_convert::scalar::{CB_CF, CR_CF, C_G_CB_COEF_2, C_G_CR_COEF_1, YUV_RND, Y_CF};
48
49pub union YmmRegister {
50    mm256: __m256i,
52    array: [i16; 16]
54}
55
56const R_AVX_COEF: i32 = i32::from_ne_bytes([CR_CF.to_ne_bytes()[0], CR_CF.to_ne_bytes()[1], 0, 0]);
57const B_AVX_COEF: i32 = i32::from_ne_bytes([0, 0, CB_CF.to_ne_bytes()[0], CB_CF.to_ne_bytes()[1]]);
58const G_COEF_AVX_COEF: i32 = i32::from_ne_bytes([
59    C_G_CR_COEF_1.to_ne_bytes()[0],
60    C_G_CR_COEF_1.to_ne_bytes()[1],
61    C_G_CB_COEF_2.to_ne_bytes()[0],
62    C_G_CB_COEF_2.to_ne_bytes()[1]
63]);
64
65#[inline(always)]
86pub fn ycbcr_to_rgb_avx2(
87    y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize
88) {
89    unsafe {
92        ycbcr_to_rgb_avx2_1(y, cb, cr, out, offset);
93    }
94}
95
96#[inline]
97#[target_feature(enable = "avx2")]
98unsafe fn ycbcr_to_rgb_avx2_1(
99    y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize
100) {
101    let (mut r, mut g, mut b) = ycbcr_to_rgb_baseline_no_clamp(y, cb, cr);
102
103    r = _mm256_packus_epi16(r, _mm256_setzero_si256());
104    g = _mm256_packus_epi16(g, _mm256_setzero_si256());
105    b = _mm256_packus_epi16(b, _mm256_setzero_si256());
106
107    r = _mm256_permute4x64_epi64::<{ shuffle(3, 1, 2, 0) }>(r);
108    g = _mm256_permute4x64_epi64::<{ shuffle(3, 1, 2, 0) }>(g);
109    b = _mm256_permute4x64_epi64::<{ shuffle(3, 1, 2, 0) }>(b);
110
111    let sh_r = _mm256_setr_epi8(
112        0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14,
113        9, 4, 15, 10, 5
114    );
115    let sh_g = _mm256_setr_epi8(
116        5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3,
117        14, 9, 4, 15, 10
118    );
119    let sh_b = _mm256_setr_epi8(
120        10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8,
121        3, 14, 9, 4, 15
122    );
123
124    let r0 = _mm256_shuffle_epi8(r, sh_r);
125    let g0 = _mm256_shuffle_epi8(g, sh_g);
126    let b0 = _mm256_shuffle_epi8(b, sh_b);
127
128    let m0 = _mm256_setr_epi8(
129        0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1,
130        0, 0, -1, 0, 0
131    );
132    let m1 = _mm256_setr_epi8(
133        0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
134        -1, 0, 0, -1, 0
135    );
136
137    let p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, g0, m0), b0, m1);
138    let p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, b0, m0), r0, m1);
139    let p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, r0, m0), g0, m1);
140
141    let rgb0 = _mm256_permute2x128_si256::<32>(p0, p1);
142    let rgb1 = _mm256_permute2x128_si256::<48>(p2, p0);
143
144    _mm256_storeu_si256(out.as_mut_ptr().cast(), rgb0);
145    _mm_storeu_si128(out[32..].as_mut_ptr().cast(), _mm256_castsi256_si128(rgb1));
146
147    *offset += 48;
148}
149
150#[inline]
152#[target_feature(enable = "avx2")]
153unsafe fn ycbcr_to_rgb_baseline_no_clamp(
159    y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16]
160) -> (__m256i, __m256i, __m256i) {
161    let y_c = _mm256_loadu_si256(y.as_ptr().cast());
164    let cb_c = _mm256_loadu_si256(cb.as_ptr().cast());
165    let cr_c = _mm256_loadu_si256(cr.as_ptr().cast());
166
167    let y_coeff = _mm256_set1_epi32(i32::from(Y_CF));
174    let cr_coeff = _mm256_set1_epi32(R_AVX_COEF);
175    let cb_coeff = _mm256_set1_epi32(B_AVX_COEF);
176    let cg_coeff = _mm256_set1_epi32(G_COEF_AVX_COEF);
177    let v_rnd = _mm256_set1_epi32(i32::from(YUV_RND));
178    let uv_bias = _mm256_set1_epi16(128);
179
180    let v_0 = _mm256_slli_epi16::<8>(cb_c);
182    let u_v_8 = _mm256_or_si256(v_0, cr_c);
183
184    let mut u_v_lo = _mm256_unpacklo_epi8(u_v_8, _mm256_setzero_si256());
185    let mut u_v_hi = _mm256_unpackhi_epi8(u_v_8, _mm256_setzero_si256());
186
187    let mut y_lo = _mm256_unpacklo_epi16(y_c, _mm256_setzero_si256());
188    let mut y_hi = _mm256_unpackhi_epi16(y_c, _mm256_setzero_si256());
189
190    u_v_lo = _mm256_sub_epi16(u_v_lo, uv_bias);
191    u_v_hi = _mm256_sub_epi16(u_v_hi, uv_bias);
192
193    y_lo = _mm256_madd_epi16(y_lo, y_coeff);
194    y_hi = _mm256_madd_epi16(y_hi, y_coeff);
195
196    let mut r_lo = _mm256_madd_epi16(u_v_lo, cr_coeff);
197    let mut r_hi = _mm256_madd_epi16(u_v_hi, cr_coeff);
198
199    let mut g_lo = _mm256_madd_epi16(u_v_lo, cg_coeff);
200    let mut g_hi = _mm256_madd_epi16(u_v_hi, cg_coeff);
201
202    y_lo = _mm256_add_epi32(y_lo, v_rnd);
205    y_hi = _mm256_add_epi32(y_hi, v_rnd);
206
207    let mut b_lo = _mm256_madd_epi16(u_v_lo, cb_coeff);
208    let mut b_hi = _mm256_madd_epi16(u_v_hi, cb_coeff);
209
210    r_lo = _mm256_add_epi32(r_lo, y_lo);
211    r_hi = _mm256_add_epi32(r_hi, y_hi);
212
213    g_lo = _mm256_add_epi32(g_lo, y_lo);
214    g_hi = _mm256_add_epi32(g_hi, y_hi);
215
216    b_lo = _mm256_add_epi32(b_lo, y_lo);
217    b_hi = _mm256_add_epi32(b_hi, y_hi);
218
219    r_lo = _mm256_srai_epi32::<14>(r_lo);
220    r_hi = _mm256_srai_epi32::<14>(r_hi);
221
222    g_lo = _mm256_srai_epi32::<14>(g_lo);
223    g_hi = _mm256_srai_epi32::<14>(g_hi);
224
225    b_lo = _mm256_srai_epi32::<14>(b_lo);
226    b_hi = _mm256_srai_epi32::<14>(b_hi);
227
228    let r = _mm256_packus_epi32(r_lo, r_hi);
229    let g = _mm256_packus_epi32(g_lo, g_hi);
230    let b = _mm256_packus_epi32(b_lo, b_hi);
231
232    return (r, g, b);
233}
234
235#[inline(always)]
236pub fn ycbcr_to_rgba_avx2(
237    y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16], out: &mut [u8], offset: &mut usize
238) {
239    unsafe {
240        ycbcr_to_rgba_unsafe(y, cb, cr, out, offset);
241    }
242}
243
244#[inline]
245#[target_feature(enable = "avx2")]
246#[rustfmt::skip]
247unsafe fn ycbcr_to_rgba_unsafe(
248    y: &[i16; 16], cb: &[i16; 16], cr: &[i16; 16],
249    out: &mut [u8],
250    offset: &mut usize,
251)
252{
253    let tmp:& mut [u8; 64] = out.get_mut(*offset..*offset + 64).expect("Slice to small cannot write").try_into().unwrap();
255
256    let (r, g, b) = ycbcr_to_rgb_baseline_no_clamp(y, cb, cr);
257
258    let c = _mm256_packus_epi16(r, g); let d = _mm256_packus_epi16(b, _mm256_set1_epi16(255)); let e = _mm256_unpacklo_epi8(c, d); let f = _mm256_unpackhi_epi8(c, d); let g = _mm256_unpacklo_epi8(e, f); let h = _mm256_unpackhi_epi8(e, f);
271    
272    let i = _mm256_permute2x128_si256::<{ shuffle(3, 2, 1, 0) }>(g, h);
274    
275    let j = _mm256_permute2x128_si256::<{ shuffle(1, 2, 3, 0) }>(g, h);
276    
277    let k = _mm256_permute2x128_si256::<{ shuffle(3, 2, 0, 1) }>(g, h);
278    
279    let l = _mm256_permute2x128_si256::<{ shuffle(0, 3, 2, 1) }>(g, h);
280    
281    let m = _mm256_blend_epi32::<0b1111_0000>(i, j);
282    
283    let n = _mm256_blend_epi32::<0b1111_0000>(k, l);
284    
285    _mm256_storeu_si256(tmp.as_mut_ptr().cast(), m);
288    
289    _mm256_storeu_si256(tmp[32..].as_mut_ptr().cast(), n);
290
291    *offset += 64;
292}
293
294#[inline]
295const fn shuffle(z: i32, y: i32, x: i32, w: i32) -> i32 {
296    (z << 6) | (y << 4) | (x << 2) | w
297}